Added comprehensive production infrastructure: Systemd Service: - guruconnect.service with auto-restart, resource limits, security hardening - setup-systemd.sh installation script Prometheus Metrics: - Added prometheus-client dependency - Created metrics module tracking: - HTTP requests (count, latency) - Sessions (created, closed, active) - Connections (WebSocket, by type) - Errors (by type) - Database operations (count, latency) - Server uptime - Added /metrics endpoint - Background task for uptime updates Monitoring Configuration: - prometheus.yml with scrape configs for GuruConnect and node_exporter - alerts.yml with alerting rules - grafana-dashboard.json with 10 panels - setup-monitoring.sh installation script PostgreSQL Backups: - backup-postgres.sh with gzip compression - restore-postgres.sh with safety checks - guruconnect-backup.service and .timer for automated daily backups - Retention policy: 30 daily, 4 weekly, 6 monthly Health Monitoring: - health-monitor.sh checking HTTP, disk, memory, database, metrics - guruconnect.logrotate for log rotation - Email alerts on failures Updated CHECKLIST_STATE.json to reflect Week 1 completion (77%) and Week 2 start. Created PHASE1_WEEK2_INFRASTRUCTURE.md with comprehensive planning. Ready for deployment and testing on RMM server.
229 lines
6.1 KiB
JSON
229 lines
6.1 KiB
JSON
{
|
|
"dashboard": {
|
|
"title": "GuruConnect Monitoring",
|
|
"tags": ["guruconnect", "monitoring"],
|
|
"timezone": "browser",
|
|
"schemaVersion": 16,
|
|
"version": 1,
|
|
"refresh": "10s",
|
|
"panels": [
|
|
{
|
|
"id": 1,
|
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
|
"type": "graph",
|
|
"title": "Active Sessions",
|
|
"targets": [
|
|
{
|
|
"expr": "guruconnect_active_sessions",
|
|
"legendFormat": "Active Sessions",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"yaxes": [
|
|
{"label": "Sessions", "show": true},
|
|
{"show": false}
|
|
],
|
|
"lines": true,
|
|
"fill": 1,
|
|
"linewidth": 2,
|
|
"tooltip": {"shared": true}
|
|
},
|
|
{
|
|
"id": 2,
|
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
|
"type": "graph",
|
|
"title": "Requests per Second",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(guruconnect_requests_total[1m])",
|
|
"legendFormat": "{{method}} {{path}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"yaxes": [
|
|
{"label": "Requests/sec", "show": true},
|
|
{"show": false}
|
|
],
|
|
"lines": true,
|
|
"fill": 1,
|
|
"linewidth": 2,
|
|
"tooltip": {"shared": true}
|
|
},
|
|
{
|
|
"id": 3,
|
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
|
"type": "graph",
|
|
"title": "Error Rate",
|
|
"targets": [
|
|
{
|
|
"expr": "rate(guruconnect_errors_total[1m])",
|
|
"legendFormat": "{{error_type}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"yaxes": [
|
|
{"label": "Errors/sec", "show": true},
|
|
{"show": false}
|
|
],
|
|
"lines": true,
|
|
"fill": 1,
|
|
"linewidth": 2,
|
|
"tooltip": {"shared": true},
|
|
"alert": {
|
|
"conditions": [
|
|
{
|
|
"evaluator": {"params": [10], "type": "gt"},
|
|
"operator": {"type": "and"},
|
|
"query": {"params": ["A", "1m", "now"]},
|
|
"reducer": {"params": [], "type": "avg"},
|
|
"type": "query"
|
|
}
|
|
],
|
|
"executionErrorState": "alerting",
|
|
"frequency": "60s",
|
|
"handler": 1,
|
|
"name": "High Error Rate",
|
|
"noDataState": "no_data",
|
|
"notifications": []
|
|
}
|
|
},
|
|
{
|
|
"id": 4,
|
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
|
|
"type": "graph",
|
|
"title": "Request Latency (p50, p95, p99)",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.50, rate(guruconnect_request_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "p50",
|
|
"refId": "A"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(guruconnect_request_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "p95",
|
|
"refId": "B"
|
|
},
|
|
{
|
|
"expr": "histogram_quantile(0.99, rate(guruconnect_request_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "p99",
|
|
"refId": "C"
|
|
}
|
|
],
|
|
"yaxes": [
|
|
{"label": "Latency (seconds)", "show": true, "format": "s"},
|
|
{"show": false}
|
|
],
|
|
"lines": true,
|
|
"fill": 0,
|
|
"linewidth": 2,
|
|
"tooltip": {"shared": true}
|
|
},
|
|
{
|
|
"id": 5,
|
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
|
|
"type": "graph",
|
|
"title": "Active Connections by Type",
|
|
"targets": [
|
|
{
|
|
"expr": "guruconnect_active_connections",
|
|
"legendFormat": "{{conn_type}}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"yaxes": [
|
|
{"label": "Connections", "show": true},
|
|
{"show": false}
|
|
],
|
|
"lines": true,
|
|
"fill": 1,
|
|
"linewidth": 2,
|
|
"stack": true,
|
|
"tooltip": {"shared": true}
|
|
},
|
|
{
|
|
"id": 6,
|
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
|
|
"type": "graph",
|
|
"title": "Database Query Duration",
|
|
"targets": [
|
|
{
|
|
"expr": "histogram_quantile(0.95, rate(guruconnect_db_query_duration_seconds_bucket[5m]))",
|
|
"legendFormat": "{{operation}} p95",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"yaxes": [
|
|
{"label": "Duration (seconds)", "show": true, "format": "s"},
|
|
{"show": false}
|
|
],
|
|
"lines": true,
|
|
"fill": 0,
|
|
"linewidth": 2,
|
|
"tooltip": {"shared": true}
|
|
},
|
|
{
|
|
"id": 7,
|
|
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 24},
|
|
"type": "singlestat",
|
|
"title": "Server Uptime",
|
|
"targets": [
|
|
{
|
|
"expr": "guruconnect_uptime_seconds",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"format": "s",
|
|
"valueName": "current",
|
|
"sparkline": {"show": true}
|
|
},
|
|
{
|
|
"id": 8,
|
|
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 24},
|
|
"type": "singlestat",
|
|
"title": "Total Sessions Created",
|
|
"targets": [
|
|
{
|
|
"expr": "guruconnect_sessions_total{status=\"created\"}",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"format": "short",
|
|
"valueName": "current",
|
|
"sparkline": {"show": true}
|
|
},
|
|
{
|
|
"id": 9,
|
|
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 24},
|
|
"type": "singlestat",
|
|
"title": "Total Requests",
|
|
"targets": [
|
|
{
|
|
"expr": "sum(guruconnect_requests_total)",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"format": "short",
|
|
"valueName": "current",
|
|
"sparkline": {"show": true}
|
|
},
|
|
{
|
|
"id": 10,
|
|
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 24},
|
|
"type": "singlestat",
|
|
"title": "Total Errors",
|
|
"targets": [
|
|
{
|
|
"expr": "sum(guruconnect_errors_total)",
|
|
"refId": "A"
|
|
}
|
|
],
|
|
"format": "short",
|
|
"valueName": "current",
|
|
"sparkline": {"show": true},
|
|
"thresholds": "10,100",
|
|
"colors": ["#299c46", "#e0b400", "#d44a3a"]
|
|
}
|
|
]
|
|
}
|
|
}
|