Files
claudetools/projects/msp-tools/guru-connect/infrastructure/grafana-dashboard.json
Mike Swanson 8521c95755 Phase 1 Week 2: Infrastructure & Monitoring
Added comprehensive production infrastructure:

Systemd Service:
- guruconnect.service with auto-restart, resource limits, security hardening
- setup-systemd.sh installation script

Prometheus Metrics:
- Added prometheus-client dependency
- Created metrics module tracking:
  - HTTP requests (count, latency)
  - Sessions (created, closed, active)
  - Connections (WebSocket, by type)
  - Errors (by type)
  - Database operations (count, latency)
  - Server uptime
- Added /metrics endpoint
- Background task for uptime updates

Monitoring Configuration:
- prometheus.yml with scrape configs for GuruConnect and node_exporter
- alerts.yml with alerting rules
- grafana-dashboard.json with 10 panels
- setup-monitoring.sh installation script

PostgreSQL Backups:
- backup-postgres.sh with gzip compression
- restore-postgres.sh with safety checks
- guruconnect-backup.service and .timer for automated daily backups
- Retention policy: 30 daily, 4 weekly, 6 monthly

Health Monitoring:
- health-monitor.sh checking HTTP, disk, memory, database, metrics
- guruconnect.logrotate for log rotation
- Email alerts on failures

Updated CHECKLIST_STATE.json to reflect Week 1 completion (77%) and Week 2 start.
Created PHASE1_WEEK2_INFRASTRUCTURE.md with comprehensive planning.

Ready for deployment and testing on RMM server.
2026-01-17 20:24:32 -07:00

229 lines
6.1 KiB
JSON

{
"dashboard": {
"title": "GuruConnect Monitoring",
"tags": ["guruconnect", "monitoring"],
"timezone": "browser",
"schemaVersion": 16,
"version": 1,
"refresh": "10s",
"panels": [
{
"id": 1,
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
"type": "graph",
"title": "Active Sessions",
"targets": [
{
"expr": "guruconnect_active_sessions",
"legendFormat": "Active Sessions",
"refId": "A"
}
],
"yaxes": [
{"label": "Sessions", "show": true},
{"show": false}
],
"lines": true,
"fill": 1,
"linewidth": 2,
"tooltip": {"shared": true}
},
{
"id": 2,
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
"type": "graph",
"title": "Requests per Second",
"targets": [
{
"expr": "rate(guruconnect_requests_total[1m])",
"legendFormat": "{{method}} {{path}}",
"refId": "A"
}
],
"yaxes": [
{"label": "Requests/sec", "show": true},
{"show": false}
],
"lines": true,
"fill": 1,
"linewidth": 2,
"tooltip": {"shared": true}
},
{
"id": 3,
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"type": "graph",
"title": "Error Rate",
"targets": [
{
"expr": "rate(guruconnect_errors_total[1m])",
"legendFormat": "{{error_type}}",
"refId": "A"
}
],
"yaxes": [
{"label": "Errors/sec", "show": true},
{"show": false}
],
"lines": true,
"fill": 1,
"linewidth": 2,
"tooltip": {"shared": true},
"alert": {
"conditions": [
{
"evaluator": {"params": [10], "type": "gt"},
"operator": {"type": "and"},
"query": {"params": ["A", "1m", "now"]},
"reducer": {"params": [], "type": "avg"},
"type": "query"
}
],
"executionErrorState": "alerting",
"frequency": "60s",
"handler": 1,
"name": "High Error Rate",
"noDataState": "no_data",
"notifications": []
}
},
{
"id": 4,
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
"type": "graph",
"title": "Request Latency (p50, p95, p99)",
"targets": [
{
"expr": "histogram_quantile(0.50, rate(guruconnect_request_duration_seconds_bucket[5m]))",
"legendFormat": "p50",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, rate(guruconnect_request_duration_seconds_bucket[5m]))",
"legendFormat": "p95",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, rate(guruconnect_request_duration_seconds_bucket[5m]))",
"legendFormat": "p99",
"refId": "C"
}
],
"yaxes": [
{"label": "Latency (seconds)", "show": true, "format": "s"},
{"show": false}
],
"lines": true,
"fill": 0,
"linewidth": 2,
"tooltip": {"shared": true}
},
{
"id": 5,
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
"type": "graph",
"title": "Active Connections by Type",
"targets": [
{
"expr": "guruconnect_active_connections",
"legendFormat": "{{conn_type}}",
"refId": "A"
}
],
"yaxes": [
{"label": "Connections", "show": true},
{"show": false}
],
"lines": true,
"fill": 1,
"linewidth": 2,
"stack": true,
"tooltip": {"shared": true}
},
{
"id": 6,
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
"type": "graph",
"title": "Database Query Duration",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(guruconnect_db_query_duration_seconds_bucket[5m]))",
"legendFormat": "{{operation}} p95",
"refId": "A"
}
],
"yaxes": [
{"label": "Duration (seconds)", "show": true, "format": "s"},
{"show": false}
],
"lines": true,
"fill": 0,
"linewidth": 2,
"tooltip": {"shared": true}
},
{
"id": 7,
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 24},
"type": "singlestat",
"title": "Server Uptime",
"targets": [
{
"expr": "guruconnect_uptime_seconds",
"refId": "A"
}
],
"format": "s",
"valueName": "current",
"sparkline": {"show": true}
},
{
"id": 8,
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 24},
"type": "singlestat",
"title": "Total Sessions Created",
"targets": [
{
"expr": "guruconnect_sessions_total{status=\"created\"}",
"refId": "A"
}
],
"format": "short",
"valueName": "current",
"sparkline": {"show": true}
},
{
"id": 9,
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 24},
"type": "singlestat",
"title": "Total Requests",
"targets": [
{
"expr": "sum(guruconnect_requests_total)",
"refId": "A"
}
],
"format": "short",
"valueName": "current",
"sparkline": {"show": true}
},
{
"id": 10,
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 24},
"type": "singlestat",
"title": "Total Errors",
"targets": [
{
"expr": "sum(guruconnect_errors_total)",
"refId": "A"
}
],
"format": "short",
"valueName": "current",
"sparkline": {"show": true},
"thresholds": "10,100",
"colors": ["#299c46", "#e0b400", "#d44a3a"]
}
]
}
}