Phase 1 Week 2: Infrastructure & Monitoring

Added comprehensive production infrastructure: Systemd Service: - guruconnect.service with auto-restart, resource limits, security hardening - setup-systemd.sh installation script Prometheus Metrics: - Added prometheus-client dependency - Created metrics module tracking: - HTTP requests (count, latency) - Sessions (created, closed, active) - Connections (WebSocket, by type) - Errors (by type) - Database operations (count, latency) - Server uptime - Added /metrics endpoint - Background task for uptime updates Monitoring Configuration: - prometheus.yml with scrape configs for GuruConnect and node_exporter - alerts.yml with alerting rules - grafana-dashboard.json with 10 panels - setup-monitoring.sh installation script PostgreSQL Backups: - backup-postgres.sh with gzip compression - restore-postgres.sh with safety checks - guruconnect-backup.service and .timer for automated daily backups - Retention policy: 30 daily, 4 weekly, 6 monthly Health Monitoring: - health-monitor.sh checking HTTP, disk, memory, database, metrics - guruconnect.logrotate for log rotation - Email alerts on failures Updated CHECKLIST_STATE.json to reflect Week 1 completion (77%) and Week 2 start. Created PHASE1_WEEK2_INFRASTRUCTURE.md with comprehensive planning. Ready for deployment and testing on RMM server.
2026-01-17 20:24:32 -07:00
parent 2481b54a65
commit 8521c95755
17 changed files with 1877 additions and 25 deletions
--- a/projects/msp-tools/guru-connect/infrastructure/alerts.yml
+++ b/projects/msp-tools/guru-connect/infrastructure/alerts.yml
@@ -0,0 +1,68 @@
+# Prometheus Alert Rules for GuruConnect
+#
+# This file defines alerting rules for monitoring GuruConnect health and performance.
+# Copy to /etc/prometheus/alerts.yml and reference in prometheus.yml
+
+groups:
+  - name: guruconnect_alerts
+    interval: 30s
+    rules:
+      # GuruConnect is down
+      - alert: GuruConnectDown
+        expr: up{job="guruconnect"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "GuruConnect server is down"
+          description: "GuruConnect server on {{ $labels.instance }} has been down for more than 1 minute"
+
+      # High error rate
+      - alert: HighErrorRate
+        expr: rate(guruconnect_errors_total[5m]) > 10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High error rate detected"
+          description: "Error rate is {{ $value | humanize }} errors/second over the last 5 minutes"
+
+      # Too many active sessions
+      - alert: TooManyActiveSessions
+        expr: guruconnect_active_sessions > 100
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Too many active sessions"
+          description: "There are {{ $value }} active sessions, exceeding threshold of 100"
+
+      # High request latency
+      - alert: HighRequestLatency
+        expr: histogram_quantile(0.95, rate(guruconnect_request_duration_seconds_bucket[5m])) > 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High request latency"
+          description: "95th percentile request latency is {{ $value | humanize }}s"
+
+      # Database operations failing
+      - alert: DatabaseOperationsFailure
+        expr: rate(guruconnect_db_operations_total{status="error"}[5m]) > 1
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Database operations failing"
+          description: "Database error rate is {{ $value | humanize }} errors/second"
+
+      # Server uptime low (recent restart)
+      - alert: ServerRestarted
+        expr: guruconnect_uptime_seconds < 300
+        for: 1m
+        labels:
+          severity: info
+        annotations:
+          summary: "Server recently restarted"
+          description: "Server uptime is only {{ $value | humanize }}s, indicating a recent restart"
--- a/projects/msp-tools/guru-connect/infrastructure/grafana-dashboard.json
+++ b/projects/msp-tools/guru-connect/infrastructure/grafana-dashboard.json
@@ -0,0 +1,228 @@
+{
+  "dashboard": {
+    "title": "GuruConnect Monitoring",
+    "tags": ["guruconnect", "monitoring"],
+    "timezone": "browser",
+    "schemaVersion": 16,
+    "version": 1,
+    "refresh": "10s",
+    "panels": [
+      {
+        "id": 1,
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
+        "type": "graph",
+        "title": "Active Sessions",
+        "targets": [
+          {
+            "expr": "guruconnect_active_sessions",
+            "legendFormat": "Active Sessions",
+            "refId": "A"
+          }
+        ],
+        "yaxes": [
+          {"label": "Sessions", "show": true},
+          {"show": false}
+        ],
+        "lines": true,
+        "fill": 1,
+        "linewidth": 2,
+        "tooltip": {"shared": true}
+      },
+      {
+        "id": 2,
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
+        "type": "graph",
+        "title": "Requests per Second",
+        "targets": [
+          {
+            "expr": "rate(guruconnect_requests_total[1m])",
+            "legendFormat": "{{method}} {{path}}",
+            "refId": "A"
+          }
+        ],
+        "yaxes": [
+          {"label": "Requests/sec", "show": true},
+          {"show": false}
+        ],
+        "lines": true,
+        "fill": 1,
+        "linewidth": 2,
+        "tooltip": {"shared": true}
+      },
+      {
+        "id": 3,
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
+        "type": "graph",
+        "title": "Error Rate",
+        "targets": [
+          {
+            "expr": "rate(guruconnect_errors_total[1m])",
+            "legendFormat": "{{error_type}}",
+            "refId": "A"
+          }
+        ],
+        "yaxes": [
+          {"label": "Errors/sec", "show": true},
+          {"show": false}
+        ],
+        "lines": true,
+        "fill": 1,
+        "linewidth": 2,
+        "tooltip": {"shared": true},
+        "alert": {
+          "conditions": [
+            {
+              "evaluator": {"params": [10], "type": "gt"},
+              "operator": {"type": "and"},
+              "query": {"params": ["A", "1m", "now"]},
+              "reducer": {"params": [], "type": "avg"},
+              "type": "query"
+            }
+          ],
+          "executionErrorState": "alerting",
+          "frequency": "60s",
+          "handler": 1,
+          "name": "High Error Rate",
+          "noDataState": "no_data",
+          "notifications": []
+        }
+      },
+      {
+        "id": 4,
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
+        "type": "graph",
+        "title": "Request Latency (p50, p95, p99)",
+        "targets": [
+          {
+            "expr": "histogram_quantile(0.50, rate(guruconnect_request_duration_seconds_bucket[5m]))",
+            "legendFormat": "p50",
+            "refId": "A"
+          },
+          {
+            "expr": "histogram_quantile(0.95, rate(guruconnect_request_duration_seconds_bucket[5m]))",
+            "legendFormat": "p95",
+            "refId": "B"
+          },
+          {
+            "expr": "histogram_quantile(0.99, rate(guruconnect_request_duration_seconds_bucket[5m]))",
+            "legendFormat": "p99",
+            "refId": "C"
+          }
+        ],
+        "yaxes": [
+          {"label": "Latency (seconds)", "show": true, "format": "s"},
+          {"show": false}
+        ],
+        "lines": true,
+        "fill": 0,
+        "linewidth": 2,
+        "tooltip": {"shared": true}
+      },
+      {
+        "id": 5,
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
+        "type": "graph",
+        "title": "Active Connections by Type",
+        "targets": [
+          {
+            "expr": "guruconnect_active_connections",
+            "legendFormat": "{{conn_type}}",
+            "refId": "A"
+          }
+        ],
+        "yaxes": [
+          {"label": "Connections", "show": true},
+          {"show": false}
+        ],
+        "lines": true,
+        "fill": 1,
+        "linewidth": 2,
+        "stack": true,
+        "tooltip": {"shared": true}
+      },
+      {
+        "id": 6,
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
+        "type": "graph",
+        "title": "Database Query Duration",
+        "targets": [
+          {
+            "expr": "histogram_quantile(0.95, rate(guruconnect_db_query_duration_seconds_bucket[5m]))",
+            "legendFormat": "{{operation}} p95",
+            "refId": "A"
+          }
+        ],
+        "yaxes": [
+          {"label": "Duration (seconds)", "show": true, "format": "s"},
+          {"show": false}
+        ],
+        "lines": true,
+        "fill": 0,
+        "linewidth": 2,
+        "tooltip": {"shared": true}
+      },
+      {
+        "id": 7,
+        "gridPos": {"h": 4, "w": 6, "x": 0, "y": 24},
+        "type": "singlestat",
+        "title": "Server Uptime",
+        "targets": [
+          {
+            "expr": "guruconnect_uptime_seconds",
+            "refId": "A"
+          }
+        ],
+        "format": "s",
+        "valueName": "current",
+        "sparkline": {"show": true}
+      },
+      {
+        "id": 8,
+        "gridPos": {"h": 4, "w": 6, "x": 6, "y": 24},
+        "type": "singlestat",
+        "title": "Total Sessions Created",
+        "targets": [
+          {
+            "expr": "guruconnect_sessions_total{status=\"created\"}",
+            "refId": "A"
+          }
+        ],
+        "format": "short",
+        "valueName": "current",
+        "sparkline": {"show": true}
+      },
+      {
+        "id": 9,
+        "gridPos": {"h": 4, "w": 6, "x": 12, "y": 24},
+        "type": "singlestat",
+        "title": "Total Requests",
+        "targets": [
+          {
+            "expr": "sum(guruconnect_requests_total)",
+            "refId": "A"
+          }
+        ],
+        "format": "short",
+        "valueName": "current",
+        "sparkline": {"show": true}
+      },
+      {
+        "id": 10,
+        "gridPos": {"h": 4, "w": 6, "x": 18, "y": 24},
+        "type": "singlestat",
+        "title": "Total Errors",
+        "targets": [
+          {
+            "expr": "sum(guruconnect_errors_total)",
+            "refId": "A"
+          }
+        ],
+        "format": "short",
+        "valueName": "current",
+        "sparkline": {"show": true},
+        "thresholds": "10,100",
+        "colors": ["#299c46", "#e0b400", "#d44a3a"]
+      }
+    ]
+  }
+}
--- a/projects/msp-tools/guru-connect/infrastructure/prometheus.yml
+++ b/projects/msp-tools/guru-connect/infrastructure/prometheus.yml
@@ -0,0 +1,45 @@
+# Prometheus configuration for GuruConnect
+#
+# Install Prometheus:
+#   sudo apt-get install prometheus
+#
+# Copy this file to:
+#   sudo cp prometheus.yml /etc/prometheus/prometheus.yml
+#
+# Restart Prometheus:
+#   sudo systemctl restart prometheus
+
+global:
+  scrape_interval: 15s  # Scrape metrics every 15 seconds
+  evaluation_interval: 15s  # Evaluate rules every 15 seconds
+  external_labels:
+    cluster: 'guruconnect-production'
+    environment: 'production'
+
+# Scrape configurations
+scrape_configs:
+  # GuruConnect server metrics
+  - job_name: 'guruconnect'
+    static_configs:
+      - targets: ['172.16.3.30:3002']
+        labels:
+          service: 'guruconnect-server'
+          instance: 'rmm-server'
+
+  # Node Exporter (system metrics)
+  # Install: sudo apt-get install prometheus-node-exporter
+  - job_name: 'node_exporter'
+    static_configs:
+      - targets: ['172.16.3.30:9100']
+        labels:
+          instance: 'rmm-server'
+
+# Alert rules (optional)
+# rule_files:
+#   - '/etc/prometheus/alerts.yml'
+
+# Alertmanager configuration (optional)
+# alerting:
+#   alertmanagers:
+#     - static_configs:
+#         - targets: ['localhost:9093']
--- a/projects/msp-tools/guru-connect/infrastructure/setup-monitoring.sh
+++ b/projects/msp-tools/guru-connect/infrastructure/setup-monitoring.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+# GuruConnect Monitoring Setup Script
+# Installs and configures Prometheus and Grafana
+
+set -e
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+echo "========================================="
+echo "GuruConnect Monitoring Setup"
+echo "========================================="
+
+# Check if running as root
+if [ "$EUID" -ne 0 ]; then
+    echo -e "${RED}ERROR: This script must be run as root (sudo)${NC}"
+    exit 1
+fi
+
+# Update package list
+echo "Updating package list..."
+apt-get update
+
+# Install Prometheus
+echo ""
+echo "Installing Prometheus..."
+apt-get install -y prometheus prometheus-node-exporter
+
+# Copy Prometheus configuration
+echo "Copying Prometheus configuration..."
+cp prometheus.yml /etc/prometheus/prometheus.yml
+if [ -f "alerts.yml" ]; then
+    cp alerts.yml /etc/prometheus/alerts.yml
+fi
+
+# Set permissions
+chown prometheus:prometheus /etc/prometheus/prometheus.yml
+if [ -f "/etc/prometheus/alerts.yml" ]; then
+    chown prometheus:prometheus /etc/prometheus/alerts.yml
+fi
+
+# Restart Prometheus
+echo "Restarting Prometheus..."
+systemctl restart prometheus
+systemctl enable prometheus
+systemctl restart prometheus-node-exporter
+systemctl enable prometheus-node-exporter
+
+# Install Grafana
+echo ""
+echo "Installing Grafana..."
+apt-get install -y software-properties-common
+add-apt-repository -y "deb https://packages.grafana.com/oss/deb stable main"
+wget -q -O - https://packages.grafana.com/gpg.key | apt-key add -
+apt-get update
+apt-get install -y grafana
+
+# Start Grafana
+echo "Starting Grafana..."
+systemctl start grafana-server
+systemctl enable grafana-server
+
+# Wait for Grafana to start
+sleep 5
+
+# Configure Grafana data source (Prometheus)
+echo ""
+echo "Configuring Grafana data source..."
+curl -X POST -H "Content-Type: application/json" \
+    -d '{
+        "name":"Prometheus",
+        "type":"prometheus",
+        "url":"http://localhost:9090",
+        "access":"proxy",
+        "isDefault":true
+    }' \
+    http://admin:admin@localhost:3000/api/datasources || true
+
+echo ""
+echo "========================================="
+echo "Monitoring Setup Complete!"
+echo "========================================="
+echo ""
+echo "Services:"
+echo "  Prometheus:  http://172.16.3.30:9090"
+echo "  Grafana:     http://172.16.3.30:3000  (default login: admin/admin)"
+echo "  Node Exporter: http://172.16.3.30:9100/metrics"
+echo ""
+echo "Next steps:"
+echo "1. Access Grafana at http://172.16.3.30:3000"
+echo "2. Login with default credentials (admin/admin)"
+echo "3. Change the default password"
+echo "4. Import the dashboard from grafana-dashboard.json"
+echo "5. Configure alerting (optional)"
+echo ""
+echo "To import the dashboard:"
+echo "  Grafana > Dashboards > Import > Upload JSON file"
+echo "  Select: infrastructure/grafana-dashboard.json"
+echo ""