Phase 1 Week 2: Infrastructure & Monitoring

Added comprehensive production infrastructure: Systemd Service: - guruconnect.service with auto-restart, resource limits, security hardening - setup-systemd.sh installation script Prometheus Metrics: - Added prometheus-client dependency - Created metrics module tracking: - HTTP requests (count, latency) - Sessions (created, closed, active) - Connections (WebSocket, by type) - Errors (by type) - Database operations (count, latency) - Server uptime - Added /metrics endpoint - Background task for uptime updates Monitoring Configuration: - prometheus.yml with scrape configs for GuruConnect and node_exporter - alerts.yml with alerting rules - grafana-dashboard.json with 10 panels - setup-monitoring.sh installation script PostgreSQL Backups: - backup-postgres.sh with gzip compression - restore-postgres.sh with safety checks - guruconnect-backup.service and .timer for automated daily backups - Retention policy: 30 daily, 4 weekly, 6 monthly Health Monitoring: - health-monitor.sh checking HTTP, disk, memory, database, metrics - guruconnect.logrotate for log rotation - Email alerts on failures Updated CHECKLIST_STATE.json to reflect Week 1 completion (77%) and Week 2 start. Created PHASE1_WEEK2_INFRASTRUCTURE.md with comprehensive planning. Ready for deployment and testing on RMM server.
2026-01-17 20:24:32 -07:00
parent 2481b54a65
commit 8521c95755
17 changed files with 1877 additions and 25 deletions
--- a/projects/msp-tools/guru-connect/server/health-monitor.sh
+++ b/projects/msp-tools/guru-connect/server/health-monitor.sh
@@ -0,0 +1,148 @@
+#!/bin/bash
+# GuruConnect Health Monitoring Script
+# Checks server health and sends alerts if issues detected
+
+set -e
+
+# Configuration
+HEALTH_URL="http://172.16.3.30:3002/health"
+ALERT_EMAIL="admin@azcomputerguru.com"
+LOG_FILE="/var/log/guruconnect/health-monitor.log"
+
+# Thresholds
+MAX_DISK_USAGE=90
+MAX_MEMORY_USAGE=90
+MAX_SESSIONS=100
+
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+NC='\033[0m'
+
+# Logging function
+log() {
+    echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
+}
+
+# Health check result
+HEALTH_STATUS="OK"
+HEALTH_ISSUES=()
+
+log "========================================="
+log "GuruConnect Health Check"
+log "========================================="
+
+# Check 1: HTTP health endpoint
+log "Checking HTTP health endpoint..."
+if HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$HEALTH_URL" --max-time 5); then
+    if [ "$HTTP_STATUS" = "200" ]; then
+        log "[OK] HTTP health endpoint responding (HTTP $HTTP_STATUS)"
+    else
+        log "[ERROR] HTTP health endpoint returned HTTP $HTTP_STATUS"
+        HEALTH_STATUS="ERROR"
+        HEALTH_ISSUES+=("HTTP health endpoint returned HTTP $HTTP_STATUS")
+    fi
+else
+    log "[ERROR] HTTP health endpoint not reachable"
+    HEALTH_STATUS="ERROR"
+    HEALTH_ISSUES+=("HTTP health endpoint not reachable")
+fi
+
+# Check 2: Systemd service status
+log "Checking systemd service status..."
+if systemctl is-active --quiet guruconnect 2>/dev/null; then
+    log "[OK] guruconnect service is running"
+else
+    log "[ERROR] guruconnect service is not running"
+    HEALTH_STATUS="ERROR"
+    HEALTH_ISSUES+=("guruconnect service is not running")
+fi
+
+# Check 3: Disk space
+log "Checking disk space..."
+DISK_USAGE=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//')
+if [ "$DISK_USAGE" -lt "$MAX_DISK_USAGE" ]; then
+    log "[OK] Disk usage: ${DISK_USAGE}% (threshold: ${MAX_DISK_USAGE}%)"
+else
+    log "[WARNING] Disk usage: ${DISK_USAGE}% (threshold: ${MAX_DISK_USAGE}%)"
+    HEALTH_STATUS="WARNING"
+    HEALTH_ISSUES+=("Disk usage ${DISK_USAGE}% exceeds threshold")
+fi
+
+# Check 4: Memory usage
+log "Checking memory usage..."
+MEMORY_USAGE=$(free | awk 'NR==2 {printf "%.0f", $3/$2 * 100.0}')
+if [ "$MEMORY_USAGE" -lt "$MAX_MEMORY_USAGE" ]; then
+    log "[OK] Memory usage: ${MEMORY_USAGE}% (threshold: ${MAX_MEMORY_USAGE}%)"
+else
+    log "[WARNING] Memory usage: ${MEMORY_USAGE}% (threshold: ${MAX_MEMORY_USAGE}%)"
+    HEALTH_STATUS="WARNING"
+    HEALTH_ISSUES+=("Memory usage ${MEMORY_USAGE}% exceeds threshold")
+fi
+
+# Check 5: Database connectivity
+log "Checking database connectivity..."
+if systemctl is-active --quiet postgresql 2>/dev/null; then
+    log "[OK] PostgreSQL service is running"
+else
+    log "[WARNING] PostgreSQL service is not running"
+    HEALTH_STATUS="WARNING"
+    HEALTH_ISSUES+=("PostgreSQL service is not running")
+fi
+
+# Check 6: Metrics endpoint
+log "Checking Prometheus metrics endpoint..."
+if METRICS=$(curl -s "http://172.16.3.30:3002/metrics" --max-time 5); then
+    if echo "$METRICS" | grep -q "guruconnect_uptime_seconds"; then
+        log "[OK] Prometheus metrics endpoint working"
+    else
+        log "[WARNING] Prometheus metrics endpoint not returning expected data"
+        HEALTH_STATUS="WARNING"
+        HEALTH_ISSUES+=("Prometheus metrics endpoint not returning expected data")
+    fi
+else
+    log "[ERROR] Prometheus metrics endpoint not reachable"
+    HEALTH_STATUS="ERROR"
+    HEALTH_ISSUES+=("Prometheus metrics endpoint not reachable")
+fi
+
+# Summary
+log "========================================="
+log "Health Check Summary"
+log "========================================="
+log "Status: $HEALTH_STATUS"
+
+if [ "${#HEALTH_ISSUES[@]}" -gt 0 ]; then
+    log "Issues found:"
+    for issue in "${HEALTH_ISSUES[@]}"; do
+        log "  - $issue"
+    done
+
+    # Send alert email (if configured)
+    if command -v mail &> /dev/null; then
+        {
+            echo "GuruConnect Health Check FAILED"
+            echo ""
+            echo "Status: $HEALTH_STATUS"
+            echo "Date: $(date)"
+            echo ""
+            echo "Issues:"
+            for issue in "${HEALTH_ISSUES[@]}"; do
+                echo "  - $issue"
+            done
+        } | mail -s "GuruConnect Health Check Alert" "$ALERT_EMAIL"
+        log "Alert email sent to $ALERT_EMAIL"
+    fi
+else
+    log "All checks passed!"
+fi
+
+# Exit with appropriate code
+if [ "$HEALTH_STATUS" = "ERROR" ]; then
+    exit 2
+elif [ "$HEALTH_STATUS" = "WARNING" ]; then
+    exit 1
+else
+    exit 0
+fi