Phase 1 Week 2: Infrastructure & Monitoring

Added comprehensive production infrastructure:

Systemd Service:
- guruconnect.service with auto-restart, resource limits, security hardening
- setup-systemd.sh installation script

Prometheus Metrics:
- Added prometheus-client dependency
- Created metrics module tracking:
  - HTTP requests (count, latency)
  - Sessions (created, closed, active)
  - Connections (WebSocket, by type)
  - Errors (by type)
  - Database operations (count, latency)
  - Server uptime
- Added /metrics endpoint
- Background task for uptime updates

Monitoring Configuration:
- prometheus.yml with scrape configs for GuruConnect and node_exporter
- alerts.yml with alerting rules
- grafana-dashboard.json with 10 panels
- setup-monitoring.sh installation script

PostgreSQL Backups:
- backup-postgres.sh with gzip compression
- restore-postgres.sh with safety checks
- guruconnect-backup.service and .timer for automated daily backups
- Retention policy: 30 daily, 4 weekly, 6 monthly

Health Monitoring:
- health-monitor.sh checking HTTP, disk, memory, database, metrics
- guruconnect.logrotate for log rotation
- Email alerts on failures

Updated CHECKLIST_STATE.json to reflect Week 1 completion (77%) and Week 2 start.
Created PHASE1_WEEK2_INFRASTRUCTURE.md with comprehensive planning.

Ready for deployment and testing on RMM server.
This commit is contained in:
2026-01-17 20:24:32 -07:00
parent 2481b54a65
commit 8521c95755
17 changed files with 1877 additions and 25 deletions

View File

@@ -0,0 +1,68 @@
# Prometheus Alert Rules for GuruConnect
#
# This file defines alerting rules for monitoring GuruConnect health and performance.
# Copy to /etc/prometheus/alerts.yml and reference in prometheus.yml
groups:
- name: guruconnect_alerts
interval: 30s
rules:
# GuruConnect is down
- alert: GuruConnectDown
expr: up{job="guruconnect"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "GuruConnect server is down"
description: "GuruConnect server on {{ $labels.instance }} has been down for more than 1 minute"
# High error rate
- alert: HighErrorRate
expr: rate(guruconnect_errors_total[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanize }} errors/second over the last 5 minutes"
# Too many active sessions
- alert: TooManyActiveSessions
expr: guruconnect_active_sessions > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Too many active sessions"
description: "There are {{ $value }} active sessions, exceeding threshold of 100"
# High request latency
- alert: HighRequestLatency
expr: histogram_quantile(0.95, rate(guruconnect_request_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "High request latency"
description: "95th percentile request latency is {{ $value | humanize }}s"
# Database operations failing
- alert: DatabaseOperationsFailure
expr: rate(guruconnect_db_operations_total{status="error"}[5m]) > 1
for: 5m
labels:
severity: critical
annotations:
summary: "Database operations failing"
description: "Database error rate is {{ $value | humanize }} errors/second"
# Server uptime low (recent restart)
- alert: ServerRestarted
expr: guruconnect_uptime_seconds < 300
for: 1m
labels:
severity: info
annotations:
summary: "Server recently restarted"
description: "Server uptime is only {{ $value | humanize }}s, indicating a recent restart"

View File

@@ -0,0 +1,228 @@
{
"dashboard": {
"title": "GuruConnect Monitoring",
"tags": ["guruconnect", "monitoring"],
"timezone": "browser",
"schemaVersion": 16,
"version": 1,
"refresh": "10s",
"panels": [
{
"id": 1,
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
"type": "graph",
"title": "Active Sessions",
"targets": [
{
"expr": "guruconnect_active_sessions",
"legendFormat": "Active Sessions",
"refId": "A"
}
],
"yaxes": [
{"label": "Sessions", "show": true},
{"show": false}
],
"lines": true,
"fill": 1,
"linewidth": 2,
"tooltip": {"shared": true}
},
{
"id": 2,
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
"type": "graph",
"title": "Requests per Second",
"targets": [
{
"expr": "rate(guruconnect_requests_total[1m])",
"legendFormat": "{{method}} {{path}}",
"refId": "A"
}
],
"yaxes": [
{"label": "Requests/sec", "show": true},
{"show": false}
],
"lines": true,
"fill": 1,
"linewidth": 2,
"tooltip": {"shared": true}
},
{
"id": 3,
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"type": "graph",
"title": "Error Rate",
"targets": [
{
"expr": "rate(guruconnect_errors_total[1m])",
"legendFormat": "{{error_type}}",
"refId": "A"
}
],
"yaxes": [
{"label": "Errors/sec", "show": true},
{"show": false}
],
"lines": true,
"fill": 1,
"linewidth": 2,
"tooltip": {"shared": true},
"alert": {
"conditions": [
{
"evaluator": {"params": [10], "type": "gt"},
"operator": {"type": "and"},
"query": {"params": ["A", "1m", "now"]},
"reducer": {"params": [], "type": "avg"},
"type": "query"
}
],
"executionErrorState": "alerting",
"frequency": "60s",
"handler": 1,
"name": "High Error Rate",
"noDataState": "no_data",
"notifications": []
}
},
{
"id": 4,
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
"type": "graph",
"title": "Request Latency (p50, p95, p99)",
"targets": [
{
"expr": "histogram_quantile(0.50, rate(guruconnect_request_duration_seconds_bucket[5m]))",
"legendFormat": "p50",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, rate(guruconnect_request_duration_seconds_bucket[5m]))",
"legendFormat": "p95",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, rate(guruconnect_request_duration_seconds_bucket[5m]))",
"legendFormat": "p99",
"refId": "C"
}
],
"yaxes": [
{"label": "Latency (seconds)", "show": true, "format": "s"},
{"show": false}
],
"lines": true,
"fill": 0,
"linewidth": 2,
"tooltip": {"shared": true}
},
{
"id": 5,
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
"type": "graph",
"title": "Active Connections by Type",
"targets": [
{
"expr": "guruconnect_active_connections",
"legendFormat": "{{conn_type}}",
"refId": "A"
}
],
"yaxes": [
{"label": "Connections", "show": true},
{"show": false}
],
"lines": true,
"fill": 1,
"linewidth": 2,
"stack": true,
"tooltip": {"shared": true}
},
{
"id": 6,
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
"type": "graph",
"title": "Database Query Duration",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(guruconnect_db_query_duration_seconds_bucket[5m]))",
"legendFormat": "{{operation}} p95",
"refId": "A"
}
],
"yaxes": [
{"label": "Duration (seconds)", "show": true, "format": "s"},
{"show": false}
],
"lines": true,
"fill": 0,
"linewidth": 2,
"tooltip": {"shared": true}
},
{
"id": 7,
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 24},
"type": "singlestat",
"title": "Server Uptime",
"targets": [
{
"expr": "guruconnect_uptime_seconds",
"refId": "A"
}
],
"format": "s",
"valueName": "current",
"sparkline": {"show": true}
},
{
"id": 8,
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 24},
"type": "singlestat",
"title": "Total Sessions Created",
"targets": [
{
"expr": "guruconnect_sessions_total{status=\"created\"}",
"refId": "A"
}
],
"format": "short",
"valueName": "current",
"sparkline": {"show": true}
},
{
"id": 9,
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 24},
"type": "singlestat",
"title": "Total Requests",
"targets": [
{
"expr": "sum(guruconnect_requests_total)",
"refId": "A"
}
],
"format": "short",
"valueName": "current",
"sparkline": {"show": true}
},
{
"id": 10,
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 24},
"type": "singlestat",
"title": "Total Errors",
"targets": [
{
"expr": "sum(guruconnect_errors_total)",
"refId": "A"
}
],
"format": "short",
"valueName": "current",
"sparkline": {"show": true},
"thresholds": "10,100",
"colors": ["#299c46", "#e0b400", "#d44a3a"]
}
]
}
}

View File

@@ -0,0 +1,45 @@
# Prometheus configuration for GuruConnect
#
# Install Prometheus:
# sudo apt-get install prometheus
#
# Copy this file to:
# sudo cp prometheus.yml /etc/prometheus/prometheus.yml
#
# Restart Prometheus:
# sudo systemctl restart prometheus
global:
scrape_interval: 15s # Scrape metrics every 15 seconds
evaluation_interval: 15s # Evaluate rules every 15 seconds
external_labels:
cluster: 'guruconnect-production'
environment: 'production'
# Scrape configurations
scrape_configs:
# GuruConnect server metrics
- job_name: 'guruconnect'
static_configs:
- targets: ['172.16.3.30:3002']
labels:
service: 'guruconnect-server'
instance: 'rmm-server'
# Node Exporter (system metrics)
# Install: sudo apt-get install prometheus-node-exporter
- job_name: 'node_exporter'
static_configs:
- targets: ['172.16.3.30:9100']
labels:
instance: 'rmm-server'
# Alert rules (optional)
# rule_files:
# - '/etc/prometheus/alerts.yml'
# Alertmanager configuration (optional)
# alerting:
# alertmanagers:
# - static_configs:
# - targets: ['localhost:9093']

View File

@@ -0,0 +1,102 @@
#!/bin/bash
# GuruConnect Monitoring Setup Script
# Installs and configures Prometheus and Grafana
set -e
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'
echo "========================================="
echo "GuruConnect Monitoring Setup"
echo "========================================="
# Check if running as root
if [ "$EUID" -ne 0 ]; then
echo -e "${RED}ERROR: This script must be run as root (sudo)${NC}"
exit 1
fi
# Update package list
echo "Updating package list..."
apt-get update
# Install Prometheus
echo ""
echo "Installing Prometheus..."
apt-get install -y prometheus prometheus-node-exporter
# Copy Prometheus configuration
echo "Copying Prometheus configuration..."
cp prometheus.yml /etc/prometheus/prometheus.yml
if [ -f "alerts.yml" ]; then
cp alerts.yml /etc/prometheus/alerts.yml
fi
# Set permissions
chown prometheus:prometheus /etc/prometheus/prometheus.yml
if [ -f "/etc/prometheus/alerts.yml" ]; then
chown prometheus:prometheus /etc/prometheus/alerts.yml
fi
# Restart Prometheus
echo "Restarting Prometheus..."
systemctl restart prometheus
systemctl enable prometheus
systemctl restart prometheus-node-exporter
systemctl enable prometheus-node-exporter
# Install Grafana
echo ""
echo "Installing Grafana..."
apt-get install -y software-properties-common
add-apt-repository -y "deb https://packages.grafana.com/oss/deb stable main"
wget -q -O - https://packages.grafana.com/gpg.key | apt-key add -
apt-get update
apt-get install -y grafana
# Start Grafana
echo "Starting Grafana..."
systemctl start grafana-server
systemctl enable grafana-server
# Wait for Grafana to start
sleep 5
# Configure Grafana data source (Prometheus)
echo ""
echo "Configuring Grafana data source..."
curl -X POST -H "Content-Type: application/json" \
-d '{
"name":"Prometheus",
"type":"prometheus",
"url":"http://localhost:9090",
"access":"proxy",
"isDefault":true
}' \
http://admin:admin@localhost:3000/api/datasources || true
echo ""
echo "========================================="
echo "Monitoring Setup Complete!"
echo "========================================="
echo ""
echo "Services:"
echo " Prometheus: http://172.16.3.30:9090"
echo " Grafana: http://172.16.3.30:3000 (default login: admin/admin)"
echo " Node Exporter: http://172.16.3.30:9100/metrics"
echo ""
echo "Next steps:"
echo "1. Access Grafana at http://172.16.3.30:3000"
echo "2. Login with default credentials (admin/admin)"
echo "3. Change the default password"
echo "4. Import the dashboard from grafana-dashboard.json"
echo "5. Configure alerting (optional)"
echo ""
echo "To import the dashboard:"
echo " Grafana > Dashboards > Import > Upload JSON file"
echo " Select: infrastructure/grafana-dashboard.json"
echo ""