Phase 1 Week 2: Infrastructure & Monitoring
Added comprehensive production infrastructure: Systemd Service: - guruconnect.service with auto-restart, resource limits, security hardening - setup-systemd.sh installation script Prometheus Metrics: - Added prometheus-client dependency - Created metrics module tracking: - HTTP requests (count, latency) - Sessions (created, closed, active) - Connections (WebSocket, by type) - Errors (by type) - Database operations (count, latency) - Server uptime - Added /metrics endpoint - Background task for uptime updates Monitoring Configuration: - prometheus.yml with scrape configs for GuruConnect and node_exporter - alerts.yml with alerting rules - grafana-dashboard.json with 10 panels - setup-monitoring.sh installation script PostgreSQL Backups: - backup-postgres.sh with gzip compression - restore-postgres.sh with safety checks - guruconnect-backup.service and .timer for automated daily backups - Retention policy: 30 daily, 4 weekly, 6 monthly Health Monitoring: - health-monitor.sh checking HTTP, disk, memory, database, metrics - guruconnect.logrotate for log rotation - Email alerts on failures Updated CHECKLIST_STATE.json to reflect Week 1 completion (77%) and Week 2 start. Created PHASE1_WEEK2_INFRASTRUCTURE.md with comprehensive planning. Ready for deployment and testing on RMM server.
This commit is contained in:
68
projects/msp-tools/guru-connect/infrastructure/alerts.yml
Normal file
68
projects/msp-tools/guru-connect/infrastructure/alerts.yml
Normal file
@@ -0,0 +1,68 @@
|
||||
# Prometheus Alert Rules for GuruConnect
|
||||
#
|
||||
# This file defines alerting rules for monitoring GuruConnect health and performance.
|
||||
# Copy to /etc/prometheus/alerts.yml and reference in prometheus.yml
|
||||
|
||||
groups:
|
||||
- name: guruconnect_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# GuruConnect is down
|
||||
- alert: GuruConnectDown
|
||||
expr: up{job="guruconnect"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "GuruConnect server is down"
|
||||
description: "GuruConnect server on {{ $labels.instance }} has been down for more than 1 minute"
|
||||
|
||||
# High error rate
|
||||
- alert: HighErrorRate
|
||||
expr: rate(guruconnect_errors_total[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate detected"
|
||||
description: "Error rate is {{ $value | humanize }} errors/second over the last 5 minutes"
|
||||
|
||||
# Too many active sessions
|
||||
- alert: TooManyActiveSessions
|
||||
expr: guruconnect_active_sessions > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Too many active sessions"
|
||||
description: "There are {{ $value }} active sessions, exceeding threshold of 100"
|
||||
|
||||
# High request latency
|
||||
- alert: HighRequestLatency
|
||||
expr: histogram_quantile(0.95, rate(guruconnect_request_duration_seconds_bucket[5m])) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High request latency"
|
||||
description: "95th percentile request latency is {{ $value | humanize }}s"
|
||||
|
||||
# Database operations failing
|
||||
- alert: DatabaseOperationsFailure
|
||||
expr: rate(guruconnect_db_operations_total{status="error"}[5m]) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Database operations failing"
|
||||
description: "Database error rate is {{ $value | humanize }} errors/second"
|
||||
|
||||
# Server uptime low (recent restart)
|
||||
- alert: ServerRestarted
|
||||
expr: guruconnect_uptime_seconds < 300
|
||||
for: 1m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "Server recently restarted"
|
||||
description: "Server uptime is only {{ $value | humanize }}s, indicating a recent restart"
|
||||
@@ -0,0 +1,228 @@
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "GuruConnect Monitoring",
|
||||
"tags": ["guruconnect", "monitoring"],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 16,
|
||||
"version": 1,
|
||||
"refresh": "10s",
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
||||
"type": "graph",
|
||||
"title": "Active Sessions",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "guruconnect_active_sessions",
|
||||
"legendFormat": "Active Sessions",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"yaxes": [
|
||||
{"label": "Sessions", "show": true},
|
||||
{"show": false}
|
||||
],
|
||||
"lines": true,
|
||||
"fill": 1,
|
||||
"linewidth": 2,
|
||||
"tooltip": {"shared": true}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||
"type": "graph",
|
||||
"title": "Requests per Second",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(guruconnect_requests_total[1m])",
|
||||
"legendFormat": "{{method}} {{path}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"yaxes": [
|
||||
{"label": "Requests/sec", "show": true},
|
||||
{"show": false}
|
||||
],
|
||||
"lines": true,
|
||||
"fill": 1,
|
||||
"linewidth": 2,
|
||||
"tooltip": {"shared": true}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
||||
"type": "graph",
|
||||
"title": "Error Rate",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(guruconnect_errors_total[1m])",
|
||||
"legendFormat": "{{error_type}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"yaxes": [
|
||||
{"label": "Errors/sec", "show": true},
|
||||
{"show": false}
|
||||
],
|
||||
"lines": true,
|
||||
"fill": 1,
|
||||
"linewidth": 2,
|
||||
"tooltip": {"shared": true},
|
||||
"alert": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {"params": [10], "type": "gt"},
|
||||
"operator": {"type": "and"},
|
||||
"query": {"params": ["A", "1m", "now"]},
|
||||
"reducer": {"params": [], "type": "avg"},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"executionErrorState": "alerting",
|
||||
"frequency": "60s",
|
||||
"handler": 1,
|
||||
"name": "High Error Rate",
|
||||
"noDataState": "no_data",
|
||||
"notifications": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
|
||||
"type": "graph",
|
||||
"title": "Request Latency (p50, p95, p99)",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.50, rate(guruconnect_request_duration_seconds_bucket[5m]))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(guruconnect_request_duration_seconds_bucket[5m]))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "histogram_quantile(0.99, rate(guruconnect_request_duration_seconds_bucket[5m]))",
|
||||
"legendFormat": "p99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"yaxes": [
|
||||
{"label": "Latency (seconds)", "show": true, "format": "s"},
|
||||
{"show": false}
|
||||
],
|
||||
"lines": true,
|
||||
"fill": 0,
|
||||
"linewidth": 2,
|
||||
"tooltip": {"shared": true}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
|
||||
"type": "graph",
|
||||
"title": "Active Connections by Type",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "guruconnect_active_connections",
|
||||
"legendFormat": "{{conn_type}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"yaxes": [
|
||||
{"label": "Connections", "show": true},
|
||||
{"show": false}
|
||||
],
|
||||
"lines": true,
|
||||
"fill": 1,
|
||||
"linewidth": 2,
|
||||
"stack": true,
|
||||
"tooltip": {"shared": true}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
|
||||
"type": "graph",
|
||||
"title": "Database Query Duration",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "histogram_quantile(0.95, rate(guruconnect_db_query_duration_seconds_bucket[5m]))",
|
||||
"legendFormat": "{{operation}} p95",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"yaxes": [
|
||||
{"label": "Duration (seconds)", "show": true, "format": "s"},
|
||||
{"show": false}
|
||||
],
|
||||
"lines": true,
|
||||
"fill": 0,
|
||||
"linewidth": 2,
|
||||
"tooltip": {"shared": true}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 24},
|
||||
"type": "singlestat",
|
||||
"title": "Server Uptime",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "guruconnect_uptime_seconds",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"format": "s",
|
||||
"valueName": "current",
|
||||
"sparkline": {"show": true}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 24},
|
||||
"type": "singlestat",
|
||||
"title": "Total Sessions Created",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "guruconnect_sessions_total{status=\"created\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"format": "short",
|
||||
"valueName": "current",
|
||||
"sparkline": {"show": true}
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 24},
|
||||
"type": "singlestat",
|
||||
"title": "Total Requests",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(guruconnect_requests_total)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"format": "short",
|
||||
"valueName": "current",
|
||||
"sparkline": {"show": true}
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 24},
|
||||
"type": "singlestat",
|
||||
"title": "Total Errors",
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(guruconnect_errors_total)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"format": "short",
|
||||
"valueName": "current",
|
||||
"sparkline": {"show": true},
|
||||
"thresholds": "10,100",
|
||||
"colors": ["#299c46", "#e0b400", "#d44a3a"]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
# Prometheus configuration for GuruConnect
|
||||
#
|
||||
# Install Prometheus:
|
||||
# sudo apt-get install prometheus
|
||||
#
|
||||
# Copy this file to:
|
||||
# sudo cp prometheus.yml /etc/prometheus/prometheus.yml
|
||||
#
|
||||
# Restart Prometheus:
|
||||
# sudo systemctl restart prometheus
|
||||
|
||||
global:
|
||||
scrape_interval: 15s # Scrape metrics every 15 seconds
|
||||
evaluation_interval: 15s # Evaluate rules every 15 seconds
|
||||
external_labels:
|
||||
cluster: 'guruconnect-production'
|
||||
environment: 'production'
|
||||
|
||||
# Scrape configurations
|
||||
scrape_configs:
|
||||
# GuruConnect server metrics
|
||||
- job_name: 'guruconnect'
|
||||
static_configs:
|
||||
- targets: ['172.16.3.30:3002']
|
||||
labels:
|
||||
service: 'guruconnect-server'
|
||||
instance: 'rmm-server'
|
||||
|
||||
# Node Exporter (system metrics)
|
||||
# Install: sudo apt-get install prometheus-node-exporter
|
||||
- job_name: 'node_exporter'
|
||||
static_configs:
|
||||
- targets: ['172.16.3.30:9100']
|
||||
labels:
|
||||
instance: 'rmm-server'
|
||||
|
||||
# Alert rules (optional)
|
||||
# rule_files:
|
||||
# - '/etc/prometheus/alerts.yml'
|
||||
|
||||
# Alertmanager configuration (optional)
|
||||
# alerting:
|
||||
# alertmanagers:
|
||||
# - static_configs:
|
||||
# - targets: ['localhost:9093']
|
||||
@@ -0,0 +1,102 @@
|
||||
#!/bin/bash
|
||||
# GuruConnect Monitoring Setup Script
|
||||
# Installs and configures Prometheus and Grafana
|
||||
|
||||
set -e
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m'
|
||||
|
||||
echo "========================================="
|
||||
echo "GuruConnect Monitoring Setup"
|
||||
echo "========================================="
|
||||
|
||||
# Check if running as root
|
||||
if [ "$EUID" -ne 0 ]; then
|
||||
echo -e "${RED}ERROR: This script must be run as root (sudo)${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Update package list
|
||||
echo "Updating package list..."
|
||||
apt-get update
|
||||
|
||||
# Install Prometheus
|
||||
echo ""
|
||||
echo "Installing Prometheus..."
|
||||
apt-get install -y prometheus prometheus-node-exporter
|
||||
|
||||
# Copy Prometheus configuration
|
||||
echo "Copying Prometheus configuration..."
|
||||
cp prometheus.yml /etc/prometheus/prometheus.yml
|
||||
if [ -f "alerts.yml" ]; then
|
||||
cp alerts.yml /etc/prometheus/alerts.yml
|
||||
fi
|
||||
|
||||
# Set permissions
|
||||
chown prometheus:prometheus /etc/prometheus/prometheus.yml
|
||||
if [ -f "/etc/prometheus/alerts.yml" ]; then
|
||||
chown prometheus:prometheus /etc/prometheus/alerts.yml
|
||||
fi
|
||||
|
||||
# Restart Prometheus
|
||||
echo "Restarting Prometheus..."
|
||||
systemctl restart prometheus
|
||||
systemctl enable prometheus
|
||||
systemctl restart prometheus-node-exporter
|
||||
systemctl enable prometheus-node-exporter
|
||||
|
||||
# Install Grafana
|
||||
echo ""
|
||||
echo "Installing Grafana..."
|
||||
apt-get install -y software-properties-common
|
||||
add-apt-repository -y "deb https://packages.grafana.com/oss/deb stable main"
|
||||
wget -q -O - https://packages.grafana.com/gpg.key | apt-key add -
|
||||
apt-get update
|
||||
apt-get install -y grafana
|
||||
|
||||
# Start Grafana
|
||||
echo "Starting Grafana..."
|
||||
systemctl start grafana-server
|
||||
systemctl enable grafana-server
|
||||
|
||||
# Wait for Grafana to start
|
||||
sleep 5
|
||||
|
||||
# Configure Grafana data source (Prometheus)
|
||||
echo ""
|
||||
echo "Configuring Grafana data source..."
|
||||
curl -X POST -H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"name":"Prometheus",
|
||||
"type":"prometheus",
|
||||
"url":"http://localhost:9090",
|
||||
"access":"proxy",
|
||||
"isDefault":true
|
||||
}' \
|
||||
http://admin:admin@localhost:3000/api/datasources || true
|
||||
|
||||
echo ""
|
||||
echo "========================================="
|
||||
echo "Monitoring Setup Complete!"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
echo "Services:"
|
||||
echo " Prometheus: http://172.16.3.30:9090"
|
||||
echo " Grafana: http://172.16.3.30:3000 (default login: admin/admin)"
|
||||
echo " Node Exporter: http://172.16.3.30:9100/metrics"
|
||||
echo ""
|
||||
echo "Next steps:"
|
||||
echo "1. Access Grafana at http://172.16.3.30:3000"
|
||||
echo "2. Login with default credentials (admin/admin)"
|
||||
echo "3. Change the default password"
|
||||
echo "4. Import the dashboard from grafana-dashboard.json"
|
||||
echo "5. Configure alerting (optional)"
|
||||
echo ""
|
||||
echo "To import the dashboard:"
|
||||
echo " Grafana > Dashboards > Import > Upload JSON file"
|
||||
echo " Select: infrastructure/grafana-dashboard.json"
|
||||
echo ""
|
||||
Reference in New Issue
Block a user