Phase 1 Week 2: Infrastructure & Monitoring
Added comprehensive production infrastructure: Systemd Service: - guruconnect.service with auto-restart, resource limits, security hardening - setup-systemd.sh installation script Prometheus Metrics: - Added prometheus-client dependency - Created metrics module tracking: - HTTP requests (count, latency) - Sessions (created, closed, active) - Connections (WebSocket, by type) - Errors (by type) - Database operations (count, latency) - Server uptime - Added /metrics endpoint - Background task for uptime updates Monitoring Configuration: - prometheus.yml with scrape configs for GuruConnect and node_exporter - alerts.yml with alerting rules - grafana-dashboard.json with 10 panels - setup-monitoring.sh installation script PostgreSQL Backups: - backup-postgres.sh with gzip compression - restore-postgres.sh with safety checks - guruconnect-backup.service and .timer for automated daily backups - Retention policy: 30 daily, 4 weekly, 6 monthly Health Monitoring: - health-monitor.sh checking HTTP, disk, memory, database, metrics - guruconnect.logrotate for log rotation - Email alerts on failures Updated CHECKLIST_STATE.json to reflect Week 1 completion (77%) and Week 2 start. Created PHASE1_WEEK2_INFRASTRUCTURE.md with comprehensive planning. Ready for deployment and testing on RMM server.
This commit is contained in:
68
projects/msp-tools/guru-connect/infrastructure/alerts.yml
Normal file
68
projects/msp-tools/guru-connect/infrastructure/alerts.yml
Normal file
@@ -0,0 +1,68 @@
|
||||
# Prometheus Alert Rules for GuruConnect
|
||||
#
|
||||
# This file defines alerting rules for monitoring GuruConnect health and performance.
|
||||
# Copy to /etc/prometheus/alerts.yml and reference in prometheus.yml
|
||||
|
||||
groups:
|
||||
- name: guruconnect_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# GuruConnect is down
|
||||
- alert: GuruConnectDown
|
||||
expr: up{job="guruconnect"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "GuruConnect server is down"
|
||||
description: "GuruConnect server on {{ $labels.instance }} has been down for more than 1 minute"
|
||||
|
||||
# High error rate
|
||||
- alert: HighErrorRate
|
||||
expr: rate(guruconnect_errors_total[5m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate detected"
|
||||
description: "Error rate is {{ $value | humanize }} errors/second over the last 5 minutes"
|
||||
|
||||
# Too many active sessions
|
||||
- alert: TooManyActiveSessions
|
||||
expr: guruconnect_active_sessions > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Too many active sessions"
|
||||
description: "There are {{ $value }} active sessions, exceeding threshold of 100"
|
||||
|
||||
# High request latency
|
||||
- alert: HighRequestLatency
|
||||
expr: histogram_quantile(0.95, rate(guruconnect_request_duration_seconds_bucket[5m])) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High request latency"
|
||||
description: "95th percentile request latency is {{ $value | humanize }}s"
|
||||
|
||||
# Database operations failing
|
||||
- alert: DatabaseOperationsFailure
|
||||
expr: rate(guruconnect_db_operations_total{status="error"}[5m]) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Database operations failing"
|
||||
description: "Database error rate is {{ $value | humanize }} errors/second"
|
||||
|
||||
# Server uptime low (recent restart)
|
||||
- alert: ServerRestarted
|
||||
expr: guruconnect_uptime_seconds < 300
|
||||
for: 1m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: "Server recently restarted"
|
||||
description: "Server uptime is only {{ $value | humanize }}s, indicating a recent restart"
|
||||
Reference in New Issue
Block a user