# Prometheus Alert Rules for GuruConnect # # This file defines alerting rules for monitoring GuruConnect health and performance. # Copy to /etc/prometheus/alerts.yml and reference in prometheus.yml groups: - name: guruconnect_alerts interval: 30s rules: # GuruConnect is down - alert: GuruConnectDown expr: up{job="guruconnect"} == 0 for: 1m labels: severity: critical annotations: summary: "GuruConnect server is down" description: "GuruConnect server on {{ $labels.instance }} has been down for more than 1 minute" # High error rate - alert: HighErrorRate expr: rate(guruconnect_errors_total[5m]) > 10 for: 5m labels: severity: warning annotations: summary: "High error rate detected" description: "Error rate is {{ $value | humanize }} errors/second over the last 5 minutes" # Too many active sessions - alert: TooManyActiveSessions expr: guruconnect_active_sessions > 100 for: 5m labels: severity: warning annotations: summary: "Too many active sessions" description: "There are {{ $value }} active sessions, exceeding threshold of 100" # High request latency - alert: HighRequestLatency expr: histogram_quantile(0.95, rate(guruconnect_request_duration_seconds_bucket[5m])) > 1 for: 5m labels: severity: warning annotations: summary: "High request latency" description: "95th percentile request latency is {{ $value | humanize }}s" # Database operations failing - alert: DatabaseOperationsFailure expr: rate(guruconnect_db_operations_total{status="error"}[5m]) > 1 for: 5m labels: severity: critical annotations: summary: "Database operations failing" description: "Database error rate is {{ $value | humanize }} errors/second" # Server uptime low (recent restart) - alert: ServerRestarted expr: guruconnect_uptime_seconds < 300 for: 1m labels: severity: info annotations: summary: "Server recently restarted" description: "Server uptime is only {{ $value | humanize }}s, indicating a recent restart"