# Prometheus Alert Rules for GuruConnect
#
# This file defines alerting rules for monitoring GuruConnect health and performance.
# Copy to /etc/prometheus/alerts.yml and reference in prometheus.yml

groups:
  - name: guruconnect_alerts
    interval: 30s
    rules:
      # GuruConnect is down
      - alert: GuruConnectDown
        expr: up{job="guruconnect"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "GuruConnect server is down"
          description: "GuruConnect server on {{ $labels.instance }} has been down for more than 1 minute"

      # High error rate
      - alert: HighErrorRate
        expr: rate(guruconnect_errors_total[5m]) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value | humanize }} errors/second over the last 5 minutes"

      # Too many active sessions
      - alert: TooManyActiveSessions
        expr: guruconnect_active_sessions > 100
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Too many active sessions"
          description: "There are {{ $value }} active sessions, exceeding threshold of 100"

      # High request latency
      - alert: HighRequestLatency
        expr: histogram_quantile(0.95, rate(guruconnect_request_duration_seconds_bucket[5m])) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High request latency"
          description: "95th percentile request latency is {{ $value | humanize }}s"

      # Database operations failing
      - alert: DatabaseOperationsFailure
        expr: rate(guruconnect_db_operations_total{status="error"}[5m]) > 1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Database operations failing"
          description: "Database error rate is {{ $value | humanize }} errors/second"

      # Server uptime low (recent restart)
      - alert: ServerRestarted
        expr: guruconnect_uptime_seconds < 300
        for: 1m
        labels:
          severity: info
        annotations:
          summary: "Server recently restarted"
          description: "Server uptime is only {{ $value | humanize }}s, indicating a recent restart"