Some checks failed
Build and Test / Build Server (Linux) (push) Has been cancelled
Build and Test / Build Agent (Windows) (push) Has been cancelled
Build and Test / Security Audit (push) Has been cancelled
Build and Test / Build Summary (push) Has been cancelled
Run Tests / Test Server (push) Has been cancelled
Run Tests / Test Agent (push) Has been cancelled
Run Tests / Code Coverage (push) Has been cancelled
Run Tests / Lint and Format Check (push) Has been cancelled
Brings azcomputerguru/guru-connect up to the authoritative working copy that had been maintained in the claudetools monorepo: Phase 1 security and infrastructure (middleware, metrics, utils, token blacklist, deployment scripts, security audits) plus the native-remote-control integration spec. Preserves the repo .gitignore, .cargo, and server/static/downloads. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
69 lines
2.3 KiB
YAML
69 lines
2.3 KiB
YAML
# Prometheus Alert Rules for GuruConnect
|
|
#
|
|
# This file defines alerting rules for monitoring GuruConnect health and performance.
|
|
# Copy to /etc/prometheus/alerts.yml and reference in prometheus.yml
|
|
|
|
groups:
|
|
- name: guruconnect_alerts
|
|
interval: 30s
|
|
rules:
|
|
# GuruConnect is down
|
|
- alert: GuruConnectDown
|
|
expr: up{job="guruconnect"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "GuruConnect server is down"
|
|
description: "GuruConnect server on {{ $labels.instance }} has been down for more than 1 minute"
|
|
|
|
# High error rate
|
|
- alert: HighErrorRate
|
|
expr: rate(guruconnect_errors_total[5m]) > 10
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "Error rate is {{ $value | humanize }} errors/second over the last 5 minutes"
|
|
|
|
# Too many active sessions
|
|
- alert: TooManyActiveSessions
|
|
expr: guruconnect_active_sessions > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Too many active sessions"
|
|
description: "There are {{ $value }} active sessions, exceeding threshold of 100"
|
|
|
|
# High request latency
|
|
- alert: HighRequestLatency
|
|
expr: histogram_quantile(0.95, rate(guruconnect_request_duration_seconds_bucket[5m])) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High request latency"
|
|
description: "95th percentile request latency is {{ $value | humanize }}s"
|
|
|
|
# Database operations failing
|
|
- alert: DatabaseOperationsFailure
|
|
expr: rate(guruconnect_db_operations_total{status="error"}[5m]) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Database operations failing"
|
|
description: "Database error rate is {{ $value | humanize }} errors/second"
|
|
|
|
# Server uptime low (recent restart)
|
|
- alert: ServerRestarted
|
|
expr: guruconnect_uptime_seconds < 300
|
|
for: 1m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
summary: "Server recently restarted"
|
|
description: "Server uptime is only {{ $value | humanize }}s, indicating a recent restart"
|