#!/bin/bash # GuruConnect Health Monitoring Script # Checks server health and sends alerts if issues detected set -e # Configuration HEALTH_URL="http://172.16.3.30:3002/health" ALERT_EMAIL="admin@azcomputerguru.com" LOG_FILE="/var/log/guruconnect/health-monitor.log" # Thresholds MAX_DISK_USAGE=90 MAX_MEMORY_USAGE=90 MAX_SESSIONS=100 # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' NC='\033[0m' # Logging function log() { echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" } # Health check result HEALTH_STATUS="OK" HEALTH_ISSUES=() log "=========================================" log "GuruConnect Health Check" log "=========================================" # Check 1: HTTP health endpoint log "Checking HTTP health endpoint..." if HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$HEALTH_URL" --max-time 5); then if [ "$HTTP_STATUS" = "200" ]; then log "[OK] HTTP health endpoint responding (HTTP $HTTP_STATUS)" else log "[ERROR] HTTP health endpoint returned HTTP $HTTP_STATUS" HEALTH_STATUS="ERROR" HEALTH_ISSUES+=("HTTP health endpoint returned HTTP $HTTP_STATUS") fi else log "[ERROR] HTTP health endpoint not reachable" HEALTH_STATUS="ERROR" HEALTH_ISSUES+=("HTTP health endpoint not reachable") fi # Check 2: Systemd service status log "Checking systemd service status..." if systemctl is-active --quiet guruconnect 2>/dev/null; then log "[OK] guruconnect service is running" else log "[ERROR] guruconnect service is not running" HEALTH_STATUS="ERROR" HEALTH_ISSUES+=("guruconnect service is not running") fi # Check 3: Disk space log "Checking disk space..." DISK_USAGE=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//') if [ "$DISK_USAGE" -lt "$MAX_DISK_USAGE" ]; then log "[OK] Disk usage: ${DISK_USAGE}% (threshold: ${MAX_DISK_USAGE}%)" else log "[WARNING] Disk usage: ${DISK_USAGE}% (threshold: ${MAX_DISK_USAGE}%)" HEALTH_STATUS="WARNING" HEALTH_ISSUES+=("Disk usage ${DISK_USAGE}% exceeds threshold") fi # Check 4: Memory usage log "Checking memory usage..." MEMORY_USAGE=$(free | awk 'NR==2 {printf "%.0f", $3/$2 * 100.0}') if [ "$MEMORY_USAGE" -lt "$MAX_MEMORY_USAGE" ]; then log "[OK] Memory usage: ${MEMORY_USAGE}% (threshold: ${MAX_MEMORY_USAGE}%)" else log "[WARNING] Memory usage: ${MEMORY_USAGE}% (threshold: ${MAX_MEMORY_USAGE}%)" HEALTH_STATUS="WARNING" HEALTH_ISSUES+=("Memory usage ${MEMORY_USAGE}% exceeds threshold") fi # Check 5: Database connectivity log "Checking database connectivity..." if systemctl is-active --quiet postgresql 2>/dev/null; then log "[OK] PostgreSQL service is running" else log "[WARNING] PostgreSQL service is not running" HEALTH_STATUS="WARNING" HEALTH_ISSUES+=("PostgreSQL service is not running") fi # Check 6: Metrics endpoint log "Checking Prometheus metrics endpoint..." if METRICS=$(curl -s "http://172.16.3.30:3002/metrics" --max-time 5); then if echo "$METRICS" | grep -q "guruconnect_uptime_seconds"; then log "[OK] Prometheus metrics endpoint working" else log "[WARNING] Prometheus metrics endpoint not returning expected data" HEALTH_STATUS="WARNING" HEALTH_ISSUES+=("Prometheus metrics endpoint not returning expected data") fi else log "[ERROR] Prometheus metrics endpoint not reachable" HEALTH_STATUS="ERROR" HEALTH_ISSUES+=("Prometheus metrics endpoint not reachable") fi # Summary log "=========================================" log "Health Check Summary" log "=========================================" log "Status: $HEALTH_STATUS" if [ "${#HEALTH_ISSUES[@]}" -gt 0 ]; then log "Issues found:" for issue in "${HEALTH_ISSUES[@]}"; do log " - $issue" done # Send alert email (if configured) if command -v mail &> /dev/null; then { echo "GuruConnect Health Check FAILED" echo "" echo "Status: $HEALTH_STATUS" echo "Date: $(date)" echo "" echo "Issues:" for issue in "${HEALTH_ISSUES[@]}"; do echo " - $issue" done } | mail -s "GuruConnect Health Check Alert" "$ALERT_EMAIL" log "Alert email sent to $ALERT_EMAIL" fi else log "All checks passed!" fi # Exit with appropriate code if [ "$HEALTH_STATUS" = "ERROR" ]; then exit 2 elif [ "$HEALTH_STATUS" = "WARNING" ]; then exit 1 else exit 0 fi