claudetools/projects/msp-tools/guru-connect/server/health-monitor.sh

#!/bin/bash
# GuruConnect Health Monitoring Script
# Checks server health and sends alerts if issues detected

set -e

# Configuration
HEALTH_URL="http://172.16.3.30:3002/health"
ALERT_EMAIL="admin@azcomputerguru.com"
LOG_FILE="/var/log/guruconnect/health-monitor.log"

# Thresholds
MAX_DISK_USAGE=90
MAX_MEMORY_USAGE=90
MAX_SESSIONS=100

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

# Logging function
log() {
    echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}

# Health check result
HEALTH_STATUS="OK"
HEALTH_ISSUES=()

log "========================================="
log "GuruConnect Health Check"
log "========================================="

# Check 1: HTTP health endpoint
log "Checking HTTP health endpoint..."
if HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$HEALTH_URL" --max-time 5); then
    if [ "$HTTP_STATUS" = "200" ]; then
        log "[OK] HTTP health endpoint responding (HTTP $HTTP_STATUS)"
    else
        log "[ERROR] HTTP health endpoint returned HTTP $HTTP_STATUS"
        HEALTH_STATUS="ERROR"
        HEALTH_ISSUES+=("HTTP health endpoint returned HTTP $HTTP_STATUS")
    fi
else
    log "[ERROR] HTTP health endpoint not reachable"
    HEALTH_STATUS="ERROR"
    HEALTH_ISSUES+=("HTTP health endpoint not reachable")
fi

# Check 2: Systemd service status
log "Checking systemd service status..."
if systemctl is-active --quiet guruconnect 2>/dev/null; then
    log "[OK] guruconnect service is running"
else
    log "[ERROR] guruconnect service is not running"
    HEALTH_STATUS="ERROR"
    HEALTH_ISSUES+=("guruconnect service is not running")
fi

# Check 3: Disk space
log "Checking disk space..."
DISK_USAGE=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//')
if [ "$DISK_USAGE" -lt "$MAX_DISK_USAGE" ]; then
    log "[OK] Disk usage: ${DISK_USAGE}% (threshold: ${MAX_DISK_USAGE}%)"
else
    log "[WARNING] Disk usage: ${DISK_USAGE}% (threshold: ${MAX_DISK_USAGE}%)"
    HEALTH_STATUS="WARNING"
    HEALTH_ISSUES+=("Disk usage ${DISK_USAGE}% exceeds threshold")
fi

# Check 4: Memory usage
log "Checking memory usage..."
MEMORY_USAGE=$(free | awk 'NR==2 {printf "%.0f", $3/$2 * 100.0}')
if [ "$MEMORY_USAGE" -lt "$MAX_MEMORY_USAGE" ]; then
    log "[OK] Memory usage: ${MEMORY_USAGE}% (threshold: ${MAX_MEMORY_USAGE}%)"
else
    log "[WARNING] Memory usage: ${MEMORY_USAGE}% (threshold: ${MAX_MEMORY_USAGE}%)"
    HEALTH_STATUS="WARNING"
    HEALTH_ISSUES+=("Memory usage ${MEMORY_USAGE}% exceeds threshold")
fi

# Check 5: Database connectivity
log "Checking database connectivity..."
if systemctl is-active --quiet postgresql 2>/dev/null; then
    log "[OK] PostgreSQL service is running"
else
    log "[WARNING] PostgreSQL service is not running"
    HEALTH_STATUS="WARNING"
    HEALTH_ISSUES+=("PostgreSQL service is not running")
fi

# Check 6: Metrics endpoint
log "Checking Prometheus metrics endpoint..."
if METRICS=$(curl -s "http://172.16.3.30:3002/metrics" --max-time 5); then
    if echo "$METRICS" | grep -q "guruconnect_uptime_seconds"; then
        log "[OK] Prometheus metrics endpoint working"
    else
        log "[WARNING] Prometheus metrics endpoint not returning expected data"
        HEALTH_STATUS="WARNING"
        HEALTH_ISSUES+=("Prometheus metrics endpoint not returning expected data")
    fi
else
    log "[ERROR] Prometheus metrics endpoint not reachable"
    HEALTH_STATUS="ERROR"
    HEALTH_ISSUES+=("Prometheus metrics endpoint not reachable")
fi

# Summary
log "========================================="
log "Health Check Summary"
log "========================================="
log "Status: $HEALTH_STATUS"

if [ "${#HEALTH_ISSUES[@]}" -gt 0 ]; then
    log "Issues found:"
    for issue in "${HEALTH_ISSUES[@]}"; do
        log "  - $issue"
    done

    # Send alert email (if configured)
    if command -v mail &> /dev/null; then
        {
            echo "GuruConnect Health Check FAILED"
            echo ""
            echo "Status: $HEALTH_STATUS"
            echo "Date: $(date)"
            echo ""
            echo "Issues:"
            for issue in "${HEALTH_ISSUES[@]}"; do
                echo "  - $issue"
            done
        } | mail -s "GuruConnect Health Check Alert" "$ALERT_EMAIL"
        log "Alert email sent to $ALERT_EMAIL"
    fi
else
    log "All checks passed!"
fi

# Exit with appropriate code
if [ "$HEALTH_STATUS" = "ERROR" ]; then
    exit 2
elif [ "$HEALTH_STATUS" = "WARNING" ]; then
    exit 1
else
    exit 0
fi