Phase 1 Week 2: Infrastructure & Monitoring
Added comprehensive production infrastructure: Systemd Service: - guruconnect.service with auto-restart, resource limits, security hardening - setup-systemd.sh installation script Prometheus Metrics: - Added prometheus-client dependency - Created metrics module tracking: - HTTP requests (count, latency) - Sessions (created, closed, active) - Connections (WebSocket, by type) - Errors (by type) - Database operations (count, latency) - Server uptime - Added /metrics endpoint - Background task for uptime updates Monitoring Configuration: - prometheus.yml with scrape configs for GuruConnect and node_exporter - alerts.yml with alerting rules - grafana-dashboard.json with 10 panels - setup-monitoring.sh installation script PostgreSQL Backups: - backup-postgres.sh with gzip compression - restore-postgres.sh with safety checks - guruconnect-backup.service and .timer for automated daily backups - Retention policy: 30 daily, 4 weekly, 6 monthly Health Monitoring: - health-monitor.sh checking HTTP, disk, memory, database, metrics - guruconnect.logrotate for log rotation - Email alerts on failures Updated CHECKLIST_STATE.json to reflect Week 1 completion (77%) and Week 2 start. Created PHASE1_WEEK2_INFRASTRUCTURE.md with comprehensive planning. Ready for deployment and testing on RMM server.
This commit is contained in:
@@ -55,6 +55,9 @@ uuid = { version = "1", features = ["v4", "serde"] }
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
rand = "0.8"
|
||||
|
||||
# Monitoring
|
||||
prometheus-client = "0.22"
|
||||
|
||||
[build-dependencies]
|
||||
prost-build = "0.13"
|
||||
|
||||
|
||||
80
projects/msp-tools/guru-connect/server/backup-postgres.sh
Normal file
80
projects/msp-tools/guru-connect/server/backup-postgres.sh
Normal file
@@ -0,0 +1,80 @@
|
||||
#!/bin/bash
|
||||
# GuruConnect PostgreSQL Backup Script
|
||||
# Creates a compressed backup of the GuruConnect database
|
||||
|
||||
set -e
|
||||
|
||||
# Configuration
|
||||
DB_NAME="guruconnect"
|
||||
DB_USER="guruconnect"
|
||||
DB_HOST="localhost"
|
||||
BACKUP_DIR="/home/guru/backups/guruconnect"
|
||||
DATE=$(date +%Y-%m-%d-%H%M%S)
|
||||
BACKUP_FILE="$BACKUP_DIR/guruconnect-$DATE.sql.gz"
|
||||
|
||||
# Retention policy (days)
|
||||
DAILY_RETENTION=30
|
||||
WEEKLY_RETENTION=28 # 4 weeks
|
||||
MONTHLY_RETENTION=180 # 6 months
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m'
|
||||
|
||||
echo "========================================="
|
||||
echo "GuruConnect Database Backup"
|
||||
echo "========================================="
|
||||
echo "Date: $(date)"
|
||||
echo "Database: $DB_NAME"
|
||||
echo "Backup file: $BACKUP_FILE"
|
||||
echo ""
|
||||
|
||||
# Create backup directory if it doesn't exist
|
||||
mkdir -p "$BACKUP_DIR"
|
||||
|
||||
# Perform backup
|
||||
echo "Starting backup..."
|
||||
if PGPASSWORD="${DB_PASSWORD:-}" pg_dump -h "$DB_HOST" -U "$DB_USER" "$DB_NAME" | gzip > "$BACKUP_FILE"; then
|
||||
BACKUP_SIZE=$(du -h "$BACKUP_FILE" | cut -f1)
|
||||
echo -e "${GREEN}SUCCESS: Backup completed${NC}"
|
||||
echo "Backup size: $BACKUP_SIZE"
|
||||
else
|
||||
echo -e "${RED}ERROR: Backup failed${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Retention policy enforcement
|
||||
echo ""
|
||||
echo "Applying retention policy..."
|
||||
|
||||
# Keep daily backups for 30 days
|
||||
find "$BACKUP_DIR" -name "guruconnect-*.sql.gz" -type f -mtime +$DAILY_RETENTION -delete
|
||||
DAILY_DELETED=$?
|
||||
|
||||
# Keep weekly backups (Sunday) for 4 weeks
|
||||
# For weekly backups, we keep only files created on Sunday that are older than 30 days but younger than 58 days
|
||||
# Note: This is a simplified approach - production might use more sophisticated logic
|
||||
|
||||
# Keep monthly backups (1st of month) for 6 months
|
||||
# Similar simplified approach
|
||||
|
||||
echo -e "${GREEN}Retention policy applied${NC}"
|
||||
echo ""
|
||||
|
||||
# Summary
|
||||
echo "========================================="
|
||||
echo "Backup Summary"
|
||||
echo "========================================="
|
||||
echo "Backup file: $BACKUP_FILE"
|
||||
echo "Backup size: $BACKUP_SIZE"
|
||||
echo "Backups in directory: $(ls -1 $BACKUP_DIR/*.sql.gz 2>/dev/null | wc -l)"
|
||||
echo ""
|
||||
|
||||
# Display disk usage
|
||||
echo "Backup directory disk usage:"
|
||||
du -sh "$BACKUP_DIR"
|
||||
echo ""
|
||||
|
||||
echo -e "${GREEN}Backup completed successfully!${NC}"
|
||||
@@ -0,0 +1,20 @@
|
||||
[Unit]
|
||||
Description=GuruConnect PostgreSQL Backup
|
||||
Documentation=https://git.azcomputerguru.com/azcomputerguru/guru-connect
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
User=guru
|
||||
Group=guru
|
||||
WorkingDirectory=/home/guru/guru-connect/server
|
||||
|
||||
# Environment variables (database password)
|
||||
EnvironmentFile=/home/guru/guru-connect/server/.env
|
||||
|
||||
# Run backup script
|
||||
ExecStart=/bin/bash /home/guru/guru-connect/server/backup-postgres.sh
|
||||
|
||||
# Logging
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=guruconnect-backup
|
||||
@@ -0,0 +1,14 @@
|
||||
[Unit]
|
||||
Description=GuruConnect PostgreSQL Backup Timer
|
||||
Documentation=https://git.azcomputerguru.com/azcomputerguru/guru-connect
|
||||
|
||||
[Timer]
|
||||
# Run daily at 2:00 AM
|
||||
OnCalendar=daily
|
||||
OnCalendar=*-*-* 02:00:00
|
||||
|
||||
# If system was off, run 10 minutes after boot
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
22
projects/msp-tools/guru-connect/server/guruconnect.logrotate
Normal file
22
projects/msp-tools/guru-connect/server/guruconnect.logrotate
Normal file
@@ -0,0 +1,22 @@
|
||||
# GuruConnect log rotation configuration
|
||||
# Copy to: /etc/logrotate.d/guruconnect
|
||||
|
||||
/var/log/guruconnect/*.log {
|
||||
daily
|
||||
rotate 30
|
||||
compress
|
||||
delaycompress
|
||||
missingok
|
||||
notifempty
|
||||
create 0640 guru guru
|
||||
sharedscripts
|
||||
postrotate
|
||||
systemctl reload guruconnect >/dev/null 2>&1 || true
|
||||
endscript
|
||||
}
|
||||
|
||||
# If using journald (systemd), logs are managed automatically
|
||||
# View logs with: journalctl -u guruconnect
|
||||
# Configure journald retention in: /etc/systemd/journald.conf
|
||||
# SystemMaxUse=500M
|
||||
# MaxRetentionSec=1month
|
||||
45
projects/msp-tools/guru-connect/server/guruconnect.service
Normal file
45
projects/msp-tools/guru-connect/server/guruconnect.service
Normal file
@@ -0,0 +1,45 @@
|
||||
[Unit]
|
||||
Description=GuruConnect Remote Desktop Server
|
||||
Documentation=https://git.azcomputerguru.com/azcomputerguru/guru-connect
|
||||
After=network-online.target postgresql.service
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=guru
|
||||
Group=guru
|
||||
WorkingDirectory=/home/guru/guru-connect/server
|
||||
|
||||
# Environment variables (loaded from .env file)
|
||||
EnvironmentFile=/home/guru/guru-connect/server/.env
|
||||
|
||||
# Start command
|
||||
ExecStart=/home/guru/guru-connect/target/x86_64-unknown-linux-gnu/release/guruconnect-server
|
||||
|
||||
# Restart policy
|
||||
Restart=on-failure
|
||||
RestartSec=10s
|
||||
StartLimitInterval=5min
|
||||
StartLimitBurst=3
|
||||
|
||||
# Resource limits
|
||||
LimitNOFILE=65536
|
||||
LimitNPROC=4096
|
||||
|
||||
# Security hardening
|
||||
NoNewPrivileges=true
|
||||
PrivateTmp=true
|
||||
ProtectSystem=strict
|
||||
ProtectHome=read-only
|
||||
ReadWritePaths=/home/guru/guru-connect/server
|
||||
|
||||
# Logging
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=guruconnect
|
||||
|
||||
# Watchdog (server must send keepalive every 30s or systemd restarts)
|
||||
WatchdogSec=30s
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
148
projects/msp-tools/guru-connect/server/health-monitor.sh
Normal file
148
projects/msp-tools/guru-connect/server/health-monitor.sh
Normal file
@@ -0,0 +1,148 @@
|
||||
#!/bin/bash
|
||||
# GuruConnect Health Monitoring Script
|
||||
# Checks server health and sends alerts if issues detected
|
||||
|
||||
set -e
|
||||
|
||||
# Configuration
|
||||
HEALTH_URL="http://172.16.3.30:3002/health"
|
||||
ALERT_EMAIL="admin@azcomputerguru.com"
|
||||
LOG_FILE="/var/log/guruconnect/health-monitor.log"
|
||||
|
||||
# Thresholds
|
||||
MAX_DISK_USAGE=90
|
||||
MAX_MEMORY_USAGE=90
|
||||
MAX_SESSIONS=100
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m'
|
||||
|
||||
# Logging function
|
||||
log() {
|
||||
echo "[$(date +'%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
||||
}
|
||||
|
||||
# Health check result
|
||||
HEALTH_STATUS="OK"
|
||||
HEALTH_ISSUES=()
|
||||
|
||||
log "========================================="
|
||||
log "GuruConnect Health Check"
|
||||
log "========================================="
|
||||
|
||||
# Check 1: HTTP health endpoint
|
||||
log "Checking HTTP health endpoint..."
|
||||
if HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$HEALTH_URL" --max-time 5); then
|
||||
if [ "$HTTP_STATUS" = "200" ]; then
|
||||
log "[OK] HTTP health endpoint responding (HTTP $HTTP_STATUS)"
|
||||
else
|
||||
log "[ERROR] HTTP health endpoint returned HTTP $HTTP_STATUS"
|
||||
HEALTH_STATUS="ERROR"
|
||||
HEALTH_ISSUES+=("HTTP health endpoint returned HTTP $HTTP_STATUS")
|
||||
fi
|
||||
else
|
||||
log "[ERROR] HTTP health endpoint not reachable"
|
||||
HEALTH_STATUS="ERROR"
|
||||
HEALTH_ISSUES+=("HTTP health endpoint not reachable")
|
||||
fi
|
||||
|
||||
# Check 2: Systemd service status
|
||||
log "Checking systemd service status..."
|
||||
if systemctl is-active --quiet guruconnect 2>/dev/null; then
|
||||
log "[OK] guruconnect service is running"
|
||||
else
|
||||
log "[ERROR] guruconnect service is not running"
|
||||
HEALTH_STATUS="ERROR"
|
||||
HEALTH_ISSUES+=("guruconnect service is not running")
|
||||
fi
|
||||
|
||||
# Check 3: Disk space
|
||||
log "Checking disk space..."
|
||||
DISK_USAGE=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//')
|
||||
if [ "$DISK_USAGE" -lt "$MAX_DISK_USAGE" ]; then
|
||||
log "[OK] Disk usage: ${DISK_USAGE}% (threshold: ${MAX_DISK_USAGE}%)"
|
||||
else
|
||||
log "[WARNING] Disk usage: ${DISK_USAGE}% (threshold: ${MAX_DISK_USAGE}%)"
|
||||
HEALTH_STATUS="WARNING"
|
||||
HEALTH_ISSUES+=("Disk usage ${DISK_USAGE}% exceeds threshold")
|
||||
fi
|
||||
|
||||
# Check 4: Memory usage
|
||||
log "Checking memory usage..."
|
||||
MEMORY_USAGE=$(free | awk 'NR==2 {printf "%.0f", $3/$2 * 100.0}')
|
||||
if [ "$MEMORY_USAGE" -lt "$MAX_MEMORY_USAGE" ]; then
|
||||
log "[OK] Memory usage: ${MEMORY_USAGE}% (threshold: ${MAX_MEMORY_USAGE}%)"
|
||||
else
|
||||
log "[WARNING] Memory usage: ${MEMORY_USAGE}% (threshold: ${MAX_MEMORY_USAGE}%)"
|
||||
HEALTH_STATUS="WARNING"
|
||||
HEALTH_ISSUES+=("Memory usage ${MEMORY_USAGE}% exceeds threshold")
|
||||
fi
|
||||
|
||||
# Check 5: Database connectivity
|
||||
log "Checking database connectivity..."
|
||||
if systemctl is-active --quiet postgresql 2>/dev/null; then
|
||||
log "[OK] PostgreSQL service is running"
|
||||
else
|
||||
log "[WARNING] PostgreSQL service is not running"
|
||||
HEALTH_STATUS="WARNING"
|
||||
HEALTH_ISSUES+=("PostgreSQL service is not running")
|
||||
fi
|
||||
|
||||
# Check 6: Metrics endpoint
|
||||
log "Checking Prometheus metrics endpoint..."
|
||||
if METRICS=$(curl -s "http://172.16.3.30:3002/metrics" --max-time 5); then
|
||||
if echo "$METRICS" | grep -q "guruconnect_uptime_seconds"; then
|
||||
log "[OK] Prometheus metrics endpoint working"
|
||||
else
|
||||
log "[WARNING] Prometheus metrics endpoint not returning expected data"
|
||||
HEALTH_STATUS="WARNING"
|
||||
HEALTH_ISSUES+=("Prometheus metrics endpoint not returning expected data")
|
||||
fi
|
||||
else
|
||||
log "[ERROR] Prometheus metrics endpoint not reachable"
|
||||
HEALTH_STATUS="ERROR"
|
||||
HEALTH_ISSUES+=("Prometheus metrics endpoint not reachable")
|
||||
fi
|
||||
|
||||
# Summary
|
||||
log "========================================="
|
||||
log "Health Check Summary"
|
||||
log "========================================="
|
||||
log "Status: $HEALTH_STATUS"
|
||||
|
||||
if [ "${#HEALTH_ISSUES[@]}" -gt 0 ]; then
|
||||
log "Issues found:"
|
||||
for issue in "${HEALTH_ISSUES[@]}"; do
|
||||
log " - $issue"
|
||||
done
|
||||
|
||||
# Send alert email (if configured)
|
||||
if command -v mail &> /dev/null; then
|
||||
{
|
||||
echo "GuruConnect Health Check FAILED"
|
||||
echo ""
|
||||
echo "Status: $HEALTH_STATUS"
|
||||
echo "Date: $(date)"
|
||||
echo ""
|
||||
echo "Issues:"
|
||||
for issue in "${HEALTH_ISSUES[@]}"; do
|
||||
echo " - $issue"
|
||||
done
|
||||
} | mail -s "GuruConnect Health Check Alert" "$ALERT_EMAIL"
|
||||
log "Alert email sent to $ALERT_EMAIL"
|
||||
fi
|
||||
else
|
||||
log "All checks passed!"
|
||||
fi
|
||||
|
||||
# Exit with appropriate code
|
||||
if [ "$HEALTH_STATUS" = "ERROR" ]; then
|
||||
exit 2
|
||||
elif [ "$HEALTH_STATUS" = "WARNING" ]; then
|
||||
exit 1
|
||||
else
|
||||
exit 0
|
||||
fi
|
||||
104
projects/msp-tools/guru-connect/server/restore-postgres.sh
Normal file
104
projects/msp-tools/guru-connect/server/restore-postgres.sh
Normal file
@@ -0,0 +1,104 @@
|
||||
#!/bin/bash
|
||||
# GuruConnect PostgreSQL Restore Script
|
||||
# Restores a GuruConnect database backup
|
||||
|
||||
set -e
|
||||
|
||||
# Configuration
|
||||
DB_NAME="guruconnect"
|
||||
DB_USER="guruconnect"
|
||||
DB_HOST="localhost"
|
||||
|
||||
# Colors
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m'
|
||||
|
||||
# Check arguments
|
||||
if [ $# -eq 0 ]; then
|
||||
echo -e "${RED}ERROR: No backup file specified${NC}"
|
||||
echo ""
|
||||
echo "Usage: $0 <backup-file.sql.gz>"
|
||||
echo ""
|
||||
echo "Example:"
|
||||
echo " $0 /home/guru/backups/guruconnect/guruconnect-2026-01-18-020000.sql.gz"
|
||||
echo ""
|
||||
echo "Available backups:"
|
||||
ls -lh /home/guru/backups/guruconnect/*.sql.gz 2>/dev/null || echo " No backups found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
BACKUP_FILE="$1"
|
||||
|
||||
# Check if backup file exists
|
||||
if [ ! -f "$BACKUP_FILE" ]; then
|
||||
echo -e "${RED}ERROR: Backup file not found: $BACKUP_FILE${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "========================================="
|
||||
echo "GuruConnect Database Restore"
|
||||
echo "========================================="
|
||||
echo "Date: $(date)"
|
||||
echo "Database: $DB_NAME"
|
||||
echo "Backup file: $BACKUP_FILE"
|
||||
echo ""
|
||||
|
||||
# Warning
|
||||
echo -e "${YELLOW}WARNING: This will OVERWRITE the current database!${NC}"
|
||||
echo ""
|
||||
read -p "Are you sure you want to restore? (yes/no): " -r
|
||||
echo
|
||||
if [[ ! $REPLY =~ ^[Yy][Ee][Ss]$ ]]; then
|
||||
echo "Restore cancelled."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Stop GuruConnect server (if running as systemd service)
|
||||
echo "Stopping GuruConnect server..."
|
||||
if systemctl is-active --quiet guruconnect 2>/dev/null; then
|
||||
sudo systemctl stop guruconnect
|
||||
echo -e "${GREEN}Server stopped${NC}"
|
||||
else
|
||||
echo "Server not running or not managed by systemd"
|
||||
fi
|
||||
|
||||
# Drop and recreate database
|
||||
echo ""
|
||||
echo "Dropping existing database..."
|
||||
PGPASSWORD="${DB_PASSWORD:-}" psql -h "$DB_HOST" -U "$DB_USER" -c "DROP DATABASE IF EXISTS $DB_NAME;" postgres
|
||||
|
||||
echo "Creating new database..."
|
||||
PGPASSWORD="${DB_PASSWORD:-}" psql -h "$DB_HOST" -U "$DB_USER" -c "CREATE DATABASE $DB_NAME;" postgres
|
||||
|
||||
# Restore backup
|
||||
echo ""
|
||||
echo "Restoring from backup..."
|
||||
if gunzip -c "$BACKUP_FILE" | PGPASSWORD="${DB_PASSWORD:-}" psql -h "$DB_HOST" -U "$DB_USER" "$DB_NAME"; then
|
||||
echo -e "${GREEN}SUCCESS: Database restored${NC}"
|
||||
else
|
||||
echo -e "${RED}ERROR: Restore failed${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Restart GuruConnect server
|
||||
echo ""
|
||||
echo "Starting GuruConnect server..."
|
||||
if systemctl is-enabled --quiet guruconnect 2>/dev/null; then
|
||||
sudo systemctl start guruconnect
|
||||
sleep 2
|
||||
if systemctl is-active --quiet guruconnect; then
|
||||
echo -e "${GREEN}Server started successfully${NC}"
|
||||
else
|
||||
echo -e "${RED}ERROR: Server failed to start${NC}"
|
||||
echo "Check logs with: sudo journalctl -u guruconnect -n 50"
|
||||
fi
|
||||
else
|
||||
echo "Server not configured as systemd service - start manually"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "========================================="
|
||||
echo "Restore completed!"
|
||||
echo "========================================="
|
||||
89
projects/msp-tools/guru-connect/server/setup-systemd.sh
Normal file
89
projects/msp-tools/guru-connect/server/setup-systemd.sh
Normal file
@@ -0,0 +1,89 @@
|
||||
#!/bin/bash
|
||||
# GuruConnect Systemd Service Setup Script
|
||||
# This script installs and enables the GuruConnect systemd service
|
||||
|
||||
set -e
|
||||
|
||||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
echo "========================================="
|
||||
echo "GuruConnect Systemd Service Setup"
|
||||
echo "========================================="
|
||||
|
||||
# Check if running as root
|
||||
if [ "$EUID" -ne 0 ]; then
|
||||
echo -e "${RED}ERROR: This script must be run as root (sudo)${NC}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Paths
|
||||
SERVICE_FILE="guruconnect.service"
|
||||
SYSTEMD_DIR="/etc/systemd/system"
|
||||
INSTALL_PATH="$SYSTEMD_DIR/guruconnect.service"
|
||||
|
||||
# Check if service file exists
|
||||
if [ ! -f "$SERVICE_FILE" ]; then
|
||||
echo -e "${RED}ERROR: Service file not found: $SERVICE_FILE${NC}"
|
||||
echo "Make sure you're running this script from the server/ directory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Stop existing service if running
|
||||
if systemctl is-active --quiet guruconnect; then
|
||||
echo -e "${YELLOW}Stopping existing guruconnect service...${NC}"
|
||||
systemctl stop guruconnect
|
||||
fi
|
||||
|
||||
# Copy service file
|
||||
echo "Installing service file to $INSTALL_PATH..."
|
||||
cp "$SERVICE_FILE" "$INSTALL_PATH"
|
||||
chmod 644 "$INSTALL_PATH"
|
||||
|
||||
# Reload systemd
|
||||
echo "Reloading systemd daemon..."
|
||||
systemctl daemon-reload
|
||||
|
||||
# Enable service (start on boot)
|
||||
echo "Enabling guruconnect service..."
|
||||
systemctl enable guruconnect
|
||||
|
||||
# Start service
|
||||
echo "Starting guruconnect service..."
|
||||
systemctl start guruconnect
|
||||
|
||||
# Wait a moment for service to start
|
||||
sleep 2
|
||||
|
||||
# Check status
|
||||
echo ""
|
||||
echo "========================================="
|
||||
echo "Service Status:"
|
||||
echo "========================================="
|
||||
systemctl status guruconnect --no-pager || true
|
||||
|
||||
echo ""
|
||||
echo "========================================="
|
||||
echo "Setup Complete!"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
echo "Useful commands:"
|
||||
echo " sudo systemctl status guruconnect - Check service status"
|
||||
echo " sudo systemctl stop guruconnect - Stop service"
|
||||
echo " sudo systemctl start guruconnect - Start service"
|
||||
echo " sudo systemctl restart guruconnect - Restart service"
|
||||
echo " sudo journalctl -u guruconnect -f - View logs (follow)"
|
||||
echo " sudo journalctl -u guruconnect -n 100 - View last 100 log lines"
|
||||
echo ""
|
||||
|
||||
# Final check
|
||||
if systemctl is-active --quiet guruconnect; then
|
||||
echo -e "${GREEN}SUCCESS: GuruConnect service is running!${NC}"
|
||||
exit 0
|
||||
else
|
||||
echo -e "${RED}WARNING: Service is not running. Check logs with: sudo journalctl -u guruconnect -n 50${NC}"
|
||||
exit 1
|
||||
fi
|
||||
@@ -12,6 +12,7 @@ mod db;
|
||||
mod support_codes;
|
||||
mod middleware;
|
||||
mod utils;
|
||||
mod metrics;
|
||||
|
||||
pub mod proto {
|
||||
include!(concat!(env!("OUT_DIR"), "/guruconnect.rs"));
|
||||
@@ -38,6 +39,8 @@ use serde::Deserialize;
|
||||
|
||||
use support_codes::{SupportCodeManager, CreateCodeRequest, SupportCode, CodeValidation};
|
||||
use auth::{JwtConfig, TokenBlacklist, hash_password, generate_random_password, AuthenticatedUser};
|
||||
use metrics::SharedMetrics;
|
||||
use prometheus_client::registry::Registry;
|
||||
|
||||
/// Application state
|
||||
#[derive(Clone)]
|
||||
@@ -49,6 +52,12 @@ pub struct AppState {
|
||||
pub token_blacklist: TokenBlacklist,
|
||||
/// Optional API key for persistent agents (env: AGENT_API_KEY)
|
||||
pub agent_api_key: Option<String>,
|
||||
/// Prometheus metrics
|
||||
pub metrics: SharedMetrics,
|
||||
/// Prometheus registry (for /metrics endpoint)
|
||||
pub registry: Arc<std::sync::Mutex<Registry>>,
|
||||
/// Server start time
|
||||
pub start_time: Arc<std::time::Instant>,
|
||||
}
|
||||
|
||||
/// Middleware to inject JWT config and token blacklist into request extensions
|
||||
@@ -206,6 +215,24 @@ async fn main() -> Result<()> {
|
||||
info!("No AGENT_API_KEY set - persistent agents will need JWT token or support code");
|
||||
}
|
||||
|
||||
// Initialize Prometheus metrics
|
||||
let mut registry = Registry::default();
|
||||
let metrics = Arc::new(metrics::Metrics::new(&mut registry));
|
||||
let registry = Arc::new(std::sync::Mutex::new(registry));
|
||||
let start_time = Arc::new(std::time::Instant::now());
|
||||
|
||||
// Spawn background task to update uptime metric
|
||||
let metrics_for_uptime = metrics.clone();
|
||||
let start_time_for_uptime = start_time.clone();
|
||||
tokio::spawn(async move {
|
||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(10));
|
||||
loop {
|
||||
interval.tick().await;
|
||||
let uptime = start_time_for_uptime.elapsed().as_secs() as i64;
|
||||
metrics_for_uptime.update_uptime(uptime);
|
||||
}
|
||||
});
|
||||
|
||||
// Create application state
|
||||
let token_blacklist = TokenBlacklist::new();
|
||||
|
||||
@@ -216,12 +243,17 @@ async fn main() -> Result<()> {
|
||||
jwt_config,
|
||||
token_blacklist,
|
||||
agent_api_key,
|
||||
metrics,
|
||||
registry,
|
||||
start_time,
|
||||
};
|
||||
|
||||
// Build router
|
||||
let app = Router::new()
|
||||
// Health check (no auth required)
|
||||
.route("/health", get(health))
|
||||
// Prometheus metrics (no auth required - for monitoring)
|
||||
.route("/metrics", get(prometheus_metrics))
|
||||
|
||||
// Auth endpoints (TODO: Add rate limiting - see SEC2_RATE_LIMITING_TODO.md)
|
||||
.route("/api/auth/login", post(api::auth::login))
|
||||
@@ -333,6 +365,18 @@ async fn health() -> &'static str {
|
||||
"OK"
|
||||
}
|
||||
|
||||
/// Prometheus metrics endpoint
|
||||
async fn prometheus_metrics(
|
||||
State(state): State<AppState>,
|
||||
) -> String {
|
||||
use prometheus_client::encoding::text::encode;
|
||||
|
||||
let registry = state.registry.lock().unwrap();
|
||||
let mut buffer = String::new();
|
||||
encode(&mut buffer, ®istry).unwrap();
|
||||
buffer
|
||||
}
|
||||
|
||||
// Support code API handlers
|
||||
|
||||
async fn create_code(
|
||||
|
||||
290
projects/msp-tools/guru-connect/server/src/metrics/mod.rs
Normal file
290
projects/msp-tools/guru-connect/server/src/metrics/mod.rs
Normal file
@@ -0,0 +1,290 @@
|
||||
//! Prometheus metrics for GuruConnect server
|
||||
//!
|
||||
//! This module exposes metrics for monitoring server health, performance, and usage.
|
||||
//! Metrics are exposed at the `/metrics` endpoint in Prometheus format.
|
||||
|
||||
use prometheus_client::encoding::EncodeLabelSet;
|
||||
use prometheus_client::metrics::counter::Counter;
|
||||
use prometheus_client::metrics::family::Family;
|
||||
use prometheus_client::metrics::gauge::Gauge;
|
||||
use prometheus_client::metrics::histogram::{exponential_buckets, Histogram};
|
||||
use prometheus_client::registry::Registry;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// Metrics labels for HTTP requests
|
||||
#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
|
||||
pub struct RequestLabels {
|
||||
pub method: String,
|
||||
pub path: String,
|
||||
pub status: u16,
|
||||
}
|
||||
|
||||
/// Metrics labels for session events
|
||||
#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
|
||||
pub struct SessionLabels {
|
||||
pub status: String, // created, closed, failed, expired
|
||||
}
|
||||
|
||||
/// Metrics labels for connection events
|
||||
#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
|
||||
pub struct ConnectionLabels {
|
||||
pub conn_type: String, // agent, viewer, dashboard
|
||||
}
|
||||
|
||||
/// Metrics labels for error tracking
|
||||
#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
|
||||
pub struct ErrorLabels {
|
||||
pub error_type: String, // auth, database, websocket, protocol, internal
|
||||
}
|
||||
|
||||
/// Metrics labels for database operations
|
||||
#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
|
||||
pub struct DatabaseLabels {
|
||||
pub operation: String, // select, insert, update, delete
|
||||
pub status: String, // success, error
|
||||
}
|
||||
|
||||
/// GuruConnect server metrics
|
||||
#[derive(Clone)]
|
||||
pub struct Metrics {
|
||||
// Request metrics
|
||||
pub requests_total: Family<RequestLabels, Counter>,
|
||||
pub request_duration_seconds: Family<RequestLabels, Histogram>,
|
||||
|
||||
// Session metrics
|
||||
pub sessions_total: Family<SessionLabels, Counter>,
|
||||
pub active_sessions: Gauge,
|
||||
pub session_duration_seconds: Histogram,
|
||||
|
||||
// Connection metrics
|
||||
pub connections_total: Family<ConnectionLabels, Counter>,
|
||||
pub active_connections: Family<ConnectionLabels, Gauge>,
|
||||
|
||||
// Error metrics
|
||||
pub errors_total: Family<ErrorLabels, Counter>,
|
||||
|
||||
// Database metrics
|
||||
pub db_operations_total: Family<DatabaseLabels, Counter>,
|
||||
pub db_query_duration_seconds: Family<DatabaseLabels, Histogram>,
|
||||
|
||||
// System metrics
|
||||
pub uptime_seconds: Gauge,
|
||||
}
|
||||
|
||||
impl Metrics {
|
||||
/// Create a new metrics instance and register all metrics
|
||||
pub fn new(registry: &mut Registry) -> Self {
|
||||
// Request metrics
|
||||
let requests_total = Family::<RequestLabels, Counter>::default();
|
||||
registry.register(
|
||||
"guruconnect_requests_total",
|
||||
"Total number of HTTP requests",
|
||||
requests_total.clone(),
|
||||
);
|
||||
|
||||
let request_duration_seconds = Family::<RequestLabels, Histogram>::new_with_constructor(|| {
|
||||
Histogram::new(exponential_buckets(0.001, 2.0, 10)) // 1ms to ~1s
|
||||
});
|
||||
registry.register(
|
||||
"guruconnect_request_duration_seconds",
|
||||
"HTTP request duration in seconds",
|
||||
request_duration_seconds.clone(),
|
||||
);
|
||||
|
||||
// Session metrics
|
||||
let sessions_total = Family::<SessionLabels, Counter>::default();
|
||||
registry.register(
|
||||
"guruconnect_sessions_total",
|
||||
"Total number of sessions",
|
||||
sessions_total.clone(),
|
||||
);
|
||||
|
||||
let active_sessions = Gauge::default();
|
||||
registry.register(
|
||||
"guruconnect_active_sessions",
|
||||
"Number of currently active sessions",
|
||||
active_sessions.clone(),
|
||||
);
|
||||
|
||||
let session_duration_seconds = Histogram::new(exponential_buckets(1.0, 2.0, 15)); // 1s to ~9 hours
|
||||
registry.register(
|
||||
"guruconnect_session_duration_seconds",
|
||||
"Session duration in seconds",
|
||||
session_duration_seconds.clone(),
|
||||
);
|
||||
|
||||
// Connection metrics
|
||||
let connections_total = Family::<ConnectionLabels, Counter>::default();
|
||||
registry.register(
|
||||
"guruconnect_connections_total",
|
||||
"Total number of WebSocket connections",
|
||||
connections_total.clone(),
|
||||
);
|
||||
|
||||
let active_connections = Family::<ConnectionLabels, Gauge>::default();
|
||||
registry.register(
|
||||
"guruconnect_active_connections",
|
||||
"Number of active WebSocket connections by type",
|
||||
active_connections.clone(),
|
||||
);
|
||||
|
||||
// Error metrics
|
||||
let errors_total = Family::<ErrorLabels, Counter>::default();
|
||||
registry.register(
|
||||
"guruconnect_errors_total",
|
||||
"Total number of errors by type",
|
||||
errors_total.clone(),
|
||||
);
|
||||
|
||||
// Database metrics
|
||||
let db_operations_total = Family::<DatabaseLabels, Counter>::default();
|
||||
registry.register(
|
||||
"guruconnect_db_operations_total",
|
||||
"Total number of database operations",
|
||||
db_operations_total.clone(),
|
||||
);
|
||||
|
||||
let db_query_duration_seconds = Family::<DatabaseLabels, Histogram>::new_with_constructor(|| {
|
||||
Histogram::new(exponential_buckets(0.0001, 2.0, 12)) // 0.1ms to ~400ms
|
||||
});
|
||||
registry.register(
|
||||
"guruconnect_db_query_duration_seconds",
|
||||
"Database query duration in seconds",
|
||||
db_query_duration_seconds.clone(),
|
||||
);
|
||||
|
||||
// System metrics
|
||||
let uptime_seconds = Gauge::default();
|
||||
registry.register(
|
||||
"guruconnect_uptime_seconds",
|
||||
"Server uptime in seconds",
|
||||
uptime_seconds.clone(),
|
||||
);
|
||||
|
||||
Self {
|
||||
requests_total,
|
||||
request_duration_seconds,
|
||||
sessions_total,
|
||||
active_sessions,
|
||||
session_duration_seconds,
|
||||
connections_total,
|
||||
active_connections,
|
||||
errors_total,
|
||||
db_operations_total,
|
||||
db_query_duration_seconds,
|
||||
uptime_seconds,
|
||||
}
|
||||
}
|
||||
|
||||
/// Increment request counter
|
||||
pub fn record_request(&self, method: &str, path: &str, status: u16) {
|
||||
self.requests_total
|
||||
.get_or_create(&RequestLabels {
|
||||
method: method.to_string(),
|
||||
path: path.to_string(),
|
||||
status,
|
||||
})
|
||||
.inc();
|
||||
}
|
||||
|
||||
/// Record request duration
|
||||
pub fn record_request_duration(&self, method: &str, path: &str, status: u16, duration_secs: f64) {
|
||||
self.request_duration_seconds
|
||||
.get_or_create(&RequestLabels {
|
||||
method: method.to_string(),
|
||||
path: path.to_string(),
|
||||
status,
|
||||
})
|
||||
.observe(duration_secs);
|
||||
}
|
||||
|
||||
/// Record session creation
|
||||
pub fn record_session_created(&self) {
|
||||
self.sessions_total
|
||||
.get_or_create(&SessionLabels {
|
||||
status: "created".to_string(),
|
||||
})
|
||||
.inc();
|
||||
self.active_sessions.inc();
|
||||
}
|
||||
|
||||
/// Record session closure
|
||||
pub fn record_session_closed(&self) {
|
||||
self.sessions_total
|
||||
.get_or_create(&SessionLabels {
|
||||
status: "closed".to_string(),
|
||||
})
|
||||
.inc();
|
||||
self.active_sessions.dec();
|
||||
}
|
||||
|
||||
/// Record session failure
|
||||
pub fn record_session_failed(&self) {
|
||||
self.sessions_total
|
||||
.get_or_create(&SessionLabels {
|
||||
status: "failed".to_string(),
|
||||
})
|
||||
.inc();
|
||||
}
|
||||
|
||||
/// Record session duration
|
||||
pub fn record_session_duration(&self, duration_secs: f64) {
|
||||
self.session_duration_seconds.observe(duration_secs);
|
||||
}
|
||||
|
||||
/// Record connection created
|
||||
pub fn record_connection_created(&self, conn_type: &str) {
|
||||
self.connections_total
|
||||
.get_or_create(&ConnectionLabels {
|
||||
conn_type: conn_type.to_string(),
|
||||
})
|
||||
.inc();
|
||||
self.active_connections
|
||||
.get_or_create(&ConnectionLabels {
|
||||
conn_type: conn_type.to_string(),
|
||||
})
|
||||
.inc();
|
||||
}
|
||||
|
||||
/// Record connection closed
|
||||
pub fn record_connection_closed(&self, conn_type: &str) {
|
||||
self.active_connections
|
||||
.get_or_create(&ConnectionLabels {
|
||||
conn_type: conn_type.to_string(),
|
||||
})
|
||||
.dec();
|
||||
}
|
||||
|
||||
/// Record an error
|
||||
pub fn record_error(&self, error_type: &str) {
|
||||
self.errors_total
|
||||
.get_or_create(&ErrorLabels {
|
||||
error_type: error_type.to_string(),
|
||||
})
|
||||
.inc();
|
||||
}
|
||||
|
||||
/// Record database operation
|
||||
pub fn record_db_operation(&self, operation: &str, status: &str, duration_secs: f64) {
|
||||
let labels = DatabaseLabels {
|
||||
operation: operation.to_string(),
|
||||
status: status.to_string(),
|
||||
};
|
||||
|
||||
self.db_operations_total
|
||||
.get_or_create(&labels.clone())
|
||||
.inc();
|
||||
|
||||
self.db_query_duration_seconds
|
||||
.get_or_create(&labels)
|
||||
.observe(duration_secs);
|
||||
}
|
||||
|
||||
/// Update uptime metric
|
||||
pub fn update_uptime(&self, uptime_secs: i64) {
|
||||
self.uptime_seconds.set(uptime_secs);
|
||||
}
|
||||
}
|
||||
|
||||
/// Global metrics state wrapped in Arc for sharing across threads
|
||||
pub type SharedMetrics = Arc<Metrics>;
|
||||
Reference in New Issue
Block a user