Phase 1 Week 2: Infrastructure & Monitoring

Added comprehensive production infrastructure:

Systemd Service:
- guruconnect.service with auto-restart, resource limits, security hardening
- setup-systemd.sh installation script

Prometheus Metrics:
- Added prometheus-client dependency
- Created metrics module tracking:
  - HTTP requests (count, latency)
  - Sessions (created, closed, active)
  - Connections (WebSocket, by type)
  - Errors (by type)
  - Database operations (count, latency)
  - Server uptime
- Added /metrics endpoint
- Background task for uptime updates

Monitoring Configuration:
- prometheus.yml with scrape configs for GuruConnect and node_exporter
- alerts.yml with alerting rules
- grafana-dashboard.json with 10 panels
- setup-monitoring.sh installation script

PostgreSQL Backups:
- backup-postgres.sh with gzip compression
- restore-postgres.sh with safety checks
- guruconnect-backup.service and .timer for automated daily backups
- Retention policy: 30 daily, 4 weekly, 6 monthly

Health Monitoring:
- health-monitor.sh checking HTTP, disk, memory, database, metrics
- guruconnect.logrotate for log rotation
- Email alerts on failures

Updated CHECKLIST_STATE.json to reflect Week 1 completion (77%) and Week 2 start.
Created PHASE1_WEEK2_INFRASTRUCTURE.md with comprehensive planning.

Ready for deployment and testing on RMM server.
This commit is contained in:
2026-01-17 20:24:32 -07:00
parent 2481b54a65
commit 8521c95755
17 changed files with 1877 additions and 25 deletions

View File

@@ -12,6 +12,7 @@ mod db;
mod support_codes;
mod middleware;
mod utils;
mod metrics;
pub mod proto {
include!(concat!(env!("OUT_DIR"), "/guruconnect.rs"));
@@ -38,6 +39,8 @@ use serde::Deserialize;
use support_codes::{SupportCodeManager, CreateCodeRequest, SupportCode, CodeValidation};
use auth::{JwtConfig, TokenBlacklist, hash_password, generate_random_password, AuthenticatedUser};
use metrics::SharedMetrics;
use prometheus_client::registry::Registry;
/// Application state
#[derive(Clone)]
@@ -49,6 +52,12 @@ pub struct AppState {
pub token_blacklist: TokenBlacklist,
/// Optional API key for persistent agents (env: AGENT_API_KEY)
pub agent_api_key: Option<String>,
/// Prometheus metrics
pub metrics: SharedMetrics,
/// Prometheus registry (for /metrics endpoint)
pub registry: Arc<std::sync::Mutex<Registry>>,
/// Server start time
pub start_time: Arc<std::time::Instant>,
}
/// Middleware to inject JWT config and token blacklist into request extensions
@@ -206,6 +215,24 @@ async fn main() -> Result<()> {
info!("No AGENT_API_KEY set - persistent agents will need JWT token or support code");
}
// Initialize Prometheus metrics
let mut registry = Registry::default();
let metrics = Arc::new(metrics::Metrics::new(&mut registry));
let registry = Arc::new(std::sync::Mutex::new(registry));
let start_time = Arc::new(std::time::Instant::now());
// Spawn background task to update uptime metric
let metrics_for_uptime = metrics.clone();
let start_time_for_uptime = start_time.clone();
tokio::spawn(async move {
let mut interval = tokio::time::interval(std::time::Duration::from_secs(10));
loop {
interval.tick().await;
let uptime = start_time_for_uptime.elapsed().as_secs() as i64;
metrics_for_uptime.update_uptime(uptime);
}
});
// Create application state
let token_blacklist = TokenBlacklist::new();
@@ -216,12 +243,17 @@ async fn main() -> Result<()> {
jwt_config,
token_blacklist,
agent_api_key,
metrics,
registry,
start_time,
};
// Build router
let app = Router::new()
// Health check (no auth required)
.route("/health", get(health))
// Prometheus metrics (no auth required - for monitoring)
.route("/metrics", get(prometheus_metrics))
// Auth endpoints (TODO: Add rate limiting - see SEC2_RATE_LIMITING_TODO.md)
.route("/api/auth/login", post(api::auth::login))
@@ -333,6 +365,18 @@ async fn health() -> &'static str {
"OK"
}
/// Prometheus metrics endpoint
async fn prometheus_metrics(
State(state): State<AppState>,
) -> String {
use prometheus_client::encoding::text::encode;
let registry = state.registry.lock().unwrap();
let mut buffer = String::new();
encode(&mut buffer, &registry).unwrap();
buffer
}
// Support code API handlers
async fn create_code(

View File

@@ -0,0 +1,290 @@
//! Prometheus metrics for GuruConnect server
//!
//! This module exposes metrics for monitoring server health, performance, and usage.
//! Metrics are exposed at the `/metrics` endpoint in Prometheus format.
use prometheus_client::encoding::EncodeLabelSet;
use prometheus_client::metrics::counter::Counter;
use prometheus_client::metrics::family::Family;
use prometheus_client::metrics::gauge::Gauge;
use prometheus_client::metrics::histogram::{exponential_buckets, Histogram};
use prometheus_client::registry::Registry;
use std::sync::Arc;
/// Metrics labels for HTTP requests
#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
pub struct RequestLabels {
pub method: String,
pub path: String,
pub status: u16,
}
/// Metrics labels for session events
#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
pub struct SessionLabels {
pub status: String, // created, closed, failed, expired
}
/// Metrics labels for connection events
#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
pub struct ConnectionLabels {
pub conn_type: String, // agent, viewer, dashboard
}
/// Metrics labels for error tracking
#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
pub struct ErrorLabels {
pub error_type: String, // auth, database, websocket, protocol, internal
}
/// Metrics labels for database operations
#[derive(Clone, Debug, Hash, PartialEq, Eq, EncodeLabelSet)]
pub struct DatabaseLabels {
pub operation: String, // select, insert, update, delete
pub status: String, // success, error
}
/// GuruConnect server metrics
#[derive(Clone)]
pub struct Metrics {
// Request metrics
pub requests_total: Family<RequestLabels, Counter>,
pub request_duration_seconds: Family<RequestLabels, Histogram>,
// Session metrics
pub sessions_total: Family<SessionLabels, Counter>,
pub active_sessions: Gauge,
pub session_duration_seconds: Histogram,
// Connection metrics
pub connections_total: Family<ConnectionLabels, Counter>,
pub active_connections: Family<ConnectionLabels, Gauge>,
// Error metrics
pub errors_total: Family<ErrorLabels, Counter>,
// Database metrics
pub db_operations_total: Family<DatabaseLabels, Counter>,
pub db_query_duration_seconds: Family<DatabaseLabels, Histogram>,
// System metrics
pub uptime_seconds: Gauge,
}
impl Metrics {
/// Create a new metrics instance and register all metrics
pub fn new(registry: &mut Registry) -> Self {
// Request metrics
let requests_total = Family::<RequestLabels, Counter>::default();
registry.register(
"guruconnect_requests_total",
"Total number of HTTP requests",
requests_total.clone(),
);
let request_duration_seconds = Family::<RequestLabels, Histogram>::new_with_constructor(|| {
Histogram::new(exponential_buckets(0.001, 2.0, 10)) // 1ms to ~1s
});
registry.register(
"guruconnect_request_duration_seconds",
"HTTP request duration in seconds",
request_duration_seconds.clone(),
);
// Session metrics
let sessions_total = Family::<SessionLabels, Counter>::default();
registry.register(
"guruconnect_sessions_total",
"Total number of sessions",
sessions_total.clone(),
);
let active_sessions = Gauge::default();
registry.register(
"guruconnect_active_sessions",
"Number of currently active sessions",
active_sessions.clone(),
);
let session_duration_seconds = Histogram::new(exponential_buckets(1.0, 2.0, 15)); // 1s to ~9 hours
registry.register(
"guruconnect_session_duration_seconds",
"Session duration in seconds",
session_duration_seconds.clone(),
);
// Connection metrics
let connections_total = Family::<ConnectionLabels, Counter>::default();
registry.register(
"guruconnect_connections_total",
"Total number of WebSocket connections",
connections_total.clone(),
);
let active_connections = Family::<ConnectionLabels, Gauge>::default();
registry.register(
"guruconnect_active_connections",
"Number of active WebSocket connections by type",
active_connections.clone(),
);
// Error metrics
let errors_total = Family::<ErrorLabels, Counter>::default();
registry.register(
"guruconnect_errors_total",
"Total number of errors by type",
errors_total.clone(),
);
// Database metrics
let db_operations_total = Family::<DatabaseLabels, Counter>::default();
registry.register(
"guruconnect_db_operations_total",
"Total number of database operations",
db_operations_total.clone(),
);
let db_query_duration_seconds = Family::<DatabaseLabels, Histogram>::new_with_constructor(|| {
Histogram::new(exponential_buckets(0.0001, 2.0, 12)) // 0.1ms to ~400ms
});
registry.register(
"guruconnect_db_query_duration_seconds",
"Database query duration in seconds",
db_query_duration_seconds.clone(),
);
// System metrics
let uptime_seconds = Gauge::default();
registry.register(
"guruconnect_uptime_seconds",
"Server uptime in seconds",
uptime_seconds.clone(),
);
Self {
requests_total,
request_duration_seconds,
sessions_total,
active_sessions,
session_duration_seconds,
connections_total,
active_connections,
errors_total,
db_operations_total,
db_query_duration_seconds,
uptime_seconds,
}
}
/// Increment request counter
pub fn record_request(&self, method: &str, path: &str, status: u16) {
self.requests_total
.get_or_create(&RequestLabels {
method: method.to_string(),
path: path.to_string(),
status,
})
.inc();
}
/// Record request duration
pub fn record_request_duration(&self, method: &str, path: &str, status: u16, duration_secs: f64) {
self.request_duration_seconds
.get_or_create(&RequestLabels {
method: method.to_string(),
path: path.to_string(),
status,
})
.observe(duration_secs);
}
/// Record session creation
pub fn record_session_created(&self) {
self.sessions_total
.get_or_create(&SessionLabels {
status: "created".to_string(),
})
.inc();
self.active_sessions.inc();
}
/// Record session closure
pub fn record_session_closed(&self) {
self.sessions_total
.get_or_create(&SessionLabels {
status: "closed".to_string(),
})
.inc();
self.active_sessions.dec();
}
/// Record session failure
pub fn record_session_failed(&self) {
self.sessions_total
.get_or_create(&SessionLabels {
status: "failed".to_string(),
})
.inc();
}
/// Record session duration
pub fn record_session_duration(&self, duration_secs: f64) {
self.session_duration_seconds.observe(duration_secs);
}
/// Record connection created
pub fn record_connection_created(&self, conn_type: &str) {
self.connections_total
.get_or_create(&ConnectionLabels {
conn_type: conn_type.to_string(),
})
.inc();
self.active_connections
.get_or_create(&ConnectionLabels {
conn_type: conn_type.to_string(),
})
.inc();
}
/// Record connection closed
pub fn record_connection_closed(&self, conn_type: &str) {
self.active_connections
.get_or_create(&ConnectionLabels {
conn_type: conn_type.to_string(),
})
.dec();
}
/// Record an error
pub fn record_error(&self, error_type: &str) {
self.errors_total
.get_or_create(&ErrorLabels {
error_type: error_type.to_string(),
})
.inc();
}
/// Record database operation
pub fn record_db_operation(&self, operation: &str, status: &str, duration_secs: f64) {
let labels = DatabaseLabels {
operation: operation.to_string(),
status: status.to_string(),
};
self.db_operations_total
.get_or_create(&labels.clone())
.inc();
self.db_query_duration_seconds
.get_or_create(&labels)
.observe(duration_secs);
}
/// Update uptime metric
pub fn update_uptime(&self, uptime_secs: i64) {
self.uptime_seconds.set(uptime_secs);
}
}
/// Global metrics state wrapped in Arc for sharing across threads
pub type SharedMetrics = Arc<Metrics>;