Phase 1 Week 2: Infrastructure & Monitoring
Added comprehensive production infrastructure: Systemd Service: - guruconnect.service with auto-restart, resource limits, security hardening - setup-systemd.sh installation script Prometheus Metrics: - Added prometheus-client dependency - Created metrics module tracking: - HTTP requests (count, latency) - Sessions (created, closed, active) - Connections (WebSocket, by type) - Errors (by type) - Database operations (count, latency) - Server uptime - Added /metrics endpoint - Background task for uptime updates Monitoring Configuration: - prometheus.yml with scrape configs for GuruConnect and node_exporter - alerts.yml with alerting rules - grafana-dashboard.json with 10 panels - setup-monitoring.sh installation script PostgreSQL Backups: - backup-postgres.sh with gzip compression - restore-postgres.sh with safety checks - guruconnect-backup.service and .timer for automated daily backups - Retention policy: 30 daily, 4 weekly, 6 monthly Health Monitoring: - health-monitor.sh checking HTTP, disk, memory, database, metrics - guruconnect.logrotate for log rotation - Email alerts on failures Updated CHECKLIST_STATE.json to reflect Week 1 completion (77%) and Week 2 start. Created PHASE1_WEEK2_INFRASTRUCTURE.md with comprehensive planning. Ready for deployment and testing on RMM server.
This commit is contained in:
@@ -12,6 +12,7 @@ mod db;
|
||||
mod support_codes;
|
||||
mod middleware;
|
||||
mod utils;
|
||||
mod metrics;
|
||||
|
||||
pub mod proto {
|
||||
include!(concat!(env!("OUT_DIR"), "/guruconnect.rs"));
|
||||
@@ -38,6 +39,8 @@ use serde::Deserialize;
|
||||
|
||||
use support_codes::{SupportCodeManager, CreateCodeRequest, SupportCode, CodeValidation};
|
||||
use auth::{JwtConfig, TokenBlacklist, hash_password, generate_random_password, AuthenticatedUser};
|
||||
use metrics::SharedMetrics;
|
||||
use prometheus_client::registry::Registry;
|
||||
|
||||
/// Application state
|
||||
#[derive(Clone)]
|
||||
@@ -49,6 +52,12 @@ pub struct AppState {
|
||||
pub token_blacklist: TokenBlacklist,
|
||||
/// Optional API key for persistent agents (env: AGENT_API_KEY)
|
||||
pub agent_api_key: Option<String>,
|
||||
/// Prometheus metrics
|
||||
pub metrics: SharedMetrics,
|
||||
/// Prometheus registry (for /metrics endpoint)
|
||||
pub registry: Arc<std::sync::Mutex<Registry>>,
|
||||
/// Server start time
|
||||
pub start_time: Arc<std::time::Instant>,
|
||||
}
|
||||
|
||||
/// Middleware to inject JWT config and token blacklist into request extensions
|
||||
@@ -206,6 +215,24 @@ async fn main() -> Result<()> {
|
||||
info!("No AGENT_API_KEY set - persistent agents will need JWT token or support code");
|
||||
}
|
||||
|
||||
// Initialize Prometheus metrics
|
||||
let mut registry = Registry::default();
|
||||
let metrics = Arc::new(metrics::Metrics::new(&mut registry));
|
||||
let registry = Arc::new(std::sync::Mutex::new(registry));
|
||||
let start_time = Arc::new(std::time::Instant::now());
|
||||
|
||||
// Spawn background task to update uptime metric
|
||||
let metrics_for_uptime = metrics.clone();
|
||||
let start_time_for_uptime = start_time.clone();
|
||||
tokio::spawn(async move {
|
||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(10));
|
||||
loop {
|
||||
interval.tick().await;
|
||||
let uptime = start_time_for_uptime.elapsed().as_secs() as i64;
|
||||
metrics_for_uptime.update_uptime(uptime);
|
||||
}
|
||||
});
|
||||
|
||||
// Create application state
|
||||
let token_blacklist = TokenBlacklist::new();
|
||||
|
||||
@@ -216,12 +243,17 @@ async fn main() -> Result<()> {
|
||||
jwt_config,
|
||||
token_blacklist,
|
||||
agent_api_key,
|
||||
metrics,
|
||||
registry,
|
||||
start_time,
|
||||
};
|
||||
|
||||
// Build router
|
||||
let app = Router::new()
|
||||
// Health check (no auth required)
|
||||
.route("/health", get(health))
|
||||
// Prometheus metrics (no auth required - for monitoring)
|
||||
.route("/metrics", get(prometheus_metrics))
|
||||
|
||||
// Auth endpoints (TODO: Add rate limiting - see SEC2_RATE_LIMITING_TODO.md)
|
||||
.route("/api/auth/login", post(api::auth::login))
|
||||
@@ -333,6 +365,18 @@ async fn health() -> &'static str {
|
||||
"OK"
|
||||
}
|
||||
|
||||
/// Prometheus metrics endpoint
|
||||
async fn prometheus_metrics(
|
||||
State(state): State<AppState>,
|
||||
) -> String {
|
||||
use prometheus_client::encoding::text::encode;
|
||||
|
||||
let registry = state.registry.lock().unwrap();
|
||||
let mut buffer = String::new();
|
||||
encode(&mut buffer, ®istry).unwrap();
|
||||
buffer
|
||||
}
|
||||
|
||||
// Support code API handlers
|
||||
|
||||
async fn create_code(
|
||||
|
||||
Reference in New Issue
Block a user