guru-connect/server/src/main.rs

//! GuruConnect Server - WebSocket Relay Server
//!
//! Handles connections from both agents and dashboard viewers,
//! relaying video frames and input events between them.

mod api;
mod auth;
mod config;
mod db;
mod metrics;
mod middleware;
mod relay;
mod session;
mod support_codes;
mod utils;

pub mod proto {
    include!(concat!(env!("OUT_DIR"), "/guruconnect.rs"));
}

use anyhow::Result;
use axum::http::{HeaderValue, Method};
use axum::{
    extract::{ConnectInfo, Json, Path, Request, State},
    http::StatusCode,
    middleware::{self as axum_middleware, Next},
    response::IntoResponse,
    routing::{any, delete, get, post, put},
    Router,
};
use serde::Deserialize;
use std::net::SocketAddr;
use std::sync::Arc;
use tower_http::cors::CorsLayer;
use tower_http::services::{ServeDir, ServeFile};
use tower_http::trace::TraceLayer;
use tracing::{info, Level};
use tracing_subscriber::FmtSubscriber;

use auth::{
    generate_random_password, hash_password, AuthenticatedUser, JwtConfig, TokenBlacklist,
    ViewerTokenRegistry,
};

/// Root of the static asset tree, relative to the server's working directory.
/// Holds the agent `downloads/` tree AND the v2 SPA build under `app/`.
const STATIC_DIR: &str = "static";

/// Directory the React/Vite SPA is built into (`dashboard/` Vite `build.outDir`
/// points here). The Axum `fallback_service` serves this tree at the server
/// root, so `npm run build` lands the SPA exactly where it is served — no copy
/// step. A dedicated subdir so the Vite build's `emptyOutDir` clears only the
/// SPA, never the agent downloads tree in the static root.
const SPA_DIR: &str = "static/app";

/// The SPA entry document. Returned (with 200) for any unmatched, non-API,
/// non-WS, non-asset GET so `BrowserRouter` deep links (`/machines`,
/// `/sessions`, `/login`) survive a hard reload.
const SPA_INDEX: &str = "static/app/index.html";

/// How long an OFFLINE persistent session may sit idle before the reaper removes
/// it (v2-stable-identity Task 4). Measured on the session's monotonic
/// `last_heartbeat_instant`. Ten minutes is well past the agent's 30s heartbeat /
/// 90s timeout, so only genuinely-gone machines age out — a brief reconnect blip
/// never reaps a real session.
const PERSISTENT_SESSION_TTL: std::time::Duration = std::time::Duration::from_secs(10 * 60);

/// Cadence of the stale-session reaper sweep (v2-stable-identity Task 4).
const PERSISTENT_SESSION_REAP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(60);
use metrics::SharedMetrics;
use prometheus_client::registry::Registry;
use support_codes::{CodeValidation, CreateCodeRequest, SupportCode, SupportCodeManager};

/// Application state
#[derive(Clone)]
pub struct AppState {
    sessions: session::SessionManager,
    support_codes: SupportCodeManager,
    db: Option<db::Database>,
    pub jwt_config: Arc<JwtConfig>,
    pub token_blacklist: TokenBlacklist,
    /// Per-user registry of outstanding session-scoped viewer tokens. Minting a
    /// viewer token registers it here under the minting user's `sub`; logout
    /// drains the user's registered viewer tokens into `token_blacklist` so a
    /// just-logged-out user cannot keep a live viewer/remote-control plane until
    /// the token's natural 5-minute expiry. The viewer WS already blacklist-
    /// checks the exact token string, so no WS change is needed.
    pub viewer_tokens: ViewerTokenRegistry,
    /// Optional API key for persistent agents (env: AGENT_API_KEY)
    pub agent_api_key: Option<String>,
    /// Prometheus metrics
    pub metrics: SharedMetrics,
    /// Prometheus registry (for /metrics endpoint)
    pub registry: Arc<std::sync::Mutex<Registry>>,
    /// Server start time
    pub start_time: Arc<std::time::Instant>,
    /// Per-IP rate limiters + brute-force lockout (Task 4). Shared (Arc-backed
    /// internally) so cloning AppState shares the same counters.
    pub rate_limits: middleware::RateLimitState,
    /// Trusted reverse-proxy allowlist for client-IP extraction. Forwarding
    /// headers (`X-Forwarded-For` / `X-Real-IP`) are honored ONLY when the TCP
    /// peer is in this set; otherwise the peer IP is used. Parsed once at startup
    /// from `CONNECT_TRUSTED_PROXIES` (default: loopback). See
    /// `utils::ip_extract::client_ip`.
    pub trusted_proxies: Arc<utils::ip_extract::TrustedProxies>,
}

/// Middleware to inject JWT config and token blacklist into request extensions
async fn auth_layer(
    State(state): State<AppState>,
    mut request: Request,
    next: Next,
) -> impl IntoResponse {
    request.extensions_mut().insert(state.jwt_config.clone());
    request
        .extensions_mut()
        .insert(Arc::new(state.token_blacklist.clone()));
    next.run(request).await
}

#[tokio::main]
async fn main() -> Result<()> {
    // Initialize logging
    FmtSubscriber::builder()
        .with_max_level(Level::INFO)
        .with_target(true)
        .init();

    info!("GuruConnect Server v{}", env!("CARGO_PKG_VERSION"));

    // Load configuration
    let config = config::Config::load()?;

    // Use port 3002 for GuruConnect
    let listen_addr = std::env::var("LISTEN_ADDR").unwrap_or_else(|_| "0.0.0.0:3002".to_string());
    info!("Loaded configuration, listening on {}", listen_addr);

    // JWT configuration - REQUIRED for security
    let jwt_secret = std::env::var("JWT_SECRET").expect(
        "JWT_SECRET environment variable must be set! Generate one with: openssl rand -base64 64",
    );

    if jwt_secret.len() < 32 {
        panic!("JWT_SECRET must be at least 32 characters long for security!");
    }

    let jwt_expiry_hours = std::env::var("JWT_EXPIRY_HOURS")
        .ok()
        .and_then(|s| s.parse().ok())
        .unwrap_or(24i64);
    let jwt_config = Arc::new(JwtConfig::new(jwt_secret, jwt_expiry_hours));

    // Initialize database if configured
    let database = if let Some(ref db_url) = config.database_url {
        match db::Database::connect(db_url, config.database_max_connections).await {
            Ok(db) => {
                // Run migrations
                if let Err(e) = db.migrate().await {
                    tracing::error!("Failed to run migrations: {}", e);
                    return Err(e);
                }
                Some(db)
            }
            Err(e) => {
                tracing::warn!(
                    "Failed to connect to database: {}. Running without persistence.",
                    e
                );
                None
            }
        }
    } else {
        info!("No DATABASE_URL set, running without persistence");
        None
    };

    // Create initial admin user if no users exist
    if let Some(ref db) = database {
        match db::count_users(db.pool()).await {
            Ok(0) => {
                info!("No users found, creating initial admin user...");
                let password = generate_random_password(16);
                let password_hash = hash_password(&password)?;

                match db::create_user(db.pool(), "admin", &password_hash, None, "admin").await {
                    Ok(user) => {
                        // Set admin permissions
                        let perms = vec![
                            "view".to_string(),
                            "control".to_string(),
                            "transfer".to_string(),
                            "manage_users".to_string(),
                            "manage_clients".to_string(),
                        ];
                        let _ = db::set_user_permissions(db.pool(), user.id, &perms).await;

                        // SEC-6: Write credentials to secure file instead of logging
                        let creds_file = ".admin-credentials";
                        match std::fs::write(creds_file, format!("Username: admin\nPassword: {}\n\nWARNING: Change this password immediately after first login!\nDelete this file after copying the password.\n", password)) {
                            Ok(_) => {
                                // Set restrictive permissions (Unix only)
                                #[cfg(unix)]
                                {
                                    use std::os::unix::fs::PermissionsExt;
                                    let _ = std::fs::set_permissions(creds_file, std::fs::Permissions::from_mode(0o600));
                                }

                                info!("========================================");
                                info!("  INITIAL ADMIN USER CREATED");
                                info!("  Credentials written to: {}", creds_file);
                                info!("  (Read file, change password, then delete file)");
                                info!("========================================");
                            }
                            Err(e) => {
                                // Fallback to logging if file write fails (but warn about security)
                                tracing::warn!("Could not write credentials file: {}", e);
                                info!("========================================");
                                info!("  INITIAL ADMIN USER CREATED");
                                info!("  Username: admin");
                                info!("  Password: {}", password);
                                info!("  WARNING: Password logged due to file write failure!");
                                info!("  (Change this password immediately!)");
                                info!("========================================");
                            }
                        }
                    }
                    Err(e) => {
                        tracing::error!("Failed to create initial admin user: {}", e);
                    }
                }
            }
            Ok(count) => {
                info!("{} user(s) in database", count);
            }
            Err(e) => {
                tracing::warn!("Could not check user count: {}", e);
            }
        }
    }

    // Create session manager
    let sessions = session::SessionManager::new();

    // Reconcile managed (persistent) sessions from the database on startup so
    // they are not orphaned after a server restart (Task 3F). Each persistent
    // machine is reloaded into the in-memory SessionManager as an OFFLINE
    // session; when the agent reconnects with its per-agent key, `register_agent`
    // reattaches to this preserved session (now bound to the authenticated
    // identity — see relay::agent_ws_handler). Support-code (attended) sessions
    // are intentionally NOT reconciled: they are ephemeral and end on disconnect.
    if let Some(ref db) = database {
        match db::machines::get_all_machines(db.pool()).await {
            Ok(machines) => {
                info!(
                    "Reconciling {} managed session(s) from database",
                    machines.len()
                );
                // Machines bound to an active `cak_` key. For these the key→machine
                // binding is authoritative (SPEC-004 Task 2), so we must NOT index a
                // restored keyed session by its stored `machine_uid`: doing so would
                // let an un-keyed agent spoofing that uid reattach the keyed machine's
                // offline session after a restart. The connect path never writes a uid
                // for keyed agents, so a non-NULL uid on a keyed row can only come from
                // a legacy pre-keying row — but close the gap regardless. On query
                // failure, fail closed (treat all machines as keyed: index none by uid)
                // rather than risk indexing a keyed machine.
                let keyed_ids = match db::agent_keys::keyed_machine_ids(db.pool()).await {
                    Ok(ids) => ids,
                    Err(e) => {
                        tracing::warn!(
                            "Could not load keyed-machine set; suppressing uid reattach index for all restored machines: {}",
                            e
                        );
                        machines.iter().map(|m| m.id).collect()
                    }
                };
                for machine in machines {
                    // Keyed machines get None (uid reattach disabled); un-keyed
                    // machines keep their stored uid for legitimate reattach.
                    let restore_uid = if keyed_ids.contains(&machine.id) {
                        None
                    } else {
                        machine.machine_uid.as_deref()
                    };
                    sessions
                        .restore_offline_machine(&machine.agent_id, &machine.hostname, restore_uid)
                        .await;
                }
            }
            Err(e) => {
                tracing::warn!("Failed to reconcile managed sessions: {}", e);
            }
        }
    }

    // Spawn the stale-session reaper (v2-stable-identity Task 4). Periodically
    // removes OFFLINE persistent sessions that have aged out past
    // PERSISTENT_SESSION_TTL with no viewer attached, purging both the agent_id and
    // machine_uid indexes via SessionManager::remove_session. Spawned AFTER the
    // startup restore so restored-offline rows are present (they age out once they
    // pass the TTL, per plan). The task holds only a clone of the SessionManager
    // (Arc-backed internally), so it shares the live session map.
    {
        let reaper_sessions = sessions.clone();
        tokio::spawn(async move {
            let mut interval = tokio::time::interval(PERSISTENT_SESSION_REAP_INTERVAL);
            // Skip the immediate first tick so we do not sweep before the server is
            // even serving; the first real sweep happens one interval in.
            interval.tick().await;
            loop {
                interval.tick().await;
                let reaped = reaper_sessions
                    .reap_stale_persistent(PERSISTENT_SESSION_TTL)
                    .await;
                if reaped > 0 {
                    info!("Stale-session reaper removed {} offline session(s)", reaped);
                }
            }
        });
    }
    info!(
        "Stale-session reaper started (ttl {}s, sweep every {}s)",
        PERSISTENT_SESSION_TTL.as_secs(),
        PERSISTENT_SESSION_REAP_INTERVAL.as_secs()
    );

    // Agent API key for persistent agents (optional)
    let agent_api_key = std::env::var("AGENT_API_KEY").ok();
    if let Some(ref key) = agent_api_key {
        // Validate API key strength for security
        utils::validation::validate_api_key_strength(key)?;
        info!("AGENT_API_KEY configured for persistent agents (validated)");
    } else {
        info!("No AGENT_API_KEY set - persistent agents will need JWT token or support code");
    }

    // Trusted reverse-proxy allowlist for real-client-IP extraction.
    // GuruConnect sits behind NPM on loopback, so axum's ConnectInfo reports the
    // proxy peer (127.0.0.1/::1), not the client. We only honor X-Forwarded-For /
    // X-Real-IP when the TCP peer is a trusted proxy; otherwise the header is
    // attacker-spoofable and is ignored. Default trust set is loopback; override
    // with CONNECT_TRUSTED_PROXIES (comma-separated IPs).
    let trusted_proxies = Arc::new(utils::ip_extract::TrustedProxies::from_env_value(
        std::env::var("CONNECT_TRUSTED_PROXIES").ok().as_deref(),
    ));
    info!(
        "Trusted reverse-proxy set for client-IP extraction: [{}]",
        trusted_proxies.describe()
    );

    // Initialize Prometheus metrics
    let mut registry = Registry::default();
    let metrics = Arc::new(metrics::Metrics::new(&mut registry));
    let registry = Arc::new(std::sync::Mutex::new(registry));
    let start_time = Arc::new(std::time::Instant::now());

    // Spawn background task to update uptime metric
    let metrics_for_uptime = metrics.clone();
    let start_time_for_uptime = start_time.clone();
    tokio::spawn(async move {
        let mut interval = tokio::time::interval(std::time::Duration::from_secs(10));
        loop {
            interval.tick().await;
            let uptime = start_time_for_uptime.elapsed().as_secs() as i64;
            metrics_for_uptime.update_uptime(uptime);
        }
    });

    // Create application state
    let token_blacklist = TokenBlacklist::new();
    let viewer_tokens = ViewerTokenRegistry::new();

    let state = AppState {
        sessions,
        support_codes: SupportCodeManager::new(),
        db: database,
        jwt_config,
        token_blacklist,
        viewer_tokens,
        agent_api_key,
        metrics,
        registry,
        start_time,
        rate_limits: middleware::RateLimitState::new(),
        trusted_proxies,
    };

    // Build router
    let app = Router::new()
        // Health check (no auth required)
        .route("/health", get(health))
        // Prometheus metrics (no auth required - for monitoring)
        .route("/metrics", get(prometheus_metrics))
        // Auth endpoints. Per-IP rate limiting (Task 4) is attached per-route via
        // `route_layer` so it applies ONLY to these endpoints, not the whole app.
        .route(
            "/api/auth/login",
            post(api::auth::login).route_layer(axum_middleware::from_fn_with_state(
                state.clone(),
                middleware::login_rate_limit,
            )),
        )
        .route(
            "/api/auth/change-password",
            post(api::auth::change_password).route_layer(axum_middleware::from_fn_with_state(
                state.clone(),
                middleware::change_password_rate_limit,
            )),
        )
        .route("/api/auth/me", get(api::auth::get_me))
        .route("/api/auth/logout", post(api::auth_logout::logout))
        .route(
            "/api/auth/revoke-token",
            post(api::auth_logout::revoke_own_token),
        )
        .route(
            "/api/auth/admin/revoke-user",
            post(api::auth_logout::revoke_user_tokens),
        )
        .route(
            "/api/auth/blacklist/stats",
            get(api::auth_logout::get_blacklist_stats),
        )
        .route(
            "/api/auth/blacklist/cleanup",
            post(api::auth_logout::cleanup_blacklist),
        )
        // User management (admin only)
        .route("/api/users", get(api::users::list_users))
        .route("/api/users", post(api::users::create_user))
        .route("/api/users/:id", get(api::users::get_user))
        .route("/api/users/:id", put(api::users::update_user))
        .route("/api/users/:id", delete(api::users::delete_user))
        .route(
            "/api/users/:id/permissions",
            put(api::users::set_permissions),
        )
        .route("/api/users/:id/clients", put(api::users::set_client_access))
        // Portal API - Support codes. The unauthenticated validate route is rate
        // limited + brute-force locked out per IP (Task 4).
        .route("/api/codes", post(create_code))
        .route("/api/codes", get(list_codes))
        .route(
            "/api/codes/:code/validate",
            get(validate_code).route_layer(axum_middleware::from_fn_with_state(
                state.clone(),
                middleware::code_validate_rate_limit,
            )),
        )
        .route("/api/codes/:code/cancel", post(cancel_code))
        // Zero-touch enrollment (SPEC-016). PUBLIC: no JWT — the per-site enrollment
        // key in the body is the gate, and the handler applies its own
        // per-(site_code, IP) rate limit / lockout (defense-in-depth). Mounted with
        // the other public API routes.
        .route("/api/enroll", post(api::enroll::enroll))
        // WebSocket endpoints
        .route("/ws/agent", get(relay::agent_ws_handler))
        .route("/ws/viewer", get(relay::viewer_ws_handler))
        // REST API - Sessions
        .route("/api/sessions", get(list_sessions))
        .route("/api/sessions/:id", get(get_session))
        // DELETE: live-only disconnect by default; `?purge=true` soft-deletes +
        // removes in-memory + audits (admin-only). Task 5 (api::removal).
        .route("/api/sessions/:id", delete(api::removal::remove_session))
        // Session-scoped viewer-token minting (dashboard JWT; bound to one session)
        .route(
            "/api/sessions/:id/viewer-token",
            post(api::sessions::mint_viewer_token),
        )
        // REST API - Machines
        .route("/api/machines", get(list_machines))
        // Bulk operator removal (admin-only). Registered before the `:agent_id`
        // routes; matchit (axum 0.7) prefers the static `bulk-remove` segment over
        // the `:agent_id` capture, so it never shadows a real agent_id. Task 5.
        .route(
            "/api/machines/bulk-remove",
            post(api::removal::bulk_remove_machines),
        )
        .route("/api/machines/:agent_id", get(get_machine))
        // DELETE: legacy hard-delete by default; `?purge=true` soft-deletes +
        // removes in-memory + audits (admin-only). Task 5 (api::removal).
        .route(
            "/api/machines/:agent_id",
            delete(api::removal::remove_machine),
        )
        .route("/api/machines/:agent_id/history", get(get_machine_history))
        .route(
            "/api/machines/:agent_id/update",
            post(trigger_machine_update),
        )
        // Per-agent key issuance (admin only). `:agent_id` matches the param
        // name used by the other /api/machines/:agent_id routes — matchit 0.7
        // panics if the same path position uses two different param names.
        .route(
            "/api/machines/:agent_id/keys",
            post(api::machine_keys::create_key),
        )
        .route(
            "/api/machines/:agent_id/keys",
            get(api::machine_keys::list_keys),
        )
        .route(
            "/api/machines/:agent_id/keys/:key_id",
            delete(api::machine_keys::revoke_key),
        )
        // Per-site enrollment key administration (SPEC-016, admin-only / JWT).
        // Rotate regenerates the cek_ secret + fingerprint (old installers can no
        // longer enroll new machines); GET returns the current non-secret
        // fingerprint/version. Both gated by the AdminUser extractor.
        .route(
            "/api/sites/:id/enrollment-key",
            get(api::sites::get_enrollment_key),
        )
        .route(
            "/api/sites/:id/enrollment-key/rotate",
            post(api::sites::rotate_enrollment_key),
        )
        // REST API - Releases and Version
        .route("/api/version", get(api::releases::get_version)) // No auth - for agent polling
        .route("/api/releases", get(api::releases::list_releases))
        .route("/api/releases", post(api::releases::create_release))
        .route("/api/releases/:version", get(api::releases::get_release))
        .route("/api/releases/:version", put(api::releases::update_release))
        .route(
            "/api/releases/:version",
            delete(api::releases::delete_release),
        )
        // Changelog (no auth - public, like /api/version)
        // Single route: version == "latest" selects the latest file; axum 0.7 / matchit 0.7
        // panics if a static segment and a path param share this position, so do not split it.
        .route(
            "/api/changelog/:component/:version",
            get(api::changelog::get),
        )
        // Agent downloads (no auth - public download links)
        .route("/api/download/viewer", get(api::downloads::download_viewer))
        .route(
            "/api/download/support",
            get(api::downloads::download_support),
        )
        .route("/api/download/agent", get(api::downloads::download_agent))
        // Namespace 404 guards. These wildcard routes catch any /api/* or /ws/*
        // path that no explicit route above matched, returning a JSON 404 so the
        // SPA fallback_service never answers an API/WS path with index.html. They
        // are intentionally the LEAST specific routes in each namespace: matchit
        // (axum 0.7) prefers a static segment over a `*` capture, so every real
        // route above still wins. `any(...)` covers every method (a bad WS path
        // is a GET, but POST/PUT/etc. to a dead /api/* path must 404 too, not 405).
        .route("/api/*rest", any(api_not_found))
        .route("/ws/*rest", any(api_not_found))
        // Public agent download tree (e.g. /downloads/guruconnect.exe). Mounted
        // explicitly so it keeps working after the v2 SPA takes over the root
        // fallback below — CLAUDE.md documents this as the public download URL.
        // `nest_service` is matched BEFORE `fallback_service`, so these binaries
        // are served from disk and never fall through to the SPA index.html.
        .nest_service(
            "/downloads",
            ServeDir::new(format!("{STATIC_DIR}/downloads")),
        )
        // NOTE: there are intentionally no /login, /dashboard, /users routes.
        // The v2 SPA (BrowserRouter) owns those paths and resolves them via the
        // fallback_service below; registering server-side handlers for them would
        // shadow the SPA on a hard reload.
        // State and middleware
        .with_state(state.clone())
        .layer(axum_middleware::from_fn_with_state(state, auth_layer))
        // SPA fallback: serve the React/Vite build from SPA_DIR and, for any
        // unmatched path, return the SPA index.html WITH 200 (via `.fallback`,
        // not `.not_found_service` which would force a 404) so BrowserRouter
        // deep links resolve. This is the Router's `fallback_service`, so it runs
        // ONLY after every explicit /api/*, /ws/*, /health, /metrics route and
        // the /downloads nest fail to match. An unknown /api/... path therefore
        // never reaches here — it hits the per-router 404 and returns the normal
        // (non-HTML) 404 the typed client expects. Real assets under /assets/*
        // are served from disk by ServeDir with correct content-types; only
        // genuinely missing files fall through to index.html.
        .fallback_service(
            ServeDir::new(SPA_DIR)
                .append_index_html_on_directories(true)
                .fallback(ServeFile::new(SPA_INDEX)),
        )
        // Middleware
        .layer(axum_middleware::from_fn(middleware::add_security_headers)) // SEC-7 & SEC-12
        .layer(TraceLayer::new_for_http())
        // SEC-11: Restricted CORS configuration
        .layer({
            CorsLayer::new()
                // Allow requests from the production domain and localhost (for development)
                .allow_origin([
                    "https://connect.azcomputerguru.com"
                        .parse::<HeaderValue>()
                        .unwrap(),
                    "http://localhost:3002".parse::<HeaderValue>().unwrap(),
                    "http://127.0.0.1:3002".parse::<HeaderValue>().unwrap(),
                ])
                // Allow only necessary HTTP methods
                .allow_methods([
                    Method::GET,
                    Method::POST,
                    Method::PUT,
                    Method::DELETE,
                    Method::OPTIONS,
                ])
                // Allow common headers needed for API requests
                .allow_headers([
                    axum::http::header::AUTHORIZATION,
                    axum::http::header::CONTENT_TYPE,
                    axum::http::header::ACCEPT,
                ])
                // Allow credentials (cookies, auth headers)
                .allow_credentials(true)
        });

    // Start server
    let addr: SocketAddr = listen_addr.parse()?;
    let listener = tokio::net::TcpListener::bind(addr).await?;

    info!("Server listening on {}", addr);

    // Use into_make_service_with_connect_info to enable IP address extraction
    axum::serve(
        listener,
        app.into_make_service_with_connect_info::<SocketAddr>(),
    )
    .await?;

    Ok(())
}

async fn health() -> &'static str {
    "OK"
}

/// Explicit 404 for unmatched paths under the `/api` and `/ws` namespaces.
///
/// CRITICAL: without these catch-all routes, an unknown `/api/...` or `/ws/...`
/// path would fall through to the SPA `fallback_service` and be answered with
/// `index.html` (HTTP 200, text/html). That would mask real 404s and break the
/// dashboard's typed client, which parses a JSON error envelope from API 404s.
/// These routes are LESS specific than every real `/api/...` / `/ws/...` route
/// (matchit matches a static segment before a `*` capture), so they only catch
/// genuinely-unrouted API/WS paths and return a proper JSON 404 — never HTML.
async fn api_not_found() -> impl IntoResponse {
    (
        StatusCode::NOT_FOUND,
        [(axum::http::header::CONTENT_TYPE, "application/json")],
        r#"{"error":"Not Found","status_code":404}"#,
    )
}

/// Prometheus metrics endpoint
async fn prometheus_metrics(State(state): State<AppState>) -> String {
    use prometheus_client::encoding::text::encode;

    let registry = state.registry.lock().unwrap();
    let mut buffer = String::new();
    encode(&mut buffer, &registry).unwrap();
    buffer
}

// Support code API handlers

async fn create_code(
    _user: AuthenticatedUser, // Require authentication
    State(state): State<AppState>,
    Json(request): Json<CreateCodeRequest>,
) -> Json<SupportCode> {
    let code = state.support_codes.create_code(request).await;

    // Persist the code to the database so the DURABLE single-use guard
    // (`db::support_codes::consume_code_for_bind`, Task 4) has a row to act on at
    // agent-bind time. The in-memory manager remains the live source of truth for
    // the auth decision; the DB row is the durable single-use record (and audit
    // trail) that also survives a server restart. A DB failure here is non-fatal:
    // the in-memory single-use consume still protects against reuse within this
    // process lifetime.
    if let Some(ref db) = state.db {
        if let Err(e) =
            db::support_codes::create_support_code(db.pool(), &code.code, &code.created_by).await
        {
            tracing::warn!("Failed to persist support code to database: {}", e);
        }
    }

    // Do not log the code value (it is a bearer credential for the session).
    info!("Created support code for {}", code.created_by);
    Json(code)
}

async fn list_codes(
    _user: AuthenticatedUser, // Require authentication
    State(state): State<AppState>,
) -> Json<Vec<SupportCode>> {
    Json(state.support_codes.list_active_codes().await)
}

#[derive(Deserialize)]
#[allow(dead_code)] // TODO(native-remote-control): consumed by the integration API; see docs/specs/native-remote-control/
struct ValidateParams {
    code: String,
}

async fn validate_code(
    State(state): State<AppState>,
    ConnectInfo(addr): ConnectInfo<SocketAddr>,
    headers: axum::http::HeaderMap,
    Path(code): Path<String>,
) -> Json<CodeValidation> {
    // Real client IP via the trusted-proxy-aware extractor — must match the key
    // the lockout middleware (`code_validate_rate_limit`) uses, or the per-attempt
    // success/failure would be recorded against a different bucket than the one
    // the lockout is enforced on.
    let ip = utils::ip_extract::client_ip(&addr, &headers, &state.trusted_proxies);

    // PREVIEW ONLY: validate_code inspects the in-memory code state and does NOT
    // consume the code (single-use consumption happens at agent BIND, in
    // relay::handle_agent_connection). A valid preview here must not flip the
    // code to connected/consumed.
    let result = state.support_codes.validate_code(&code).await;

    // Feed the per-IP brute-force lockout (Task 4): a failed validation counts
    // toward the streak; a success resets it. The middleware
    // (`code_validate_rate_limit`) enforces the lockout BEFORE this handler runs,
    // so an already-locked IP never reaches here.
    if result.valid {
        state.rate_limits.code_validate_lockout.record_success(ip);
    } else {
        state.rate_limits.code_validate_lockout.record_failure(ip);
        tracing::warn!("Failed support-code validation from {}", ip);
    }

    Json(result)
}

async fn cancel_code(
    _user: AuthenticatedUser, // Require authentication
    State(state): State<AppState>,
    Path(code): Path<String>,
) -> impl IntoResponse {
    if state.support_codes.cancel_code(&code).await {
        (StatusCode::OK, "Code cancelled")
    } else {
        (StatusCode::BAD_REQUEST, "Cannot cancel code")
    }
}

// Session API handlers (updated to use AppState)

async fn list_sessions(
    _user: AuthenticatedUser, // Require authentication
    State(state): State<AppState>,
) -> Json<Vec<api::SessionInfo>> {
    let sessions = state.sessions.list_sessions().await;
    Json(sessions.into_iter().map(api::SessionInfo::from).collect())
}

async fn get_session(
    _user: AuthenticatedUser, // Require authentication
    State(state): State<AppState>,
    Path(id): Path<String>,
) -> Result<Json<api::SessionInfo>, (StatusCode, &'static str)> {
    let session_id =
        uuid::Uuid::parse_str(&id).map_err(|_| (StatusCode::BAD_REQUEST, "Invalid session ID"))?;

    let session = state
        .sessions
        .get_session(session_id)
        .await
        .ok_or((StatusCode::NOT_FOUND, "Session not found"))?;

    Ok(Json(api::SessionInfo::from(session)))
}

// Machine API handlers

async fn list_machines(
    _user: AuthenticatedUser, // Require authentication
    State(state): State<AppState>,
) -> Result<Json<Vec<api::MachineInfo>>, (StatusCode, &'static str)> {
    let db = state
        .db
        .as_ref()
        .ok_or((StatusCode::SERVICE_UNAVAILABLE, "Database not available"))?;

    let machines = db::machines::get_all_machines(db.pool())
        .await
        .map_err(|_| (StatusCode::INTERNAL_SERVER_ERROR, "Database error"))?;

    Ok(Json(
        machines.into_iter().map(api::MachineInfo::from).collect(),
    ))
}

async fn get_machine(
    _user: AuthenticatedUser, // Require authentication
    State(state): State<AppState>,
    Path(agent_id): Path<String>,
) -> Result<Json<api::MachineInfo>, (StatusCode, &'static str)> {
    let db = state
        .db
        .as_ref()
        .ok_or((StatusCode::SERVICE_UNAVAILABLE, "Database not available"))?;

    let machine = db::machines::get_machine_by_agent_id(db.pool(), &agent_id)
        .await
        .map_err(|_| (StatusCode::INTERNAL_SERVER_ERROR, "Database error"))?
        .ok_or((StatusCode::NOT_FOUND, "Machine not found"))?;

    Ok(Json(api::MachineInfo::from(machine)))
}

async fn get_machine_history(
    _user: AuthenticatedUser, // Require authentication
    State(state): State<AppState>,
    Path(agent_id): Path<String>,
) -> Result<Json<api::MachineHistory>, (StatusCode, &'static str)> {
    let db = state
        .db
        .as_ref()
        .ok_or((StatusCode::SERVICE_UNAVAILABLE, "Database not available"))?;

    // Get machine
    let machine = db::machines::get_machine_by_agent_id(db.pool(), &agent_id)
        .await
        .map_err(|_| (StatusCode::INTERNAL_SERVER_ERROR, "Database error"))?
        .ok_or((StatusCode::NOT_FOUND, "Machine not found"))?;

    // Get sessions for this machine
    let sessions = db::sessions::get_sessions_for_machine(db.pool(), machine.id)
        .await
        .map_err(|_| (StatusCode::INTERNAL_SERVER_ERROR, "Database error"))?;

    // Get events for this machine
    let events = db::events::get_events_for_machine(db.pool(), machine.id)
        .await
        .map_err(|_| (StatusCode::INTERNAL_SERVER_ERROR, "Database error"))?;

    let history = api::MachineHistory {
        machine: api::MachineInfo::from(machine),
        sessions: sessions.into_iter().map(api::SessionRecord::from).collect(),
        events: events.into_iter().map(api::EventRecord::from).collect(),
        exported_at: chrono::Utc::now().to_rfc3339(),
    };

    Ok(Json(history))
}

// Update trigger request
#[derive(Deserialize)]
struct TriggerUpdateRequest {
    /// Target version (optional, defaults to latest stable)
    version: Option<String>,
}

/// Trigger update on a specific machine
async fn trigger_machine_update(
    _user: AuthenticatedUser, // Require authentication
    State(state): State<AppState>,
    Path(agent_id): Path<String>,
    Json(request): Json<TriggerUpdateRequest>,
) -> Result<impl IntoResponse, (StatusCode, &'static str)> {
    let db = state
        .db
        .as_ref()
        .ok_or((StatusCode::SERVICE_UNAVAILABLE, "Database not available"))?;

    // Get the target release (either specified or latest stable)
    let release = if let Some(version) = request.version {
        db::releases::get_release_by_version(db.pool(), &version)
            .await
            .map_err(|_| (StatusCode::INTERNAL_SERVER_ERROR, "Database error"))?
            .ok_or((StatusCode::NOT_FOUND, "Release version not found"))?
    } else {
        db::releases::get_latest_stable_release(db.pool())
            .await
            .map_err(|_| (StatusCode::INTERNAL_SERVER_ERROR, "Database error"))?
            .ok_or((StatusCode::NOT_FOUND, "No stable release available"))?
    };

    // Find session for this agent
    let session = state
        .sessions
        .get_session_by_agent(&agent_id)
        .await
        .ok_or((StatusCode::NOT_FOUND, "Agent not found or offline"))?;

    if !session.is_online {
        return Err((StatusCode::BAD_REQUEST, "Agent is offline"));
    }

    // Send update command via WebSocket
    // For now, we send admin command - later we'll include UpdateInfo in the message
    let sent = state
        .sessions
        .send_admin_command(
            session.id,
            proto::AdminCommandType::AdminUpdate,
            &format!("Update to version {}", release.version),
        )
        .await;

    if sent {
        info!(
            "Sent update command to agent {} (version {})",
            agent_id, release.version
        );

        // Update machine update status in database
        let _ =
            db::releases::update_machine_update_status(db.pool(), &agent_id, "downloading").await;

        Ok((StatusCode::OK, "Update command sent"))
    } else {
        Err((
            StatusCode::INTERNAL_SERVER_ERROR,
            "Failed to send update command",
        ))
    }
}