feat(server): v2 secure-session-core Task 4 - rate limit + single-use codes

SPEC-002 Phase 1 Task 4 (the final keystone task), code-reviewed APPROVED. Closes the audit's reusable-code HIGH and rate-limiting-disabled HIGH. - Rebuilt rate limiting as a self-contained in-memory per-IP limiter (replaces the non-compiling tower_governor; removed that dep). Fixed-window caps wired to login (8/min), change-password (5/min), code-validate (15/min) -> 429; per-IP lockout after 10 consecutive failed code validations (15-min cooldown). - Single-use support codes: atomic consume on first agent bind (in-memory Pending->Connected under write lock + DB conditional UPDATE), rejecting a second presenter; validate/preview does not consume. - Widened code format: XXX-XXX-XXX, 31-char unambiguous alphabet (no 0/O/1/I/L), CSPRNG + rejection sampling, ~44.6 bits (replaces 6-digit numeric); migration 006 widens the code columns to TEXT. Completes the keystone (Tasks 1-4): every audit CRITICAL + HIGH in the secure auth/session core is now addressed. Known follow-up todos (not blocking): (1) trusted-proxy client-IP extraction (NPM-on-loopback collapses clients to 127.0.0.1); (2) multi-instance fail-closed DB single-use gate. Not cargo-check-verified locally - build-host/CI verification follows this commit. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-05-29 21:04:54 -07:00
parent 8a0193577b
commit bfcdbb5379
9 changed files with 1026 additions and 130 deletions
--- a/server/src/main.rs
+++ b/server/src/main.rs
@@ -21,7 +21,7 @@ pub mod proto {
 use anyhow::Result;
 use axum::http::{HeaderValue, Method};
 use axum::{
-    extract::{Json, Path, Query, Request, State},
+    extract::{ConnectInfo, Json, Path, Query, Request, State},
    http::StatusCode,
    middleware::{self as axum_middleware, Next},
    response::{Html, IntoResponse},
@@ -58,6 +58,9 @@ pub struct AppState {
    pub registry: Arc<std::sync::Mutex<Registry>>,
    /// Server start time
    pub start_time: Arc<std::time::Instant>,
+    /// Per-IP rate limiters + brute-force lockout (Task 4). Shared (Arc-backed
+    /// internally) so cloning AppState shares the same counters.
+    pub rate_limits: middleware::RateLimitState,
 }

 /// Middleware to inject JWT config and token blacklist into request extensions
@@ -263,6 +266,7 @@ async fn main() -> Result<()> {
        metrics,
        registry,
        start_time,
+        rate_limits: middleware::RateLimitState::new(),
    };

    // Build router
@@ -271,11 +275,21 @@ async fn main() -> Result<()> {
        .route("/health", get(health))
        // Prometheus metrics (no auth required - for monitoring)
        .route("/metrics", get(prometheus_metrics))
-        // Auth endpoints (TODO: Add rate limiting - see SEC2_RATE_LIMITING_TODO.md)
-        .route("/api/auth/login", post(api::auth::login))
+        // Auth endpoints. Per-IP rate limiting (Task 4) is attached per-route via
+        // `route_layer` so it applies ONLY to these endpoints, not the whole app.
+        .route(
+            "/api/auth/login",
+            post(api::auth::login).route_layer(axum_middleware::from_fn_with_state(
+                state.clone(),
+                middleware::login_rate_limit,
+            )),
+        )
        .route(
            "/api/auth/change-password",
-            post(api::auth::change_password),
+            post(api::auth::change_password).route_layer(axum_middleware::from_fn_with_state(
+                state.clone(),
+                middleware::change_password_rate_limit,
+            )),
        )
        .route("/api/auth/me", get(api::auth::get_me))
        .route("/api/auth/logout", post(api::auth_logout::logout))
@@ -306,10 +320,17 @@ async fn main() -> Result<()> {
            put(api::users::set_permissions),
        )
        .route("/api/users/:id/clients", put(api::users::set_client_access))
-        // Portal API - Support codes (TODO: Add rate limiting)
+        // Portal API - Support codes. The unauthenticated validate route is rate
+        // limited + brute-force locked out per IP (Task 4).
        .route("/api/codes", post(create_code))
        .route("/api/codes", get(list_codes))
-        .route("/api/codes/:code/validate", get(validate_code))
+        .route(
+            "/api/codes/:code/validate",
+            get(validate_code).route_layer(axum_middleware::from_fn_with_state(
+                state.clone(),
+                middleware::code_validate_rate_limit,
+            )),
+        )
        .route("/api/codes/:code/cancel", post(cancel_code))
        // WebSocket endpoints
        .route("/ws/agent", get(relay::agent_ws_handler))
@@ -450,7 +471,24 @@ async fn create_code(
    Json(request): Json<CreateCodeRequest>,
 ) -> Json<SupportCode> {
    let code = state.support_codes.create_code(request).await;
-    info!("Created support code: {}", code.code);
+
+    // Persist the code to the database so the DURABLE single-use guard
+    // (`db::support_codes::consume_code_for_bind`, Task 4) has a row to act on at
+    // agent-bind time. The in-memory manager remains the live source of truth for
+    // the auth decision; the DB row is the durable single-use record (and audit
+    // trail) that also survives a server restart. A DB failure here is non-fatal:
+    // the in-memory single-use consume still protects against reuse within this
+    // process lifetime.
+    if let Some(ref db) = state.db {
+        if let Err(e) =
+            db::support_codes::create_support_code(db.pool(), &code.code, &code.created_by).await
+        {
+            tracing::warn!("Failed to persist support code to database: {}", e);
+        }
+    }
+
+    // Do not log the code value (it is a bearer credential for the session).
+    info!("Created support code for {}", code.created_by);
    Json(code)
 }

@@ -469,9 +507,29 @@ struct ValidateParams {

 async fn validate_code(
    State(state): State<AppState>,
+    ConnectInfo(addr): ConnectInfo<SocketAddr>,
    Path(code): Path<String>,
 ) -> Json<CodeValidation> {
-    Json(state.support_codes.validate_code(&code).await)
+    let ip = addr.ip();
+
+    // PREVIEW ONLY: validate_code inspects the in-memory code state and does NOT
+    // consume the code (single-use consumption happens at agent BIND, in
+    // relay::handle_agent_connection). A valid preview here must not flip the
+    // code to connected/consumed.
+    let result = state.support_codes.validate_code(&code).await;
+
+    // Feed the per-IP brute-force lockout (Task 4): a failed validation counts
+    // toward the streak; a success resets it. The middleware
+    // (`code_validate_rate_limit`) enforces the lockout BEFORE this handler runs,
+    // so an already-locked IP never reaches here.
+    if result.valid {
+        state.rate_limits.code_validate_lockout.record_success(ip);
+    } else {
+        state.rate_limits.code_validate_lockout.record_failure(ip);
+        tracing::warn!("Failed support-code validation from {}", ip);
+    }
+
+    Json(result)
 }

 async fn cancel_code(