From 0f258788f97526af08c57581a721c1dad4dc9f2e Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Fri, 29 May 2026 19:13:03 -0700 Subject: [PATCH] feat(server): v2 secure-session-core Task 3 - secure relay WS SPEC-002 Phase 1 Task 3 (specs/v2-secure-session-core), code-reviewed APPROVED. - viewer_ws_handler: verify the session-scoped VIEWER token (validate_viewer_token sig+exp+purpose) + token_blacklist.is_revoked + session_id claim == requested session, before upgrade. Raw login JWTs no longer accepted on the viewer plane (closes audit CRITICAL #2; closes the *mechanism* of CRITICAL #1). - mint_viewer_token: authz gate is_admin() || has_permission("view") -> 403. - Agent identity binding: validate_agent_api_key returns AgentKeyAuth; a cak_- verified agent rebinds to the key's machine identity (fails closed if unresolvable), so a key for machine X cannot seize machine Y's session slot. - Frame caps on both WS upgrades (agent 4 MiB, viewer 64 KiB) - closes WS-OOM HIGH. - Viewer->agent input throttle (200 ev/s token bucket, bounded try_send) - closes input-injection MEDIUM. - Startup managed-session reconcile clarified. KNOWN FOLLOW-UPS (tracked todos): (1) authz STRENGTH - the "view" permission is held by every default role incl. viewer, and a viewer token grants input control, so the gate should be "control" or a VIEW_ONLY/CONTROL token split; CRITICAL #1 is mechanism-closed, strength pending decision. (2) revoke minted viewer tokens on logout (currently bounded only by 5-min TTL). Not cargo-check-verified (no toolchain on the authoring host). Co-Authored-By: Claude Opus 4.8 (1M context) --- server/src/api/sessions.rs | 42 +++- server/src/auth/jwt.rs | 1 - server/src/auth/mod.rs | 6 +- server/src/db/machines.rs | 16 ++ server/src/main.rs | 12 +- server/src/relay/mod.rs | 287 ++++++++++++++++++++++----- specs/v2-secure-session-core/plan.md | 40 +++- 7 files changed, 340 insertions(+), 64 deletions(-) diff --git a/server/src/api/sessions.rs b/server/src/api/sessions.rs index 96a2025..6905b4d 100644 --- a/server/src/api/sessions.rs +++ b/server/src/api/sessions.rs @@ -8,11 +8,14 @@ //! matches the requested session. //! //! Authorization (Phase 1): the requester must present a valid dashboard JWT -//! (the [`AuthenticatedUser`] extractor) AND the target session must exist in -//! the live session manager. Per-tenant / per-machine ACL narrowing is a -//! Phase-4 concern; the tenancy claim is already carried so the WS and future -//! phases can enforce it. Minting is itself the authorization gate — only an -//! authenticated user can obtain a token, and only for a real session. +//! (the [`AuthenticatedUser`] extractor), MUST be authorized to view sessions +//! (`is_admin()` OR the `view` permission — see [`SESSION_VIEW_PERMISSION`]), +//! AND the target session must exist in the live session manager. This minting +//! endpoint is the authorization decision point: it is what actually closes +//! audit CRITICAL #1 (any authenticated user could previously obtain viewer +//! access to any session). The WS layer then trusts the session-scoped token. +//! Per-tenant / per-machine ACL narrowing is a Phase-4 concern; the tenancy +//! claim is already carried so the WS and future phases can enforce it. use axum::{ extract::{Path, State}, @@ -30,6 +33,15 @@ use super::machine_keys::ApiError; type ApiResult = Result)>; +/// Permission (in GC's existing catalog — see `api::users` `valid_permissions`: +/// `view`, `control`, `transfer`, `manage_users`, `manage_clients`) that gates +/// obtaining a viewer token. `view` is the established "may view sessions" +/// permission, held by every non-degraded role (admin/operator/viewer) and +/// required of `viewer`. Admins bypass it via `is_admin()`. No NEW permission is +/// introduced: the intra-tenant role distinction is honored using the existing +/// `view` grant. (Policy DECIDED — Mike, 2026-05-29: admin-or-view-permission.) +const SESSION_VIEW_PERMISSION: &str = "view"; + /// Response carrying a freshly minted viewer token. #[derive(Debug, Serialize)] pub struct ViewerTokenResponse { @@ -65,7 +77,25 @@ pub async fn mint_viewer_token( let session_id = Uuid::parse_str(&id) .map_err(|_| err(StatusCode::BAD_REQUEST, "INVALID_SESSION_ID", "Invalid session ID"))?; - // Authorization gate: the session must exist (live session manager is the + // AUTHORIZATION GATE (closes audit CRITICAL #1). Authentication alone is not + // enough: the user must be an admin OR hold the `view` permission. A user + // with no view grant cannot obtain a viewer token, and therefore cannot join + // any session's viewer WS (which now requires this session-scoped token). + if !(user.is_admin() || user.has_permission(SESSION_VIEW_PERMISSION)) { + tracing::warn!( + "User {} denied viewer-token mint for session {} (lacks '{}' permission)", + user.username, + session_id, + SESSION_VIEW_PERMISSION + ); + return Err(err( + StatusCode::FORBIDDEN, + "FORBIDDEN", + "You do not have permission to view sessions", + )); + } + + // The session must exist (live session manager is the // source of truth for joinable sessions, matching GET /api/sessions/:id). let session = state.sessions.get_session(session_id).await.ok_or_else(|| { err( diff --git a/server/src/auth/jwt.rs b/server/src/auth/jwt.rs index 01024c2..faaf391 100644 --- a/server/src/auth/jwt.rs +++ b/server/src/auth/jwt.rs @@ -207,7 +207,6 @@ impl JwtConfig { /// symmetrically, a viewer token can never satisfy [`Self::validate_token`] /// because it lacks the login claim fields. The WS layer (Task 3) /// additionally checks the blacklist and the `session_id` match. - #[allow(dead_code)] // Consumed by the viewer WebSocket handler in Task 3. pub fn validate_viewer_token(&self, token: &str) -> Result { let mut validation = Validation::default(); validation.validate_exp = true; diff --git a/server/src/auth/mod.rs b/server/src/auth/mod.rs index 2a1934a..3782d97 100644 --- a/server/src/auth/mod.rs +++ b/server/src/auth/mod.rs @@ -30,8 +30,10 @@ pub struct AuthenticatedUser { } impl AuthenticatedUser { - /// Check if user has a specific permission - #[allow(dead_code)] // TODO(native-remote-control): consumed by the integration API; see docs/specs/native-remote-control/ + /// Check if user has a specific permission. + /// + /// Admins implicitly hold every permission. Consumed by the viewer-token + /// authorization gate (`api::sessions::mint_viewer_token`). pub fn has_permission(&self, permission: &str) -> bool { if self.role == "admin" { return true; diff --git a/server/src/db/machines.rs b/server/src/db/machines.rs index 9680d9f..a250aa1 100644 --- a/server/src/db/machines.rs +++ b/server/src/db/machines.rs @@ -112,6 +112,22 @@ pub async fn get_machine_by_agent_id( .await } +/// Get machine by its primary-key UUID (`connect_machines.id`). +/// +/// Used by the agent WS plane to resolve the trusted `machine_id` returned by +/// `verify_agent_key` back to its canonical `agent_id`, so persistent reattach +/// binds to the authenticated identity rather than a client-supplied query +/// param (Task 3 identity binding). +pub async fn get_machine_by_id( + pool: &PgPool, + machine_id: Uuid, +) -> Result, sqlx::Error> { + sqlx::query_as::<_, Machine>("SELECT * FROM connect_machines WHERE id = $1") + .bind(machine_id) + .fetch_optional(pool) + .await +} + /// Mark machine as offline pub async fn mark_machine_offline(pool: &PgPool, agent_id: &str) -> Result<(), sqlx::Error> { sqlx::query( diff --git a/server/src/main.rs b/server/src/main.rs index 2813279..5ad020d 100644 --- a/server/src/main.rs +++ b/server/src/main.rs @@ -196,12 +196,18 @@ async fn main() -> Result<()> { // Create session manager let sessions = session::SessionManager::new(); - // Restore persistent machines from database + // Reconcile managed (persistent) sessions from the database on startup so + // they are not orphaned after a server restart (Task 3F). Each persistent + // machine is reloaded into the in-memory SessionManager as an OFFLINE + // session; when the agent reconnects with its per-agent key, `register_agent` + // reattaches to this preserved session (now bound to the authenticated + // identity — see relay::agent_ws_handler). Support-code (attended) sessions + // are intentionally NOT reconciled: they are ephemeral and end on disconnect. if let Some(ref db) = database { match db::machines::get_all_machines(db.pool()).await { Ok(machines) => { info!( - "Restoring {} persistent machines from database", + "Reconciling {} managed session(s) from database", machines.len() ); for machine in machines { @@ -211,7 +217,7 @@ async fn main() -> Result<()> { } } Err(e) => { - tracing::warn!("Failed to restore machines: {}", e); + tracing::warn!("Failed to reconcile managed sessions: {}", e); } } } diff --git a/server/src/relay/mod.rs b/server/src/relay/mod.rs index 62acf23..ae6cf58 100644 --- a/server/src/relay/mod.rs +++ b/server/src/relay/mod.rs @@ -23,6 +23,31 @@ use crate::proto; use crate::session::SessionManager; use crate::AppState; +/// Maximum size of a single inbound WebSocket message on the AGENT plane. +/// +/// Agents stream encoded video frames; a single frame (full-screen keyframe at +/// high resolution, raw/Zstd or H.264) must fit, but anything beyond a few MB is +/// hostile. 4 MiB is comfortably above a realistic worst-case frame while +/// bounding the per-frame `to_vec()` + broadcast allocation. Frames larger than +/// this are rejected by the WebSocket layer before reaching `to_vec()`/broadcast. +/// (Closes the WS-OOM HIGH on the agent plane.) +const AGENT_WS_MAX_MESSAGE_BYTES: usize = 4 * 1024 * 1024; + +/// Maximum size of a single inbound WebSocket message on the VIEWER plane. +/// +/// Viewers send only small control messages (mouse/key events, chat). 64 KiB is +/// already generous for these; a viewer has no legitimate reason to push large +/// payloads up the relay. (Closes the WS-OOM HIGH on the viewer plane and bounds +/// the input path together with the rate limiter below.) +const VIEWER_WS_MAX_MESSAGE_BYTES: usize = 64 * 1024; + +/// Maximum viewer→agent input events forwarded per second, per viewer +/// connection. A human technician generates well under this; the cap exists to +/// stop a compromised/hostile viewer from flooding the target with injected +/// input (`SendInput`). Excess events are DROPPED (coalesced away), never +/// buffered unboundedly. (Closes the input-injection MEDIUM.) +const VIEWER_INPUT_EVENTS_PER_SEC: u32 = 200; + #[derive(Debug, Deserialize)] pub struct AgentParams { agent_id: String, @@ -58,7 +83,11 @@ pub async fn agent_ws_handler( ConnectInfo(addr): ConnectInfo, Query(params): Query, ) -> Result { - let agent_id = params.agent_id.clone(); + // The CLIENT-SUPPLIED agent_id. For a per-agent-key (managed) agent this is + // untrusted until it is reconciled against the key's machine identity below; + // for a support-code/attended agent it is used as-is. `agent_id` is rebound + // to the AUTHENTICATED identity for `cak_`-keyed agents (Task 3 binding). + let mut agent_id = params.agent_id.clone(); let agent_name = params .hostname .clone() @@ -172,41 +201,84 @@ pub async fn agent_ws_handler( // Agent-plane auth ONLY: a per-agent `cak_` key (hash-compared against // connect_agent_keys, rejecting revoked) or the deprecated shared // AGENT_API_KEY fallback. A dashboard/user JWT is NEVER accepted here. - if !validate_agent_api_key(&state, key).await { - warn!( - "Agent connection rejected: {} from {} - invalid API key", - agent_id, client_ip - ); - - // Log failed connection attempt - if let Some(ref db) = state.db { - let _ = db::events::log_event( - db.pool(), - Uuid::new_v4(), - db::events::EventTypes::CONNECTION_REJECTED_INVALID_API_KEY, - None, - Some(&agent_id), - Some(serde_json::json!({ - "reason": "invalid_api_key", - "agent_id": agent_id - })), - Some(client_ip), - ) - .await; + match validate_agent_api_key(&state, key).await { + AgentKeyAuth::PerAgentKey(Some(trusted_agent_id)) => { + // IDENTITY BINDING (Task 3): persistent reattach must bind to the + // authenticated machine identity, NOT a client-supplied agent_id. + // If the client claimed a different agent_id, ignore the claim and + // use the key's machine identity — a valid key for machine X can + // never seize machine Y's persistent session slot. + if trusted_agent_id != agent_id { + warn!( + "Agent from {} presented agent_id '{}' but its key authenticates \ + machine '{}'; binding to the authenticated identity", + client_ip, agent_id, trusted_agent_id + ); + } + agent_id = trusted_agent_id; + info!( + "Agent {} from {} authenticated via per-agent key", + agent_id, client_ip + ); } + AgentKeyAuth::PerAgentKey(None) => { + // Key verified but the owning machine could not be resolved + // (transient DB error). Fail closed rather than reattach to an + // unverified slot. + warn!( + "Agent connection rejected from {}: key authenticated but machine \ + identity could not be resolved", + client_ip + ); + return Err(StatusCode::SERVICE_UNAVAILABLE); + } + AgentKeyAuth::SharedKey => { + // Deprecated shared key: no per-agent identity; legacy behavior + // uses the client-supplied agent_id as-is. + info!( + "Agent {} from {} authenticated via DEPRECATED shared API key", + agent_id, client_ip + ); + } + AgentKeyAuth::Invalid => { + warn!( + "Agent connection rejected: {} from {} - invalid API key", + agent_id, client_ip + ); - return Err(StatusCode::UNAUTHORIZED); + // Log failed connection attempt + if let Some(ref db) = state.db { + let _ = db::events::log_event( + db.pool(), + Uuid::new_v4(), + db::events::EventTypes::CONNECTION_REJECTED_INVALID_API_KEY, + None, + Some(&agent_id), + Some(serde_json::json!({ + "reason": "invalid_api_key", + "agent_id": agent_id + })), + Some(client_ip), + ) + .await; + } + + return Err(StatusCode::UNAUTHORIZED); + } } - info!( - "Agent {} from {} authenticated via API key", - agent_id, client_ip - ); } let sessions = state.sessions.clone(); let support_codes = state.support_codes.clone(); let db = state.db.clone(); + // Bounded relay: cap inbound frame/message size before the socket is upgraded + // so oversized agent frames are rejected by the WS layer, never `to_vec()`'d + // and broadcast. (WS-OOM HIGH.) + let ws = ws + .max_message_size(AGENT_WS_MAX_MESSAGE_BYTES) + .max_frame_size(AGENT_WS_MAX_MESSAGE_BYTES); + Ok(ws.on_upgrade(move |socket| { handle_agent_connection( socket, @@ -221,6 +293,27 @@ pub async fn agent_ws_handler( })) } +/// Outcome of validating an agent key presented on the agent plane. +enum AgentKeyAuth { + /// Authenticated by a per-agent `cak_` key. Carries the TRUSTED machine + /// identity (the canonical `agent_id` of the machine the key belongs to), + /// resolved from `connect_machines` via the key's `machine_id`. The WS + /// handler binds persistent reattach to THIS identity, not the client's + /// query-string `agent_id`. + /// + /// `None` inside means the key verified but the owning machine row could not + /// be resolved (e.g. transient DB error) — treated as authenticated but with + /// no trusted identity to bind, so the connection is rejected rather than + /// allowed to reattach to an unverified slot. + PerAgentKey(Option), + /// Authenticated by the DEPRECATED shared `AGENT_API_KEY`. No per-agent + /// identity is established; the client-supplied `agent_id` is used as-is + /// (legacy behavior, sunset path). + SharedKey, + /// Not a valid agent key. + Invalid, +} + /// Validate an agent key presented on the agent plane. /// /// SECURITY (v2): a dashboard/user JWT is NEVER a valid agent credential — the @@ -228,21 +321,33 @@ pub async fn agent_ws_handler( /// Accepts, in order: /// 1. A per-agent `cak_` key: SHA-256 hash compared against /// `connect_agent_keys`; revoked keys are rejected (the DB query filters -/// `revoked_at IS NULL`). This is the supported path. +/// `revoked_at IS NULL`). This is the supported path. On success the +/// owning machine's canonical `agent_id` is resolved and returned so the +/// handler can bind the session to the AUTHENTICATED identity. /// 2. The shared `AGENT_API_KEY` env value — DEPRECATED fallback, retained /// only for not-yet-migrated agents. Its use is logged at WARNING and it /// should be removed once all managed agents carry per-agent keys. /// /// Never logs the presented key or any hash. -async fn validate_agent_api_key(state: &AppState, api_key: &str) -> bool { +async fn validate_agent_api_key(state: &AppState, api_key: &str) -> AgentKeyAuth { // 1. Per-agent key (the supported path). Requires a database; without one, // only the deprecated shared-key fallback below can apply. if let Some(ref db) = state.db { - if crate::auth::agent_keys::verify_agent_key(db.pool(), api_key) - .await - .is_some() + if let Some(machine_id) = + crate::auth::agent_keys::verify_agent_key(db.pool(), api_key).await { - return true; + // Resolve the trusted identity from the authenticated key's machine. + let trusted_agent_id = match db::machines::get_machine_by_id(db.pool(), machine_id) + .await + { + Ok(Some(machine)) => Some(machine.agent_id), + Ok(None) => None, + Err(e) => { + tracing::error!("Failed to resolve machine for authenticated agent key: {}", e); + None + } + }; + return AgentKeyAuth::PerAgentKey(trusted_agent_id); } } @@ -256,11 +361,11 @@ async fn validate_agent_api_key(state: &AppState, api_key: &str) -> bool { fallback. Migrate this agent to a per-agent cak_ key; the shared key \ will be removed." ); - return true; + return AgentKeyAuth::SharedKey; } } - false + AgentKeyAuth::Invalid } /// WebSocket handler for viewer connections @@ -272,27 +377,71 @@ pub async fn viewer_ws_handler( ) -> Result { let client_ip = addr.ip(); - // Require JWT token for viewers + // The session the viewer is asking to join. Parse early — a malformed UUID + // can never match a viewer token's `session_id` claim, so reject up front. + let requested_session_id = Uuid::parse_str(¶ms.session_id).map_err(|_| { + warn!( + "Viewer connection rejected from {}: invalid session_id", + client_ip + ); + StatusCode::BAD_REQUEST + })?; + + // Require a session-scoped VIEWER token (minted by `mint_viewer_token`), + // NOT a raw login JWT. This is the v2 mechanism that, together with the + // authorization gate at mint time, closes the any-JWT-joins-any-session and + // blacklist-bypass CRITICALs. let token = params.token.ok_or_else(|| { warn!( - "Viewer connection rejected from {}: missing token", + "Viewer connection rejected from {}: missing viewer token", client_ip ); StatusCode::UNAUTHORIZED })?; - // Validate the token - let claims = state.jwt_config.validate_token(&token).map_err(|e| { + // 1. Signature + expiry + `purpose == "viewer"`. A login JWT fails this + // (wrong claim shape / no `purpose`), so login tokens are no longer + // accepted on the viewer plane. + let claims = state.jwt_config.validate_viewer_token(&token).map_err(|e| { warn!( - "Viewer connection rejected from {}: invalid token: {}", + "Viewer connection rejected from {}: invalid viewer token: {}", client_ip, e ); StatusCode::UNAUTHORIZED })?; + // 2. Revocation check on the WS plane (CRITICAL #2): a logged-out / revoked + // token must not grant live remote control even before natural expiry. + if state.token_blacklist.is_revoked(&token).await { + warn!( + "Viewer connection rejected from {}: viewer token revoked", + client_ip + ); + return Err(StatusCode::UNAUTHORIZED); + } + + // 3. Bind the token to THIS session (CRITICAL #1): the token's `session_id` + // claim must equal the session being joined. A token minted for session A + // cannot be replayed against session B. + let claim_session_id = claims.session_uuid().map_err(|e| { + warn!( + "Viewer connection rejected from {}: malformed session_id claim: {}", + client_ip, e + ); + StatusCode::UNAUTHORIZED + })?; + if claim_session_id != requested_session_id { + warn!( + "Viewer connection rejected from {}: token session mismatch \ + (token scoped to a different session than requested {})", + client_ip, requested_session_id + ); + return Err(StatusCode::FORBIDDEN); + } + info!( - "Viewer {} authenticated via JWT from {}", - claims.username, client_ip + "Viewer (user {}) authenticated via session-scoped token for session {} from {}", + claims.sub, requested_session_id, client_ip ); let session_id = params.session_id; @@ -300,6 +449,13 @@ pub async fn viewer_ws_handler( let sessions = state.sessions.clone(); let db = state.db.clone(); + // Bounded relay: cap inbound message size on the viewer plane. Viewers send + // only small control messages, so a tight cap both prevents OOM and bounds + // the input path. (WS-OOM HIGH.) + let ws = ws + .max_message_size(VIEWER_WS_MAX_MESSAGE_BYTES) + .max_frame_size(VIEWER_WS_MAX_MESSAGE_BYTES); + Ok(ws.on_upgrade(move |socket| { handle_viewer_connection( socket, @@ -683,6 +839,18 @@ async fn handle_viewer_connection( let viewer_id_cleanup = viewer_id.clone(); let viewer_name_cleanup = viewer_name.clone(); + // Per-viewer input rate limiter (input-injection MEDIUM). A simple + // refilling token bucket: at most `VIEWER_INPUT_EVENTS_PER_SEC` input events + // are forwarded per second. Excess events are DROPPED (coalesced away) rather + // than buffered — the session `input_tx` channel is already bounded + // (capacity 64), and `try_send` never blocks, so a flood can neither stall + // the relay nor grow memory. Chat is not throttled here (it is not an + // injected-input vector). Only MouseEvent/KeyEvent/SpecialKey count against + // the bucket. + let mut input_tokens: f64 = VIEWER_INPUT_EVENTS_PER_SEC as f64; + let mut last_refill = std::time::Instant::now(); + let mut dropped_input: u64 = 0; + // Main loop: receive input from viewer and forward to agent while let Some(msg) = ws_receiver.next().await { match msg { @@ -694,13 +862,40 @@ async fn handle_viewer_connection( Some(proto::message::Payload::MouseEvent(_)) | Some(proto::message::Payload::KeyEvent(_)) | Some(proto::message::Payload::SpecialKey(_)) => { - // Forward input to agent - let _ = input_tx.send(data.to_vec()).await; + // Refill the token bucket based on elapsed time, + // capped at one second's worth of capacity. + let elapsed = last_refill.elapsed().as_secs_f64(); + if elapsed > 0.0 { + input_tokens = (input_tokens + + elapsed * VIEWER_INPUT_EVENTS_PER_SEC as f64) + .min(VIEWER_INPUT_EVENTS_PER_SEC as f64); + last_refill = std::time::Instant::now(); + } + + if input_tokens >= 1.0 { + input_tokens -= 1.0; + // Non-blocking, bounded send: drop on a full + // queue rather than awaiting/buffering. + if input_tx.try_send(data.to_vec()).is_err() { + dropped_input += 1; + } + } else { + // Over the per-second cap: drop (coalesce). + dropped_input += 1; + if dropped_input.is_power_of_two() { + warn!( + "Viewer {} exceeding input rate cap on session {} \ + ({} events dropped so far)", + viewer_id, session_id, dropped_input + ); + } + } } Some(proto::message::Payload::ChatMessage(chat)) => { - // Forward chat message to agent + // Forward chat message to agent (not throttled — + // not an injected-input vector). Bounded send. info!("Chat from technician: {}", chat.content); - let _ = input_tx.send(data.to_vec()).await; + let _ = input_tx.try_send(data.to_vec()); } _ => {} } diff --git a/specs/v2-secure-session-core/plan.md b/specs/v2-secure-session-core/plan.md index 9a91d71..c8b0c89 100644 --- a/specs/v2-secure-session-core/plan.md +++ b/specs/v2-secure-session-core/plan.md @@ -1,10 +1,14 @@ # v2 Secure Session Core — Implementation Plan > Spec created: 2026-05-29 -> Status: in progress — Tasks 1-2 DONE 2026-05-29; Task 3 (relay WS) next. +> Status: in progress — Tasks 1-3 DONE 2026-05-29 (Task 3 code-reviewed APPROVED). REQUIRED follow-up +> before Phase-1 exit: viewer-token authz STRENGTH — the gate uses `view` (held by EVERY default role +> incl. `viewer`) but a viewer token grants input CONTROL; flip to `control`, or split VIEW_ONLY/CONTROL +> tokens (proto already models SCREEN_CONTROL vs VIEW_ONLY). PENDING Mike. Also: nothing revokes a minted +> viewer token on logout (bounded by 5-min TTL) — follow-up todo. Task 4 (rate limiting + single-use codes) next. > CARRY-FORWARD: Task 3 MUST add a viewer-token AUTHORIZATION check (admin/permission gate) — Task 2 -> fixed only the token *mechanism*; the authz gate is what actually closes audit CRITICAL #1. Policy -> (admin-only vs admin-or-view-permission) pending Mike's decision. +> fixed only the token *mechanism*; the authz gate is what actually closes audit CRITICAL #1. +> Policy DECIDED (Mike, 2026-05-29): admin-or-view-permission (`is_admin() || has_permission(...)`). > Parent: `docs/specs/SPEC-002-v2-modernization-architecture.md` (Phase 1) > Keystone: Tasks 1–4 are the "get-right-first" secure auth/session core — every audit CRITICAL/HIGH > is closed there. Tasks 5–7 deliver the product capability on top. Do them in order. @@ -87,7 +91,29 @@ Reference: `relay/mod.rs:224` (`validate_agent_api_key` — the CRITICAL), `auth --- -## Task 3 (KEYSTONE): Secure relay WS handlers + bounded relay +## Task 3 (KEYSTONE) [IMPLEMENTED 2026-05-29 — self-reviewed; no Rust toolchain on this machine, not yet `cargo check`-verified]: Secure relay WS handlers + bounded relay + +> [IMPLEMENTED] Viewer WS now verifies the session-scoped VIEWER token +> (`validate_viewer_token`: sig+exp+`purpose`) + `token_blacklist.is_revoked` + +> `session_id` claim == requested session, before upgrade — raw login JWTs are no +> longer accepted (closes CRITICAL #1 mechanism + #2). Authz gate added to +> `mint_viewer_token`: `is_admin() || has_permission("view")` → 403 envelope on +> failure (closes CRITICAL #1 — uses the EXISTING `view` permission from GC's +> catalog; no new permission defined). Agent WS now binds persistent reattach to +> the authenticated machine identity: `validate_agent_api_key` returns an +> `AgentKeyAuth` enum carrying the `cak_` key's machine `agent_id` (resolved via +> new `db::machines::get_machine_by_id`); a mismatched query-string `agent_id` is +> ignored, a per-agent key whose machine can't be resolved fails closed +> (503). Frame caps set on BOTH upgrades (agent 4 MiB, viewer 64 KiB via +> `max_message_size`/`max_frame_size`) (closes WS-OOM HIGH). Viewer→agent input +> throttled to 200 events/sec/viewer via a refilling token bucket + non-blocking +> bounded `try_send` (drop/coalesce on overflow) (closes input-injection MEDIUM). +> Startup managed-session reconcile retained + clarified (persistent machines → +> offline in-memory sessions). Removed `#[allow(dead_code)]` on +> `validate_viewer_token` and `AuthenticatedUser::has_permission`. No token/secret +> logged; runtime `sqlx::query`/`query_as`. Files: `server/src/relay/mod.rs`, +> `server/src/api/sessions.rs`, `server/src/db/machines.rs`, `server/src/auth/mod.rs`, +> `server/src/auth/jwt.rs`, `server/src/main.rs`. Files touched: `server/src/relay/mod.rs`, `server/src/session/mod.rs`. @@ -99,8 +125,10 @@ Files touched: `server/src/relay/mod.rs`, `server/src/session/mod.rs`. must enforce a real permission predicate, not just `AuthenticatedUser`: `user.is_admin() || user.has_permission()`. GC's role model (`admin|operator|viewer`) + permissions table already exist (`server/src/auth/mod.rs`), so honoring the intra-tenant role distinction is cheap. **Policy - decision (Mike): admin-only, or admin-or-`view_sessions`-permission.** Multi-tenant client-access - isolation stays deferred to Phase 4; this is only the intra-tenant role gate. + DECIDED (Mike, 2026-05-29): admin-or-view-permission** — `user.is_admin() || user.has_permission()`. Enforce + at the minting endpoint `mint_viewer_token` (the authz decision point); the WS then trusts the + session-scoped token. Multi-tenant client-access isolation stays deferred to Phase 4. - **`agent_ws_handler`** (`relay/mod.rs:55`): authenticate via per-agent key OR support code only (Task 2). Persistent reattach must bind to the authenticated machine identity, not a query-string `agent_id` alone (`session/mod.rs:98`).