From 59e40c80196f02dfe04f9f58e8f190c85ee3b828 Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Tue, 2 Jun 2026 10:12:10 -0700 Subject: [PATCH 1/3] =?UTF-8?q?feat(enroll):=20SPEC-016=20Phase=20A=20?= =?UTF-8?q?=E2=80=94=20enrollment=20backend=20+=20migration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Server-side zero-touch per-site enrollment (Phase A: backend + DB only; agent-side machine_uid derivation is Phase B, server treats it as opaque). Migration 010_spec016_enrollment.sql: - connect_sites: relational site anchor (site_code natural key, per-tenant unique). The spec assumed a sites table existed; it did not (site/company were free-text columns on connect_machines), so this creates a minimal one. - site_enrollment_keys: rotatable, Argon2id-hashed cek_ secret + monotonic version + hex fingerprint + active flag; one-active-per-site partial unique. - connect_machines: + site_id (FK), + enrollment_state ('active'|'pending') collision gate, + per-tenant (tenant_id, machine_uid) unique index added ALONGSIDE the 008 global index (the connect-path upsert_machine ON CONFLICT arbiter binds to 008 — dropping it would break live reconnect). - connect_sites.enrollment_policy: reserved (default auto-approve), not enforced. auth/enrollment_keys.rs: cek_ mint (256-bit, OS CSPRNG), Argon2id hash/verify (reuses auth::password), and hex fingerprint vN (XXXX) per resolved-decision #3. db/sites.rs + db/enrollment_keys.rs: runtime sqlx persistence; rotate_key deactivates+inserts in one tx to hold the one-active-key invariant. POST /api/enroll (public, api/enroll.rs): site_code+cek_ verify against active key -> dedup on (tenant, machine_uid) -> new / reuse / site-move / collision. Collision gate (PROVISIONAL heuristic: online existing row + different hostname) -> pending, no usable cak_, alert. Mints cak_ via existing agent_keys path in the exact form relay::validate_agent_api_key expects. Per-(site_code,IP) rate-limit + lockout (EnrollLimiter). Audit events + [ENROLL] alert markers with TODO(SPEC-016) #dev-alerts notes. Admin (JWT) api/sites.rs: POST /api/sites/:id/enrollment-key/rotate (plaintext + fingerprint once) and GET .../enrollment-key (fingerprint/version, no secret). Routes wired in main.rs (enroll public, rotation admin). 13 new unit tests; full server suite 99 passing. cargo check + clippy clean on the host (Windows) target — Linux cross-target not installed here; server crate is platform-neutral Rust. No sqlx offline cache needed (codebase uses runtime queries, no query!). Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.lock | 4 +- server/migrations/010_spec016_enrollment.sql | 159 ++++++ server/src/api/enroll.rs | 562 +++++++++++++++++++ server/src/api/mod.rs | 2 + server/src/api/sites.rs | 217 +++++++ server/src/auth/enrollment_keys.rs | 188 +++++++ server/src/auth/mod.rs | 1 + server/src/db/enrollment_keys.rs | 142 +++++ server/src/db/events.rs | 59 ++ server/src/db/machines.rs | 142 +++++ server/src/db/mod.rs | 2 + server/src/db/sites.rs | 94 ++++ server/src/main.rs | 17 + server/src/middleware/rate_limit.rs | 213 +++++++ 14 files changed, 1800 insertions(+), 2 deletions(-) create mode 100644 server/migrations/010_spec016_enrollment.sql create mode 100644 server/src/api/enroll.rs create mode 100644 server/src/api/sites.rs create mode 100644 server/src/auth/enrollment_keys.rs create mode 100644 server/src/db/enrollment_keys.rs create mode 100644 server/src/db/sites.rs diff --git a/Cargo.lock b/Cargo.lock index 58d8190..b352724 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1407,7 +1407,7 @@ dependencies = [ [[package]] name = "guruconnect" -version = "0.2.0" +version = "0.3.0" dependencies = [ "anyhow", "bytes", @@ -1447,7 +1447,7 @@ dependencies = [ [[package]] name = "guruconnect-server" -version = "0.2.0" +version = "0.3.0" dependencies = [ "anyhow", "argon2", diff --git a/server/migrations/010_spec016_enrollment.sql b/server/migrations/010_spec016_enrollment.sql new file mode 100644 index 0000000..db95ba9 --- /dev/null +++ b/server/migrations/010_spec016_enrollment.sql @@ -0,0 +1,159 @@ +-- Migration: 010_spec016_enrollment.sql +-- Purpose: SPEC-016 zero-touch per-site agent enrollment — server-side data model. +-- +-- Adds the per-site enrollment-key table, a minimal sites table to anchor it, +-- and the machine-side columns the collision-gated self-registration flow needs. +-- +-- Two-tier credential model (SPEC-016 §Security): a low-sensitivity, rotatable, +-- per-site ENROLLMENT KEY (the `cek_` secret stored hashed here) gates "may this +-- machine register at all", while the high-sensitivity per-machine `cak_` +-- operating credential (connect_agent_keys, migration 004) is minted on a +-- successful enroll. Compromise of an enrollment key is recovered by rotating one +-- site, not a fleet-wide re-key. +-- +-- DEVIATION FROM SPEC (documented): SPEC-016 §DB-migration describes +-- `site_enrollment_keys.site_id` as `fk -> sites`, assuming a sites table already +-- exists. It does NOT — in the current schema "site" and "company/organization" are +-- free-text columns on connect_machines (migration 005), there is no relational +-- sites entity. This migration therefore CREATES a minimal `connect_sites` table +-- (the relational anchor the enrollment-key FK and the dashboard per-site key +-- display both require) keyed by a natural `site_code` and scoped per-tenant. It is +-- intentionally minimal (code + display name + tenant); richer site/company +-- modeling is left to future work. The free-text connect_machines.site / +-- .organization columns are untouched and continue to carry agent-reported labels. +-- +-- Idempotent: CREATE TABLE/INDEX IF NOT EXISTS, ADD COLUMN IF NOT EXISTS. Applied on +-- server startup by sqlx::migrate!(); never pre-applied via psql. Ordered after 009. +-- See .claude/standards/gururmm/sqlx-migrations.md. + +-- pgcrypto provides gen_random_uuid(); enabled in 001/004 but re-asserted for safety. +CREATE EXTENSION IF NOT EXISTS "pgcrypto"; + +-- ============================================================================ +-- connect_sites — relational anchor for per-site enrollment (see DEVIATION above) +-- ============================================================================ +-- A site is the unit a single signed installer targets. `site_code` is the +-- non-secret, operator-facing identifier the installer carries and the agent sends +-- at /api/enroll (e.g. "ACME-PHX"). Uniqueness is per-tenant: the same human-chosen +-- code may legitimately exist in two tenants. tenant_id mirrors the nullable, +-- default-tenant-backfilled tenancy column used on every other scoped table +-- (migration 004); db::tenancy::current_tenant_id() resolves it for now. + +CREATE TABLE IF NOT EXISTS connect_sites ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + -- Operator-facing site identifier the installer carries. Non-secret. + site_code TEXT NOT NULL, + -- Human-readable site / company display name for the dashboard. + display_name TEXT, + -- Default company label applied to machines enrolled at this site (mirrors the + -- free-text connect_machines.organization the agent otherwise self-reports). + company TEXT, + -- Tenancy-ready (Phase 4). Backfilled to the default tenant below. + tenant_id UUID, + -- RESERVED for future per-site enrollment POLICY work (SPEC-016 §out-of-scope): + -- default 'auto-approve'; a future 'pending-approval' value will gate new + -- enrollments. NOT enforced in Phase A — present so the policy SPEC needs no + -- schema change. Do not branch on this column yet. + enrollment_policy TEXT DEFAULT 'auto-approve', + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +-- Per-tenant uniqueness of the natural site_code so /api/enroll can resolve a site +-- deterministically within a tenant while the same code may exist across tenants. +-- COALESCE keeps the index usable while tenant_id is still nullable (Phase 1). +CREATE UNIQUE INDEX IF NOT EXISTS idx_connect_sites_tenant_code + ON connect_sites (COALESCE(tenant_id, '00000000-0000-0000-0000-000000000001'::uuid), site_code); + +-- Backfill the sites tenant_id to the default tenant (table is empty on a fresh DB; +-- no-op there, but keeps the migration self-consistent). +UPDATE connect_sites +SET tenant_id = '00000000-0000-0000-0000-000000000001' +WHERE tenant_id IS NULL; + +-- ============================================================================ +-- site_enrollment_keys — rotatable, hashed per-site enrollment secret + fingerprint +-- ============================================================================ +-- Stores ONLY the Argon2id hash of the `cek_` secret; the plaintext is shown once +-- at issue/rotate and never recoverable. `version` is the monotonic rotation +-- counter; `fingerprint` is the non-secret short hex shown as `vN (XXXX)` in the +-- dashboard and baked into the installer filename. `active` marks the current key — +-- rotation flips the old key to active=false (blocking NEW enrollments from old +-- installers) and inserts a new active row; already-enrolled agents holding their +-- own `cak_` are unaffected. Multiple inactive (historical) rows may coexist per +-- site; at most one active row is intended (enforced by a partial unique index). + +CREATE TABLE IF NOT EXISTS site_enrollment_keys ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + site_id UUID NOT NULL REFERENCES connect_sites(id) ON DELETE CASCADE, + -- Argon2id hash of the `cek_` enrollment secret. Never the plaintext. + key_hash TEXT NOT NULL, + -- Monotonic rotation version (1, 2, 3, ...). + version INTEGER NOT NULL, + -- Non-secret short hex fingerprint code (the XXXX in `vN (XXXX)`), derived from + -- the secret. Stored so the dashboard / GET endpoint can show it without the + -- secret. + fingerprint TEXT NOT NULL, + active BOOLEAN NOT NULL DEFAULT true, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + -- Set when this key is rotated out (active flipped to false). + rotated_at TIMESTAMPTZ +); + +-- Lookup index for the enroll hot path: resolve the active key for a site. +CREATE INDEX IF NOT EXISTS idx_site_enrollment_keys_site_active + ON site_enrollment_keys (site_id, active); + +-- At most one ACTIVE enrollment key per site (the "current" installer key). +-- Partial unique index so any number of inactive historical rows may coexist. +CREATE UNIQUE INDEX IF NOT EXISTS idx_site_enrollment_keys_one_active + ON site_enrollment_keys (site_id) + WHERE active; + +-- ============================================================================ +-- connect_machines — site binding + enrollment-state collision gate +-- ============================================================================ +-- machine_uid already exists (migration 008) with a partial UNIQUE index on +-- (machine_uid) WHERE machine_uid IS NOT NULL. SPEC-016 §item-1 / resolved-decision #4 +-- call for the dedup key to be PER-TENANT — (tenant_id, machine_uid) — so the same +-- hardware legitimately present in two tenants stays two rows. tenant_id is the +-- scoping column that exists on connect_machines (migration 004); machines have no +-- direct site_id today, so site is tracked separately (site_id below) and tenancy is +-- the uniqueness scope, exactly as the spec states. +-- +-- CRITICAL CONSTRAINT (why we ADD rather than REPLACE the 008 index here): +-- db::machines::upsert_machine (the live connect-path upsert) uses +-- `ON CONFLICT (machine_uid) WHERE machine_uid IS NOT NULL` as its conflict arbiter. +-- Postgres matches that arbiter to the EXACT index from migration 008. Dropping that +-- index would make the live upsert fail to find an arbiter and error at runtime — +-- breaking every un-keyed agent reconnect. So migration 008's global index is LEFT +-- IN PLACE (the connect path keeps working unchanged) and the per-tenant index is +-- added ALONGSIDE it. In single-tenant Phase 1 the two are equivalent (every row's +-- tenant_id is the default tenant), so the per-tenant index adds the SPEC-016 dedup +-- semantics without a redundant-uniqueness conflict: a (tenant, uid) pair that is +-- unique is also globally unique today. When multi-tenancy activates AND +-- upsert_machine's ON CONFLICT is updated to name (tenant_id, machine_uid), a future +-- migration drops the global 008 index. Documented as deferred; do not drop it now. + +-- Optional FK to the site a machine enrolled under (NULL for legacy / support-code +-- machines that never enrolled through /api/enroll). A site change on re-enroll is +-- the "site move" SPEC-016 audits. +ALTER TABLE connect_machines ADD COLUMN IF NOT EXISTS site_id UUID REFERENCES connect_sites(id) ON DELETE SET NULL; + +-- enrollment_state: the collision gate (SPEC-016 §item-1/6). 'active' = live and +-- controllable (auto-approve posture); 'pending' = a machine_uid collision was +-- detected at enroll and an operator must confirm in the dashboard before the +-- endpoint may be controlled. Default 'active' so every legacy/connect-path row is +-- unaffected. +ALTER TABLE connect_machines + ADD COLUMN IF NOT EXISTS enrollment_state TEXT NOT NULL DEFAULT 'active' + CHECK (enrollment_state IN ('active', 'pending')); + +-- Per-tenant machine_uid uniqueness (SPEC-016). Added ALONGSIDE migration 008's +-- global (machine_uid) index (see CRITICAL CONSTRAINT above — the connect-path +-- upsert's ON CONFLICT arbiter binds to the 008 index, which must survive). COALESCE +-- folds a NULL tenant_id to the default tenant so the index is well-defined while +-- tenancy is single-tenant (Phase 1); the WHERE clause excludes NULL machine_uid so +-- legacy un-keyed rows coexist freely. +CREATE UNIQUE INDEX IF NOT EXISTS idx_connect_machines_tenant_machine_uid + ON connect_machines (COALESCE(tenant_id, '00000000-0000-0000-0000-000000000001'::uuid), machine_uid) + WHERE machine_uid IS NOT NULL; diff --git a/server/src/api/enroll.rs b/server/src/api/enroll.rs new file mode 100644 index 0000000..b961e0f --- /dev/null +++ b/server/src/api/enroll.rs @@ -0,0 +1,562 @@ +//! Zero-touch per-site self-registration endpoint (SPEC-016, Phase A). +//! +//! `POST /api/enroll` is the PUBLIC (no-JWT) door a managed agent walks through on +//! first run: it presents its site's `site_code` + the long per-site enrollment +//! key (`cek_`) and its machine-derived `machine_uid`, and the server — if the key +//! verifies — dedups on `(tenant, machine_uid)`, creates or reuses the machine row, +//! and mints the per-machine `cak_` operating credential, returning the plaintext +//! `cak_` exactly once. +//! +//! Two-tier credential model (SPEC-016 §Security): the enrollment key is the +//! low-sensitivity, rotatable, per-site GATE ("may register"); the minted `cak_` is +//! the high-sensitivity, per-machine, independently-revocable OPERATING credential +//! the relay (`relay::validate_agent_api_key`) already accepts. This handler only +//! MINTS a `cak_` in the exact stored form `verify_agent_key` expects (SHA-256 hash +//! in `connect_agent_keys`) — it does not touch the relay auth path. +//! +//! AUTH POSTURE: auto-approve (ScreenConnect parity) — a clean enroll is live and +//! controllable immediately, with the new-enrollment alert as the tripwire. The one +//! exception is a detected `machine_uid` collision, which gates the machine to +//! `enrollment_state = 'pending'` and withholds a usable `cak_` until an operator +//! confirms it in the dashboard. +//! +//! SECURITY: never log the enrollment key, the minted `cak_`, or any hash. The +//! plaintext `cak_` appears only in the success response body, once. + +use std::net::IpAddr; +use std::net::SocketAddr; + +use axum::{ + extract::{ConnectInfo, State}, + http::{HeaderMap, StatusCode}, + Json, +}; +use serde::{Deserialize, Serialize}; +use serde_json::json; +use uuid::Uuid; + +use crate::auth::{agent_keys, enrollment_keys}; +use crate::db; +use crate::AppState; + +/// Standard error envelope (see `.claude/standards/api/response-format.md`), +/// matching `api::machine_keys::ApiError`. +#[derive(Debug, Serialize)] +pub struct ApiError { + pub detail: String, + pub error_code: String, + pub status_code: u16, +} + +impl ApiError { + fn new(status: StatusCode, code: &str, detail: &str) -> (StatusCode, Json) { + ( + status, + Json(ApiError { + detail: detail.to_string(), + error_code: code.to_string(), + status_code: status.as_u16(), + }), + ) + } +} + +type ApiResult = Result)>; + +/// Labels an installer carries for the machines it enrolls (SPEC-016 §3). +/// +/// All optional: a thin installer may carry only company/site. `company` -> +/// `connect_machines.organization`; `site` -> `connect_machines.site` (the +/// free-text label, distinct from the relational site binding resolved from +/// `site_code`). `department` / `device_type` are reserved label fields (SPEC-007 +/// AgentStatus parity) — accepted and folded into `tags` for now (no dedicated +/// columns yet), so they are not silently dropped. +#[derive(Debug, Default, Deserialize)] +pub struct EnrollLabels { + #[serde(default)] + pub company: Option, + #[serde(default)] + pub site: Option, + #[serde(default)] + pub department: Option, + #[serde(default)] + pub device_type: Option, + #[serde(default)] + pub tags: Vec, +} + +/// `POST /api/enroll` request body (SPEC-016 §3). +#[derive(Debug, Deserialize)] +pub struct EnrollRequest { + pub site_code: String, + /// The per-site enrollment secret (`cek_`). Verified against the site's active + /// hashed key; never logged. + pub enrollment_key: String, + /// Opaque caller-supplied stable machine identity. Phase A treats this as an + /// opaque string; the hardware-salted derivation is Phase B (agent-side). + pub machine_uid: String, + pub hostname: String, + #[serde(default)] + pub labels: EnrollLabels, +} + +/// `POST /api/enroll` success response. +/// +/// On a clean (active) enroll, `key` carries the plaintext `cak_` ONCE. On a +/// collision-gated `pending` enroll, `key` is `None` and `enrollment_state` is +/// `"pending"` — no usable operating credential is issued until an operator +/// confirms the endpoint in the dashboard. +#[derive(Debug, Serialize)] +pub struct EnrollResponse { + /// `connect_machines.id` for the enrolled machine. + pub machine_id: Uuid, + /// The minted plaintext `cak_`, present ONLY for an active enroll, ONLY here. + #[serde(skip_serializing_if = "Option::is_none")] + pub key: Option, + /// `"active"` (live, controllable, key issued) or `"pending"` (collision-gated; + /// awaiting operator confirmation; no key issued). + pub enrollment_state: String, + /// Disposition: `"new"` | `"reuse"` | `"site_move"` | `"collision_pending"`. + pub disposition: String, +} + +fn require_db(state: &AppState) -> ApiResult<&db::Database> { + state.db.as_ref().ok_or_else(|| { + ApiError::new( + StatusCode::SERVICE_UNAVAILABLE, + "DATABASE_UNAVAILABLE", + "Database not available", + ) + }) +} + +/// Collision-gate heuristic (PROVISIONAL — SPEC-016 §item-1/6). +/// +/// The residual collision case is template-cloned VMs that share a hardware UUID +/// (some hypervisors clone the SMBIOS UUID), so a *different* physical endpoint +/// resolves to an existing `machine_uid`. We cannot distinguish that from a benign +/// re-image purely from a client-asserted uid, so the gate is intentionally +/// CONSERVATIVE and the heuristic is provisional, to be tightened in planning +/// (which durable hardware signals feed the uid, and the hypervisor behavior +/// matrix — see SPEC-016 §Remaining-for-planning). +/// +/// PROVISIONAL heuristic chosen for Phase A: treat it as a collision when the +/// matched existing machine +/// (a) is currently considered ONLINE (status == "online"), AND +/// (b) reports a DIFFERENT hostname than the incoming request, +/// i.e. an apparently-live box already owns this uid yet a second box with a +/// different name is enrolling against it concurrently — the clone signature. A +/// case-insensitive hostname compare avoids false positives from case drift. An +/// OFFLINE matched row (the common re-image / re-install case) is NOT treated as a +/// collision — that is the legitimate reuse path. A same-hostname match is reuse, +/// never a collision. +/// +/// Rationale for conservatism: a false POSITIVE merely sends a real machine to +/// `pending` (an operator clicks confirm — annoying, recoverable); a false NEGATIVE +/// would auto-activate a cloned endpoint (worse). When the salt set is finalized in +/// planning this should become "uid is stable hardware, so a genuine clone is +/// expected to be rare; gate on the salt's distinguishing component instead." +fn is_collision(existing: &db::machines::Machine, incoming_hostname: &str) -> bool { + let online = existing.status.eq_ignore_ascii_case("online"); + let different_host = !existing + .hostname + .eq_ignore_ascii_case(incoming_hostname.trim()); + online && different_host +} + +/// Mint a `cak_`, store its hash bound to `machine_id` + tenant, and return the +/// plaintext. Shared by the new/reuse/move active paths. +async fn mint_cak( + db: &db::Database, + machine_id: Uuid, + tenant_id: Uuid, +) -> ApiResult { + let plaintext = agent_keys::generate_agent_key(); + let key_hash = agent_keys::hash_agent_key(&plaintext); + db::agent_keys::insert_agent_key(db.pool(), machine_id, &key_hash, Some(tenant_id)) + .await + .map_err(|e| { + tracing::error!("[ENROLL] DB error minting agent key: {}", e); + ApiError::new( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Failed to mint machine credential", + ) + })?; + Ok(plaintext) +} + +/// `POST /api/enroll` — public self-registration (SPEC-016 §3). +pub async fn enroll( + State(state): State, + ConnectInfo(addr): ConnectInfo, + headers: HeaderMap, + Json(req): Json, +) -> ApiResult<(StatusCode, Json)> { + let db = require_db(&state)?; + let tenant_id = db::tenancy::current_tenant_id(); + + // Real client IP via the shared trusted-proxy-aware extractor (same source the + // relay / rate limiter / audit log use, so the buckets never drift). + let ip: IpAddr = crate::utils::ip_extract::client_ip(&addr, &headers, &state.trusted_proxies); + + // Basic input hygiene before any DB/KDF work. + let site_code = req.site_code.trim(); + let hostname = req.hostname.trim(); + let machine_uid = req.machine_uid.trim(); + if site_code.is_empty() || hostname.is_empty() || machine_uid.is_empty() { + return Err(ApiError::new( + StatusCode::BAD_REQUEST, + "INVALID_REQUEST", + "site_code, hostname, and machine_uid are required", + )); + } + + // Defense-in-depth rate limit / lockout per (site_code, IP). The 256-bit + // enrollment key is the load-bearing gate; this throttles brute-force/abuse. + if !state.rate_limits.enroll.check(site_code, ip) { + tracing::warn!( + "[ENROLL] rate-limited/locked-out enroll for site_code={} from {}", + site_code, + ip + ); + return Err(ApiError::new( + StatusCode::TOO_MANY_REQUESTS, + "RATE_LIMITED", + "Too many enrollment attempts. Please try again later.", + )); + } + + // Resolve the site by code (per-tenant). + let site = match db::sites::get_site_by_code(db.pool(), site_code, tenant_id).await { + Ok(Some(s)) => s, + Ok(None) => { + state.rate_limits.enroll.record_failure(site_code, ip); + audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({ + "reason": "unknown_site_code", + "site_code": site_code, + "machine_uid": machine_uid, + })) + .await; + tracing::warn!("[ENROLL] unknown site_code={} from {}", site_code, ip); + // Same opaque rejection shape as a bad key — do not reveal which of the + // two failed (avoids a site_code enumeration oracle). + return Err(ApiError::new( + StatusCode::UNAUTHORIZED, + "ENROLL_REJECTED", + "Invalid site code or enrollment key", + )); + } + Err(e) => { + tracing::error!("[ENROLL] DB error resolving site: {}", e); + return Err(ApiError::new( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Internal server error", + )); + } + }; + + // Verify the enrollment key against the site's ACTIVE key. A rotated-out (old) + // installer's key is inactive and rejected here — old installers cannot enroll + // NEW machines after rotation (SPEC-016 success-criterion #3). + let active_key = match db::enrollment_keys::get_active_for_site(db.pool(), site.id).await { + Ok(Some(k)) => k, + Ok(None) => { + state.rate_limits.enroll.record_failure(site_code, ip); + audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({ + "reason": "no_active_key", + "site_code": site_code, + "machine_uid": machine_uid, + })) + .await; + tracing::warn!("[ENROLL] no active enrollment key for site_code={}", site_code); + return Err(ApiError::new( + StatusCode::UNAUTHORIZED, + "ENROLL_REJECTED", + "Invalid site code or enrollment key", + )); + } + Err(e) => { + tracing::error!("[ENROLL] DB error loading active enrollment key: {}", e); + return Err(ApiError::new( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Internal server error", + )); + } + }; + + if !enrollment_keys::verify_enrollment_key(&req.enrollment_key, &active_key.key_hash) { + state.rate_limits.enroll.record_failure(site_code, ip); + audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({ + "reason": "bad_enrollment_key", + "site_code": site_code, + "machine_uid": machine_uid, + })) + .await; + tracing::warn!("[ENROLL] bad enrollment key for site_code={} from {}", site_code, ip); + return Err(ApiError::new( + StatusCode::UNAUTHORIZED, + "ENROLL_REJECTED", + "Invalid site code or enrollment key", + )); + } + + // Key verified — this is a legitimate attempt; reset the failure streak. + state.rate_limits.enroll.record_success(site_code, ip); + + // Build the label/identity params shared by the create/update paths. + let tags = effective_tags(&req.labels); + let company = req.labels.company.as_deref().map(str::trim).filter(|s| !s.is_empty()); + let site_label = req + .labels + .site + .as_deref() + .map(str::trim) + .filter(|s| !s.is_empty()); + + // Dedup on (tenant, machine_uid). + let existing = + match db::machines::get_machine_by_tenant_uid(db.pool(), tenant_id, machine_uid).await { + Ok(e) => e, + Err(e) => { + tracing::error!("[ENROLL] DB error on dedup lookup: {}", e); + return Err(ApiError::new( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Internal server error", + )); + } + }; + + match existing { + // -- Reuse / site-move / collision path ------------------------------- + Some(existing) => { + // Collision gate: a seemingly-different live endpoint resolving to an + // existing uid -> pending, alert, NO usable cak_ minted. + if is_collision(&existing, hostname) { + let params = enroll_params( + &existing.agent_id, + hostname, + machine_uid, + tenant_id, + site.id, + company, + site_label, + &tags, + "pending", + ); + let machine = db::machines::update_enrolled_machine(db.pool(), existing.id, ¶ms) + .await + .map_err(map_update_err)?; + + audit(db, db::events::EventTypes::ENROLL_COLLISION_PENDING, ip, json!({ + "machine_id": machine.id, + "machine_uid": machine_uid, + "site_code": site_code, + "existing_hostname": existing.hostname, + "incoming_hostname": hostname, + "heuristic": "online_existing_with_different_hostname (PROVISIONAL)", + })) + .await; + // TODO(SPEC-016): wire to #dev-alerts — collision requires operator + // confirmation in the dashboard before this endpoint may activate. + tracing::warn!( + "[ENROLL] machine_uid collision -> PENDING: machine_id={} site_code={} \ + existing_host={} incoming_host={} from {}", + machine.id, site_code, existing.hostname, hostname, ip + ); + + return Ok(( + StatusCode::ACCEPTED, + Json(EnrollResponse { + machine_id: machine.id, + key: None, + enrollment_state: "pending".to_string(), + disposition: "collision_pending".to_string(), + }), + )); + } + + // Legitimate reuse (re-image / re-install) or a site move. + let is_move = existing.site_id.map(|s| s != site.id).unwrap_or(true); + let params = enroll_params( + &existing.agent_id, + hostname, + machine_uid, + tenant_id, + site.id, + company, + site_label, + &tags, + "active", + ); + let machine = db::machines::update_enrolled_machine(db.pool(), existing.id, ¶ms) + .await + .map_err(map_update_err)?; + + let cak = mint_cak(db, machine.id, tenant_id).await?; + + if is_move { + audit(db, db::events::EventTypes::ENROLL_SITE_MOVE, ip, json!({ + "machine_id": machine.id, + "machine_uid": machine_uid, + "from_site_id": existing.site_id, + "to_site_code": site_code, + "to_site_id": site.id, + })) + .await; + // TODO(SPEC-016): wire to #dev-alerts — a machine moved between sites. + tracing::info!( + "[ENROLL] site-move: machine_id={} machine_uid present, to site_code={} from {}", + machine.id, site_code, ip + ); + } else { + audit(db, db::events::EventTypes::ENROLL_REUSE, ip, json!({ + "machine_id": machine.id, + "machine_uid": machine_uid, + "site_code": site_code, + })) + .await; + tracing::info!( + "[ENROLL] reuse: machine_id={} re-enrolled at site_code={} from {}", + machine.id, site_code, ip + ); + } + + Ok(( + StatusCode::OK, + Json(EnrollResponse { + machine_id: machine.id, + key: Some(cak), + enrollment_state: "active".to_string(), + disposition: if is_move { "site_move" } else { "reuse" }.to_string(), + }), + )) + } + + // -- New enrollment ---------------------------------------------------- + None => { + // Fresh opaque agent_id for the new row's `agent_id UNIQUE` column. The + // agent's own config-UUID story is Phase B; the server only needs a + // unique non-null value here, and the authoritative identity is the + // minted cak_ -> machine binding. + let agent_id = format!("enroll-{}", Uuid::new_v4()); + let params = enroll_params( + &agent_id, + hostname, + machine_uid, + tenant_id, + site.id, + company, + site_label, + &tags, + "active", + ); + let machine = db::machines::insert_enrolled_machine(db.pool(), ¶ms) + .await + .map_err(|e| { + tracing::error!("[ENROLL] DB error inserting enrolled machine: {}", e); + ApiError::new( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Failed to register machine", + ) + })?; + + let cak = mint_cak(db, machine.id, tenant_id).await?; + + audit(db, db::events::EventTypes::ENROLL_NEW, ip, json!({ + "machine_id": machine.id, + "machine_uid": machine_uid, + "site_code": site_code, + "hostname": hostname, + })) + .await; + // TODO(SPEC-016): wire to #dev-alerts — new-enrollment tripwire. + tracing::info!( + "[ENROLL] new: machine_id={} hostname={} site_code={} from {}", + machine.id, hostname, site_code, ip + ); + + Ok(( + StatusCode::CREATED, + Json(EnrollResponse { + machine_id: machine.id, + key: Some(cak), + enrollment_state: "active".to_string(), + disposition: "new".to_string(), + }), + )) + } + } +} + +/// Fold `department` / `device_type` (no dedicated columns yet — SPEC-007) into the +/// tag set as `department:` / `device_type:` so they are preserved rather than +/// dropped, alongside any explicit tags. Empty/whitespace values are skipped. +fn effective_tags(labels: &EnrollLabels) -> Vec { + let mut tags: Vec = labels + .tags + .iter() + .map(|t| t.trim().to_string()) + .filter(|t| !t.is_empty()) + .collect(); + if let Some(d) = labels.department.as_deref().map(str::trim).filter(|s| !s.is_empty()) { + tags.push(format!("department:{}", d)); + } + if let Some(d) = labels + .device_type + .as_deref() + .map(str::trim) + .filter(|s| !s.is_empty()) + { + tags.push(format!("device_type:{}", d)); + } + tags +} + +/// Assemble [`db::machines::EnrollMachineParams`] from the resolved pieces. +#[allow(clippy::too_many_arguments)] +fn enroll_params<'a>( + agent_id: &'a str, + hostname: &'a str, + machine_uid: &'a str, + tenant_id: Uuid, + site_id: Uuid, + company: Option<&'a str>, + site_label: Option<&'a str>, + tags: &'a [String], + enrollment_state: &'a str, +) -> db::machines::EnrollMachineParams<'a> { + db::machines::EnrollMachineParams { + agent_id, + hostname, + machine_uid, + tenant_id, + site_id, + company, + site_label, + tags, + enrollment_state, + } +} + +/// Best-effort enrollment audit write — a failure here never fails the enroll. +async fn audit(db: &db::Database, event_type: &str, ip: IpAddr, details: serde_json::Value) { + if let Err(e) = db::events::log_enrollment_event(db.pool(), event_type, details, Some(ip)).await + { + tracing::warn!("[ENROLL] failed to write {} audit event: {}", event_type, e); + } +} + +/// Map a DB error from the existing-row update to the standard 500 envelope. +fn map_update_err(e: sqlx::Error) -> (StatusCode, Json) { + tracing::error!("[ENROLL] DB error updating enrolled machine: {}", e); + ApiError::new( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Failed to update machine", + ) +} diff --git a/server/src/api/mod.rs b/server/src/api/mod.rs index 5d99dbe..3c28950 100644 --- a/server/src/api/mod.rs +++ b/server/src/api/mod.rs @@ -4,10 +4,12 @@ pub mod auth; pub mod auth_logout; pub mod changelog; pub mod downloads; +pub mod enroll; pub mod machine_keys; pub mod releases; pub mod removal; pub mod sessions; +pub mod sites; pub mod users; use axum::{ diff --git a/server/src/api/sites.rs b/server/src/api/sites.rs new file mode 100644 index 0000000..01bdb36 --- /dev/null +++ b/server/src/api/sites.rs @@ -0,0 +1,217 @@ +//! Site enrollment-key administration (SPEC-016, admin plane). +//! +//! Admin (dashboard JWT + admin role) endpoints for the per-site enrollment key +//! the dashboard surfaces and rotates: +//! +//! - `POST /api/sites/:id/enrollment-key/rotate` — regenerate the `cek_` secret, +//! bump the monotonic version, derive a new fingerprint, deactivate the prior +//! active key, and return the plaintext + fingerprint ONCE. Old installers can no +//! longer enroll NEW machines after this; already-enrolled agents (holding their +//! own `cak_`) are unaffected (SPEC-016 success-criterion #3). Doubles as +//! first-issue when a site has no key yet. +//! - `GET /api/sites/:id/enrollment-key` — read the CURRENT non-secret fingerprint +//! + version (never the secret). 404 if the site has no active key yet. +//! +//! Auth mirrors `api::machine_keys`: the [`crate::auth::AdminUser`] extractor gates +//! both routes, and they are mounted behind the JWT `auth_layer`. +//! +//! SECURITY: the plaintext `cek_` is returned exactly once (rotate response), +//! never persisted in plaintext and never logged. Read responses expose only the +//! version + fingerprint. + +use axum::{ + extract::{Path, State}, + http::StatusCode, + Json, +}; +use serde::Serialize; +use uuid::Uuid; + +use crate::auth::{enrollment_keys, AdminUser}; +use crate::db; +use crate::AppState; + +/// Standard error envelope (matches `api::machine_keys::ApiError`). +#[derive(Debug, Serialize)] +pub struct ApiError { + pub detail: String, + pub error_code: String, + pub status_code: u16, +} + +impl ApiError { + fn new(status: StatusCode, code: &str, detail: &str) -> (StatusCode, Json) { + ( + status, + Json(ApiError { + detail: detail.to_string(), + error_code: code.to_string(), + status_code: status.as_u16(), + }), + ) + } +} + +type ApiResult = Result)>; + +/// Response for a freshly rotated/issued enrollment key. `key` is present ONLY +/// here, once. +#[derive(Debug, Serialize)] +pub struct RotatedEnrollmentKey { + pub site_id: Uuid, + /// The plaintext `cek_` enrollment key. Shown exactly once — bake it into the + /// site installer now; the server keeps only its hash. + pub key: String, + /// Monotonic rotation version. + pub version: i32, + /// The non-secret short hex code (the `XXXX` in `vN (XXXX)`). + pub fingerprint: String, + /// Fully rendered operator-facing fingerprint, e.g. `v3 (7F2A)`. + pub fingerprint_label: String, +} + +/// Non-secret current-key view for the GET endpoint. +#[derive(Debug, Serialize)] +pub struct EnrollmentKeyView { + pub site_id: Uuid, + pub version: i32, + pub fingerprint: String, + pub fingerprint_label: String, + pub active: bool, +} + +fn require_db(state: &AppState) -> ApiResult<&db::Database> { + state.db.as_ref().ok_or_else(|| { + ApiError::new( + StatusCode::SERVICE_UNAVAILABLE, + "DATABASE_UNAVAILABLE", + "Database not available", + ) + }) +} + +/// Resolve a site by its UUID path segment, or a 404 envelope. +async fn resolve_site(db: &db::Database, site_id: Uuid) -> ApiResult { + db::sites::get_site_by_id(db.pool(), site_id) + .await + .map_err(|e| { + tracing::error!("DB error resolving site: {}", e); + ApiError::new( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Internal server error", + ) + })? + .ok_or_else(|| ApiError::new(StatusCode::NOT_FOUND, "SITE_NOT_FOUND", "Site not found")) +} + +/// POST /api/sites/:id/enrollment-key/rotate — rotate (or first-issue) a site's +/// enrollment key. Returns the plaintext `cek_` + fingerprint once. +pub async fn rotate_enrollment_key( + AdminUser(admin): AdminUser, + State(state): State, + Path(site_id): Path, +) -> ApiResult<(StatusCode, Json)> { + let db = require_db(&state)?; + let site = resolve_site(db, site_id).await?; + + // Mint plaintext + Argon2id hash + fingerprint. Only the hash + fingerprint + // are persisted; the plaintext is surfaced once below. + let plaintext = enrollment_keys::generate_enrollment_key(); + let key_hash = enrollment_keys::hash_enrollment_key(&plaintext).map_err(|e| { + tracing::error!("Failed to hash enrollment key: {}", e); + ApiError::new( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Failed to hash enrollment key", + ) + })?; + let fingerprint = enrollment_keys::compute_fingerprint(&plaintext); + + let new_key = db::enrollment_keys::rotate_key(db.pool(), site.id, &key_hash, &fingerprint) + .await + .map_err(|e| { + tracing::error!("DB error rotating enrollment key: {}", e); + ApiError::new( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Failed to rotate enrollment key", + ) + })?; + + let fingerprint_label = + enrollment_keys::render_fingerprint(new_key.version, &new_key.fingerprint); + + // Audit WITHOUT key material (no plaintext, no hash). + if let Err(e) = db::events::log_enrollment_event( + db.pool(), + db::events::EventTypes::ENROLLMENT_KEY_ROTATED, + serde_json::json!({ + "site_id": site.id, + "site_code": site.site_code, + "version": new_key.version, + "fingerprint": new_key.fingerprint, + "rotated_by": admin.username, + }), + None, + ) + .await + { + tracing::warn!("[ENROLL] failed to write key-rotate audit event: {}", e); + } + tracing::info!( + "Admin {} rotated enrollment key for site {} to {}", + admin.username, + site.site_code, + fingerprint_label + ); + + Ok(( + StatusCode::CREATED, + Json(RotatedEnrollmentKey { + site_id: site.id, + key: plaintext, + version: new_key.version, + fingerprint: new_key.fingerprint, + fingerprint_label, + }), + )) +} + +/// GET /api/sites/:id/enrollment-key — current non-secret fingerprint + version. +pub async fn get_enrollment_key( + AdminUser(_admin): AdminUser, + State(state): State, + Path(site_id): Path, +) -> ApiResult> { + let db = require_db(&state)?; + let site = resolve_site(db, site_id).await?; + + let key = db::enrollment_keys::get_active_for_site(db.pool(), site.id) + .await + .map_err(|e| { + tracing::error!("DB error loading enrollment key: {}", e); + ApiError::new( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Internal server error", + ) + })? + .ok_or_else(|| { + ApiError::new( + StatusCode::NOT_FOUND, + "NO_ENROLLMENT_KEY", + "Site has no active enrollment key", + ) + })?; + + let fingerprint_label = enrollment_keys::render_fingerprint(key.version, &key.fingerprint); + + Ok(Json(EnrollmentKeyView { + site_id: site.id, + version: key.version, + fingerprint: key.fingerprint, + fingerprint_label, + active: key.active, + })) +} diff --git a/server/src/auth/enrollment_keys.rs b/server/src/auth/enrollment_keys.rs new file mode 100644 index 0000000..2b63b38 --- /dev/null +++ b/server/src/auth/enrollment_keys.rs @@ -0,0 +1,188 @@ +//! Per-site enrollment key minting, hashing, verification, and fingerprinting +//! (SPEC-016 zero-touch enrollment, auth layer). +//! +//! This is the low-sensitivity, rotatable side of the two-tier credential model +//! (SPEC-016 §Security). A per-site ENROLLMENT key (`cek_` prefix) gates "may +//! this machine register at all" at `POST /api/enroll`; a successful enroll mints +//! the high-sensitivity per-machine `cak_` operating credential +//! ([`crate::auth::agent_keys`]). Compromise of an enrollment key is contained to +//! one site and recovered by rotating it. +//! +//! Lifecycle owned here (the secret side): +//! +//! - [`generate_enrollment_key`] mints a high-entropy, `cek_`-prefixed plaintext +//! secret. Mirrors [`crate::auth::agent_keys::generate_agent_key`]'s entropy +//! approach (32 random bytes from the OS CSPRNG, hex-encoded) with a DISTINCT +//! prefix so the two key kinds are never confused in logs or storage. The +//! plaintext is shown to the operator exactly once at issue/rotate and is NEVER +//! persisted or logged. +//! - [`hash_enrollment_key`] / [`verify_enrollment_key`] use **Argon2id** (via +//! [`crate::auth::password`]). This DIFFERS from `cak_` (which uses SHA-256 for +//! a constant-shape equality lookup): SPEC-016 §2 explicitly requires the +//! enrollment key be "stored hashed (Argon2id, same as `cak_`/passwords)". The +//! trade-off is deliberate — enrollment keys are looked up by `(site, active)` +//! first (a small candidate set, usually one row) and only then verified, so the +//! per-verify KDF cost is bounded and not on a high-QPS path, while Argon2id +//! gives salted, GPU-resistant storage matching the password posture. +//! - [`compute_fingerprint`] derives the non-secret short HEX code shown as +//! `vN (XXXX)` (SPEC-016 resolved-decision #3 — hex, deliberately NOT the +//! GuruRMM word-style code, so the two products' artifacts are never visually +//! conflated). +//! +//! SECURITY: never log a plaintext key or its hash. Functions here return the +//! plaintext to the caller (issue/rotate endpoint) but emit no `tracing` output +//! containing key material. + +use anyhow::Result; +use rand::RngCore; +use ring::digest; + +/// Prefix marking a GuruConnect per-site enrollment key. Distinct from the +/// per-agent `cak_` prefix so the two key kinds are never confused. +pub const ENROLLMENT_KEY_PREFIX: &str = "cek_"; + +/// Number of random bytes behind an enrollment key (256 bits of entropy), matching +/// [`crate::auth::agent_keys`]. SPEC-016 §2 requires ≥256-bit. +const ENROLLMENT_KEY_RANDOM_BYTES: usize = 32; + +/// Number of hex characters in the fingerprint code (the `XXXX` in `vN (XXXX)`). +/// Four hex chars = 16 bits — ample to let an operator tell two installers apart at +/// a glance; it is a non-secret display aid, not a security control. +const FINGERPRINT_HEX_LEN: usize = 4; + +/// Generate a new high-entropy, `cek_`-prefixed per-site enrollment key (plaintext). +/// +/// The returned string is the ONLY time the plaintext exists; the caller must +/// surface it to the operator once and store only [`hash_enrollment_key`] of it. +/// Uses the OS CSPRNG via `rand::rngs::OsRng`. +pub fn generate_enrollment_key() -> String { + let mut bytes = [0u8; ENROLLMENT_KEY_RANDOM_BYTES]; + rand::rngs::OsRng.fill_bytes(&mut bytes); + format!("{}{}", ENROLLMENT_KEY_PREFIX, hex_encode(&bytes)) +} + +/// Hash an enrollment key for storage using Argon2id (SPEC-016 §2). +/// +/// Delegates to [`crate::auth::password::hash_password`] so the KDF parameters and +/// salt generation match the password posture exactly. Returns the PHC-format +/// string Postgres stores in `site_enrollment_keys.key_hash`. +pub fn hash_enrollment_key(plaintext: &str) -> Result { + crate::auth::password::hash_password(plaintext) +} + +/// Verify a presented enrollment key against a stored Argon2id hash. +/// +/// Returns `Ok(true)` on a match. A malformed stored hash or a mismatch yields +/// `Ok(false)` / an `Err` from the underlying verifier; the caller treats any +/// non-`Ok(true)` as a rejection. A cheap structural reject (`cek_` prefix) runs +/// first to skip the KDF on obviously-bogus input. +/// +/// SECURITY: only compares; never logs the presented key or the hash. +pub fn verify_enrollment_key(presented: &str, stored_hash: &str) -> bool { + if !presented.starts_with(ENROLLMENT_KEY_PREFIX) { + return false; + } + crate::auth::password::verify_password(presented, stored_hash).unwrap_or(false) +} + +/// Compute the non-secret short HEX fingerprint code for an enrollment key. +/// +/// Derived as the first [`FINGERPRINT_HEX_LEN`] hex chars of the SHA-256 of the +/// plaintext secret, uppercased. This is a stable, non-reversible tag of the secret +/// (knowing the code does not reveal the key) used purely for display. Pair it with +/// the monotonic version via [`render_fingerprint`]. +pub fn compute_fingerprint(plaintext: &str) -> String { + let d = digest::digest(&digest::SHA256, plaintext.as_bytes()); + let hex = hex_encode(d.as_ref()); + hex[..FINGERPRINT_HEX_LEN].to_ascii_uppercase() +} + +/// Render the operator-facing fingerprint string `vN (XXXX)` (SPEC-016 §2). +/// +/// `version` is the monotonic rotation counter; `code` is [`compute_fingerprint`]. +/// Example: `render_fingerprint(3, "7F2A")` -> `"v3 (7F2A)"`. +pub fn render_fingerprint(version: i32, code: &str) -> String { + format!("v{} ({})", version, code) +} + +/// Lowercase hex encoding without pulling in the `hex` crate (mirrors +/// [`crate::auth::agent_keys`]). +fn hex_encode(bytes: &[u8]) -> String { + use std::fmt::Write; + let mut s = String::with_capacity(bytes.len() * 2); + for b in bytes { + let _ = write!(s, "{:02x}", b); + } + s +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn generated_key_is_prefixed_and_high_entropy() { + let key = generate_enrollment_key(); + assert!(key.starts_with(ENROLLMENT_KEY_PREFIX)); + assert_eq!( + key.len(), + ENROLLMENT_KEY_PREFIX.len() + ENROLLMENT_KEY_RANDOM_BYTES * 2 + ); + } + + #[test] + fn generated_keys_are_unique() { + assert_ne!(generate_enrollment_key(), generate_enrollment_key()); + } + + #[test] + fn hash_and_verify_roundtrip() { + let key = generate_enrollment_key(); + let hash = hash_enrollment_key(&key).expect("hash"); + assert!(verify_enrollment_key(&key, &hash)); + } + + #[test] + fn verify_rejects_wrong_key() { + let key = generate_enrollment_key(); + let other = generate_enrollment_key(); + let hash = hash_enrollment_key(&key).expect("hash"); + assert!(!verify_enrollment_key(&other, &hash)); + } + + #[test] + fn verify_rejects_unprefixed_input_without_touching_kdf() { + let key = generate_enrollment_key(); + let hash = hash_enrollment_key(&key).expect("hash"); + // A value lacking the cek_ prefix is structurally rejected before the KDF. + assert!(!verify_enrollment_key("not-a-key", &hash)); + } + + #[test] + fn verify_rejects_malformed_stored_hash() { + let key = generate_enrollment_key(); + // A garbage stored hash must not panic and must reject. + assert!(!verify_enrollment_key(&key, "not-a-phc-hash")); + } + + #[test] + fn fingerprint_is_stable_uppercase_hex_of_expected_len() { + let key = "cek_deadbeef"; + let f1 = compute_fingerprint(key); + let f2 = compute_fingerprint(key); + assert_eq!(f1, f2); + assert_eq!(f1.len(), FINGERPRINT_HEX_LEN); + assert!(f1.chars().all(|c| c.is_ascii_hexdigit())); + assert_eq!(f1, f1.to_ascii_uppercase()); + } + + #[test] + fn fingerprint_differs_per_key() { + assert_ne!(compute_fingerprint("cek_aaa"), compute_fingerprint("cek_bbb")); + } + + #[test] + fn render_fingerprint_matches_spec_shape() { + assert_eq!(render_fingerprint(3, "7F2A"), "v3 (7F2A)"); + } +} diff --git a/server/src/auth/mod.rs b/server/src/auth/mod.rs index 8cebc3d..28da8ae 100644 --- a/server/src/auth/mod.rs +++ b/server/src/auth/mod.rs @@ -4,6 +4,7 @@ //! validation for agents. pub mod agent_keys; +pub mod enrollment_keys; pub mod jwt; pub mod password; pub mod token_blacklist; diff --git a/server/src/db/enrollment_keys.rs b/server/src/db/enrollment_keys.rs new file mode 100644 index 0000000..96c27bd --- /dev/null +++ b/server/src/db/enrollment_keys.rs @@ -0,0 +1,142 @@ +//! Per-site enrollment key database operations (SPEC-016 zero-touch enrollment). +//! +//! Backs the `site_enrollment_keys` table (migration 010). Stores ONLY the +//! Argon2id hash of the `cek_` secret plus the non-secret rotation metadata +//! (version, fingerprint, active flag). Computing the hash and minting the +//! plaintext is [`crate::auth::enrollment_keys`]'s job; this module is +//! hash-agnostic persistence and takes already-hashed values. +//! +//! Rotation invariant: at most one `active` row per site (enforced by a partial +//! unique index in migration 010). [`rotate_key`] deactivates the current active +//! row and inserts a new active one inside a single transaction so the invariant +//! is never transiently violated. +//! +//! All queries use runtime `sqlx::query()` / `sqlx::query_as()` per the codebase +//! convention (no compile-time `query!` macros, no `.sqlx` offline cache). + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use sqlx::PgPool; +use uuid::Uuid; + +/// Per-site enrollment key record. +/// +/// `key_hash` is the only representation of the secret the server stores; the +/// plaintext is shown once at issue/rotate and never persisted. +#[derive(Debug, Clone, Serialize, Deserialize, sqlx::FromRow)] +pub struct EnrollmentKey { + pub id: Uuid, + pub site_id: Uuid, + pub key_hash: String, + pub version: i32, + pub fingerprint: String, + pub active: bool, + pub created_at: DateTime, + pub rotated_at: Option>, +} + +/// Fetch the active enrollment key for a site, if any. +/// +/// This is the `/api/enroll` hot path: resolve the one active key whose hash the +/// presented `cek_` is verified against. The partial unique index guarantees at +/// most one active row, so `fetch_optional` is correct. +pub async fn get_active_for_site( + pool: &PgPool, + site_id: Uuid, +) -> Result, sqlx::Error> { + sqlx::query_as::<_, EnrollmentKey>( + r#" + SELECT id, site_id, key_hash, version, fingerprint, active, created_at, rotated_at + FROM site_enrollment_keys + WHERE site_id = $1 AND active + "#, + ) + .bind(site_id) + .fetch_optional(pool) + .await +} + +/// Insert the FIRST enrollment key for a site at version 1 (initial issue). +/// +/// Use [`rotate_key`] for subsequent rotations. Errors with a unique violation if +/// the site already has an active key (the caller should rotate instead). +#[allow(dead_code)] // Wired by site-admin issue flow; Phase A exposes rotation (which also covers first issue when none exists). +pub async fn insert_initial_key( + pool: &PgPool, + site_id: Uuid, + key_hash: &str, + fingerprint: &str, +) -> Result { + sqlx::query_as::<_, EnrollmentKey>( + r#" + INSERT INTO site_enrollment_keys (site_id, key_hash, version, fingerprint, active) + VALUES ($1, $2, 1, $3, true) + RETURNING id, site_id, key_hash, version, fingerprint, active, created_at, rotated_at + "#, + ) + .bind(site_id) + .bind(key_hash) + .bind(fingerprint) + .fetch_one(pool) + .await +} + +/// Rotate a site's enrollment key (SPEC-016 §2): deactivate the current active key +/// (if any) and insert a new active key at the next monotonic version, all in one +/// transaction. +/// +/// Returns the newly-created active key. If the site has no key yet, this issues +/// version 1 (so rotation also serves as first-issue). The caller passes the +/// already-hashed new secret and its fingerprint; the plaintext is surfaced once by +/// the caller and never reaches this layer. +/// +/// The transaction is what keeps the "at most one active key per site" invariant +/// (partial unique index) from being transiently violated between the UPDATE and +/// the INSERT. +pub async fn rotate_key( + pool: &PgPool, + site_id: Uuid, + new_key_hash: &str, + new_fingerprint: &str, +) -> Result { + let mut tx = pool.begin().await?; + + // Highest existing version for this site (NULL -> 0 so the first key is v1). + let current_max: Option = sqlx::query_scalar( + "SELECT MAX(version) FROM site_enrollment_keys WHERE site_id = $1", + ) + .bind(site_id) + .fetch_one(&mut *tx) + .await?; + let next_version = current_max.unwrap_or(0) + 1; + + // Deactivate the current active key (if any), stamping rotated_at. + sqlx::query( + r#" + UPDATE site_enrollment_keys + SET active = false, rotated_at = NOW() + WHERE site_id = $1 AND active + "#, + ) + .bind(site_id) + .execute(&mut *tx) + .await?; + + // Insert the new active key at the next version. + let new_key = sqlx::query_as::<_, EnrollmentKey>( + r#" + INSERT INTO site_enrollment_keys (site_id, key_hash, version, fingerprint, active) + VALUES ($1, $2, $3, $4, true) + RETURNING id, site_id, key_hash, version, fingerprint, active, created_at, rotated_at + "#, + ) + .bind(site_id) + .bind(new_key_hash) + .bind(next_version) + .bind(new_fingerprint) + .fetch_one(&mut *tx) + .await?; + + tx.commit().await?; + Ok(new_key) +} diff --git a/server/src/db/events.rs b/server/src/db/events.rs index 8d14aff..f42fb39 100644 --- a/server/src/db/events.rs +++ b/server/src/db/events.rs @@ -69,6 +69,29 @@ impl EventTypes { pub const MACHINE_REMOVED: &'static str = "machine_removed"; /// An administrator soft-deleted (purged) a session and dropped it in-memory. pub const SESSION_REMOVED: &'static str = "session_removed"; + + // Zero-touch enrollment events (SPEC-016). Written by POST /api/enroll and the + // site enrollment-key rotation endpoint. These carry no session, so they are + // logged via `log_enrollment_event` with `session_id = NULL`; the structured + // detail (machine_uid, site_code, fingerprint, etc.) goes in `details` and the + // source IP in `ip_address`. + /// A new machine self-registered at a site and was minted its first `cak_`. + pub const ENROLL_NEW: &'static str = "enroll_new"; + /// An existing machine_uid re-enrolled at the SAME site — the row was reused and + /// a fresh `cak_` minted (re-image / re-install). + pub const ENROLL_REUSE: &'static str = "enroll_reuse"; + /// An existing machine_uid enrolled under a DIFFERENT site — the machine's site + /// binding was updated (a "site move"). Fires an alert. + pub const ENROLL_SITE_MOVE: &'static str = "enroll_site_move"; + /// A machine_uid collision was detected at enroll — the endpoint dropped to + /// `pending` and awaits operator confirmation in the dashboard. Fires an alert. + pub const ENROLL_COLLISION_PENDING: &'static str = "enroll_collision_pending"; + /// An enroll attempt failed enrollment-key verification (wrong/inactive key or + /// unknown site_code). Security audit trail for the open-registration surface. + pub const ENROLL_REJECTED: &'static str = "enroll_rejected"; + /// An administrator rotated a site's enrollment key (new version + fingerprint; + /// old installers can no longer enroll NEW machines). + pub const ENROLLMENT_KEY_ROTATED: &'static str = "enrollment_key_rotated"; } /// Log a session event @@ -154,6 +177,42 @@ pub async fn log_admin_removal( Ok(result) } +/// Log a zero-touch enrollment audit event (SPEC-016). +/// +/// Shares the `connect_session_events` audit table but carries no session +/// (`session_id = NULL`, the FK column is nullable) and no viewer — enrollment is +/// an unauthenticated agent action, not a viewer/session event. The structured +/// detail (machine_uid, site_code, fingerprint version, decision, etc.) goes in +/// `details` and the agent's source IP in `ip_address`. +/// +/// Best-effort: a failure to write the audit row must NOT fail the enroll (the +/// machine row and `cak_` already exist); the caller logs the error and proceeds, +/// matching how the relay and Task-5 removal treat audit writes. +pub async fn log_enrollment_event( + pool: &PgPool, + event_type: &str, + details: JsonValue, + ip_address: Option, +) -> Result { + let ip_str = ip_address.map(|ip| ip.to_string()); + + let result = sqlx::query_scalar::<_, i64>( + r#" + INSERT INTO connect_session_events + (session_id, event_type, viewer_id, viewer_name, details, ip_address) + VALUES (NULL, $1, NULL, NULL, $2, $3::inet) + RETURNING id + "#, + ) + .bind(event_type) + .bind(details) + .bind(ip_str) + .fetch_one(pool) + .await?; + + Ok(result) +} + /// Get events for a session #[allow(dead_code)] // TODO(native-remote-control): consumed by the integration API; see docs/specs/native-remote-control/ pub async fn get_session_events( diff --git a/server/src/db/machines.rs b/server/src/db/machines.rs index 901260b..51c3609 100644 --- a/server/src/db/machines.rs +++ b/server/src/db/machines.rs @@ -64,6 +64,16 @@ pub struct Machine { /// history) is retained. NULL = live. Nullable, so it is read NULL-tolerantly /// in the manual `FromRow` below. pub deleted_at: Option>, + /// Relational site binding for a machine enrolled via `/api/enroll` (SPEC-016, + /// migration 010). NULL for legacy / support-code / connect-path machines that + /// never enrolled through the zero-touch flow. A change of this on re-enroll is + /// the "site move" the enroll path audits. + pub site_id: Option, + /// Collision-gate state (SPEC-016, migration 010): `'active'` (live, auto-approve) + /// or `'pending'` (a machine_uid collision was detected at enroll; awaiting + /// operator confirmation before the endpoint may be controlled). Non-null with a + /// default of `'active'`; read NULL-tolerantly below for defense in depth. + pub enrollment_state: String, } impl<'r> FromRow<'r, PgRow> for Machine { @@ -83,6 +93,13 @@ impl<'r> FromRow<'r, PgRow> for Machine { machine_uid: row.try_get("machine_uid")?, // Schema-nullable (migration 009); decode directly as Option. deleted_at: row.try_get("deleted_at")?, + // Schema-nullable (migration 010); decode directly as Option. + site_id: row.try_get("site_id")?, + // Non-null with default 'active' (migration 010); read NULL-tolerantly + // (older snapshots / partial rows) and fall back to 'active'. + enrollment_state: row + .try_get::, _>("enrollment_state")? + .unwrap_or_else(|| "active".to_string()), // Nullable-with-default columns mapped to non-`Option` Rust types: read as // `Option` and fall back to the type default so a NULL cell never errors. is_elevated: row @@ -207,6 +224,131 @@ pub async fn upsert_machine( } } +/// Find a machine by the SPEC-016 per-tenant dedup key `(tenant_id, machine_uid)`. +/// +/// This is the enroll-time dedup lookup: the same hardware re-enrolling (re-image / +/// re-install) resolves to its existing row within the tenant, while the same +/// hardware in a DIFFERENT tenant is a distinct row (resolved-decision #4). Tenant +/// scoping uses the same default-tenant fold as the unique index so the lookup +/// matches the uniqueness guarantee. +/// +/// Unlike `get_machine_by_agent_id`, this deliberately does NOT filter +/// `deleted_at IS NULL`: a previously operator-purged machine that legitimately +/// re-enrolls must be found so the enroll path can revive it (clearing +/// `deleted_at`), mirroring the connect-path revive in `upsert_machine`. +pub async fn get_machine_by_tenant_uid( + pool: &PgPool, + tenant_id: Uuid, + machine_uid: &str, +) -> Result, sqlx::Error> { + sqlx::query_as::<_, Machine>( + r#" + SELECT * FROM connect_machines + WHERE machine_uid = $1 + AND COALESCE(tenant_id, '00000000-0000-0000-0000-000000000001'::uuid) = $2 + "#, + ) + .bind(machine_uid) + .bind(tenant_id) + .fetch_optional(pool) + .await +} + +/// Parameters for an enroll-time machine create/update (SPEC-016 `/api/enroll`). +/// +/// `agent_id` is a freshly minted opaque id for a NEW enrollment (the agent's +/// config UUID story is Phase B; the server only needs a unique non-null value for +/// the `agent_id UNIQUE` column). On REUSE/MOVE the existing row's `agent_id` is +/// preserved (the FK target of any already-minted `cak_`), so the update path does +/// not touch it. +pub struct EnrollMachineParams<'a> { + pub agent_id: &'a str, + pub hostname: &'a str, + pub machine_uid: &'a str, + pub tenant_id: Uuid, + pub site_id: Uuid, + /// Company label (-> connect_machines.organization). + pub company: Option<&'a str>, + /// Site label (-> connect_machines.site) — the free-text label, distinct from + /// the relational site_id binding. + pub site_label: Option<&'a str>, + pub tags: &'a [String], + /// 'active' (auto-approve) or 'pending' (collision-gated). + pub enrollment_state: &'a str, +} + +/// Insert a NEW machine row for a first-time enrollment (SPEC-016). +/// +/// Carries the labels, the relational `site_id`, the per-tenant `machine_uid`, and +/// the collision-gate `enrollment_state`. Persistent + online. Returns the created +/// row (its `id` is the FK target for the `cak_` the caller mints next). +pub async fn insert_enrolled_machine( + pool: &PgPool, + p: &EnrollMachineParams<'_>, +) -> Result { + sqlx::query_as::<_, Machine>( + r#" + INSERT INTO connect_machines + (agent_id, hostname, is_persistent, status, last_seen, machine_uid, + tenant_id, site_id, organization, site, tags, enrollment_state) + VALUES ($1, $2, true, 'online', NOW(), $3, $4, $5, $6, $7, $8, $9) + RETURNING * + "#, + ) + .bind(p.agent_id) + .bind(p.hostname) + .bind(p.machine_uid) + .bind(p.tenant_id) + .bind(p.site_id) + .bind(p.company) + .bind(p.site_label) + .bind(p.tags) + .bind(p.enrollment_state) + .fetch_one(pool) + .await +} + +/// Update an EXISTING machine row on re-enroll / reuse / site-move (SPEC-016). +/// +/// Refreshes hostname, site binding (`site_id`), labels, and `enrollment_state`, +/// and revives a soft-deleted row (`deleted_at = NULL`) — a re-enroll of a purged +/// host means it is live again, mirroring `upsert_machine`'s revive. Deliberately +/// does NOT change `agent_id`: the existing id is the FK target of any prior `cak_`. +/// Labels are COALESCE-merged so an enroll that omits a label does not wipe an +/// existing value; `tags` is overwritten only when a non-empty set is supplied +/// (matching `update_machine_metadata`'s convention). +pub async fn update_enrolled_machine( + pool: &PgPool, + machine_id: Uuid, + p: &EnrollMachineParams<'_>, +) -> Result { + sqlx::query_as::<_, Machine>( + r#" + UPDATE connect_machines SET + hostname = $2, + site_id = $3, + organization = COALESCE($4, organization), + site = COALESCE($5, site), + tags = CASE WHEN $6::text[] = '{}' THEN tags ELSE $6 END, + enrollment_state = $7, + status = 'online', + last_seen = NOW(), + deleted_at = NULL + WHERE id = $1 + RETURNING * + "#, + ) + .bind(machine_id) + .bind(p.hostname) + .bind(p.site_id) + .bind(p.company) + .bind(p.site_label) + .bind(p.tags) + .bind(p.enrollment_state) + .fetch_one(pool) + .await +} + /// Update machine status and info #[allow(dead_code)] // TODO(native-remote-control): consumed by the integration API; see docs/specs/native-remote-control/ pub async fn update_machine_status( diff --git a/server/src/db/mod.rs b/server/src/db/mod.rs index 655ef81..97afb9e 100644 --- a/server/src/db/mod.rs +++ b/server/src/db/mod.rs @@ -4,8 +4,10 @@ //! Optional - server works without database if DATABASE_URL not set. pub mod agent_keys; +pub mod enrollment_keys; pub mod events; pub mod machines; +pub mod sites; pub mod releases; pub mod sessions; pub mod support_codes; diff --git a/server/src/db/sites.rs b/server/src/db/sites.rs new file mode 100644 index 0000000..01784cc --- /dev/null +++ b/server/src/db/sites.rs @@ -0,0 +1,94 @@ +//! Site database operations (SPEC-016 zero-touch enrollment). +//! +//! Backs the `connect_sites` table (migration 010): the relational anchor a +//! per-site enrollment key hangs off and the `/api/enroll` flow resolves by +//! `site_code`. See the migration header for why this table exists (the prior +//! schema modeled "site" only as a free-text column on `connect_machines`). +//! +//! All queries use runtime `sqlx::query()` / `sqlx::query_as()` per the codebase +//! convention (no compile-time `query!` macros, no `.sqlx` offline cache). + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use sqlx::PgPool; +use uuid::Uuid; + +/// Site record from the database. +#[derive(Debug, Clone, Serialize, Deserialize, sqlx::FromRow)] +pub struct Site { + pub id: Uuid, + pub site_code: String, + pub display_name: Option, + pub company: Option, + pub tenant_id: Option, + /// RESERVED for future per-site enrollment POLICY work (SPEC-016 §out-of-scope). + /// Not enforced in Phase A. + pub enrollment_policy: Option, + pub created_at: DateTime, +} + +/// Resolve a site by its operator-facing `site_code`, scoped to the given tenant. +/// +/// Tenant scoping uses the same default-tenant fold as the unique index so the +/// lookup matches the uniqueness guarantee: `(COALESCE(tenant_id, default), +/// site_code)`. Returns `None` if no site with that code exists in the tenant. +pub async fn get_site_by_code( + pool: &PgPool, + site_code: &str, + tenant_id: Uuid, +) -> Result, sqlx::Error> { + sqlx::query_as::<_, Site>( + r#" + SELECT id, site_code, display_name, company, tenant_id, enrollment_policy, created_at + FROM connect_sites + WHERE site_code = $1 + AND COALESCE(tenant_id, '00000000-0000-0000-0000-000000000001'::uuid) = $2 + "#, + ) + .bind(site_code) + .bind(tenant_id) + .fetch_optional(pool) + .await +} + +/// Fetch a site by its primary-key UUID. +pub async fn get_site_by_id(pool: &PgPool, id: Uuid) -> Result, sqlx::Error> { + sqlx::query_as::<_, Site>( + r#" + SELECT id, site_code, display_name, company, tenant_id, enrollment_policy, created_at + FROM connect_sites + WHERE id = $1 + "#, + ) + .bind(id) + .fetch_optional(pool) + .await +} + +/// Insert a new site, returning the created row. +/// +/// `tenant_id` is `None`-tolerant and resolved via `db::tenancy::current_tenant_id()` +/// at the call site. Errors with a unique-violation if `(tenant, site_code)` already +/// exists (the caller maps that to a 409). +#[allow(dead_code)] // Wired by the site-admin API (dashboard site CRUD); Phase A exposes key rotation, not site CRUD. +pub async fn insert_site( + pool: &PgPool, + site_code: &str, + display_name: Option<&str>, + company: Option<&str>, + tenant_id: Option, +) -> Result { + sqlx::query_as::<_, Site>( + r#" + INSERT INTO connect_sites (site_code, display_name, company, tenant_id) + VALUES ($1, $2, $3, $4) + RETURNING id, site_code, display_name, company, tenant_id, enrollment_policy, created_at + "#, + ) + .bind(site_code) + .bind(display_name) + .bind(company) + .bind(tenant_id) + .fetch_one(pool) + .await +} diff --git a/server/src/main.rs b/server/src/main.rs index e94e7e6..ed2eca2 100644 --- a/server/src/main.rs +++ b/server/src/main.rs @@ -448,6 +448,11 @@ async fn main() -> Result<()> { )), ) .route("/api/codes/:code/cancel", post(cancel_code)) + // Zero-touch enrollment (SPEC-016). PUBLIC: no JWT — the per-site enrollment + // key in the body is the gate, and the handler applies its own + // per-(site_code, IP) rate limit / lockout (defense-in-depth). Mounted with + // the other public API routes. + .route("/api/enroll", post(api::enroll::enroll)) // WebSocket endpoints .route("/ws/agent", get(relay::agent_ws_handler)) .route("/ws/viewer", get(relay::viewer_ws_handler)) @@ -498,6 +503,18 @@ async fn main() -> Result<()> { "/api/machines/:agent_id/keys/:key_id", delete(api::machine_keys::revoke_key), ) + // Per-site enrollment key administration (SPEC-016, admin-only / JWT). + // Rotate regenerates the cek_ secret + fingerprint (old installers can no + // longer enroll new machines); GET returns the current non-secret + // fingerprint/version. Both gated by the AdminUser extractor. + .route( + "/api/sites/:id/enrollment-key", + get(api::sites::get_enrollment_key), + ) + .route( + "/api/sites/:id/enrollment-key/rotate", + post(api::sites::rotate_enrollment_key), + ) // REST API - Releases and Version .route("/api/version", get(api::releases::get_version)) // No auth - for agent polling .route("/api/releases", get(api::releases::list_releases)) diff --git a/server/src/middleware/rate_limit.rs b/server/src/middleware/rate_limit.rs index cf3433f..72f21e7 100644 --- a/server/src/middleware/rate_limit.rs +++ b/server/src/middleware/rate_limit.rs @@ -77,6 +77,19 @@ pub const CODE_VALIDATE_MAX_FAILURES: u32 = 10; /// Support-code validate: how long an IP stays locked out once tripped. pub const CODE_VALIDATE_LOCKOUT: Duration = Duration::from_secs(15 * 60); +/// Enroll (`POST /api/enroll`, SPEC-016): window length. +pub const ENROLL_WINDOW: Duration = Duration::from_secs(60); +/// Enroll: max requests per window per `(site_code, IP)`. A zero-touch site push +/// drives N machines through enroll near-simultaneously, so this is generous +/// (mass-deploy friendly) while still capping a runaway loop. Defense-in-depth: the +/// 256-bit enrollment key is the load-bearing gate, not this cap. +pub const ENROLL_MAX_PER_WINDOW: u32 = 60; +/// Enroll: consecutive FAILED enroll attempts (bad/inactive key, unknown site) from +/// one `(site_code, IP)` that trip the lockout. +pub const ENROLL_MAX_FAILURES: u32 = 20; +/// Enroll: how long a `(site_code, IP)` stays locked out once tripped. +pub const ENROLL_LOCKOUT: Duration = Duration::from_secs(15 * 60); + /// Hard cap on the number of distinct IPs tracked by any single limiter map. /// Prevents an IP-rotating attacker from growing memory without bound. When the /// cap is hit, the oldest-windowed entries are pruned. Generous for a real MSP @@ -260,6 +273,150 @@ impl FailureLockout { } } +// ============================================================================ +// Composite-key limiter for enrollment (keyed by (site_code, IP)) — SPEC-016 +// ============================================================================ +// +// The login / change-password / code-validate limiters above key purely on IP. +// SPEC-016 §3 wants the enroll defense keyed on `(site_code, source-IP)` so a noisy +// site push from one office IP cannot lock out a different site enrolling from the +// same egress IP. Rather than overload the IP-only maps, this is a small dedicated +// composite-key limiter + lockout. It is invoked from the enroll HANDLER (not a +// `from_fn` layer) because the `site_code` lives in the JSON body, which a +// pre-handler middleware cannot read without consuming it. Documented as +// defense-in-depth: the 256-bit enrollment key is the real gate. + +/// Composite limiter key: the site_code and the real client IP. +type EnrollKey = (String, IpAddr); + +/// Per-`(site_code, IP)` fixed-window limiter + consecutive-failure lockout. +/// +/// Combines both protections behind one lock-guarded map so the enroll handler +/// makes a single allow/deny decision and reports success/failure into the same +/// structure. Self-pruning and size-capped, like the IP-only limiters. +#[derive(Clone)] +pub struct EnrollLimiter { + inner: std::sync::Arc>>, + max_per_window: u32, + window: Duration, + max_failures: u32, + cooldown: Duration, +} + +#[derive(Debug, Clone, Copy)] +struct EnrollEntry { + window_started: Instant, + count: u32, + failures: u32, + locked_until: Option, + last_seen: Instant, +} + +impl EnrollLimiter { + pub fn new( + max_per_window: u32, + window: Duration, + max_failures: u32, + cooldown: Duration, + ) -> Self { + Self { + inner: std::sync::Arc::new(Mutex::new(HashMap::new())), + max_per_window, + window, + max_failures, + cooldown, + } + } + + fn entry_now() -> EnrollEntry { + let now = Instant::now(); + EnrollEntry { + window_started: now, + count: 0, + failures: 0, + locked_until: None, + last_seen: now, + } + } + + /// Admit one enroll attempt for `(site_code, ip)`. Returns `true` if allowed + /// (and counts it). Returns `false` if the key is currently locked out OR over + /// the per-window request cap. Clock injected for tests. + fn check_at(&self, site_code: &str, ip: IpAddr, now: Instant) -> bool { + let mut map = self.inner.lock().unwrap_or_else(|e| e.into_inner()); + + if map.len() >= MAX_TRACKED_IPS { + let window = self.window; + let cooldown = self.cooldown; + map.retain(|_, e| { + e.locked_until.map(|u| now < u).unwrap_or(false) + || now.duration_since(e.window_started) < window + || now.duration_since(e.last_seen) < cooldown + }); + } + + let key = (site_code.to_string(), ip); + let e = map.entry(key).or_insert_with(Self::entry_now); + e.last_seen = now; + + // Lockout takes precedence. + if let Some(until) = e.locked_until { + if now < until { + return false; + } + // Cooldown elapsed — clear it for a fresh start. + e.locked_until = None; + e.failures = 0; + } + + // Roll the fixed window forward if elapsed. + if now.duration_since(e.window_started) >= self.window { + e.window_started = now; + e.count = 0; + } + + if e.count >= self.max_per_window { + false + } else { + e.count += 1; + true + } + } + + /// Admit one enroll attempt (real clock). + pub fn check(&self, site_code: &str, ip: IpAddr) -> bool { + self.check_at(site_code, ip, Instant::now()) + } + + fn record_failure_at(&self, site_code: &str, ip: IpAddr, now: Instant) { + let mut map = self.inner.lock().unwrap_or_else(|e| e.into_inner()); + let key = (site_code.to_string(), ip); + let e = map.entry(key).or_insert_with(Self::entry_now); + e.last_seen = now; + e.failures = e.failures.saturating_add(1); + if e.failures >= self.max_failures { + e.locked_until = Some(now + self.cooldown); + } + } + + /// Record a FAILED enroll attempt (bad key / unknown site) for the key, + /// tripping the lockout once the streak reaches `max_failures`. + pub fn record_failure(&self, site_code: &str, ip: IpAddr) { + self.record_failure_at(site_code, ip, Instant::now()); + } + + /// Record a SUCCESSFUL enroll for the key, resetting its failure streak. + pub fn record_success(&self, site_code: &str, ip: IpAddr) { + let mut map = self.inner.lock().unwrap_or_else(|e| e.into_inner()); + let key = (site_code.to_string(), ip); + if let Some(e) = map.get_mut(&key) { + e.failures = 0; + e.locked_until = None; + e.last_seen = Instant::now(); + } + } +} + // ============================================================================ // Shared rate-limit state (lives in AppState) // ============================================================================ @@ -275,6 +432,9 @@ pub struct RateLimitState { pub code_validate: RateLimiter, /// Per-IP lockout on repeated failed code validations (brute-force defense). pub code_validate_lockout: FailureLockout, + /// `POST /api/enroll` (SPEC-016): per-`(site_code, IP)` request cap + + /// consecutive-failure lockout. Invoked from the enroll handler. + pub enroll: EnrollLimiter, } impl RateLimitState { @@ -290,6 +450,12 @@ impl RateLimitState { CODE_VALIDATE_MAX_FAILURES, CODE_VALIDATE_LOCKOUT, ), + enroll: EnrollLimiter::new( + ENROLL_MAX_PER_WINDOW, + ENROLL_WINDOW, + ENROLL_MAX_FAILURES, + ENROLL_LOCKOUT, + ), } } } @@ -524,4 +690,51 @@ mod tests { assert!(lockout.is_locked_at(ip(8), t0)); assert!(!lockout.is_locked_at(ip(9), t0)); // ip9 unaffected } + + // -- EnrollLimiter (composite (site_code, IP) key) -------------------------- + + #[test] + fn enroll_window_allows_up_to_cap_then_blocks() { + let lim = EnrollLimiter::new(2, Duration::from_secs(60), 100, Duration::from_secs(600)); + let t0 = Instant::now(); + assert!(lim.check_at("SITE-A", ip(1), t0)); // 1 + assert!(lim.check_at("SITE-A", ip(1), t0)); // 2 + assert!(!lim.check_at("SITE-A", ip(1), t0)); // over cap + } + + #[test] + fn enroll_is_keyed_by_site_and_ip() { + let lim = EnrollLimiter::new(1, Duration::from_secs(60), 100, Duration::from_secs(600)); + let t0 = Instant::now(); + assert!(lim.check_at("SITE-A", ip(1), t0)); + assert!(!lim.check_at("SITE-A", ip(1), t0)); // same key over cap + // Different site, same IP -> independent bucket. + assert!(lim.check_at("SITE-B", ip(1), t0)); + // Same site, different IP -> independent bucket. + assert!(lim.check_at("SITE-A", ip(2), t0)); + } + + #[test] + fn enroll_lockout_trips_after_failures_and_blocks_check() { + let lim = EnrollLimiter::new(100, Duration::from_secs(60), 3, Duration::from_secs(600)); + let t0 = Instant::now(); + lim.record_failure_at("SITE-A", ip(1), t0); + lim.record_failure_at("SITE-A", ip(1), t0); + // Not yet tripped: a check still admits. + assert!(lim.check_at("SITE-A", ip(1), t0)); + lim.record_failure_at("SITE-A", ip(1), t0); // 3rd -> trips + // Now locked out: check denies even though under the request cap. + assert!(!lim.check_at("SITE-A", ip(1), t0)); + } + + #[test] + fn enroll_success_resets_failure_streak() { + let lim = EnrollLimiter::new(100, Duration::from_secs(60), 2, Duration::from_secs(600)); + let t0 = Instant::now(); + lim.record_failure_at("SITE-A", ip(1), t0); + lim.record_success("SITE-A", ip(1)); // reset + lim.record_failure_at("SITE-A", ip(1), t0); + // Only one failure since reset -> not locked. + assert!(lim.check_at("SITE-A", ip(1), t0)); + } } From 0f02f2376518b5f706de7343f7239fb0804dac73 Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Tue, 2 Jun 2026 10:28:31 -0700 Subject: [PATCH 2/3] fix(enroll): SPEC-016 Phase A review fixes (cross-site guard, timing oracle, TOCTOU) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Applies the four review fixes to POST /api/enroll, all in server/src/api/enroll.rs (+ a new ENROLL_SITE_CONFLICT event type in server/src/db/events.rs): 1. HIGH — close the within-tenant cross-site silent-move hijack. A valid key for site B presented for a machine_uid already bound to a DIFFERENT site is now REFUSED (409 ENROLL_SITE_CONFLICT) instead of silently repointing the row and minting a fresh cak_. No move, no key. Emits an ENROLL_SITE_CONFLICT audit event + alert TODO. Same-site match still resolves to reuse; a NULL prior site_id is a first relational bind, not a move. The unauthenticated site_move mint path is removed; deliberate moves are deferred to the Phase-B --reassign flow + dashboard. 2. MEDIUM — kill the timing/enumeration oracle. Unknown site_code and no-active-key early rejects now pay a dummy Argon2id verify against a fixed, valid throwaway PHC constant (TIMING_EQUALIZER_PHC) before returning the identical 401, so every rejection path pays one KDF. The constant is asserted valid + verifying in tests. 3. LOW — fix the new-enroll TOCTOU. The dedup lookup + INSERT is wrapped in a bounded retry loop: a concurrent first-enroll of the same machine_uid whose INSERT loses the unique-index race (classified by is_machine_uid_conflict on SQLSTATE 23505 + machine_uid constraint) now re-looks-up and converges to reuse instead of 500ing. A non-machine_uid unique violation still surfaces as 500. 4. LOW — make the collision-gate doc honest + leave an enforcement TODO. The module doc now states the gate withholds only a NEWLY minted cak_ (a prior clean cak_ survives) and that nothing consults enrollment_state at control time yet, with a TODO(SPEC-016 Phase B/D) marker for relay/control-plane enforcement + revocation. Verify: cargo check, cargo clippy --all-targets, and cargo test all clean on this Windows host (104 tests pass). Two DB-gated tests (cross-site bound-site_id exposure, machine_uid-vs-agent_id conflict classification) no-op without TEST_DATABASE_URL and run against real Postgres in CI; the Linux target / real-Postgres handler path is validated there, not on this host. Co-Authored-By: Claude Opus 4.8 (1M context) --- server/src/api/enroll.rs | 495 ++++++++++++++++++++++++++++++++++----- server/src/db/events.rs | 11 + 2 files changed, 447 insertions(+), 59 deletions(-) diff --git a/server/src/api/enroll.rs b/server/src/api/enroll.rs index b961e0f..b0cf69f 100644 --- a/server/src/api/enroll.rs +++ b/server/src/api/enroll.rs @@ -17,8 +17,26 @@ //! AUTH POSTURE: auto-approve (ScreenConnect parity) — a clean enroll is live and //! controllable immediately, with the new-enrollment alert as the tripwire. The one //! exception is a detected `machine_uid` collision, which gates the machine to -//! `enrollment_state = 'pending'` and withholds a usable `cak_` until an operator -//! confirms it in the dashboard. +//! `enrollment_state = 'pending'`. +//! +//! What the collision gate actually does in Phase A (be precise — it is NOT a full +//! quarantine yet): it sets `enrollment_state = 'pending'` and withholds a *newly +//! minted* `cak_` (the response carries no key). It does NOT revoke a `cak_` the +//! machine may already hold from a prior clean enroll, and NOTHING on the relay / +//! control plane consults `enrollment_state` yet — so a colliding machine that was +//! already enrolled keeps its working credential and remains controllable until the +//! enforcement below lands. The gate today is an audit + alert tripwire that asks an +//! operator to confirm in the dashboard, not a hard control-plane block. +//! +//! TODO(SPEC-016 Phase B/D): relay/control plane must refuse machines with +//! enrollment_state='pending'; consider revoking existing cak_ on collision. +//! +//! CROSS-SITE: Phase A does NOT move a machine between sites. A valid key for site B +//! presented for a `machine_uid` already bound to site A is REFUSED +//! (`ENROLL_SITE_CONFLICT`, 409) — no move, no key — as the accidental-move / +//! cross-site-hijack guard (`machine_uid` is a raw, spoofable client string in Phase +//! A). Deliberate moves arrive with the Phase-B `--reassign` flow + dashboard +//! (SPEC-016 §"Explicitly out of scope" / `--reassign`). //! //! SECURITY: never log the enrollment key, the minted `cak_`, or any hash. The //! plaintext `cak_` appears only in the success response body, once. @@ -39,6 +57,37 @@ use crate::auth::{agent_keys, enrollment_keys}; use crate::db; use crate::AppState; +/// A fixed, valid Argon2id PHC hash used ONLY to equalize timing on the early-reject +/// enroll paths (unknown `site_code`, no active key). Those paths would otherwise +/// return before paying the KDF, while a wrong key pays the full Argon2id verify — +/// a timing oracle that distinguishes "this site_code/active-key exists" from +/// "rejected at the key check". On every early reject we run a throwaway verify of +/// the supplied key against THIS constant and discard the result, so all rejection +/// paths pay one Argon2id verify. +/// +/// It is the Argon2id (V0x13, default params — matching [`crate::auth::password`]) +/// hash of the byte string `"enroll-timing-equalizer-throwaway"` under a fixed salt +/// — generated offline, committed verbatim, and asserted valid + verifying in the +/// tests below. It guards NOTHING: it is never a credential, never compared for +/// auth, and the password it encodes is public. A real key can never equal it +/// (Phase A keys are `cek_`-prefixed 256-bit randoms), so the dummy verify always +/// returns `false`, exactly as the real reject paths do. +const TIMING_EQUALIZER_PHC: &str = + "$argon2id$v=19$m=19456,t=2,p=1$ZW5yb2xsdGltaW5nZXF6$tXiQXmQUAUVszrdp5HrVGIJtsQTidLuSKld0ITNv2Es"; + +/// Pay one Argon2id verify against [`TIMING_EQUALIZER_PHC`] and discard the result. +/// +/// Called on the early-reject enroll paths so they cost the same as a real +/// wrong-key reject (which pays the KDF in `verify_enrollment_key`). The boolean is +/// intentionally ignored — this exists purely for its side effect (CPU time). +#[inline] +fn equalize_reject_timing(presented_key: &str) { + // `verify_password` parses the PHC and runs Argon2id; a malformed constant would + // make it return Err and skip the KDF, defeating the purpose, so the constant is + // a known-valid PHC string (asserted in tests). + let _ = crate::auth::password::verify_password(presented_key, TIMING_EQUALIZER_PHC); +} + /// Standard error envelope (see `.claude/standards/api/response-format.md`), /// matching `api::machine_keys::ApiError`. #[derive(Debug, Serialize)] @@ -116,7 +165,10 @@ pub struct EnrollResponse { /// `"active"` (live, controllable, key issued) or `"pending"` (collision-gated; /// awaiting operator confirmation; no key issued). pub enrollment_state: String, - /// Disposition: `"new"` | `"reuse"` | `"site_move"` | `"collision_pending"`. + /// Disposition: `"new"` | `"reuse"` | `"collision_pending"`. (`"site_move"` is + /// NOT reachable on the Phase A unauthenticated path — a cross-site enroll is + /// refused with `409 ENROLL_SITE_CONFLICT` before any disposition is returned; + /// deliberate moves are the Phase-B `--reassign` flow.) pub disposition: String, } @@ -231,6 +283,9 @@ pub async fn enroll( let site = match db::sites::get_site_by_code(db.pool(), site_code, tenant_id).await { Ok(Some(s)) => s, Ok(None) => { + // Pay the Argon2id cost a real wrong-key reject would, so an unknown + // site_code is not distinguishable by timing (enumeration oracle). + equalize_reject_timing(&req.enrollment_key); state.rate_limits.enroll.record_failure(site_code, ip); audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({ "reason": "unknown_site_code", @@ -239,8 +294,8 @@ pub async fn enroll( })) .await; tracing::warn!("[ENROLL] unknown site_code={} from {}", site_code, ip); - // Same opaque rejection shape as a bad key — do not reveal which of the - // two failed (avoids a site_code enumeration oracle). + // Same opaque rejection shape AND the same KDF cost as a bad key — do not + // reveal (by body or by timing) which of the two failed. return Err(ApiError::new( StatusCode::UNAUTHORIZED, "ENROLL_REJECTED", @@ -263,6 +318,9 @@ pub async fn enroll( let active_key = match db::enrollment_keys::get_active_for_site(db.pool(), site.id).await { Ok(Some(k)) => k, Ok(None) => { + // Pay the Argon2id cost a real wrong-key reject would, so "site exists but + // has no active key" is not distinguishable by timing from a bad key. + equalize_reject_timing(&req.enrollment_key); state.rate_limits.enroll.record_failure(site_code, ip); audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({ "reason": "no_active_key", @@ -316,22 +374,33 @@ pub async fn enroll( .map(str::trim) .filter(|s| !s.is_empty()); - // Dedup on (tenant, machine_uid). - let existing = - match db::machines::get_machine_by_tenant_uid(db.pool(), tenant_id, machine_uid).await { - Ok(e) => e, - Err(e) => { - tracing::error!("[ENROLL] DB error on dedup lookup: {}", e); - return Err(ApiError::new( - StatusCode::INTERNAL_SERVER_ERROR, - "INTERNAL_ERROR", - "Internal server error", - )); - } - }; + // Dedup on (tenant, machine_uid). Wrapped in a bounded retry loop to resolve the + // first-enroll TOCTOU (SPEC-016 Phase A, LOW): the `None` branch does a lookup + // then an INSERT with no in-between lock, so two concurrent first-time enrolls of + // the same `machine_uid` both see `None` and both INSERT. The partial unique index + // on `machine_uid` makes the loser's INSERT raise a unique violation; instead of + // 500ing, the loser loops back, the lookup now returns `Some`, and it converges to + // the reuse path (one row, no error). One retry is sufficient — after a unique + // violation the winner's row is committed and visible — but we cap iterations + // defensively so a pathological repeat can never spin forever. + let mut attempts = 0u8; + loop { + attempts += 1; + let existing = + match db::machines::get_machine_by_tenant_uid(db.pool(), tenant_id, machine_uid).await { + Ok(e) => e, + Err(e) => { + tracing::error!("[ENROLL] DB error on dedup lookup: {}", e); + return Err(ApiError::new( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Internal server error", + )); + } + }; - match existing { - // -- Reuse / site-move / collision path ------------------------------- + match existing { + // -- Reuse / collision path ------------------------------------------- Some(existing) => { // Collision gate: a seemingly-different live endpoint resolving to an // existing uid -> pending, alert, NO usable cak_ minted. @@ -379,8 +448,53 @@ pub async fn enroll( )); } - // Legitimate reuse (re-image / re-install) or a site move. - let is_move = existing.site_id.map(|s| s != site.id).unwrap_or(true); + // Cross-site guard (SPEC-016 Phase A, HIGH): if this machine_uid is + // already bound to a DIFFERENT site, REFUSE — do not move it and do not + // mint a key. `machine_uid` is a raw, spoofable client string in Phase A, + // so a silent move here is a hijack vector: a holder of site B's key could + // claim a site-A machine merely by presenting its uid, repointing the row + // and minting a fresh `cak_` on site B. Deliberate moves are out of scope + // for Phase A and arrive with the Phase-B `--reassign` flow + dashboard + // (SPEC-016 §"Explicitly out of scope"); until then this is the explicit + // accidental-move / hijack guard the spec calls for. + // + // Only `Some(other_site)` is a conflict. `None` (a legacy / connect-path / + // support-code row that never had a relational site binding) is treated as + // a first-time bind to the enrolling site, NOT a cross-site move — there is + // no prior owning site to hijack from. + if let Some(existing_site) = existing.site_id { + if existing_site != site.id { + audit(db, db::events::EventTypes::ENROLL_SITE_CONFLICT, ip, json!({ + "machine_id": existing.id, + "machine_uid": machine_uid, + // Record the bound site id for the operator audit trail; the + // response body below leaks nothing about the other site. + "bound_site_id": existing_site, + "attempted_site_code": site_code, + "attempted_site_id": site.id, + })) + .await; + // TODO(SPEC-016): wire to #dev-alerts — cross-site enroll refused + // (possible accidental move or a cross-site claim attempt). + tracing::warn!( + "[ENROLL] cross-site conflict REFUSED: machine_id={} already bound to \ + a different site; attempted site_code={} from {}", + existing.id, site_code, ip + ); + // Opaque-enough: states the machine is already enrolled elsewhere + // and how to move it deliberately, without naming the other site. + return Err(ApiError::new( + StatusCode::CONFLICT, + "ENROLL_SITE_CONFLICT", + "This machine is already enrolled at another site. A deliberate \ + move requires the operator-initiated reassignment flow.", + )); + } + } + + // Same-site reuse (re-image / re-install) or first relational bind of a + // legacy/connect-path row (existing.site_id was NULL). No site move occurs + // on the unauthenticated path in Phase A. let params = enroll_params( &existing.agent_id, hostname, @@ -398,42 +512,26 @@ pub async fn enroll( let cak = mint_cak(db, machine.id, tenant_id).await?; - if is_move { - audit(db, db::events::EventTypes::ENROLL_SITE_MOVE, ip, json!({ - "machine_id": machine.id, - "machine_uid": machine_uid, - "from_site_id": existing.site_id, - "to_site_code": site_code, - "to_site_id": site.id, - })) - .await; - // TODO(SPEC-016): wire to #dev-alerts — a machine moved between sites. - tracing::info!( - "[ENROLL] site-move: machine_id={} machine_uid present, to site_code={} from {}", - machine.id, site_code, ip - ); - } else { - audit(db, db::events::EventTypes::ENROLL_REUSE, ip, json!({ - "machine_id": machine.id, - "machine_uid": machine_uid, - "site_code": site_code, - })) - .await; - tracing::info!( - "[ENROLL] reuse: machine_id={} re-enrolled at site_code={} from {}", - machine.id, site_code, ip - ); - } + audit(db, db::events::EventTypes::ENROLL_REUSE, ip, json!({ + "machine_id": machine.id, + "machine_uid": machine_uid, + "site_code": site_code, + })) + .await; + tracing::info!( + "[ENROLL] reuse: machine_id={} re-enrolled at site_code={} from {}", + machine.id, site_code, ip + ); - Ok(( + return Ok(( StatusCode::OK, Json(EnrollResponse { machine_id: machine.id, key: Some(cak), enrollment_state: "active".to_string(), - disposition: if is_move { "site_move" } else { "reuse" }.to_string(), + disposition: "reuse".to_string(), }), - )) + )); } // -- New enrollment ---------------------------------------------------- @@ -454,16 +552,31 @@ pub async fn enroll( &tags, "active", ); - let machine = db::machines::insert_enrolled_machine(db.pool(), ¶ms) - .await - .map_err(|e| { + let machine = match db::machines::insert_enrolled_machine(db.pool(), ¶ms).await { + Ok(m) => m, + // TOCTOU loser: a concurrent first-enroll of the same machine_uid won + // the race and committed its row between our lookup and this INSERT, so + // the partial unique index on `machine_uid` rejects ours. Loop back: the + // re-lookup now finds the winner's row and we converge to reuse instead + // of 500ing. Capped so a non-uid unique violation (or any persistent + // conflict) surfaces as a 500 rather than spinning. + Err(e) if is_machine_uid_conflict(&e) && attempts < 2 => { + tracing::info!( + "[ENROLL] concurrent first-enroll race on machine_uid; \ + retrying as reuse (site_code={} from {})", + site_code, ip + ); + continue; + } + Err(e) => { tracing::error!("[ENROLL] DB error inserting enrolled machine: {}", e); - ApiError::new( + return Err(ApiError::new( StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", "Failed to register machine", - ) - })?; + )); + } + }; let cak = mint_cak(db, machine.id, tenant_id).await?; @@ -480,7 +593,7 @@ pub async fn enroll( machine.id, hostname, site_code, ip ); - Ok(( + return Ok(( StatusCode::CREATED, Json(EnrollResponse { machine_id: machine.id, @@ -488,11 +601,31 @@ pub async fn enroll( enrollment_state: "active".to_string(), disposition: "new".to_string(), }), - )) + )); + } } } } +/// Is this DB error a unique-constraint violation involving the `machine_uid` +/// (the first-enroll TOCTOU race)? Postgres reports a unique violation as SQLSTATE +/// `23505`; we additionally require the constraint/message to reference `machine_uid` +/// so a violation on a different unique column (e.g. `agent_id`) is NOT swallowed as +/// a benign race — those are genuine errors and must still surface as 500. +fn is_machine_uid_conflict(e: &sqlx::Error) -> bool { + e.as_database_error() + .filter(|db_err| db_err.code().as_deref() == Some("23505")) + .map(|db_err| { + db_err + .constraint() + .map(|c| c.contains("machine_uid")) + // Fall back to the message when the driver exposes no constraint name + // (e.g. a partial-index violation), so the uid race is still caught. + .unwrap_or_else(|| db_err.message().contains("machine_uid")) + }) + .unwrap_or(false) +} + /// Fold `department` / `device_type` (no dedicated columns yet — SPEC-007) into the /// tag set as `department:` / `device_type:` so they are preserved rather than /// dropped, alongside any explicit tags. Empty/whitespace values are skipped. @@ -560,3 +693,247 @@ fn map_update_err(e: sqlx::Error) -> (StatusCode, Json) { "Failed to update machine", ) } + +#[cfg(test)] +mod tests { + use super::*; + + // ---- Fix #2: the timing-equalizer PHC constant is a real, valid Argon2id hash. + // If this constant were malformed, `verify_password` would return `Err` and skip + // the KDF entirely — defeating the whole point (the early-reject paths would NOT + // pay the Argon2id cost and the timing oracle would reopen). These tests are the + // standing guard that the constant keeps paying the KDF. + + #[test] + fn timing_equalizer_phc_is_a_valid_parseable_hash() { + // A wrong password must verify to `false` WITHOUT erroring — proving the PHC + // parsed and the KDF actually ran (a parse failure would surface as `Err`). + let res = crate::auth::password::verify_password("cek_anything_at_all", TIMING_EQUALIZER_PHC); + assert!( + res.is_ok(), + "TIMING_EQUALIZER_PHC must be a valid PHC string so the KDF runs; got Err: {:?}", + res.err() + ); + assert!( + !res.unwrap(), + "an arbitrary key must NOT match the throwaway equalizer hash" + ); + } + + #[test] + fn equalize_reject_timing_runs_without_panicking() { + // The early-reject helper must always complete (it pays the KDF and discards + // the result); a panic here would 500 a rejection path. + equalize_reject_timing("cek_some_presented_value"); + equalize_reject_timing(""); // even degenerate input must be safe + } + + // ---- Fix #1 support: the collision heuristic truth table is unchanged. ------- + + fn machine_with(status: &str, hostname: &str) -> db::machines::Machine { + db::machines::Machine { + id: Uuid::new_v4(), + agent_id: "agent-x".to_string(), + hostname: hostname.to_string(), + os_version: None, + is_elevated: false, + is_persistent: true, + first_seen: chrono::Utc::now(), + last_seen: chrono::Utc::now(), + last_session_id: None, + status: status.to_string(), + created_at: chrono::Utc::now(), + updated_at: chrono::Utc::now(), + tenant_id: None, + organization: None, + site: None, + tags: Vec::new(), + machine_uid: Some("uid-1".to_string()), + deleted_at: None, + site_id: None, + enrollment_state: "active".to_string(), + } + } + + #[test] + fn collision_only_when_online_and_different_host() { + // Online + different hostname => collision (the clone signature). + assert!(is_collision(&machine_with("online", "HOST-A"), "HOST-B")); + // Online + same hostname (case-insensitive) => reuse, not a collision. + assert!(!is_collision(&machine_with("online", "HOST-A"), "host-a")); + // Offline (the common re-image case) => never a collision, even if renamed. + assert!(!is_collision(&machine_with("offline", "HOST-A"), "HOST-B")); + } + + // ---- DB-gated tests (skip without TEST_DATABASE_URL; run in CI on Postgres). - + // These validate the mechanisms fixes #1 and #3 rely on against a REAL Postgres + // error/row, which cannot be faked on a workstation. The full handler is exercised + // end-to-end in CI; here we pin the load-bearing primitives. + + use sqlx::postgres::PgPoolOptions; + use sqlx::PgPool; + + async fn test_pool() -> Option { + let url = std::env::var("TEST_DATABASE_URL").ok()?; + let pool = PgPoolOptions::new() + .max_connections(2) + .connect(&url) + .await + .expect("connect to TEST_DATABASE_URL"); + sqlx::migrate!("./migrations") + .run(&pool) + .await + .expect("apply migrations to the test database"); + Some(pool) + } + + async fn cleanup(pool: &PgPool, uids: &[&str], site_codes: &[&str]) { + for uid in uids { + let _ = sqlx::query("DELETE FROM connect_machines WHERE machine_uid = $1") + .bind(uid) + .execute(pool) + .await; + } + for code in site_codes { + let _ = sqlx::query("DELETE FROM connect_sites WHERE site_code = $1") + .bind(code) + .execute(pool) + .await; + } + } + + /// Fix #1: after a machine is enrolled at site A, the dedup lookup returns the row + /// carrying `site_id == site_a`, which is exactly what the cross-site guard compares + /// against `site.id`. This pins the precondition that makes the refusal possible: + /// a site-B enroll resolves the SAME row and sees a DIFFERENT bound site_id. + #[tokio::test] + async fn enrolled_machine_exposes_bound_site_id_for_cross_site_guard() { + let Some(pool) = test_pool().await else { + return; // no TEST_DATABASE_URL: skip (runs in CI) + }; + let tenant = db::tenancy::current_tenant_id(); + let uid = "test-xsite-uid-001"; + cleanup(&pool, &[uid], &["test-xsite-A", "test-xsite-B"]).await; + + let site_a = db::sites::insert_site(&pool, "test-xsite-A", None, None, Some(tenant)) + .await + .expect("insert site A"); + let site_b = db::sites::insert_site(&pool, "test-xsite-B", None, None, Some(tenant)) + .await + .expect("insert site B"); + assert_ne!(site_a.id, site_b.id); + + let agent_id_a = format!("enroll-{}", Uuid::new_v4()); + let params = enroll_params( + &agent_id_a, + "HOST-XSITE", + uid, + tenant, + site_a.id, + None, + None, + &[], + "active", + ); + db::machines::insert_enrolled_machine(&pool, ¶ms) + .await + .expect("enroll at site A"); + + // A subsequent (site-B) enroll dedups on (tenant, uid) and resolves THIS row. + let found = db::machines::get_machine_by_tenant_uid(&pool, tenant, uid) + .await + .expect("dedup lookup") + .expect("row exists"); + + // The guard compares found.site_id (Some(A)) against the enrolling site (B); + // they differ => refusal. Prove the comparison the guard makes evaluates true. + assert_eq!(found.site_id, Some(site_a.id), "row is bound to site A"); + assert!( + found.site_id.is_some_and(|s| s != site_b.id), + "cross-site guard condition (bound to a different site) must hold" + ); + + cleanup(&pool, &[uid], &["test-xsite-A", "test-xsite-B"]).await; + } + + /// Fix #3: a duplicate first-enroll of the same `machine_uid` raises a Postgres + /// unique violation that `is_machine_uid_conflict` classifies as `true` (the race + /// the loop retries as reuse), while a DIFFERENT unique violation (`agent_id`) + /// classifies as `false` (a genuine error that must still 500). This is the exact + /// branch predicate of the TOCTOU retry, validated against real driver errors. + #[tokio::test] + async fn machine_uid_conflict_is_classified_but_agent_id_conflict_is_not() { + let Some(pool) = test_pool().await else { + return; // no TEST_DATABASE_URL: skip (runs in CI) + }; + let tenant = db::tenancy::current_tenant_id(); + let uid = "test-toctou-uid-001"; + cleanup(&pool, &[uid], &["test-toctou-A"]).await; + + let site = db::sites::insert_site(&pool, "test-toctou-A", None, None, Some(tenant)) + .await + .expect("insert site"); + + let shared_agent_id = format!("enroll-{}", Uuid::new_v4()); + let first = enroll_params( + &shared_agent_id, + "HOST-TOCTOU", + uid, + tenant, + site.id, + None, + None, + &[], + "active", + ); + db::machines::insert_enrolled_machine(&pool, &first) + .await + .expect("first enroll wins the race"); + + // Loser re-inserts the SAME machine_uid (fresh agent_id) -> machine_uid unique + // violation -> must be classified as the retryable race. + let loser_agent_id = format!("enroll-{}", Uuid::new_v4()); + let uid_dup = enroll_params( + &loser_agent_id, + "HOST-TOCTOU-2", + uid, + tenant, + site.id, + None, + None, + &[], + "active", + ); + let err = db::machines::insert_enrolled_machine(&pool, &uid_dup) + .await + .expect_err("duplicate machine_uid must violate the unique index"); + assert!( + is_machine_uid_conflict(&err), + "a machine_uid unique violation must be classified as the retryable race; got: {err:?}" + ); + + // A DIFFERENT unique violation (reusing the agent_id with a NEW uid) must NOT be + // swallowed as the race — it is a genuine error that still surfaces as 500. + let uid_other = "test-toctou-uid-002"; + let agent_dup = enroll_params( + &shared_agent_id, // collides on agent_id UNIQUE, not machine_uid + "HOST-TOCTOU-3", + uid_other, + tenant, + site.id, + None, + None, + &[], + "active", + ); + let err2 = db::machines::insert_enrolled_machine(&pool, &agent_dup) + .await + .expect_err("duplicate agent_id must violate its unique constraint"); + assert!( + !is_machine_uid_conflict(&err2), + "an agent_id unique violation must NOT be misclassified as the machine_uid race; got: {err2:?}" + ); + + cleanup(&pool, &[uid, uid_other], &["test-toctou-A"]).await; + } +} diff --git a/server/src/db/events.rs b/server/src/db/events.rs index f42fb39..b850ef4 100644 --- a/server/src/db/events.rs +++ b/server/src/db/events.rs @@ -82,7 +82,18 @@ impl EventTypes { pub const ENROLL_REUSE: &'static str = "enroll_reuse"; /// An existing machine_uid enrolled under a DIFFERENT site — the machine's site /// binding was updated (a "site move"). Fires an alert. + /// + /// NOTE (SPEC-016 Phase A): the unauthenticated enroll path does NOT perform this + /// move — a cross-site enroll is REFUSED (`ENROLL_SITE_CONFLICT`) rather than + /// silently repointing the machine. This event is reserved for the deliberate + /// Phase-B `--reassign` flow (and the dashboard move action) that supersede it. + #[allow(dead_code)] // reserved for Phase-B --reassign; not emitted by Phase A enroll pub const ENROLL_SITE_MOVE: &'static str = "enroll_site_move"; + /// An existing machine_uid presented a valid key for a DIFFERENT site than the one + /// the machine is currently bound to. Phase A REFUSES this (no move, no key minted) + /// as the accidental-move / cross-site-hijack guard; the deliberate move arrives + /// with the Phase-B `--reassign` flow + dashboard. Fires an alert. + pub const ENROLL_SITE_CONFLICT: &'static str = "enroll_site_conflict"; /// A machine_uid collision was detected at enroll — the endpoint dropped to /// `pending` and awaits operator confirmation in the dashboard. Fires an alert. pub const ENROLL_COLLISION_PENDING: &'static str = "enroll_collision_pending"; From 4106fc4bc4edd752f098aaf59b3b542091892a4c Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Tue, 2 Jun 2026 10:48:51 -0700 Subject: [PATCH 3/3] style(enroll): cargo fmt --all (satisfy CI fmt gate) The Phase A work passed cargo check + clippy + tests locally but missed `cargo fmt --all -- --check` (the first step of the Linux CI job): module ordering in db/mod.rs and two trailing-comment alignments in rate_limit.rs. No logic change. Agent build failure on the prior run was transient infra (verified: agent crate compiles clean locally). Co-Authored-By: Claude Opus 4.8 (1M context) --- server/src/api/enroll.rs | 499 ++++++++++++++++------------ server/src/auth/enrollment_keys.rs | 5 +- server/src/db/enrollment_keys.rs | 11 +- server/src/db/mod.rs | 2 +- server/src/middleware/rate_limit.rs | 4 +- 5 files changed, 296 insertions(+), 225 deletions(-) diff --git a/server/src/api/enroll.rs b/server/src/api/enroll.rs index b0cf69f..2aaa1eb 100644 --- a/server/src/api/enroll.rs +++ b/server/src/api/enroll.rs @@ -218,11 +218,7 @@ fn is_collision(existing: &db::machines::Machine, incoming_hostname: &str) -> bo /// Mint a `cak_`, store its hash bound to `machine_id` + tenant, and return the /// plaintext. Shared by the new/reuse/move active paths. -async fn mint_cak( - db: &db::Database, - machine_id: Uuid, - tenant_id: Uuid, -) -> ApiResult { +async fn mint_cak(db: &db::Database, machine_id: Uuid, tenant_id: Uuid) -> ApiResult { let plaintext = agent_keys::generate_agent_key(); let key_hash = agent_keys::hash_agent_key(&plaintext); db::agent_keys::insert_agent_key(db.pool(), machine_id, &key_hash, Some(tenant_id)) @@ -287,11 +283,16 @@ pub async fn enroll( // site_code is not distinguishable by timing (enumeration oracle). equalize_reject_timing(&req.enrollment_key); state.rate_limits.enroll.record_failure(site_code, ip); - audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({ - "reason": "unknown_site_code", - "site_code": site_code, - "machine_uid": machine_uid, - })) + audit( + db, + db::events::EventTypes::ENROLL_REJECTED, + ip, + json!({ + "reason": "unknown_site_code", + "site_code": site_code, + "machine_uid": machine_uid, + }), + ) .await; tracing::warn!("[ENROLL] unknown site_code={} from {}", site_code, ip); // Same opaque rejection shape AND the same KDF cost as a bad key — do not @@ -322,13 +323,21 @@ pub async fn enroll( // has no active key" is not distinguishable by timing from a bad key. equalize_reject_timing(&req.enrollment_key); state.rate_limits.enroll.record_failure(site_code, ip); - audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({ - "reason": "no_active_key", - "site_code": site_code, - "machine_uid": machine_uid, - })) + audit( + db, + db::events::EventTypes::ENROLL_REJECTED, + ip, + json!({ + "reason": "no_active_key", + "site_code": site_code, + "machine_uid": machine_uid, + }), + ) .await; - tracing::warn!("[ENROLL] no active enrollment key for site_code={}", site_code); + tracing::warn!( + "[ENROLL] no active enrollment key for site_code={}", + site_code + ); return Err(ApiError::new( StatusCode::UNAUTHORIZED, "ENROLL_REJECTED", @@ -347,13 +356,22 @@ pub async fn enroll( if !enrollment_keys::verify_enrollment_key(&req.enrollment_key, &active_key.key_hash) { state.rate_limits.enroll.record_failure(site_code, ip); - audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({ - "reason": "bad_enrollment_key", - "site_code": site_code, - "machine_uid": machine_uid, - })) + audit( + db, + db::events::EventTypes::ENROLL_REJECTED, + ip, + json!({ + "reason": "bad_enrollment_key", + "site_code": site_code, + "machine_uid": machine_uid, + }), + ) .await; - tracing::warn!("[ENROLL] bad enrollment key for site_code={} from {}", site_code, ip); + tracing::warn!( + "[ENROLL] bad enrollment key for site_code={} from {}", + site_code, + ip + ); return Err(ApiError::new( StatusCode::UNAUTHORIZED, "ENROLL_REJECTED", @@ -366,7 +384,12 @@ pub async fn enroll( // Build the label/identity params shared by the create/update paths. let tags = effective_tags(&req.labels); - let company = req.labels.company.as_deref().map(str::trim).filter(|s| !s.is_empty()); + let company = req + .labels + .company + .as_deref() + .map(str::trim) + .filter(|s| !s.is_empty()); let site_label = req .labels .site @@ -386,25 +409,137 @@ pub async fn enroll( let mut attempts = 0u8; loop { attempts += 1; - let existing = - match db::machines::get_machine_by_tenant_uid(db.pool(), tenant_id, machine_uid).await { - Ok(e) => e, - Err(e) => { - tracing::error!("[ENROLL] DB error on dedup lookup: {}", e); - return Err(ApiError::new( - StatusCode::INTERNAL_SERVER_ERROR, - "INTERNAL_ERROR", - "Internal server error", - )); - } - }; + let existing = match db::machines::get_machine_by_tenant_uid( + db.pool(), + tenant_id, + machine_uid, + ) + .await + { + Ok(e) => e, + Err(e) => { + tracing::error!("[ENROLL] DB error on dedup lookup: {}", e); + return Err(ApiError::new( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Internal server error", + )); + } + }; match existing { - // -- Reuse / collision path ------------------------------------------- - Some(existing) => { - // Collision gate: a seemingly-different live endpoint resolving to an - // existing uid -> pending, alert, NO usable cak_ minted. - if is_collision(&existing, hostname) { + // -- Reuse / collision path ------------------------------------------- + Some(existing) => { + // Collision gate: a seemingly-different live endpoint resolving to an + // existing uid -> pending, alert, NO usable cak_ minted. + if is_collision(&existing, hostname) { + let params = enroll_params( + &existing.agent_id, + hostname, + machine_uid, + tenant_id, + site.id, + company, + site_label, + &tags, + "pending", + ); + let machine = + db::machines::update_enrolled_machine(db.pool(), existing.id, ¶ms) + .await + .map_err(map_update_err)?; + + audit( + db, + db::events::EventTypes::ENROLL_COLLISION_PENDING, + ip, + json!({ + "machine_id": machine.id, + "machine_uid": machine_uid, + "site_code": site_code, + "existing_hostname": existing.hostname, + "incoming_hostname": hostname, + "heuristic": "online_existing_with_different_hostname (PROVISIONAL)", + }), + ) + .await; + // TODO(SPEC-016): wire to #dev-alerts — collision requires operator + // confirmation in the dashboard before this endpoint may activate. + tracing::warn!( + "[ENROLL] machine_uid collision -> PENDING: machine_id={} site_code={} \ + existing_host={} incoming_host={} from {}", + machine.id, + site_code, + existing.hostname, + hostname, + ip + ); + + return Ok(( + StatusCode::ACCEPTED, + Json(EnrollResponse { + machine_id: machine.id, + key: None, + enrollment_state: "pending".to_string(), + disposition: "collision_pending".to_string(), + }), + )); + } + + // Cross-site guard (SPEC-016 Phase A, HIGH): if this machine_uid is + // already bound to a DIFFERENT site, REFUSE — do not move it and do not + // mint a key. `machine_uid` is a raw, spoofable client string in Phase A, + // so a silent move here is a hijack vector: a holder of site B's key could + // claim a site-A machine merely by presenting its uid, repointing the row + // and minting a fresh `cak_` on site B. Deliberate moves are out of scope + // for Phase A and arrive with the Phase-B `--reassign` flow + dashboard + // (SPEC-016 §"Explicitly out of scope"); until then this is the explicit + // accidental-move / hijack guard the spec calls for. + // + // Only `Some(other_site)` is a conflict. `None` (a legacy / connect-path / + // support-code row that never had a relational site binding) is treated as + // a first-time bind to the enrolling site, NOT a cross-site move — there is + // no prior owning site to hijack from. + if let Some(existing_site) = existing.site_id { + if existing_site != site.id { + audit( + db, + db::events::EventTypes::ENROLL_SITE_CONFLICT, + ip, + json!({ + "machine_id": existing.id, + "machine_uid": machine_uid, + // Record the bound site id for the operator audit trail; the + // response body below leaks nothing about the other site. + "bound_site_id": existing_site, + "attempted_site_code": site_code, + "attempted_site_id": site.id, + }), + ) + .await; + // TODO(SPEC-016): wire to #dev-alerts — cross-site enroll refused + // (possible accidental move or a cross-site claim attempt). + tracing::warn!( + "[ENROLL] cross-site conflict REFUSED: machine_id={} already bound to \ + a different site; attempted site_code={} from {}", + existing.id, + site_code, + ip + ); + // Opaque-enough: states the machine is already enrolled elsewhere + // and how to move it deliberately, without naming the other site. + return Err(ApiError::new( + StatusCode::CONFLICT, + "ENROLL_SITE_CONFLICT", + "This machine is already enrolled at another site. A deliberate \ + move requires the operator-initiated reassignment flow.", + )); + } + } + + // Same-site reuse (re-image / re-install) or first relational bind of a + // legacy/connect-path row (existing.site_id was NULL). No site move occurs + // on the unauthenticated path in Phase A. let params = enroll_params( &existing.agent_id, hostname, @@ -414,195 +549,123 @@ pub async fn enroll( company, site_label, &tags, - "pending", + "active", ); - let machine = db::machines::update_enrolled_machine(db.pool(), existing.id, ¶ms) - .await - .map_err(map_update_err)?; + let machine = + db::machines::update_enrolled_machine(db.pool(), existing.id, ¶ms) + .await + .map_err(map_update_err)?; - audit(db, db::events::EventTypes::ENROLL_COLLISION_PENDING, ip, json!({ - "machine_id": machine.id, - "machine_uid": machine_uid, - "site_code": site_code, - "existing_hostname": existing.hostname, - "incoming_hostname": hostname, - "heuristic": "online_existing_with_different_hostname (PROVISIONAL)", - })) + let cak = mint_cak(db, machine.id, tenant_id).await?; + + audit( + db, + db::events::EventTypes::ENROLL_REUSE, + ip, + json!({ + "machine_id": machine.id, + "machine_uid": machine_uid, + "site_code": site_code, + }), + ) .await; - // TODO(SPEC-016): wire to #dev-alerts — collision requires operator - // confirmation in the dashboard before this endpoint may activate. - tracing::warn!( - "[ENROLL] machine_uid collision -> PENDING: machine_id={} site_code={} \ - existing_host={} incoming_host={} from {}", - machine.id, site_code, existing.hostname, hostname, ip + tracing::info!( + "[ENROLL] reuse: machine_id={} re-enrolled at site_code={} from {}", + machine.id, + site_code, + ip ); return Ok(( - StatusCode::ACCEPTED, + StatusCode::OK, Json(EnrollResponse { machine_id: machine.id, - key: None, - enrollment_state: "pending".to_string(), - disposition: "collision_pending".to_string(), + key: Some(cak), + enrollment_state: "active".to_string(), + disposition: "reuse".to_string(), }), )); } - // Cross-site guard (SPEC-016 Phase A, HIGH): if this machine_uid is - // already bound to a DIFFERENT site, REFUSE — do not move it and do not - // mint a key. `machine_uid` is a raw, spoofable client string in Phase A, - // so a silent move here is a hijack vector: a holder of site B's key could - // claim a site-A machine merely by presenting its uid, repointing the row - // and minting a fresh `cak_` on site B. Deliberate moves are out of scope - // for Phase A and arrive with the Phase-B `--reassign` flow + dashboard - // (SPEC-016 §"Explicitly out of scope"); until then this is the explicit - // accidental-move / hijack guard the spec calls for. - // - // Only `Some(other_site)` is a conflict. `None` (a legacy / connect-path / - // support-code row that never had a relational site binding) is treated as - // a first-time bind to the enrolling site, NOT a cross-site move — there is - // no prior owning site to hijack from. - if let Some(existing_site) = existing.site_id { - if existing_site != site.id { - audit(db, db::events::EventTypes::ENROLL_SITE_CONFLICT, ip, json!({ - "machine_id": existing.id, - "machine_uid": machine_uid, - // Record the bound site id for the operator audit trail; the - // response body below leaks nothing about the other site. - "bound_site_id": existing_site, - "attempted_site_code": site_code, - "attempted_site_id": site.id, - })) - .await; - // TODO(SPEC-016): wire to #dev-alerts — cross-site enroll refused - // (possible accidental move or a cross-site claim attempt). - tracing::warn!( - "[ENROLL] cross-site conflict REFUSED: machine_id={} already bound to \ - a different site; attempted site_code={} from {}", - existing.id, site_code, ip - ); - // Opaque-enough: states the machine is already enrolled elsewhere - // and how to move it deliberately, without naming the other site. - return Err(ApiError::new( - StatusCode::CONFLICT, - "ENROLL_SITE_CONFLICT", - "This machine is already enrolled at another site. A deliberate \ - move requires the operator-initiated reassignment flow.", - )); - } - } - - // Same-site reuse (re-image / re-install) or first relational bind of a - // legacy/connect-path row (existing.site_id was NULL). No site move occurs - // on the unauthenticated path in Phase A. - let params = enroll_params( - &existing.agent_id, - hostname, - machine_uid, - tenant_id, - site.id, - company, - site_label, - &tags, - "active", - ); - let machine = db::machines::update_enrolled_machine(db.pool(), existing.id, ¶ms) - .await - .map_err(map_update_err)?; - - let cak = mint_cak(db, machine.id, tenant_id).await?; - - audit(db, db::events::EventTypes::ENROLL_REUSE, ip, json!({ - "machine_id": machine.id, - "machine_uid": machine_uid, - "site_code": site_code, - })) - .await; - tracing::info!( - "[ENROLL] reuse: machine_id={} re-enrolled at site_code={} from {}", - machine.id, site_code, ip - ); - - return Ok(( - StatusCode::OK, - Json(EnrollResponse { - machine_id: machine.id, - key: Some(cak), - enrollment_state: "active".to_string(), - disposition: "reuse".to_string(), - }), - )); - } - - // -- New enrollment ---------------------------------------------------- - None => { - // Fresh opaque agent_id for the new row's `agent_id UNIQUE` column. The - // agent's own config-UUID story is Phase B; the server only needs a - // unique non-null value here, and the authoritative identity is the - // minted cak_ -> machine binding. - let agent_id = format!("enroll-{}", Uuid::new_v4()); - let params = enroll_params( - &agent_id, - hostname, - machine_uid, - tenant_id, - site.id, - company, - site_label, - &tags, - "active", - ); - let machine = match db::machines::insert_enrolled_machine(db.pool(), ¶ms).await { - Ok(m) => m, - // TOCTOU loser: a concurrent first-enroll of the same machine_uid won - // the race and committed its row between our lookup and this INSERT, so - // the partial unique index on `machine_uid` rejects ours. Loop back: the - // re-lookup now finds the winner's row and we converge to reuse instead - // of 500ing. Capped so a non-uid unique violation (or any persistent - // conflict) surfaces as a 500 rather than spinning. - Err(e) if is_machine_uid_conflict(&e) && attempts < 2 => { - tracing::info!( - "[ENROLL] concurrent first-enroll race on machine_uid; \ + // -- New enrollment ---------------------------------------------------- + None => { + // Fresh opaque agent_id for the new row's `agent_id UNIQUE` column. The + // agent's own config-UUID story is Phase B; the server only needs a + // unique non-null value here, and the authoritative identity is the + // minted cak_ -> machine binding. + let agent_id = format!("enroll-{}", Uuid::new_v4()); + let params = enroll_params( + &agent_id, + hostname, + machine_uid, + tenant_id, + site.id, + company, + site_label, + &tags, + "active", + ); + let machine = match db::machines::insert_enrolled_machine(db.pool(), ¶ms).await + { + Ok(m) => m, + // TOCTOU loser: a concurrent first-enroll of the same machine_uid won + // the race and committed its row between our lookup and this INSERT, so + // the partial unique index on `machine_uid` rejects ours. Loop back: the + // re-lookup now finds the winner's row and we converge to reuse instead + // of 500ing. Capped so a non-uid unique violation (or any persistent + // conflict) surfaces as a 500 rather than spinning. + Err(e) if is_machine_uid_conflict(&e) && attempts < 2 => { + tracing::info!( + "[ENROLL] concurrent first-enroll race on machine_uid; \ retrying as reuse (site_code={} from {})", - site_code, ip - ); - continue; - } - Err(e) => { - tracing::error!("[ENROLL] DB error inserting enrolled machine: {}", e); - return Err(ApiError::new( - StatusCode::INTERNAL_SERVER_ERROR, - "INTERNAL_ERROR", - "Failed to register machine", - )); - } - }; + site_code, + ip + ); + continue; + } + Err(e) => { + tracing::error!("[ENROLL] DB error inserting enrolled machine: {}", e); + return Err(ApiError::new( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Failed to register machine", + )); + } + }; - let cak = mint_cak(db, machine.id, tenant_id).await?; + let cak = mint_cak(db, machine.id, tenant_id).await?; - audit(db, db::events::EventTypes::ENROLL_NEW, ip, json!({ - "machine_id": machine.id, - "machine_uid": machine_uid, - "site_code": site_code, - "hostname": hostname, - })) - .await; - // TODO(SPEC-016): wire to #dev-alerts — new-enrollment tripwire. - tracing::info!( - "[ENROLL] new: machine_id={} hostname={} site_code={} from {}", - machine.id, hostname, site_code, ip - ); + audit( + db, + db::events::EventTypes::ENROLL_NEW, + ip, + json!({ + "machine_id": machine.id, + "machine_uid": machine_uid, + "site_code": site_code, + "hostname": hostname, + }), + ) + .await; + // TODO(SPEC-016): wire to #dev-alerts — new-enrollment tripwire. + tracing::info!( + "[ENROLL] new: machine_id={} hostname={} site_code={} from {}", + machine.id, + hostname, + site_code, + ip + ); - return Ok(( - StatusCode::CREATED, - Json(EnrollResponse { - machine_id: machine.id, - key: Some(cak), - enrollment_state: "active".to_string(), - disposition: "new".to_string(), - }), - )); - } + return Ok(( + StatusCode::CREATED, + Json(EnrollResponse { + machine_id: machine.id, + key: Some(cak), + enrollment_state: "active".to_string(), + disposition: "new".to_string(), + }), + )); + } } } } @@ -636,7 +699,12 @@ fn effective_tags(labels: &EnrollLabels) -> Vec { .map(|t| t.trim().to_string()) .filter(|t| !t.is_empty()) .collect(); - if let Some(d) = labels.department.as_deref().map(str::trim).filter(|s| !s.is_empty()) { + if let Some(d) = labels + .department + .as_deref() + .map(str::trim) + .filter(|s| !s.is_empty()) + { tags.push(format!("department:{}", d)); } if let Some(d) = labels @@ -708,7 +776,8 @@ mod tests { fn timing_equalizer_phc_is_a_valid_parseable_hash() { // A wrong password must verify to `false` WITHOUT erroring — proving the PHC // parsed and the KDF actually ran (a parse failure would surface as `Err`). - let res = crate::auth::password::verify_password("cek_anything_at_all", TIMING_EQUALIZER_PHC); + let res = + crate::auth::password::verify_password("cek_anything_at_all", TIMING_EQUALIZER_PHC); assert!( res.is_ok(), "TIMING_EQUALIZER_PHC must be a valid PHC string so the KDF runs; got Err: {:?}", diff --git a/server/src/auth/enrollment_keys.rs b/server/src/auth/enrollment_keys.rs index 2b63b38..e32e2d0 100644 --- a/server/src/auth/enrollment_keys.rs +++ b/server/src/auth/enrollment_keys.rs @@ -178,7 +178,10 @@ mod tests { #[test] fn fingerprint_differs_per_key() { - assert_ne!(compute_fingerprint("cek_aaa"), compute_fingerprint("cek_bbb")); + assert_ne!( + compute_fingerprint("cek_aaa"), + compute_fingerprint("cek_bbb") + ); } #[test] diff --git a/server/src/db/enrollment_keys.rs b/server/src/db/enrollment_keys.rs index 96c27bd..82715f8 100644 --- a/server/src/db/enrollment_keys.rs +++ b/server/src/db/enrollment_keys.rs @@ -102,12 +102,11 @@ pub async fn rotate_key( let mut tx = pool.begin().await?; // Highest existing version for this site (NULL -> 0 so the first key is v1). - let current_max: Option = sqlx::query_scalar( - "SELECT MAX(version) FROM site_enrollment_keys WHERE site_id = $1", - ) - .bind(site_id) - .fetch_one(&mut *tx) - .await?; + let current_max: Option = + sqlx::query_scalar("SELECT MAX(version) FROM site_enrollment_keys WHERE site_id = $1") + .bind(site_id) + .fetch_one(&mut *tx) + .await?; let next_version = current_max.unwrap_or(0) + 1; // Deactivate the current active key (if any), stamping rotated_at. diff --git a/server/src/db/mod.rs b/server/src/db/mod.rs index 97afb9e..a4432a3 100644 --- a/server/src/db/mod.rs +++ b/server/src/db/mod.rs @@ -7,9 +7,9 @@ pub mod agent_keys; pub mod enrollment_keys; pub mod events; pub mod machines; -pub mod sites; pub mod releases; pub mod sessions; +pub mod sites; pub mod support_codes; pub mod tenancy; pub mod users; diff --git a/server/src/middleware/rate_limit.rs b/server/src/middleware/rate_limit.rs index 72f21e7..eeb420e 100644 --- a/server/src/middleware/rate_limit.rs +++ b/server/src/middleware/rate_limit.rs @@ -708,7 +708,7 @@ mod tests { let t0 = Instant::now(); assert!(lim.check_at("SITE-A", ip(1), t0)); assert!(!lim.check_at("SITE-A", ip(1), t0)); // same key over cap - // Different site, same IP -> independent bucket. + // Different site, same IP -> independent bucket. assert!(lim.check_at("SITE-B", ip(1), t0)); // Same site, different IP -> independent bucket. assert!(lim.check_at("SITE-A", ip(2), t0)); @@ -723,7 +723,7 @@ mod tests { // Not yet tripped: a check still admits. assert!(lim.check_at("SITE-A", ip(1), t0)); lim.record_failure_at("SITE-A", ip(1), t0); // 3rd -> trips - // Now locked out: check denies even though under the request cap. + // Now locked out: check denies even though under the request cap. assert!(!lim.check_at("SITE-A", ip(1), t0)); }