//! Zero-touch per-site self-registration endpoint (SPEC-016, Phase A). //! //! `POST /api/enroll` is the PUBLIC (no-JWT) door a managed agent walks through on //! first run: it presents its site's `site_code` + the long per-site enrollment //! key (`cek_`) and its machine-derived `machine_uid`, and the server — if the key //! verifies — dedups on `(tenant, machine_uid)`, creates or reuses the machine row, //! and mints the per-machine `cak_` operating credential, returning the plaintext //! `cak_` exactly once. //! //! Two-tier credential model (SPEC-016 §Security): the enrollment key is the //! low-sensitivity, rotatable, per-site GATE ("may register"); the minted `cak_` is //! the high-sensitivity, per-machine, independently-revocable OPERATING credential //! the relay (`relay::validate_agent_api_key`) already accepts. This handler only //! MINTS a `cak_` in the exact stored form `verify_agent_key` expects (SHA-256 hash //! in `connect_agent_keys`) — it does not touch the relay auth path. //! //! AUTH POSTURE: auto-approve (ScreenConnect parity) — a clean enroll is live and //! controllable immediately, with the new-enrollment alert as the tripwire. The one //! exception is a detected `machine_uid` collision, which gates the machine to //! `enrollment_state = 'pending'`. //! //! What the collision gate actually does in Phase A (be precise — it is NOT a full //! quarantine yet): it sets `enrollment_state = 'pending'` and withholds a *newly //! minted* `cak_` (the response carries no key). It does NOT revoke a `cak_` the //! machine may already hold from a prior clean enroll, and NOTHING on the relay / //! control plane consults `enrollment_state` yet — so a colliding machine that was //! already enrolled keeps its working credential and remains controllable until the //! enforcement below lands. The gate today is an audit + alert tripwire that asks an //! operator to confirm in the dashboard, not a hard control-plane block. //! //! TODO(SPEC-016 Phase B/D): relay/control plane must refuse machines with //! enrollment_state='pending'; consider revoking existing cak_ on collision. //! //! CROSS-SITE: Phase A does NOT move a machine between sites. A valid key for site B //! presented for a `machine_uid` already bound to site A is REFUSED //! (`ENROLL_SITE_CONFLICT`, 409) — no move, no key — as the accidental-move / //! cross-site-hijack guard (`machine_uid` is a raw, spoofable client string in Phase //! A). Deliberate moves arrive with the Phase-B `--reassign` flow + dashboard //! (SPEC-016 §"Explicitly out of scope" / `--reassign`). //! //! SECURITY: never log the enrollment key, the minted `cak_`, or any hash. The //! plaintext `cak_` appears only in the success response body, once. use std::net::IpAddr; use std::net::SocketAddr; use axum::{ extract::{ConnectInfo, State}, http::{HeaderMap, StatusCode}, Json, }; use serde::{Deserialize, Serialize}; use serde_json::json; use uuid::Uuid; use crate::auth::{agent_keys, enrollment_keys}; use crate::db; use crate::AppState; /// A fixed, valid Argon2id PHC hash used ONLY to equalize timing on the early-reject /// enroll paths (unknown `site_code`, no active key). Those paths would otherwise /// return before paying the KDF, while a wrong key pays the full Argon2id verify — /// a timing oracle that distinguishes "this site_code/active-key exists" from /// "rejected at the key check". On every early reject we run a throwaway verify of /// the supplied key against THIS constant and discard the result, so all rejection /// paths pay one Argon2id verify. /// /// It is the Argon2id (V0x13, default params — matching [`crate::auth::password`]) /// hash of the byte string `"enroll-timing-equalizer-throwaway"` under a fixed salt /// — generated offline, committed verbatim, and asserted valid + verifying in the /// tests below. It guards NOTHING: it is never a credential, never compared for /// auth, and the password it encodes is public. A real key can never equal it /// (Phase A keys are `cek_`-prefixed 256-bit randoms), so the dummy verify always /// returns `false`, exactly as the real reject paths do. const TIMING_EQUALIZER_PHC: &str = "$argon2id$v=19$m=19456,t=2,p=1$ZW5yb2xsdGltaW5nZXF6$tXiQXmQUAUVszrdp5HrVGIJtsQTidLuSKld0ITNv2Es"; /// Pay one Argon2id verify against [`TIMING_EQUALIZER_PHC`] and discard the result. /// /// Called on the early-reject enroll paths so they cost the same as a real /// wrong-key reject (which pays the KDF in `verify_enrollment_key`). The boolean is /// intentionally ignored — this exists purely for its side effect (CPU time). #[inline] fn equalize_reject_timing(presented_key: &str) { // `verify_password` parses the PHC and runs Argon2id; a malformed constant would // make it return Err and skip the KDF, defeating the purpose, so the constant is // a known-valid PHC string (asserted in tests). let _ = crate::auth::password::verify_password(presented_key, TIMING_EQUALIZER_PHC); } /// Standard error envelope (see `.claude/standards/api/response-format.md`), /// matching `api::machine_keys::ApiError`. #[derive(Debug, Serialize)] pub struct ApiError { pub detail: String, pub error_code: String, pub status_code: u16, } impl ApiError { fn new(status: StatusCode, code: &str, detail: &str) -> (StatusCode, Json) { ( status, Json(ApiError { detail: detail.to_string(), error_code: code.to_string(), status_code: status.as_u16(), }), ) } } type ApiResult = Result)>; /// Labels an installer carries for the machines it enrolls (SPEC-016 §3). /// /// All optional: a thin installer may carry only company/site. `company` -> /// `connect_machines.organization`; `site` -> `connect_machines.site` (the /// free-text label, distinct from the relational site binding resolved from /// `site_code`). `department` / `device_type` are reserved label fields (SPEC-007 /// AgentStatus parity) — accepted and folded into `tags` for now (no dedicated /// columns yet), so they are not silently dropped. #[derive(Debug, Default, Deserialize)] pub struct EnrollLabels { #[serde(default)] pub company: Option, #[serde(default)] pub site: Option, #[serde(default)] pub department: Option, #[serde(default)] pub device_type: Option, #[serde(default)] pub tags: Vec, } /// `POST /api/enroll` request body (SPEC-016 §3). #[derive(Debug, Deserialize)] pub struct EnrollRequest { pub site_code: String, /// The per-site enrollment secret (`cek_`). Verified against the site's active /// hashed key; never logged. pub enrollment_key: String, /// Opaque caller-supplied stable machine identity. Phase A treats this as an /// opaque string; the hardware-salted derivation is Phase B (agent-side). pub machine_uid: String, pub hostname: String, #[serde(default)] pub labels: EnrollLabels, } /// `POST /api/enroll` success response. /// /// On a clean (active) enroll, `key` carries the plaintext `cak_` ONCE. On a /// collision-gated `pending` enroll, `key` is `None` and `enrollment_state` is /// `"pending"` — no usable operating credential is issued until an operator /// confirms the endpoint in the dashboard. #[derive(Debug, Serialize)] pub struct EnrollResponse { /// `connect_machines.id` for the enrolled machine. pub machine_id: Uuid, /// The minted plaintext `cak_`, present ONLY for an active enroll, ONLY here. #[serde(skip_serializing_if = "Option::is_none")] pub key: Option, /// `"active"` (live, controllable, key issued) or `"pending"` (collision-gated; /// awaiting operator confirmation; no key issued). pub enrollment_state: String, /// Disposition: `"new"` | `"reuse"` | `"collision_pending"`. (`"site_move"` is /// NOT reachable on the Phase A unauthenticated path — a cross-site enroll is /// refused with `409 ENROLL_SITE_CONFLICT` before any disposition is returned; /// deliberate moves are the Phase-B `--reassign` flow.) pub disposition: String, } fn require_db(state: &AppState) -> ApiResult<&db::Database> { state.db.as_ref().ok_or_else(|| { ApiError::new( StatusCode::SERVICE_UNAVAILABLE, "DATABASE_UNAVAILABLE", "Database not available", ) }) } /// Collision-gate heuristic (PROVISIONAL — SPEC-016 §item-1/6). /// /// The residual collision case is template-cloned VMs that share a hardware UUID /// (some hypervisors clone the SMBIOS UUID), so a *different* physical endpoint /// resolves to an existing `machine_uid`. We cannot distinguish that from a benign /// re-image purely from a client-asserted uid, so the gate is intentionally /// CONSERVATIVE and the heuristic is provisional, to be tightened in planning /// (which durable hardware signals feed the uid, and the hypervisor behavior /// matrix — see SPEC-016 §Remaining-for-planning). /// /// PROVISIONAL heuristic chosen for Phase A: treat it as a collision when the /// matched existing machine /// (a) is currently considered ONLINE (status == "online"), AND /// (b) reports a DIFFERENT hostname than the incoming request, /// i.e. an apparently-live box already owns this uid yet a second box with a /// different name is enrolling against it concurrently — the clone signature. A /// case-insensitive hostname compare avoids false positives from case drift. An /// OFFLINE matched row (the common re-image / re-install case) is NOT treated as a /// collision — that is the legitimate reuse path. A same-hostname match is reuse, /// never a collision. /// /// Rationale for conservatism: a false POSITIVE merely sends a real machine to /// `pending` (an operator clicks confirm — annoying, recoverable); a false NEGATIVE /// would auto-activate a cloned endpoint (worse). When the salt set is finalized in /// planning this should become "uid is stable hardware, so a genuine clone is /// expected to be rare; gate on the salt's distinguishing component instead." fn is_collision(existing: &db::machines::Machine, incoming_hostname: &str) -> bool { let online = existing.status.eq_ignore_ascii_case("online"); let different_host = !existing .hostname .eq_ignore_ascii_case(incoming_hostname.trim()); online && different_host } /// Mint a `cak_`, store its hash bound to `machine_id` + tenant, and return the /// plaintext. Shared by the new/reuse/move active paths. async fn mint_cak( db: &db::Database, machine_id: Uuid, tenant_id: Uuid, ) -> ApiResult { let plaintext = agent_keys::generate_agent_key(); let key_hash = agent_keys::hash_agent_key(&plaintext); db::agent_keys::insert_agent_key(db.pool(), machine_id, &key_hash, Some(tenant_id)) .await .map_err(|e| { tracing::error!("[ENROLL] DB error minting agent key: {}", e); ApiError::new( StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", "Failed to mint machine credential", ) })?; Ok(plaintext) } /// `POST /api/enroll` — public self-registration (SPEC-016 §3). pub async fn enroll( State(state): State, ConnectInfo(addr): ConnectInfo, headers: HeaderMap, Json(req): Json, ) -> ApiResult<(StatusCode, Json)> { let db = require_db(&state)?; let tenant_id = db::tenancy::current_tenant_id(); // Real client IP via the shared trusted-proxy-aware extractor (same source the // relay / rate limiter / audit log use, so the buckets never drift). let ip: IpAddr = crate::utils::ip_extract::client_ip(&addr, &headers, &state.trusted_proxies); // Basic input hygiene before any DB/KDF work. let site_code = req.site_code.trim(); let hostname = req.hostname.trim(); let machine_uid = req.machine_uid.trim(); if site_code.is_empty() || hostname.is_empty() || machine_uid.is_empty() { return Err(ApiError::new( StatusCode::BAD_REQUEST, "INVALID_REQUEST", "site_code, hostname, and machine_uid are required", )); } // Defense-in-depth rate limit / lockout per (site_code, IP). The 256-bit // enrollment key is the load-bearing gate; this throttles brute-force/abuse. if !state.rate_limits.enroll.check(site_code, ip) { tracing::warn!( "[ENROLL] rate-limited/locked-out enroll for site_code={} from {}", site_code, ip ); return Err(ApiError::new( StatusCode::TOO_MANY_REQUESTS, "RATE_LIMITED", "Too many enrollment attempts. Please try again later.", )); } // Resolve the site by code (per-tenant). let site = match db::sites::get_site_by_code(db.pool(), site_code, tenant_id).await { Ok(Some(s)) => s, Ok(None) => { // Pay the Argon2id cost a real wrong-key reject would, so an unknown // site_code is not distinguishable by timing (enumeration oracle). equalize_reject_timing(&req.enrollment_key); state.rate_limits.enroll.record_failure(site_code, ip); audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({ "reason": "unknown_site_code", "site_code": site_code, "machine_uid": machine_uid, })) .await; tracing::warn!("[ENROLL] unknown site_code={} from {}", site_code, ip); // Same opaque rejection shape AND the same KDF cost as a bad key — do not // reveal (by body or by timing) which of the two failed. return Err(ApiError::new( StatusCode::UNAUTHORIZED, "ENROLL_REJECTED", "Invalid site code or enrollment key", )); } Err(e) => { tracing::error!("[ENROLL] DB error resolving site: {}", e); return Err(ApiError::new( StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", "Internal server error", )); } }; // Verify the enrollment key against the site's ACTIVE key. A rotated-out (old) // installer's key is inactive and rejected here — old installers cannot enroll // NEW machines after rotation (SPEC-016 success-criterion #3). let active_key = match db::enrollment_keys::get_active_for_site(db.pool(), site.id).await { Ok(Some(k)) => k, Ok(None) => { // Pay the Argon2id cost a real wrong-key reject would, so "site exists but // has no active key" is not distinguishable by timing from a bad key. equalize_reject_timing(&req.enrollment_key); state.rate_limits.enroll.record_failure(site_code, ip); audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({ "reason": "no_active_key", "site_code": site_code, "machine_uid": machine_uid, })) .await; tracing::warn!("[ENROLL] no active enrollment key for site_code={}", site_code); return Err(ApiError::new( StatusCode::UNAUTHORIZED, "ENROLL_REJECTED", "Invalid site code or enrollment key", )); } Err(e) => { tracing::error!("[ENROLL] DB error loading active enrollment key: {}", e); return Err(ApiError::new( StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", "Internal server error", )); } }; if !enrollment_keys::verify_enrollment_key(&req.enrollment_key, &active_key.key_hash) { state.rate_limits.enroll.record_failure(site_code, ip); audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({ "reason": "bad_enrollment_key", "site_code": site_code, "machine_uid": machine_uid, })) .await; tracing::warn!("[ENROLL] bad enrollment key for site_code={} from {}", site_code, ip); return Err(ApiError::new( StatusCode::UNAUTHORIZED, "ENROLL_REJECTED", "Invalid site code or enrollment key", )); } // Key verified — this is a legitimate attempt; reset the failure streak. state.rate_limits.enroll.record_success(site_code, ip); // Build the label/identity params shared by the create/update paths. let tags = effective_tags(&req.labels); let company = req.labels.company.as_deref().map(str::trim).filter(|s| !s.is_empty()); let site_label = req .labels .site .as_deref() .map(str::trim) .filter(|s| !s.is_empty()); // Dedup on (tenant, machine_uid). Wrapped in a bounded retry loop to resolve the // first-enroll TOCTOU (SPEC-016 Phase A, LOW): the `None` branch does a lookup // then an INSERT with no in-between lock, so two concurrent first-time enrolls of // the same `machine_uid` both see `None` and both INSERT. The partial unique index // on `machine_uid` makes the loser's INSERT raise a unique violation; instead of // 500ing, the loser loops back, the lookup now returns `Some`, and it converges to // the reuse path (one row, no error). One retry is sufficient — after a unique // violation the winner's row is committed and visible — but we cap iterations // defensively so a pathological repeat can never spin forever. let mut attempts = 0u8; loop { attempts += 1; let existing = match db::machines::get_machine_by_tenant_uid(db.pool(), tenant_id, machine_uid).await { Ok(e) => e, Err(e) => { tracing::error!("[ENROLL] DB error on dedup lookup: {}", e); return Err(ApiError::new( StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", "Internal server error", )); } }; match existing { // -- Reuse / collision path ------------------------------------------- Some(existing) => { // Collision gate: a seemingly-different live endpoint resolving to an // existing uid -> pending, alert, NO usable cak_ minted. if is_collision(&existing, hostname) { let params = enroll_params( &existing.agent_id, hostname, machine_uid, tenant_id, site.id, company, site_label, &tags, "pending", ); let machine = db::machines::update_enrolled_machine(db.pool(), existing.id, ¶ms) .await .map_err(map_update_err)?; audit(db, db::events::EventTypes::ENROLL_COLLISION_PENDING, ip, json!({ "machine_id": machine.id, "machine_uid": machine_uid, "site_code": site_code, "existing_hostname": existing.hostname, "incoming_hostname": hostname, "heuristic": "online_existing_with_different_hostname (PROVISIONAL)", })) .await; // TODO(SPEC-016): wire to #dev-alerts — collision requires operator // confirmation in the dashboard before this endpoint may activate. tracing::warn!( "[ENROLL] machine_uid collision -> PENDING: machine_id={} site_code={} \ existing_host={} incoming_host={} from {}", machine.id, site_code, existing.hostname, hostname, ip ); return Ok(( StatusCode::ACCEPTED, Json(EnrollResponse { machine_id: machine.id, key: None, enrollment_state: "pending".to_string(), disposition: "collision_pending".to_string(), }), )); } // Cross-site guard (SPEC-016 Phase A, HIGH): if this machine_uid is // already bound to a DIFFERENT site, REFUSE — do not move it and do not // mint a key. `machine_uid` is a raw, spoofable client string in Phase A, // so a silent move here is a hijack vector: a holder of site B's key could // claim a site-A machine merely by presenting its uid, repointing the row // and minting a fresh `cak_` on site B. Deliberate moves are out of scope // for Phase A and arrive with the Phase-B `--reassign` flow + dashboard // (SPEC-016 §"Explicitly out of scope"); until then this is the explicit // accidental-move / hijack guard the spec calls for. // // Only `Some(other_site)` is a conflict. `None` (a legacy / connect-path / // support-code row that never had a relational site binding) is treated as // a first-time bind to the enrolling site, NOT a cross-site move — there is // no prior owning site to hijack from. if let Some(existing_site) = existing.site_id { if existing_site != site.id { audit(db, db::events::EventTypes::ENROLL_SITE_CONFLICT, ip, json!({ "machine_id": existing.id, "machine_uid": machine_uid, // Record the bound site id for the operator audit trail; the // response body below leaks nothing about the other site. "bound_site_id": existing_site, "attempted_site_code": site_code, "attempted_site_id": site.id, })) .await; // TODO(SPEC-016): wire to #dev-alerts — cross-site enroll refused // (possible accidental move or a cross-site claim attempt). tracing::warn!( "[ENROLL] cross-site conflict REFUSED: machine_id={} already bound to \ a different site; attempted site_code={} from {}", existing.id, site_code, ip ); // Opaque-enough: states the machine is already enrolled elsewhere // and how to move it deliberately, without naming the other site. return Err(ApiError::new( StatusCode::CONFLICT, "ENROLL_SITE_CONFLICT", "This machine is already enrolled at another site. A deliberate \ move requires the operator-initiated reassignment flow.", )); } } // Same-site reuse (re-image / re-install) or first relational bind of a // legacy/connect-path row (existing.site_id was NULL). No site move occurs // on the unauthenticated path in Phase A. let params = enroll_params( &existing.agent_id, hostname, machine_uid, tenant_id, site.id, company, site_label, &tags, "active", ); let machine = db::machines::update_enrolled_machine(db.pool(), existing.id, ¶ms) .await .map_err(map_update_err)?; let cak = mint_cak(db, machine.id, tenant_id).await?; audit(db, db::events::EventTypes::ENROLL_REUSE, ip, json!({ "machine_id": machine.id, "machine_uid": machine_uid, "site_code": site_code, })) .await; tracing::info!( "[ENROLL] reuse: machine_id={} re-enrolled at site_code={} from {}", machine.id, site_code, ip ); return Ok(( StatusCode::OK, Json(EnrollResponse { machine_id: machine.id, key: Some(cak), enrollment_state: "active".to_string(), disposition: "reuse".to_string(), }), )); } // -- New enrollment ---------------------------------------------------- None => { // Fresh opaque agent_id for the new row's `agent_id UNIQUE` column. The // agent's own config-UUID story is Phase B; the server only needs a // unique non-null value here, and the authoritative identity is the // minted cak_ -> machine binding. let agent_id = format!("enroll-{}", Uuid::new_v4()); let params = enroll_params( &agent_id, hostname, machine_uid, tenant_id, site.id, company, site_label, &tags, "active", ); let machine = match db::machines::insert_enrolled_machine(db.pool(), ¶ms).await { Ok(m) => m, // TOCTOU loser: a concurrent first-enroll of the same machine_uid won // the race and committed its row between our lookup and this INSERT, so // the partial unique index on `machine_uid` rejects ours. Loop back: the // re-lookup now finds the winner's row and we converge to reuse instead // of 500ing. Capped so a non-uid unique violation (or any persistent // conflict) surfaces as a 500 rather than spinning. Err(e) if is_machine_uid_conflict(&e) && attempts < 2 => { tracing::info!( "[ENROLL] concurrent first-enroll race on machine_uid; \ retrying as reuse (site_code={} from {})", site_code, ip ); continue; } Err(e) => { tracing::error!("[ENROLL] DB error inserting enrolled machine: {}", e); return Err(ApiError::new( StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", "Failed to register machine", )); } }; let cak = mint_cak(db, machine.id, tenant_id).await?; audit(db, db::events::EventTypes::ENROLL_NEW, ip, json!({ "machine_id": machine.id, "machine_uid": machine_uid, "site_code": site_code, "hostname": hostname, })) .await; // TODO(SPEC-016): wire to #dev-alerts — new-enrollment tripwire. tracing::info!( "[ENROLL] new: machine_id={} hostname={} site_code={} from {}", machine.id, hostname, site_code, ip ); return Ok(( StatusCode::CREATED, Json(EnrollResponse { machine_id: machine.id, key: Some(cak), enrollment_state: "active".to_string(), disposition: "new".to_string(), }), )); } } } } /// Is this DB error a unique-constraint violation involving the `machine_uid` /// (the first-enroll TOCTOU race)? Postgres reports a unique violation as SQLSTATE /// `23505`; we additionally require the constraint/message to reference `machine_uid` /// so a violation on a different unique column (e.g. `agent_id`) is NOT swallowed as /// a benign race — those are genuine errors and must still surface as 500. fn is_machine_uid_conflict(e: &sqlx::Error) -> bool { e.as_database_error() .filter(|db_err| db_err.code().as_deref() == Some("23505")) .map(|db_err| { db_err .constraint() .map(|c| c.contains("machine_uid")) // Fall back to the message when the driver exposes no constraint name // (e.g. a partial-index violation), so the uid race is still caught. .unwrap_or_else(|| db_err.message().contains("machine_uid")) }) .unwrap_or(false) } /// Fold `department` / `device_type` (no dedicated columns yet — SPEC-007) into the /// tag set as `department:` / `device_type:` so they are preserved rather than /// dropped, alongside any explicit tags. Empty/whitespace values are skipped. fn effective_tags(labels: &EnrollLabels) -> Vec { let mut tags: Vec = labels .tags .iter() .map(|t| t.trim().to_string()) .filter(|t| !t.is_empty()) .collect(); if let Some(d) = labels.department.as_deref().map(str::trim).filter(|s| !s.is_empty()) { tags.push(format!("department:{}", d)); } if let Some(d) = labels .device_type .as_deref() .map(str::trim) .filter(|s| !s.is_empty()) { tags.push(format!("device_type:{}", d)); } tags } /// Assemble [`db::machines::EnrollMachineParams`] from the resolved pieces. #[allow(clippy::too_many_arguments)] fn enroll_params<'a>( agent_id: &'a str, hostname: &'a str, machine_uid: &'a str, tenant_id: Uuid, site_id: Uuid, company: Option<&'a str>, site_label: Option<&'a str>, tags: &'a [String], enrollment_state: &'a str, ) -> db::machines::EnrollMachineParams<'a> { db::machines::EnrollMachineParams { agent_id, hostname, machine_uid, tenant_id, site_id, company, site_label, tags, enrollment_state, } } /// Best-effort enrollment audit write — a failure here never fails the enroll. async fn audit(db: &db::Database, event_type: &str, ip: IpAddr, details: serde_json::Value) { if let Err(e) = db::events::log_enrollment_event(db.pool(), event_type, details, Some(ip)).await { tracing::warn!("[ENROLL] failed to write {} audit event: {}", event_type, e); } } /// Map a DB error from the existing-row update to the standard 500 envelope. fn map_update_err(e: sqlx::Error) -> (StatusCode, Json) { tracing::error!("[ENROLL] DB error updating enrolled machine: {}", e); ApiError::new( StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", "Failed to update machine", ) } #[cfg(test)] mod tests { use super::*; // ---- Fix #2: the timing-equalizer PHC constant is a real, valid Argon2id hash. // If this constant were malformed, `verify_password` would return `Err` and skip // the KDF entirely — defeating the whole point (the early-reject paths would NOT // pay the Argon2id cost and the timing oracle would reopen). These tests are the // standing guard that the constant keeps paying the KDF. #[test] fn timing_equalizer_phc_is_a_valid_parseable_hash() { // A wrong password must verify to `false` WITHOUT erroring — proving the PHC // parsed and the KDF actually ran (a parse failure would surface as `Err`). let res = crate::auth::password::verify_password("cek_anything_at_all", TIMING_EQUALIZER_PHC); assert!( res.is_ok(), "TIMING_EQUALIZER_PHC must be a valid PHC string so the KDF runs; got Err: {:?}", res.err() ); assert!( !res.unwrap(), "an arbitrary key must NOT match the throwaway equalizer hash" ); } #[test] fn equalize_reject_timing_runs_without_panicking() { // The early-reject helper must always complete (it pays the KDF and discards // the result); a panic here would 500 a rejection path. equalize_reject_timing("cek_some_presented_value"); equalize_reject_timing(""); // even degenerate input must be safe } // ---- Fix #1 support: the collision heuristic truth table is unchanged. ------- fn machine_with(status: &str, hostname: &str) -> db::machines::Machine { db::machines::Machine { id: Uuid::new_v4(), agent_id: "agent-x".to_string(), hostname: hostname.to_string(), os_version: None, is_elevated: false, is_persistent: true, first_seen: chrono::Utc::now(), last_seen: chrono::Utc::now(), last_session_id: None, status: status.to_string(), created_at: chrono::Utc::now(), updated_at: chrono::Utc::now(), tenant_id: None, organization: None, site: None, tags: Vec::new(), machine_uid: Some("uid-1".to_string()), deleted_at: None, site_id: None, enrollment_state: "active".to_string(), } } #[test] fn collision_only_when_online_and_different_host() { // Online + different hostname => collision (the clone signature). assert!(is_collision(&machine_with("online", "HOST-A"), "HOST-B")); // Online + same hostname (case-insensitive) => reuse, not a collision. assert!(!is_collision(&machine_with("online", "HOST-A"), "host-a")); // Offline (the common re-image case) => never a collision, even if renamed. assert!(!is_collision(&machine_with("offline", "HOST-A"), "HOST-B")); } // ---- DB-gated tests (skip without TEST_DATABASE_URL; run in CI on Postgres). - // These validate the mechanisms fixes #1 and #3 rely on against a REAL Postgres // error/row, which cannot be faked on a workstation. The full handler is exercised // end-to-end in CI; here we pin the load-bearing primitives. use sqlx::postgres::PgPoolOptions; use sqlx::PgPool; async fn test_pool() -> Option { let url = std::env::var("TEST_DATABASE_URL").ok()?; let pool = PgPoolOptions::new() .max_connections(2) .connect(&url) .await .expect("connect to TEST_DATABASE_URL"); sqlx::migrate!("./migrations") .run(&pool) .await .expect("apply migrations to the test database"); Some(pool) } async fn cleanup(pool: &PgPool, uids: &[&str], site_codes: &[&str]) { for uid in uids { let _ = sqlx::query("DELETE FROM connect_machines WHERE machine_uid = $1") .bind(uid) .execute(pool) .await; } for code in site_codes { let _ = sqlx::query("DELETE FROM connect_sites WHERE site_code = $1") .bind(code) .execute(pool) .await; } } /// Fix #1: after a machine is enrolled at site A, the dedup lookup returns the row /// carrying `site_id == site_a`, which is exactly what the cross-site guard compares /// against `site.id`. This pins the precondition that makes the refusal possible: /// a site-B enroll resolves the SAME row and sees a DIFFERENT bound site_id. #[tokio::test] async fn enrolled_machine_exposes_bound_site_id_for_cross_site_guard() { let Some(pool) = test_pool().await else { return; // no TEST_DATABASE_URL: skip (runs in CI) }; let tenant = db::tenancy::current_tenant_id(); let uid = "test-xsite-uid-001"; cleanup(&pool, &[uid], &["test-xsite-A", "test-xsite-B"]).await; let site_a = db::sites::insert_site(&pool, "test-xsite-A", None, None, Some(tenant)) .await .expect("insert site A"); let site_b = db::sites::insert_site(&pool, "test-xsite-B", None, None, Some(tenant)) .await .expect("insert site B"); assert_ne!(site_a.id, site_b.id); let agent_id_a = format!("enroll-{}", Uuid::new_v4()); let params = enroll_params( &agent_id_a, "HOST-XSITE", uid, tenant, site_a.id, None, None, &[], "active", ); db::machines::insert_enrolled_machine(&pool, ¶ms) .await .expect("enroll at site A"); // A subsequent (site-B) enroll dedups on (tenant, uid) and resolves THIS row. let found = db::machines::get_machine_by_tenant_uid(&pool, tenant, uid) .await .expect("dedup lookup") .expect("row exists"); // The guard compares found.site_id (Some(A)) against the enrolling site (B); // they differ => refusal. Prove the comparison the guard makes evaluates true. assert_eq!(found.site_id, Some(site_a.id), "row is bound to site A"); assert!( found.site_id.is_some_and(|s| s != site_b.id), "cross-site guard condition (bound to a different site) must hold" ); cleanup(&pool, &[uid], &["test-xsite-A", "test-xsite-B"]).await; } /// Fix #3: a duplicate first-enroll of the same `machine_uid` raises a Postgres /// unique violation that `is_machine_uid_conflict` classifies as `true` (the race /// the loop retries as reuse), while a DIFFERENT unique violation (`agent_id`) /// classifies as `false` (a genuine error that must still 500). This is the exact /// branch predicate of the TOCTOU retry, validated against real driver errors. #[tokio::test] async fn machine_uid_conflict_is_classified_but_agent_id_conflict_is_not() { let Some(pool) = test_pool().await else { return; // no TEST_DATABASE_URL: skip (runs in CI) }; let tenant = db::tenancy::current_tenant_id(); let uid = "test-toctou-uid-001"; cleanup(&pool, &[uid], &["test-toctou-A"]).await; let site = db::sites::insert_site(&pool, "test-toctou-A", None, None, Some(tenant)) .await .expect("insert site"); let shared_agent_id = format!("enroll-{}", Uuid::new_v4()); let first = enroll_params( &shared_agent_id, "HOST-TOCTOU", uid, tenant, site.id, None, None, &[], "active", ); db::machines::insert_enrolled_machine(&pool, &first) .await .expect("first enroll wins the race"); // Loser re-inserts the SAME machine_uid (fresh agent_id) -> machine_uid unique // violation -> must be classified as the retryable race. let loser_agent_id = format!("enroll-{}", Uuid::new_v4()); let uid_dup = enroll_params( &loser_agent_id, "HOST-TOCTOU-2", uid, tenant, site.id, None, None, &[], "active", ); let err = db::machines::insert_enrolled_machine(&pool, &uid_dup) .await .expect_err("duplicate machine_uid must violate the unique index"); assert!( is_machine_uid_conflict(&err), "a machine_uid unique violation must be classified as the retryable race; got: {err:?}" ); // A DIFFERENT unique violation (reusing the agent_id with a NEW uid) must NOT be // swallowed as the race — it is a genuine error that still surfaces as 500. let uid_other = "test-toctou-uid-002"; let agent_dup = enroll_params( &shared_agent_id, // collides on agent_id UNIQUE, not machine_uid "HOST-TOCTOU-3", uid_other, tenant, site.id, None, None, &[], "active", ); let err2 = db::machines::insert_enrolled_machine(&pool, &agent_dup) .await .expect_err("duplicate agent_id must violate its unique constraint"); assert!( !is_machine_uid_conflict(&err2), "an agent_id unique violation must NOT be misclassified as the machine_uid race; got: {err2:?}" ); cleanup(&pool, &[uid, uid_other], &["test-toctou-A"]).await; } }