From 0f02f2376518b5f706de7343f7239fb0804dac73 Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Tue, 2 Jun 2026 10:28:31 -0700 Subject: [PATCH] fix(enroll): SPEC-016 Phase A review fixes (cross-site guard, timing oracle, TOCTOU) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Applies the four review fixes to POST /api/enroll, all in server/src/api/enroll.rs (+ a new ENROLL_SITE_CONFLICT event type in server/src/db/events.rs): 1. HIGH — close the within-tenant cross-site silent-move hijack. A valid key for site B presented for a machine_uid already bound to a DIFFERENT site is now REFUSED (409 ENROLL_SITE_CONFLICT) instead of silently repointing the row and minting a fresh cak_. No move, no key. Emits an ENROLL_SITE_CONFLICT audit event + alert TODO. Same-site match still resolves to reuse; a NULL prior site_id is a first relational bind, not a move. The unauthenticated site_move mint path is removed; deliberate moves are deferred to the Phase-B --reassign flow + dashboard. 2. MEDIUM — kill the timing/enumeration oracle. Unknown site_code and no-active-key early rejects now pay a dummy Argon2id verify against a fixed, valid throwaway PHC constant (TIMING_EQUALIZER_PHC) before returning the identical 401, so every rejection path pays one KDF. The constant is asserted valid + verifying in tests. 3. LOW — fix the new-enroll TOCTOU. The dedup lookup + INSERT is wrapped in a bounded retry loop: a concurrent first-enroll of the same machine_uid whose INSERT loses the unique-index race (classified by is_machine_uid_conflict on SQLSTATE 23505 + machine_uid constraint) now re-looks-up and converges to reuse instead of 500ing. A non-machine_uid unique violation still surfaces as 500. 4. LOW — make the collision-gate doc honest + leave an enforcement TODO. The module doc now states the gate withholds only a NEWLY minted cak_ (a prior clean cak_ survives) and that nothing consults enrollment_state at control time yet, with a TODO(SPEC-016 Phase B/D) marker for relay/control-plane enforcement + revocation. Verify: cargo check, cargo clippy --all-targets, and cargo test all clean on this Windows host (104 tests pass). Two DB-gated tests (cross-site bound-site_id exposure, machine_uid-vs-agent_id conflict classification) no-op without TEST_DATABASE_URL and run against real Postgres in CI; the Linux target / real-Postgres handler path is validated there, not on this host. Co-Authored-By: Claude Opus 4.8 (1M context) --- server/src/api/enroll.rs | 495 ++++++++++++++++++++++++++++++++++----- server/src/db/events.rs | 11 + 2 files changed, 447 insertions(+), 59 deletions(-) diff --git a/server/src/api/enroll.rs b/server/src/api/enroll.rs index b961e0f..b0cf69f 100644 --- a/server/src/api/enroll.rs +++ b/server/src/api/enroll.rs @@ -17,8 +17,26 @@ //! AUTH POSTURE: auto-approve (ScreenConnect parity) — a clean enroll is live and //! controllable immediately, with the new-enrollment alert as the tripwire. The one //! exception is a detected `machine_uid` collision, which gates the machine to -//! `enrollment_state = 'pending'` and withholds a usable `cak_` until an operator -//! confirms it in the dashboard. +//! `enrollment_state = 'pending'`. +//! +//! What the collision gate actually does in Phase A (be precise — it is NOT a full +//! quarantine yet): it sets `enrollment_state = 'pending'` and withholds a *newly +//! minted* `cak_` (the response carries no key). It does NOT revoke a `cak_` the +//! machine may already hold from a prior clean enroll, and NOTHING on the relay / +//! control plane consults `enrollment_state` yet — so a colliding machine that was +//! already enrolled keeps its working credential and remains controllable until the +//! enforcement below lands. The gate today is an audit + alert tripwire that asks an +//! operator to confirm in the dashboard, not a hard control-plane block. +//! +//! TODO(SPEC-016 Phase B/D): relay/control plane must refuse machines with +//! enrollment_state='pending'; consider revoking existing cak_ on collision. +//! +//! CROSS-SITE: Phase A does NOT move a machine between sites. A valid key for site B +//! presented for a `machine_uid` already bound to site A is REFUSED +//! (`ENROLL_SITE_CONFLICT`, 409) — no move, no key — as the accidental-move / +//! cross-site-hijack guard (`machine_uid` is a raw, spoofable client string in Phase +//! A). Deliberate moves arrive with the Phase-B `--reassign` flow + dashboard +//! (SPEC-016 §"Explicitly out of scope" / `--reassign`). //! //! SECURITY: never log the enrollment key, the minted `cak_`, or any hash. The //! plaintext `cak_` appears only in the success response body, once. @@ -39,6 +57,37 @@ use crate::auth::{agent_keys, enrollment_keys}; use crate::db; use crate::AppState; +/// A fixed, valid Argon2id PHC hash used ONLY to equalize timing on the early-reject +/// enroll paths (unknown `site_code`, no active key). Those paths would otherwise +/// return before paying the KDF, while a wrong key pays the full Argon2id verify — +/// a timing oracle that distinguishes "this site_code/active-key exists" from +/// "rejected at the key check". On every early reject we run a throwaway verify of +/// the supplied key against THIS constant and discard the result, so all rejection +/// paths pay one Argon2id verify. +/// +/// It is the Argon2id (V0x13, default params — matching [`crate::auth::password`]) +/// hash of the byte string `"enroll-timing-equalizer-throwaway"` under a fixed salt +/// — generated offline, committed verbatim, and asserted valid + verifying in the +/// tests below. It guards NOTHING: it is never a credential, never compared for +/// auth, and the password it encodes is public. A real key can never equal it +/// (Phase A keys are `cek_`-prefixed 256-bit randoms), so the dummy verify always +/// returns `false`, exactly as the real reject paths do. +const TIMING_EQUALIZER_PHC: &str = + "$argon2id$v=19$m=19456,t=2,p=1$ZW5yb2xsdGltaW5nZXF6$tXiQXmQUAUVszrdp5HrVGIJtsQTidLuSKld0ITNv2Es"; + +/// Pay one Argon2id verify against [`TIMING_EQUALIZER_PHC`] and discard the result. +/// +/// Called on the early-reject enroll paths so they cost the same as a real +/// wrong-key reject (which pays the KDF in `verify_enrollment_key`). The boolean is +/// intentionally ignored — this exists purely for its side effect (CPU time). +#[inline] +fn equalize_reject_timing(presented_key: &str) { + // `verify_password` parses the PHC and runs Argon2id; a malformed constant would + // make it return Err and skip the KDF, defeating the purpose, so the constant is + // a known-valid PHC string (asserted in tests). + let _ = crate::auth::password::verify_password(presented_key, TIMING_EQUALIZER_PHC); +} + /// Standard error envelope (see `.claude/standards/api/response-format.md`), /// matching `api::machine_keys::ApiError`. #[derive(Debug, Serialize)] @@ -116,7 +165,10 @@ pub struct EnrollResponse { /// `"active"` (live, controllable, key issued) or `"pending"` (collision-gated; /// awaiting operator confirmation; no key issued). pub enrollment_state: String, - /// Disposition: `"new"` | `"reuse"` | `"site_move"` | `"collision_pending"`. + /// Disposition: `"new"` | `"reuse"` | `"collision_pending"`. (`"site_move"` is + /// NOT reachable on the Phase A unauthenticated path — a cross-site enroll is + /// refused with `409 ENROLL_SITE_CONFLICT` before any disposition is returned; + /// deliberate moves are the Phase-B `--reassign` flow.) pub disposition: String, } @@ -231,6 +283,9 @@ pub async fn enroll( let site = match db::sites::get_site_by_code(db.pool(), site_code, tenant_id).await { Ok(Some(s)) => s, Ok(None) => { + // Pay the Argon2id cost a real wrong-key reject would, so an unknown + // site_code is not distinguishable by timing (enumeration oracle). + equalize_reject_timing(&req.enrollment_key); state.rate_limits.enroll.record_failure(site_code, ip); audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({ "reason": "unknown_site_code", @@ -239,8 +294,8 @@ pub async fn enroll( })) .await; tracing::warn!("[ENROLL] unknown site_code={} from {}", site_code, ip); - // Same opaque rejection shape as a bad key — do not reveal which of the - // two failed (avoids a site_code enumeration oracle). + // Same opaque rejection shape AND the same KDF cost as a bad key — do not + // reveal (by body or by timing) which of the two failed. return Err(ApiError::new( StatusCode::UNAUTHORIZED, "ENROLL_REJECTED", @@ -263,6 +318,9 @@ pub async fn enroll( let active_key = match db::enrollment_keys::get_active_for_site(db.pool(), site.id).await { Ok(Some(k)) => k, Ok(None) => { + // Pay the Argon2id cost a real wrong-key reject would, so "site exists but + // has no active key" is not distinguishable by timing from a bad key. + equalize_reject_timing(&req.enrollment_key); state.rate_limits.enroll.record_failure(site_code, ip); audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({ "reason": "no_active_key", @@ -316,22 +374,33 @@ pub async fn enroll( .map(str::trim) .filter(|s| !s.is_empty()); - // Dedup on (tenant, machine_uid). - let existing = - match db::machines::get_machine_by_tenant_uid(db.pool(), tenant_id, machine_uid).await { - Ok(e) => e, - Err(e) => { - tracing::error!("[ENROLL] DB error on dedup lookup: {}", e); - return Err(ApiError::new( - StatusCode::INTERNAL_SERVER_ERROR, - "INTERNAL_ERROR", - "Internal server error", - )); - } - }; + // Dedup on (tenant, machine_uid). Wrapped in a bounded retry loop to resolve the + // first-enroll TOCTOU (SPEC-016 Phase A, LOW): the `None` branch does a lookup + // then an INSERT with no in-between lock, so two concurrent first-time enrolls of + // the same `machine_uid` both see `None` and both INSERT. The partial unique index + // on `machine_uid` makes the loser's INSERT raise a unique violation; instead of + // 500ing, the loser loops back, the lookup now returns `Some`, and it converges to + // the reuse path (one row, no error). One retry is sufficient — after a unique + // violation the winner's row is committed and visible — but we cap iterations + // defensively so a pathological repeat can never spin forever. + let mut attempts = 0u8; + loop { + attempts += 1; + let existing = + match db::machines::get_machine_by_tenant_uid(db.pool(), tenant_id, machine_uid).await { + Ok(e) => e, + Err(e) => { + tracing::error!("[ENROLL] DB error on dedup lookup: {}", e); + return Err(ApiError::new( + StatusCode::INTERNAL_SERVER_ERROR, + "INTERNAL_ERROR", + "Internal server error", + )); + } + }; - match existing { - // -- Reuse / site-move / collision path ------------------------------- + match existing { + // -- Reuse / collision path ------------------------------------------- Some(existing) => { // Collision gate: a seemingly-different live endpoint resolving to an // existing uid -> pending, alert, NO usable cak_ minted. @@ -379,8 +448,53 @@ pub async fn enroll( )); } - // Legitimate reuse (re-image / re-install) or a site move. - let is_move = existing.site_id.map(|s| s != site.id).unwrap_or(true); + // Cross-site guard (SPEC-016 Phase A, HIGH): if this machine_uid is + // already bound to a DIFFERENT site, REFUSE — do not move it and do not + // mint a key. `machine_uid` is a raw, spoofable client string in Phase A, + // so a silent move here is a hijack vector: a holder of site B's key could + // claim a site-A machine merely by presenting its uid, repointing the row + // and minting a fresh `cak_` on site B. Deliberate moves are out of scope + // for Phase A and arrive with the Phase-B `--reassign` flow + dashboard + // (SPEC-016 §"Explicitly out of scope"); until then this is the explicit + // accidental-move / hijack guard the spec calls for. + // + // Only `Some(other_site)` is a conflict. `None` (a legacy / connect-path / + // support-code row that never had a relational site binding) is treated as + // a first-time bind to the enrolling site, NOT a cross-site move — there is + // no prior owning site to hijack from. + if let Some(existing_site) = existing.site_id { + if existing_site != site.id { + audit(db, db::events::EventTypes::ENROLL_SITE_CONFLICT, ip, json!({ + "machine_id": existing.id, + "machine_uid": machine_uid, + // Record the bound site id for the operator audit trail; the + // response body below leaks nothing about the other site. + "bound_site_id": existing_site, + "attempted_site_code": site_code, + "attempted_site_id": site.id, + })) + .await; + // TODO(SPEC-016): wire to #dev-alerts — cross-site enroll refused + // (possible accidental move or a cross-site claim attempt). + tracing::warn!( + "[ENROLL] cross-site conflict REFUSED: machine_id={} already bound to \ + a different site; attempted site_code={} from {}", + existing.id, site_code, ip + ); + // Opaque-enough: states the machine is already enrolled elsewhere + // and how to move it deliberately, without naming the other site. + return Err(ApiError::new( + StatusCode::CONFLICT, + "ENROLL_SITE_CONFLICT", + "This machine is already enrolled at another site. A deliberate \ + move requires the operator-initiated reassignment flow.", + )); + } + } + + // Same-site reuse (re-image / re-install) or first relational bind of a + // legacy/connect-path row (existing.site_id was NULL). No site move occurs + // on the unauthenticated path in Phase A. let params = enroll_params( &existing.agent_id, hostname, @@ -398,42 +512,26 @@ pub async fn enroll( let cak = mint_cak(db, machine.id, tenant_id).await?; - if is_move { - audit(db, db::events::EventTypes::ENROLL_SITE_MOVE, ip, json!({ - "machine_id": machine.id, - "machine_uid": machine_uid, - "from_site_id": existing.site_id, - "to_site_code": site_code, - "to_site_id": site.id, - })) - .await; - // TODO(SPEC-016): wire to #dev-alerts — a machine moved between sites. - tracing::info!( - "[ENROLL] site-move: machine_id={} machine_uid present, to site_code={} from {}", - machine.id, site_code, ip - ); - } else { - audit(db, db::events::EventTypes::ENROLL_REUSE, ip, json!({ - "machine_id": machine.id, - "machine_uid": machine_uid, - "site_code": site_code, - })) - .await; - tracing::info!( - "[ENROLL] reuse: machine_id={} re-enrolled at site_code={} from {}", - machine.id, site_code, ip - ); - } + audit(db, db::events::EventTypes::ENROLL_REUSE, ip, json!({ + "machine_id": machine.id, + "machine_uid": machine_uid, + "site_code": site_code, + })) + .await; + tracing::info!( + "[ENROLL] reuse: machine_id={} re-enrolled at site_code={} from {}", + machine.id, site_code, ip + ); - Ok(( + return Ok(( StatusCode::OK, Json(EnrollResponse { machine_id: machine.id, key: Some(cak), enrollment_state: "active".to_string(), - disposition: if is_move { "site_move" } else { "reuse" }.to_string(), + disposition: "reuse".to_string(), }), - )) + )); } // -- New enrollment ---------------------------------------------------- @@ -454,16 +552,31 @@ pub async fn enroll( &tags, "active", ); - let machine = db::machines::insert_enrolled_machine(db.pool(), ¶ms) - .await - .map_err(|e| { + let machine = match db::machines::insert_enrolled_machine(db.pool(), ¶ms).await { + Ok(m) => m, + // TOCTOU loser: a concurrent first-enroll of the same machine_uid won + // the race and committed its row between our lookup and this INSERT, so + // the partial unique index on `machine_uid` rejects ours. Loop back: the + // re-lookup now finds the winner's row and we converge to reuse instead + // of 500ing. Capped so a non-uid unique violation (or any persistent + // conflict) surfaces as a 500 rather than spinning. + Err(e) if is_machine_uid_conflict(&e) && attempts < 2 => { + tracing::info!( + "[ENROLL] concurrent first-enroll race on machine_uid; \ + retrying as reuse (site_code={} from {})", + site_code, ip + ); + continue; + } + Err(e) => { tracing::error!("[ENROLL] DB error inserting enrolled machine: {}", e); - ApiError::new( + return Err(ApiError::new( StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", "Failed to register machine", - ) - })?; + )); + } + }; let cak = mint_cak(db, machine.id, tenant_id).await?; @@ -480,7 +593,7 @@ pub async fn enroll( machine.id, hostname, site_code, ip ); - Ok(( + return Ok(( StatusCode::CREATED, Json(EnrollResponse { machine_id: machine.id, @@ -488,11 +601,31 @@ pub async fn enroll( enrollment_state: "active".to_string(), disposition: "new".to_string(), }), - )) + )); + } } } } +/// Is this DB error a unique-constraint violation involving the `machine_uid` +/// (the first-enroll TOCTOU race)? Postgres reports a unique violation as SQLSTATE +/// `23505`; we additionally require the constraint/message to reference `machine_uid` +/// so a violation on a different unique column (e.g. `agent_id`) is NOT swallowed as +/// a benign race — those are genuine errors and must still surface as 500. +fn is_machine_uid_conflict(e: &sqlx::Error) -> bool { + e.as_database_error() + .filter(|db_err| db_err.code().as_deref() == Some("23505")) + .map(|db_err| { + db_err + .constraint() + .map(|c| c.contains("machine_uid")) + // Fall back to the message when the driver exposes no constraint name + // (e.g. a partial-index violation), so the uid race is still caught. + .unwrap_or_else(|| db_err.message().contains("machine_uid")) + }) + .unwrap_or(false) +} + /// Fold `department` / `device_type` (no dedicated columns yet — SPEC-007) into the /// tag set as `department:` / `device_type:` so they are preserved rather than /// dropped, alongside any explicit tags. Empty/whitespace values are skipped. @@ -560,3 +693,247 @@ fn map_update_err(e: sqlx::Error) -> (StatusCode, Json) { "Failed to update machine", ) } + +#[cfg(test)] +mod tests { + use super::*; + + // ---- Fix #2: the timing-equalizer PHC constant is a real, valid Argon2id hash. + // If this constant were malformed, `verify_password` would return `Err` and skip + // the KDF entirely — defeating the whole point (the early-reject paths would NOT + // pay the Argon2id cost and the timing oracle would reopen). These tests are the + // standing guard that the constant keeps paying the KDF. + + #[test] + fn timing_equalizer_phc_is_a_valid_parseable_hash() { + // A wrong password must verify to `false` WITHOUT erroring — proving the PHC + // parsed and the KDF actually ran (a parse failure would surface as `Err`). + let res = crate::auth::password::verify_password("cek_anything_at_all", TIMING_EQUALIZER_PHC); + assert!( + res.is_ok(), + "TIMING_EQUALIZER_PHC must be a valid PHC string so the KDF runs; got Err: {:?}", + res.err() + ); + assert!( + !res.unwrap(), + "an arbitrary key must NOT match the throwaway equalizer hash" + ); + } + + #[test] + fn equalize_reject_timing_runs_without_panicking() { + // The early-reject helper must always complete (it pays the KDF and discards + // the result); a panic here would 500 a rejection path. + equalize_reject_timing("cek_some_presented_value"); + equalize_reject_timing(""); // even degenerate input must be safe + } + + // ---- Fix #1 support: the collision heuristic truth table is unchanged. ------- + + fn machine_with(status: &str, hostname: &str) -> db::machines::Machine { + db::machines::Machine { + id: Uuid::new_v4(), + agent_id: "agent-x".to_string(), + hostname: hostname.to_string(), + os_version: None, + is_elevated: false, + is_persistent: true, + first_seen: chrono::Utc::now(), + last_seen: chrono::Utc::now(), + last_session_id: None, + status: status.to_string(), + created_at: chrono::Utc::now(), + updated_at: chrono::Utc::now(), + tenant_id: None, + organization: None, + site: None, + tags: Vec::new(), + machine_uid: Some("uid-1".to_string()), + deleted_at: None, + site_id: None, + enrollment_state: "active".to_string(), + } + } + + #[test] + fn collision_only_when_online_and_different_host() { + // Online + different hostname => collision (the clone signature). + assert!(is_collision(&machine_with("online", "HOST-A"), "HOST-B")); + // Online + same hostname (case-insensitive) => reuse, not a collision. + assert!(!is_collision(&machine_with("online", "HOST-A"), "host-a")); + // Offline (the common re-image case) => never a collision, even if renamed. + assert!(!is_collision(&machine_with("offline", "HOST-A"), "HOST-B")); + } + + // ---- DB-gated tests (skip without TEST_DATABASE_URL; run in CI on Postgres). - + // These validate the mechanisms fixes #1 and #3 rely on against a REAL Postgres + // error/row, which cannot be faked on a workstation. The full handler is exercised + // end-to-end in CI; here we pin the load-bearing primitives. + + use sqlx::postgres::PgPoolOptions; + use sqlx::PgPool; + + async fn test_pool() -> Option { + let url = std::env::var("TEST_DATABASE_URL").ok()?; + let pool = PgPoolOptions::new() + .max_connections(2) + .connect(&url) + .await + .expect("connect to TEST_DATABASE_URL"); + sqlx::migrate!("./migrations") + .run(&pool) + .await + .expect("apply migrations to the test database"); + Some(pool) + } + + async fn cleanup(pool: &PgPool, uids: &[&str], site_codes: &[&str]) { + for uid in uids { + let _ = sqlx::query("DELETE FROM connect_machines WHERE machine_uid = $1") + .bind(uid) + .execute(pool) + .await; + } + for code in site_codes { + let _ = sqlx::query("DELETE FROM connect_sites WHERE site_code = $1") + .bind(code) + .execute(pool) + .await; + } + } + + /// Fix #1: after a machine is enrolled at site A, the dedup lookup returns the row + /// carrying `site_id == site_a`, which is exactly what the cross-site guard compares + /// against `site.id`. This pins the precondition that makes the refusal possible: + /// a site-B enroll resolves the SAME row and sees a DIFFERENT bound site_id. + #[tokio::test] + async fn enrolled_machine_exposes_bound_site_id_for_cross_site_guard() { + let Some(pool) = test_pool().await else { + return; // no TEST_DATABASE_URL: skip (runs in CI) + }; + let tenant = db::tenancy::current_tenant_id(); + let uid = "test-xsite-uid-001"; + cleanup(&pool, &[uid], &["test-xsite-A", "test-xsite-B"]).await; + + let site_a = db::sites::insert_site(&pool, "test-xsite-A", None, None, Some(tenant)) + .await + .expect("insert site A"); + let site_b = db::sites::insert_site(&pool, "test-xsite-B", None, None, Some(tenant)) + .await + .expect("insert site B"); + assert_ne!(site_a.id, site_b.id); + + let agent_id_a = format!("enroll-{}", Uuid::new_v4()); + let params = enroll_params( + &agent_id_a, + "HOST-XSITE", + uid, + tenant, + site_a.id, + None, + None, + &[], + "active", + ); + db::machines::insert_enrolled_machine(&pool, ¶ms) + .await + .expect("enroll at site A"); + + // A subsequent (site-B) enroll dedups on (tenant, uid) and resolves THIS row. + let found = db::machines::get_machine_by_tenant_uid(&pool, tenant, uid) + .await + .expect("dedup lookup") + .expect("row exists"); + + // The guard compares found.site_id (Some(A)) against the enrolling site (B); + // they differ => refusal. Prove the comparison the guard makes evaluates true. + assert_eq!(found.site_id, Some(site_a.id), "row is bound to site A"); + assert!( + found.site_id.is_some_and(|s| s != site_b.id), + "cross-site guard condition (bound to a different site) must hold" + ); + + cleanup(&pool, &[uid], &["test-xsite-A", "test-xsite-B"]).await; + } + + /// Fix #3: a duplicate first-enroll of the same `machine_uid` raises a Postgres + /// unique violation that `is_machine_uid_conflict` classifies as `true` (the race + /// the loop retries as reuse), while a DIFFERENT unique violation (`agent_id`) + /// classifies as `false` (a genuine error that must still 500). This is the exact + /// branch predicate of the TOCTOU retry, validated against real driver errors. + #[tokio::test] + async fn machine_uid_conflict_is_classified_but_agent_id_conflict_is_not() { + let Some(pool) = test_pool().await else { + return; // no TEST_DATABASE_URL: skip (runs in CI) + }; + let tenant = db::tenancy::current_tenant_id(); + let uid = "test-toctou-uid-001"; + cleanup(&pool, &[uid], &["test-toctou-A"]).await; + + let site = db::sites::insert_site(&pool, "test-toctou-A", None, None, Some(tenant)) + .await + .expect("insert site"); + + let shared_agent_id = format!("enroll-{}", Uuid::new_v4()); + let first = enroll_params( + &shared_agent_id, + "HOST-TOCTOU", + uid, + tenant, + site.id, + None, + None, + &[], + "active", + ); + db::machines::insert_enrolled_machine(&pool, &first) + .await + .expect("first enroll wins the race"); + + // Loser re-inserts the SAME machine_uid (fresh agent_id) -> machine_uid unique + // violation -> must be classified as the retryable race. + let loser_agent_id = format!("enroll-{}", Uuid::new_v4()); + let uid_dup = enroll_params( + &loser_agent_id, + "HOST-TOCTOU-2", + uid, + tenant, + site.id, + None, + None, + &[], + "active", + ); + let err = db::machines::insert_enrolled_machine(&pool, &uid_dup) + .await + .expect_err("duplicate machine_uid must violate the unique index"); + assert!( + is_machine_uid_conflict(&err), + "a machine_uid unique violation must be classified as the retryable race; got: {err:?}" + ); + + // A DIFFERENT unique violation (reusing the agent_id with a NEW uid) must NOT be + // swallowed as the race — it is a genuine error that still surfaces as 500. + let uid_other = "test-toctou-uid-002"; + let agent_dup = enroll_params( + &shared_agent_id, // collides on agent_id UNIQUE, not machine_uid + "HOST-TOCTOU-3", + uid_other, + tenant, + site.id, + None, + None, + &[], + "active", + ); + let err2 = db::machines::insert_enrolled_machine(&pool, &agent_dup) + .await + .expect_err("duplicate agent_id must violate its unique constraint"); + assert!( + !is_machine_uid_conflict(&err2), + "an agent_id unique violation must NOT be misclassified as the machine_uid race; got: {err2:?}" + ); + + cleanup(&pool, &[uid, uid_other], &["test-toctou-A"]).await; + } +} diff --git a/server/src/db/events.rs b/server/src/db/events.rs index f42fb39..b850ef4 100644 --- a/server/src/db/events.rs +++ b/server/src/db/events.rs @@ -82,7 +82,18 @@ impl EventTypes { pub const ENROLL_REUSE: &'static str = "enroll_reuse"; /// An existing machine_uid enrolled under a DIFFERENT site — the machine's site /// binding was updated (a "site move"). Fires an alert. + /// + /// NOTE (SPEC-016 Phase A): the unauthenticated enroll path does NOT perform this + /// move — a cross-site enroll is REFUSED (`ENROLL_SITE_CONFLICT`) rather than + /// silently repointing the machine. This event is reserved for the deliberate + /// Phase-B `--reassign` flow (and the dashboard move action) that supersede it. + #[allow(dead_code)] // reserved for Phase-B --reassign; not emitted by Phase A enroll pub const ENROLL_SITE_MOVE: &'static str = "enroll_site_move"; + /// An existing machine_uid presented a valid key for a DIFFERENT site than the one + /// the machine is currently bound to. Phase A REFUSES this (no move, no key minted) + /// as the accidental-move / cross-site-hijack guard; the deliberate move arrives + /// with the Phase-B `--reassign` flow + dashboard. Fires an alert. + pub const ENROLL_SITE_CONFLICT: &'static str = "enroll_site_conflict"; /// A machine_uid collision was detected at enroll — the endpoint dropped to /// `pending` and awaits operator confirmation in the dashboard. Fires an alert. pub const ENROLL_COLLISION_PENDING: &'static str = "enroll_collision_pending";