guru-connect/server/src/api/enroll.rs

//! Zero-touch per-site self-registration endpoint (SPEC-016, Phase A).
//!
//! `POST /api/enroll` is the PUBLIC (no-JWT) door a managed agent walks through on
//! first run: it presents its site's `site_code` + the long per-site enrollment
//! key (`cek_`) and its machine-derived `machine_uid`, and the server — if the key
//! verifies — dedups on `(tenant, machine_uid)`, creates or reuses the machine row,
//! and mints the per-machine `cak_` operating credential, returning the plaintext
//! `cak_` exactly once.
//!
//! Two-tier credential model (SPEC-016 §Security): the enrollment key is the
//! low-sensitivity, rotatable, per-site GATE ("may register"); the minted `cak_` is
//! the high-sensitivity, per-machine, independently-revocable OPERATING credential
//! the relay (`relay::validate_agent_api_key`) already accepts. This handler only
//! MINTS a `cak_` in the exact stored form `verify_agent_key` expects (SHA-256 hash
//! in `connect_agent_keys`) — it does not touch the relay auth path.
//!
//! AUTH POSTURE: auto-approve (ScreenConnect parity) — a clean enroll is live and
//! controllable immediately, with the new-enrollment alert as the tripwire. The one
//! exception is a detected `machine_uid` collision, which gates the machine to
//! `enrollment_state = 'pending'`.
//!
//! What the collision gate actually does in Phase A (be precise — it is NOT a full
//! quarantine yet): it sets `enrollment_state = 'pending'` and withholds a *newly
//! minted* `cak_` (the response carries no key). It does NOT revoke a `cak_` the
//! machine may already hold from a prior clean enroll, and NOTHING on the relay /
//! control plane consults `enrollment_state` yet — so a colliding machine that was
//! already enrolled keeps its working credential and remains controllable until the
//! enforcement below lands. The gate today is an audit + alert tripwire that asks an
//! operator to confirm in the dashboard, not a hard control-plane block.
//!
//! TODO(SPEC-016 Phase B/D): relay/control plane must refuse machines with
//! enrollment_state='pending'; consider revoking existing cak_ on collision.
//!
//! CROSS-SITE: Phase A does NOT move a machine between sites. A valid key for site B
//! presented for a `machine_uid` already bound to site A is REFUSED
//! (`ENROLL_SITE_CONFLICT`, 409) — no move, no key — as the accidental-move /
//! cross-site-hijack guard (`machine_uid` is a raw, spoofable client string in Phase
//! A). Deliberate moves arrive with the Phase-B `--reassign` flow + dashboard
//! (SPEC-016 §"Explicitly out of scope" / `--reassign`).
//!
//! SECURITY: never log the enrollment key, the minted `cak_`, or any hash. The
//! plaintext `cak_` appears only in the success response body, once.

use std::net::IpAddr;
use std::net::SocketAddr;

use axum::{
    extract::{ConnectInfo, State},
    http::{HeaderMap, StatusCode},
    Json,
};
use serde::{Deserialize, Serialize};
use serde_json::json;
use uuid::Uuid;

use crate::auth::{agent_keys, enrollment_keys};
use crate::db;
use crate::AppState;

/// A fixed, valid Argon2id PHC hash used ONLY to equalize timing on the early-reject
/// enroll paths (unknown `site_code`, no active key). Those paths would otherwise
/// return before paying the KDF, while a wrong key pays the full Argon2id verify —
/// a timing oracle that distinguishes "this site_code/active-key exists" from
/// "rejected at the key check". On every early reject we run a throwaway verify of
/// the supplied key against THIS constant and discard the result, so all rejection
/// paths pay one Argon2id verify.
///
/// It is the Argon2id (V0x13, default params — matching [`crate::auth::password`])
/// hash of the byte string `"enroll-timing-equalizer-throwaway"` under a fixed salt
/// — generated offline, committed verbatim, and asserted valid + verifying in the
/// tests below. It guards NOTHING: it is never a credential, never compared for
/// auth, and the password it encodes is public. A real key can never equal it
/// (Phase A keys are `cek_`-prefixed 256-bit randoms), so the dummy verify always
/// returns `false`, exactly as the real reject paths do.
const TIMING_EQUALIZER_PHC: &str =
    "$argon2id$v=19$m=19456,t=2,p=1$ZW5yb2xsdGltaW5nZXF6$tXiQXmQUAUVszrdp5HrVGIJtsQTidLuSKld0ITNv2Es";

/// Pay one Argon2id verify against [`TIMING_EQUALIZER_PHC`] and discard the result.
///
/// Called on the early-reject enroll paths so they cost the same as a real
/// wrong-key reject (which pays the KDF in `verify_enrollment_key`). The boolean is
/// intentionally ignored — this exists purely for its side effect (CPU time).
#[inline]
fn equalize_reject_timing(presented_key: &str) {
    // `verify_password` parses the PHC and runs Argon2id; a malformed constant would
    // make it return Err and skip the KDF, defeating the purpose, so the constant is
    // a known-valid PHC string (asserted in tests).
    let _ = crate::auth::password::verify_password(presented_key, TIMING_EQUALIZER_PHC);
}

/// Standard error envelope (see `.claude/standards/api/response-format.md`),
/// matching `api::machine_keys::ApiError`.
#[derive(Debug, Serialize)]
pub struct ApiError {
    pub detail: String,
    pub error_code: String,
    pub status_code: u16,
}

impl ApiError {
    fn new(status: StatusCode, code: &str, detail: &str) -> (StatusCode, Json<ApiError>) {
        (
            status,
            Json(ApiError {
                detail: detail.to_string(),
                error_code: code.to_string(),
                status_code: status.as_u16(),
            }),
        )
    }
}

type ApiResult<T> = Result<T, (StatusCode, Json<ApiError>)>;

/// Labels an installer carries for the machines it enrolls (SPEC-016 §3).
///
/// All optional: a thin installer may carry only company/site. `company` ->
/// `connect_machines.organization`; `site` -> `connect_machines.site` (the
/// free-text label, distinct from the relational site binding resolved from
/// `site_code`). `department` / `device_type` are reserved label fields (SPEC-007
/// AgentStatus parity) — accepted and folded into `tags` for now (no dedicated
/// columns yet), so they are not silently dropped.
#[derive(Debug, Default, Deserialize)]
pub struct EnrollLabels {
    #[serde(default)]
    pub company: Option<String>,
    #[serde(default)]
    pub site: Option<String>,
    #[serde(default)]
    pub department: Option<String>,
    #[serde(default)]
    pub device_type: Option<String>,
    #[serde(default)]
    pub tags: Vec<String>,
}

/// `POST /api/enroll` request body (SPEC-016 §3).
#[derive(Debug, Deserialize)]
pub struct EnrollRequest {
    pub site_code: String,
    /// The per-site enrollment secret (`cek_`). Verified against the site's active
    /// hashed key; never logged.
    pub enrollment_key: String,
    /// Opaque caller-supplied stable machine identity. Phase A treats this as an
    /// opaque string; the hardware-salted derivation is Phase B (agent-side).
    pub machine_uid: String,
    pub hostname: String,
    #[serde(default)]
    pub labels: EnrollLabels,
}

/// `POST /api/enroll` success response.
///
/// On a clean (active) enroll, `key` carries the plaintext `cak_` ONCE. On a
/// collision-gated `pending` enroll, `key` is `None` and `enrollment_state` is
/// `"pending"` — no usable operating credential is issued until an operator
/// confirms the endpoint in the dashboard.
#[derive(Debug, Serialize)]
pub struct EnrollResponse {
    /// `connect_machines.id` for the enrolled machine.
    pub machine_id: Uuid,
    /// The minted plaintext `cak_`, present ONLY for an active enroll, ONLY here.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub key: Option<String>,
    /// `"active"` (live, controllable, key issued) or `"pending"` (collision-gated;
    /// awaiting operator confirmation; no key issued).
    pub enrollment_state: String,
    /// Disposition: `"new"` | `"reuse"` | `"collision_pending"`. (`"site_move"` is
    /// NOT reachable on the Phase A unauthenticated path — a cross-site enroll is
    /// refused with `409 ENROLL_SITE_CONFLICT` before any disposition is returned;
    /// deliberate moves are the Phase-B `--reassign` flow.)
    pub disposition: String,
}

fn require_db(state: &AppState) -> ApiResult<&db::Database> {
    state.db.as_ref().ok_or_else(|| {
        ApiError::new(
            StatusCode::SERVICE_UNAVAILABLE,
            "DATABASE_UNAVAILABLE",
            "Database not available",
        )
    })
}

/// Collision-gate heuristic (PROVISIONAL — SPEC-016 §item-1/6).
///
/// The residual collision case is template-cloned VMs that share a hardware UUID
/// (some hypervisors clone the SMBIOS UUID), so a *different* physical endpoint
/// resolves to an existing `machine_uid`. We cannot distinguish that from a benign
/// re-image purely from a client-asserted uid, so the gate is intentionally
/// CONSERVATIVE and the heuristic is provisional, to be tightened in planning
/// (which durable hardware signals feed the uid, and the hypervisor behavior
/// matrix — see SPEC-016 §Remaining-for-planning).
///
/// PROVISIONAL heuristic chosen for Phase A: treat it as a collision when the
/// matched existing machine
///   (a) is currently considered ONLINE (status == "online"), AND
///   (b) reports a DIFFERENT hostname than the incoming request,
/// i.e. an apparently-live box already owns this uid yet a second box with a
/// different name is enrolling against it concurrently — the clone signature. A
/// case-insensitive hostname compare avoids false positives from case drift. An
/// OFFLINE matched row (the common re-image / re-install case) is NOT treated as a
/// collision — that is the legitimate reuse path. A same-hostname match is reuse,
/// never a collision.
///
/// Rationale for conservatism: a false POSITIVE merely sends a real machine to
/// `pending` (an operator clicks confirm — annoying, recoverable); a false NEGATIVE
/// would auto-activate a cloned endpoint (worse). When the salt set is finalized in
/// planning this should become "uid is stable hardware, so a genuine clone is
/// expected to be rare; gate on the salt's distinguishing component instead."
fn is_collision(existing: &db::machines::Machine, incoming_hostname: &str) -> bool {
    let online = existing.status.eq_ignore_ascii_case("online");
    let different_host = !existing
        .hostname
        .eq_ignore_ascii_case(incoming_hostname.trim());
    online && different_host
}

/// Mint a `cak_`, store its hash bound to `machine_id` + tenant, and return the
/// plaintext. Shared by the new/reuse/move active paths.
async fn mint_cak(
    db: &db::Database,
    machine_id: Uuid,
    tenant_id: Uuid,
) -> ApiResult<String> {
    let plaintext = agent_keys::generate_agent_key();
    let key_hash = agent_keys::hash_agent_key(&plaintext);
    db::agent_keys::insert_agent_key(db.pool(), machine_id, &key_hash, Some(tenant_id))
        .await
        .map_err(|e| {
            tracing::error!("[ENROLL] DB error minting agent key: {}", e);
            ApiError::new(
                StatusCode::INTERNAL_SERVER_ERROR,
                "INTERNAL_ERROR",
                "Failed to mint machine credential",
            )
        })?;
    Ok(plaintext)
}

/// `POST /api/enroll` — public self-registration (SPEC-016 §3).
pub async fn enroll(
    State(state): State<AppState>,
    ConnectInfo(addr): ConnectInfo<SocketAddr>,
    headers: HeaderMap,
    Json(req): Json<EnrollRequest>,
) -> ApiResult<(StatusCode, Json<EnrollResponse>)> {
    let db = require_db(&state)?;
    let tenant_id = db::tenancy::current_tenant_id();

    // Real client IP via the shared trusted-proxy-aware extractor (same source the
    // relay / rate limiter / audit log use, so the buckets never drift).
    let ip: IpAddr = crate::utils::ip_extract::client_ip(&addr, &headers, &state.trusted_proxies);

    // Basic input hygiene before any DB/KDF work.
    let site_code = req.site_code.trim();
    let hostname = req.hostname.trim();
    let machine_uid = req.machine_uid.trim();
    if site_code.is_empty() || hostname.is_empty() || machine_uid.is_empty() {
        return Err(ApiError::new(
            StatusCode::BAD_REQUEST,
            "INVALID_REQUEST",
            "site_code, hostname, and machine_uid are required",
        ));
    }

    // Defense-in-depth rate limit / lockout per (site_code, IP). The 256-bit
    // enrollment key is the load-bearing gate; this throttles brute-force/abuse.
    if !state.rate_limits.enroll.check(site_code, ip) {
        tracing::warn!(
            "[ENROLL] rate-limited/locked-out enroll for site_code={} from {}",
            site_code,
            ip
        );
        return Err(ApiError::new(
            StatusCode::TOO_MANY_REQUESTS,
            "RATE_LIMITED",
            "Too many enrollment attempts. Please try again later.",
        ));
    }

    // Resolve the site by code (per-tenant).
    let site = match db::sites::get_site_by_code(db.pool(), site_code, tenant_id).await {
        Ok(Some(s)) => s,
        Ok(None) => {
            // Pay the Argon2id cost a real wrong-key reject would, so an unknown
            // site_code is not distinguishable by timing (enumeration oracle).
            equalize_reject_timing(&req.enrollment_key);
            state.rate_limits.enroll.record_failure(site_code, ip);
            audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({
                "reason": "unknown_site_code",
                "site_code": site_code,
                "machine_uid": machine_uid,
            }))
            .await;
            tracing::warn!("[ENROLL] unknown site_code={} from {}", site_code, ip);
            // Same opaque rejection shape AND the same KDF cost as a bad key — do not
            // reveal (by body or by timing) which of the two failed.
            return Err(ApiError::new(
                StatusCode::UNAUTHORIZED,
                "ENROLL_REJECTED",
                "Invalid site code or enrollment key",
            ));
        }
        Err(e) => {
            tracing::error!("[ENROLL] DB error resolving site: {}", e);
            return Err(ApiError::new(
                StatusCode::INTERNAL_SERVER_ERROR,
                "INTERNAL_ERROR",
                "Internal server error",
            ));
        }
    };

    // Verify the enrollment key against the site's ACTIVE key. A rotated-out (old)
    // installer's key is inactive and rejected here — old installers cannot enroll
    // NEW machines after rotation (SPEC-016 success-criterion #3).
    let active_key = match db::enrollment_keys::get_active_for_site(db.pool(), site.id).await {
        Ok(Some(k)) => k,
        Ok(None) => {
            // Pay the Argon2id cost a real wrong-key reject would, so "site exists but
            // has no active key" is not distinguishable by timing from a bad key.
            equalize_reject_timing(&req.enrollment_key);
            state.rate_limits.enroll.record_failure(site_code, ip);
            audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({
                "reason": "no_active_key",
                "site_code": site_code,
                "machine_uid": machine_uid,
            }))
            .await;
            tracing::warn!("[ENROLL] no active enrollment key for site_code={}", site_code);
            return Err(ApiError::new(
                StatusCode::UNAUTHORIZED,
                "ENROLL_REJECTED",
                "Invalid site code or enrollment key",
            ));
        }
        Err(e) => {
            tracing::error!("[ENROLL] DB error loading active enrollment key: {}", e);
            return Err(ApiError::new(
                StatusCode::INTERNAL_SERVER_ERROR,
                "INTERNAL_ERROR",
                "Internal server error",
            ));
        }
    };

    if !enrollment_keys::verify_enrollment_key(&req.enrollment_key, &active_key.key_hash) {
        state.rate_limits.enroll.record_failure(site_code, ip);
        audit(db, db::events::EventTypes::ENROLL_REJECTED, ip, json!({
            "reason": "bad_enrollment_key",
            "site_code": site_code,
            "machine_uid": machine_uid,
        }))
        .await;
        tracing::warn!("[ENROLL] bad enrollment key for site_code={} from {}", site_code, ip);
        return Err(ApiError::new(
            StatusCode::UNAUTHORIZED,
            "ENROLL_REJECTED",
            "Invalid site code or enrollment key",
        ));
    }

    // Key verified — this is a legitimate attempt; reset the failure streak.
    state.rate_limits.enroll.record_success(site_code, ip);

    // Build the label/identity params shared by the create/update paths.
    let tags = effective_tags(&req.labels);
    let company = req.labels.company.as_deref().map(str::trim).filter(|s| !s.is_empty());
    let site_label = req
        .labels
        .site
        .as_deref()
        .map(str::trim)
        .filter(|s| !s.is_empty());

    // Dedup on (tenant, machine_uid). Wrapped in a bounded retry loop to resolve the
    // first-enroll TOCTOU (SPEC-016 Phase A, LOW): the `None` branch does a lookup
    // then an INSERT with no in-between lock, so two concurrent first-time enrolls of
    // the same `machine_uid` both see `None` and both INSERT. The partial unique index
    // on `machine_uid` makes the loser's INSERT raise a unique violation; instead of
    // 500ing, the loser loops back, the lookup now returns `Some`, and it converges to
    // the reuse path (one row, no error). One retry is sufficient — after a unique
    // violation the winner's row is committed and visible — but we cap iterations
    // defensively so a pathological repeat can never spin forever.
    let mut attempts = 0u8;
    loop {
        attempts += 1;
        let existing =
            match db::machines::get_machine_by_tenant_uid(db.pool(), tenant_id, machine_uid).await {
                Ok(e) => e,
                Err(e) => {
                    tracing::error!("[ENROLL] DB error on dedup lookup: {}", e);
                    return Err(ApiError::new(
                        StatusCode::INTERNAL_SERVER_ERROR,
                        "INTERNAL_ERROR",
                        "Internal server error",
                    ));
                }
            };

        match existing {
        // -- Reuse / collision path -------------------------------------------
        Some(existing) => {
            // Collision gate: a seemingly-different live endpoint resolving to an
            // existing uid -> pending, alert, NO usable cak_ minted.
            if is_collision(&existing, hostname) {
                let params = enroll_params(
                    &existing.agent_id,
                    hostname,
                    machine_uid,
                    tenant_id,
                    site.id,
                    company,
                    site_label,
                    &tags,
                    "pending",
                );
                let machine = db::machines::update_enrolled_machine(db.pool(), existing.id, &params)
                    .await
                    .map_err(map_update_err)?;

                audit(db, db::events::EventTypes::ENROLL_COLLISION_PENDING, ip, json!({
                    "machine_id": machine.id,
                    "machine_uid": machine_uid,
                    "site_code": site_code,
                    "existing_hostname": existing.hostname,
                    "incoming_hostname": hostname,
                    "heuristic": "online_existing_with_different_hostname (PROVISIONAL)",
                }))
                .await;
                // TODO(SPEC-016): wire to #dev-alerts — collision requires operator
                // confirmation in the dashboard before this endpoint may activate.
                tracing::warn!(
                    "[ENROLL] machine_uid collision -> PENDING: machine_id={} site_code={} \
                     existing_host={} incoming_host={} from {}",
                    machine.id, site_code, existing.hostname, hostname, ip
                );

                return Ok((
                    StatusCode::ACCEPTED,
                    Json(EnrollResponse {
                        machine_id: machine.id,
                        key: None,
                        enrollment_state: "pending".to_string(),
                        disposition: "collision_pending".to_string(),
                    }),
                ));
            }

            // Cross-site guard (SPEC-016 Phase A, HIGH): if this machine_uid is
            // already bound to a DIFFERENT site, REFUSE — do not move it and do not
            // mint a key. `machine_uid` is a raw, spoofable client string in Phase A,
            // so a silent move here is a hijack vector: a holder of site B's key could
            // claim a site-A machine merely by presenting its uid, repointing the row
            // and minting a fresh `cak_` on site B. Deliberate moves are out of scope
            // for Phase A and arrive with the Phase-B `--reassign` flow + dashboard
            // (SPEC-016 §"Explicitly out of scope"); until then this is the explicit
            // accidental-move / hijack guard the spec calls for.
            //
            // Only `Some(other_site)` is a conflict. `None` (a legacy / connect-path /
            // support-code row that never had a relational site binding) is treated as
            // a first-time bind to the enrolling site, NOT a cross-site move — there is
            // no prior owning site to hijack from.
            if let Some(existing_site) = existing.site_id {
                if existing_site != site.id {
                    audit(db, db::events::EventTypes::ENROLL_SITE_CONFLICT, ip, json!({
                        "machine_id": existing.id,
                        "machine_uid": machine_uid,
                        // Record the bound site id for the operator audit trail; the
                        // response body below leaks nothing about the other site.
                        "bound_site_id": existing_site,
                        "attempted_site_code": site_code,
                        "attempted_site_id": site.id,
                    }))
                    .await;
                    // TODO(SPEC-016): wire to #dev-alerts — cross-site enroll refused
                    // (possible accidental move or a cross-site claim attempt).
                    tracing::warn!(
                        "[ENROLL] cross-site conflict REFUSED: machine_id={} already bound to \
                         a different site; attempted site_code={} from {}",
                        existing.id, site_code, ip
                    );
                    // Opaque-enough: states the machine is already enrolled elsewhere
                    // and how to move it deliberately, without naming the other site.
                    return Err(ApiError::new(
                        StatusCode::CONFLICT,
                        "ENROLL_SITE_CONFLICT",
                        "This machine is already enrolled at another site. A deliberate \
                         move requires the operator-initiated reassignment flow.",
                    ));
                }
            }

            // Same-site reuse (re-image / re-install) or first relational bind of a
            // legacy/connect-path row (existing.site_id was NULL). No site move occurs
            // on the unauthenticated path in Phase A.
            let params = enroll_params(
                &existing.agent_id,
                hostname,
                machine_uid,
                tenant_id,
                site.id,
                company,
                site_label,
                &tags,
                "active",
            );
            let machine = db::machines::update_enrolled_machine(db.pool(), existing.id, &params)
                .await
                .map_err(map_update_err)?;

            let cak = mint_cak(db, machine.id, tenant_id).await?;

            audit(db, db::events::EventTypes::ENROLL_REUSE, ip, json!({
                "machine_id": machine.id,
                "machine_uid": machine_uid,
                "site_code": site_code,
            }))
            .await;
            tracing::info!(
                "[ENROLL] reuse: machine_id={} re-enrolled at site_code={} from {}",
                machine.id, site_code, ip
            );

            return Ok((
                StatusCode::OK,
                Json(EnrollResponse {
                    machine_id: machine.id,
                    key: Some(cak),
                    enrollment_state: "active".to_string(),
                    disposition: "reuse".to_string(),
                }),
            ));
        }

        // -- New enrollment ----------------------------------------------------
        None => {
            // Fresh opaque agent_id for the new row's `agent_id UNIQUE` column. The
            // agent's own config-UUID story is Phase B; the server only needs a
            // unique non-null value here, and the authoritative identity is the
            // minted cak_ -> machine binding.
            let agent_id = format!("enroll-{}", Uuid::new_v4());
            let params = enroll_params(
                &agent_id,
                hostname,
                machine_uid,
                tenant_id,
                site.id,
                company,
                site_label,
                &tags,
                "active",
            );
            let machine = match db::machines::insert_enrolled_machine(db.pool(), &params).await {
                Ok(m) => m,
                // TOCTOU loser: a concurrent first-enroll of the same machine_uid won
                // the race and committed its row between our lookup and this INSERT, so
                // the partial unique index on `machine_uid` rejects ours. Loop back: the
                // re-lookup now finds the winner's row and we converge to reuse instead
                // of 500ing. Capped so a non-uid unique violation (or any persistent
                // conflict) surfaces as a 500 rather than spinning.
                Err(e) if is_machine_uid_conflict(&e) && attempts < 2 => {
                    tracing::info!(
                        "[ENROLL] concurrent first-enroll race on machine_uid; \
                         retrying as reuse (site_code={} from {})",
                        site_code, ip
                    );
                    continue;
                }
                Err(e) => {
                    tracing::error!("[ENROLL] DB error inserting enrolled machine: {}", e);
                    return Err(ApiError::new(
                        StatusCode::INTERNAL_SERVER_ERROR,
                        "INTERNAL_ERROR",
                        "Failed to register machine",
                    ));
                }
            };

            let cak = mint_cak(db, machine.id, tenant_id).await?;

            audit(db, db::events::EventTypes::ENROLL_NEW, ip, json!({
                "machine_id": machine.id,
                "machine_uid": machine_uid,
                "site_code": site_code,
                "hostname": hostname,
            }))
            .await;
            // TODO(SPEC-016): wire to #dev-alerts — new-enrollment tripwire.
            tracing::info!(
                "[ENROLL] new: machine_id={} hostname={} site_code={} from {}",
                machine.id, hostname, site_code, ip
            );

            return Ok((
                StatusCode::CREATED,
                Json(EnrollResponse {
                    machine_id: machine.id,
                    key: Some(cak),
                    enrollment_state: "active".to_string(),
                    disposition: "new".to_string(),
                }),
            ));
        }
        }
    }
}

/// Is this DB error a unique-constraint violation involving the `machine_uid`
/// (the first-enroll TOCTOU race)? Postgres reports a unique violation as SQLSTATE
/// `23505`; we additionally require the constraint/message to reference `machine_uid`
/// so a violation on a different unique column (e.g. `agent_id`) is NOT swallowed as
/// a benign race — those are genuine errors and must still surface as 500.
fn is_machine_uid_conflict(e: &sqlx::Error) -> bool {
    e.as_database_error()
        .filter(|db_err| db_err.code().as_deref() == Some("23505"))
        .map(|db_err| {
            db_err
                .constraint()
                .map(|c| c.contains("machine_uid"))
                // Fall back to the message when the driver exposes no constraint name
                // (e.g. a partial-index violation), so the uid race is still caught.
                .unwrap_or_else(|| db_err.message().contains("machine_uid"))
        })
        .unwrap_or(false)
}

/// Fold `department` / `device_type` (no dedicated columns yet — SPEC-007) into the
/// tag set as `department:<x>` / `device_type:<x>` so they are preserved rather than
/// dropped, alongside any explicit tags. Empty/whitespace values are skipped.
fn effective_tags(labels: &EnrollLabels) -> Vec<String> {
    let mut tags: Vec<String> = labels
        .tags
        .iter()
        .map(|t| t.trim().to_string())
        .filter(|t| !t.is_empty())
        .collect();
    if let Some(d) = labels.department.as_deref().map(str::trim).filter(|s| !s.is_empty()) {
        tags.push(format!("department:{}", d));
    }
    if let Some(d) = labels
        .device_type
        .as_deref()
        .map(str::trim)
        .filter(|s| !s.is_empty())
    {
        tags.push(format!("device_type:{}", d));
    }
    tags
}

/// Assemble [`db::machines::EnrollMachineParams`] from the resolved pieces.
#[allow(clippy::too_many_arguments)]
fn enroll_params<'a>(
    agent_id: &'a str,
    hostname: &'a str,
    machine_uid: &'a str,
    tenant_id: Uuid,
    site_id: Uuid,
    company: Option<&'a str>,
    site_label: Option<&'a str>,
    tags: &'a [String],
    enrollment_state: &'a str,
) -> db::machines::EnrollMachineParams<'a> {
    db::machines::EnrollMachineParams {
        agent_id,
        hostname,
        machine_uid,
        tenant_id,
        site_id,
        company,
        site_label,
        tags,
        enrollment_state,
    }
}

/// Best-effort enrollment audit write — a failure here never fails the enroll.
async fn audit(db: &db::Database, event_type: &str, ip: IpAddr, details: serde_json::Value) {
    if let Err(e) = db::events::log_enrollment_event(db.pool(), event_type, details, Some(ip)).await
    {
        tracing::warn!("[ENROLL] failed to write {} audit event: {}", event_type, e);
    }
}

/// Map a DB error from the existing-row update to the standard 500 envelope.
fn map_update_err(e: sqlx::Error) -> (StatusCode, Json<ApiError>) {
    tracing::error!("[ENROLL] DB error updating enrolled machine: {}", e);
    ApiError::new(
        StatusCode::INTERNAL_SERVER_ERROR,
        "INTERNAL_ERROR",
        "Failed to update machine",
    )
}

#[cfg(test)]
mod tests {
    use super::*;

    // ---- Fix #2: the timing-equalizer PHC constant is a real, valid Argon2id hash.
    // If this constant were malformed, `verify_password` would return `Err` and skip
    // the KDF entirely — defeating the whole point (the early-reject paths would NOT
    // pay the Argon2id cost and the timing oracle would reopen). These tests are the
    // standing guard that the constant keeps paying the KDF.

    #[test]
    fn timing_equalizer_phc_is_a_valid_parseable_hash() {
        // A wrong password must verify to `false` WITHOUT erroring — proving the PHC
        // parsed and the KDF actually ran (a parse failure would surface as `Err`).
        let res = crate::auth::password::verify_password("cek_anything_at_all", TIMING_EQUALIZER_PHC);
        assert!(
            res.is_ok(),
            "TIMING_EQUALIZER_PHC must be a valid PHC string so the KDF runs; got Err: {:?}",
            res.err()
        );
        assert!(
            !res.unwrap(),
            "an arbitrary key must NOT match the throwaway equalizer hash"
        );
    }

    #[test]
    fn equalize_reject_timing_runs_without_panicking() {
        // The early-reject helper must always complete (it pays the KDF and discards
        // the result); a panic here would 500 a rejection path.
        equalize_reject_timing("cek_some_presented_value");
        equalize_reject_timing(""); // even degenerate input must be safe
    }

    // ---- Fix #1 support: the collision heuristic truth table is unchanged. -------

    fn machine_with(status: &str, hostname: &str) -> db::machines::Machine {
        db::machines::Machine {
            id: Uuid::new_v4(),
            agent_id: "agent-x".to_string(),
            hostname: hostname.to_string(),
            os_version: None,
            is_elevated: false,
            is_persistent: true,
            first_seen: chrono::Utc::now(),
            last_seen: chrono::Utc::now(),
            last_session_id: None,
            status: status.to_string(),
            created_at: chrono::Utc::now(),
            updated_at: chrono::Utc::now(),
            tenant_id: None,
            organization: None,
            site: None,
            tags: Vec::new(),
            machine_uid: Some("uid-1".to_string()),
            deleted_at: None,
            site_id: None,
            enrollment_state: "active".to_string(),
        }
    }

    #[test]
    fn collision_only_when_online_and_different_host() {
        // Online + different hostname => collision (the clone signature).
        assert!(is_collision(&machine_with("online", "HOST-A"), "HOST-B"));
        // Online + same hostname (case-insensitive) => reuse, not a collision.
        assert!(!is_collision(&machine_with("online", "HOST-A"), "host-a"));
        // Offline (the common re-image case) => never a collision, even if renamed.
        assert!(!is_collision(&machine_with("offline", "HOST-A"), "HOST-B"));
    }

    // ---- DB-gated tests (skip without TEST_DATABASE_URL; run in CI on Postgres). -
    // These validate the mechanisms fixes #1 and #3 rely on against a REAL Postgres
    // error/row, which cannot be faked on a workstation. The full handler is exercised
    // end-to-end in CI; here we pin the load-bearing primitives.

    use sqlx::postgres::PgPoolOptions;
    use sqlx::PgPool;

    async fn test_pool() -> Option<PgPool> {
        let url = std::env::var("TEST_DATABASE_URL").ok()?;
        let pool = PgPoolOptions::new()
            .max_connections(2)
            .connect(&url)
            .await
            .expect("connect to TEST_DATABASE_URL");
        sqlx::migrate!("./migrations")
            .run(&pool)
            .await
            .expect("apply migrations to the test database");
        Some(pool)
    }

    async fn cleanup(pool: &PgPool, uids: &[&str], site_codes: &[&str]) {
        for uid in uids {
            let _ = sqlx::query("DELETE FROM connect_machines WHERE machine_uid = $1")
                .bind(uid)
                .execute(pool)
                .await;
        }
        for code in site_codes {
            let _ = sqlx::query("DELETE FROM connect_sites WHERE site_code = $1")
                .bind(code)
                .execute(pool)
                .await;
        }
    }

    /// Fix #1: after a machine is enrolled at site A, the dedup lookup returns the row
    /// carrying `site_id == site_a`, which is exactly what the cross-site guard compares
    /// against `site.id`. This pins the precondition that makes the refusal possible:
    /// a site-B enroll resolves the SAME row and sees a DIFFERENT bound site_id.
    #[tokio::test]
    async fn enrolled_machine_exposes_bound_site_id_for_cross_site_guard() {
        let Some(pool) = test_pool().await else {
            return; // no TEST_DATABASE_URL: skip (runs in CI)
        };
        let tenant = db::tenancy::current_tenant_id();
        let uid = "test-xsite-uid-001";
        cleanup(&pool, &[uid], &["test-xsite-A", "test-xsite-B"]).await;

        let site_a = db::sites::insert_site(&pool, "test-xsite-A", None, None, Some(tenant))
            .await
            .expect("insert site A");
        let site_b = db::sites::insert_site(&pool, "test-xsite-B", None, None, Some(tenant))
            .await
            .expect("insert site B");
        assert_ne!(site_a.id, site_b.id);

        let agent_id_a = format!("enroll-{}", Uuid::new_v4());
        let params = enroll_params(
            &agent_id_a,
            "HOST-XSITE",
            uid,
            tenant,
            site_a.id,
            None,
            None,
            &[],
            "active",
        );
        db::machines::insert_enrolled_machine(&pool, &params)
            .await
            .expect("enroll at site A");

        // A subsequent (site-B) enroll dedups on (tenant, uid) and resolves THIS row.
        let found = db::machines::get_machine_by_tenant_uid(&pool, tenant, uid)
            .await
            .expect("dedup lookup")
            .expect("row exists");

        // The guard compares found.site_id (Some(A)) against the enrolling site (B);
        // they differ => refusal. Prove the comparison the guard makes evaluates true.
        assert_eq!(found.site_id, Some(site_a.id), "row is bound to site A");
        assert!(
            found.site_id.is_some_and(|s| s != site_b.id),
            "cross-site guard condition (bound to a different site) must hold"
        );

        cleanup(&pool, &[uid], &["test-xsite-A", "test-xsite-B"]).await;
    }

    /// Fix #3: a duplicate first-enroll of the same `machine_uid` raises a Postgres
    /// unique violation that `is_machine_uid_conflict` classifies as `true` (the race
    /// the loop retries as reuse), while a DIFFERENT unique violation (`agent_id`)
    /// classifies as `false` (a genuine error that must still 500). This is the exact
    /// branch predicate of the TOCTOU retry, validated against real driver errors.
    #[tokio::test]
    async fn machine_uid_conflict_is_classified_but_agent_id_conflict_is_not() {
        let Some(pool) = test_pool().await else {
            return; // no TEST_DATABASE_URL: skip (runs in CI)
        };
        let tenant = db::tenancy::current_tenant_id();
        let uid = "test-toctou-uid-001";
        cleanup(&pool, &[uid], &["test-toctou-A"]).await;

        let site = db::sites::insert_site(&pool, "test-toctou-A", None, None, Some(tenant))
            .await
            .expect("insert site");

        let shared_agent_id = format!("enroll-{}", Uuid::new_v4());
        let first = enroll_params(
            &shared_agent_id,
            "HOST-TOCTOU",
            uid,
            tenant,
            site.id,
            None,
            None,
            &[],
            "active",
        );
        db::machines::insert_enrolled_machine(&pool, &first)
            .await
            .expect("first enroll wins the race");

        // Loser re-inserts the SAME machine_uid (fresh agent_id) -> machine_uid unique
        // violation -> must be classified as the retryable race.
        let loser_agent_id = format!("enroll-{}", Uuid::new_v4());
        let uid_dup = enroll_params(
            &loser_agent_id,
            "HOST-TOCTOU-2",
            uid,
            tenant,
            site.id,
            None,
            None,
            &[],
            "active",
        );
        let err = db::machines::insert_enrolled_machine(&pool, &uid_dup)
            .await
            .expect_err("duplicate machine_uid must violate the unique index");
        assert!(
            is_machine_uid_conflict(&err),
            "a machine_uid unique violation must be classified as the retryable race; got: {err:?}"
        );

        // A DIFFERENT unique violation (reusing the agent_id with a NEW uid) must NOT be
        // swallowed as the race — it is a genuine error that still surfaces as 500.
        let uid_other = "test-toctou-uid-002";
        let agent_dup = enroll_params(
            &shared_agent_id, // collides on agent_id UNIQUE, not machine_uid
            "HOST-TOCTOU-3",
            uid_other,
            tenant,
            site.id,
            None,
            None,
            &[],
            "active",
        );
        let err2 = db::machines::insert_enrolled_machine(&pool, &agent_dup)
            .await
            .expect_err("duplicate agent_id must violate its unique constraint");
        assert!(
            !is_machine_uid_conflict(&err2),
            "an agent_id unique violation must NOT be misclassified as the machine_uid race; got: {err2:?}"
        );

        cleanup(&pool, &[uid, uid_other], &["test-toctou-A"]).await;
    }
}