Merge pull request 'SPEC-016 Phase A: zero-touch enrollment backend + migration' (#5) from feat/spec-016-enrollment into main
All checks were successful
Build and Test / Build Agent (Windows) (push) Successful in 10m37s
Build and Test / Build Server (Linux) (push) Successful in 15m25s
Build and Test / Security Audit (push) Successful in 5m28s
Build and Test / Build Summary (push) Successful in 23s

This commit was merged in pull request #5.
This commit is contained in:
2026-06-02 11:19:37 -07:00
14 changed files with 2259 additions and 2 deletions

4
Cargo.lock generated
View File

@@ -1407,7 +1407,7 @@ dependencies = [
[[package]]
name = "guruconnect"
version = "0.2.0"
version = "0.3.0"
dependencies = [
"anyhow",
"bytes",
@@ -1447,7 +1447,7 @@ dependencies = [
[[package]]
name = "guruconnect-server"
version = "0.2.0"
version = "0.3.0"
dependencies = [
"anyhow",
"argon2",

View File

@@ -0,0 +1,159 @@
-- Migration: 010_spec016_enrollment.sql
-- Purpose: SPEC-016 zero-touch per-site agent enrollment — server-side data model.
--
-- Adds the per-site enrollment-key table, a minimal sites table to anchor it,
-- and the machine-side columns the collision-gated self-registration flow needs.
--
-- Two-tier credential model (SPEC-016 §Security): a low-sensitivity, rotatable,
-- per-site ENROLLMENT KEY (the `cek_` secret stored hashed here) gates "may this
-- machine register at all", while the high-sensitivity per-machine `cak_`
-- operating credential (connect_agent_keys, migration 004) is minted on a
-- successful enroll. Compromise of an enrollment key is recovered by rotating one
-- site, not a fleet-wide re-key.
--
-- DEVIATION FROM SPEC (documented): SPEC-016 §DB-migration describes
-- `site_enrollment_keys.site_id` as `fk -> sites`, assuming a sites table already
-- exists. It does NOT — in the current schema "site" and "company/organization" are
-- free-text columns on connect_machines (migration 005), there is no relational
-- sites entity. This migration therefore CREATES a minimal `connect_sites` table
-- (the relational anchor the enrollment-key FK and the dashboard per-site key
-- display both require) keyed by a natural `site_code` and scoped per-tenant. It is
-- intentionally minimal (code + display name + tenant); richer site/company
-- modeling is left to future work. The free-text connect_machines.site /
-- .organization columns are untouched and continue to carry agent-reported labels.
--
-- Idempotent: CREATE TABLE/INDEX IF NOT EXISTS, ADD COLUMN IF NOT EXISTS. Applied on
-- server startup by sqlx::migrate!(); never pre-applied via psql. Ordered after 009.
-- See .claude/standards/gururmm/sqlx-migrations.md.
-- pgcrypto provides gen_random_uuid(); enabled in 001/004 but re-asserted for safety.
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
-- ============================================================================
-- connect_sites — relational anchor for per-site enrollment (see DEVIATION above)
-- ============================================================================
-- A site is the unit a single signed installer targets. `site_code` is the
-- non-secret, operator-facing identifier the installer carries and the agent sends
-- at /api/enroll (e.g. "ACME-PHX"). Uniqueness is per-tenant: the same human-chosen
-- code may legitimately exist in two tenants. tenant_id mirrors the nullable,
-- default-tenant-backfilled tenancy column used on every other scoped table
-- (migration 004); db::tenancy::current_tenant_id() resolves it for now.
CREATE TABLE IF NOT EXISTS connect_sites (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
-- Operator-facing site identifier the installer carries. Non-secret.
site_code TEXT NOT NULL,
-- Human-readable site / company display name for the dashboard.
display_name TEXT,
-- Default company label applied to machines enrolled at this site (mirrors the
-- free-text connect_machines.organization the agent otherwise self-reports).
company TEXT,
-- Tenancy-ready (Phase 4). Backfilled to the default tenant below.
tenant_id UUID,
-- RESERVED for future per-site enrollment POLICY work (SPEC-016 §out-of-scope):
-- default 'auto-approve'; a future 'pending-approval' value will gate new
-- enrollments. NOT enforced in Phase A — present so the policy SPEC needs no
-- schema change. Do not branch on this column yet.
enrollment_policy TEXT DEFAULT 'auto-approve',
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Per-tenant uniqueness of the natural site_code so /api/enroll can resolve a site
-- deterministically within a tenant while the same code may exist across tenants.
-- COALESCE keeps the index usable while tenant_id is still nullable (Phase 1).
CREATE UNIQUE INDEX IF NOT EXISTS idx_connect_sites_tenant_code
ON connect_sites (COALESCE(tenant_id, '00000000-0000-0000-0000-000000000001'::uuid), site_code);
-- Backfill the sites tenant_id to the default tenant (table is empty on a fresh DB;
-- no-op there, but keeps the migration self-consistent).
UPDATE connect_sites
SET tenant_id = '00000000-0000-0000-0000-000000000001'
WHERE tenant_id IS NULL;
-- ============================================================================
-- site_enrollment_keys — rotatable, hashed per-site enrollment secret + fingerprint
-- ============================================================================
-- Stores ONLY the Argon2id hash of the `cek_` secret; the plaintext is shown once
-- at issue/rotate and never recoverable. `version` is the monotonic rotation
-- counter; `fingerprint` is the non-secret short hex shown as `vN (XXXX)` in the
-- dashboard and baked into the installer filename. `active` marks the current key —
-- rotation flips the old key to active=false (blocking NEW enrollments from old
-- installers) and inserts a new active row; already-enrolled agents holding their
-- own `cak_` are unaffected. Multiple inactive (historical) rows may coexist per
-- site; at most one active row is intended (enforced by a partial unique index).
CREATE TABLE IF NOT EXISTS site_enrollment_keys (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
site_id UUID NOT NULL REFERENCES connect_sites(id) ON DELETE CASCADE,
-- Argon2id hash of the `cek_` enrollment secret. Never the plaintext.
key_hash TEXT NOT NULL,
-- Monotonic rotation version (1, 2, 3, ...).
version INTEGER NOT NULL,
-- Non-secret short hex fingerprint code (the XXXX in `vN (XXXX)`), derived from
-- the secret. Stored so the dashboard / GET endpoint can show it without the
-- secret.
fingerprint TEXT NOT NULL,
active BOOLEAN NOT NULL DEFAULT true,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
-- Set when this key is rotated out (active flipped to false).
rotated_at TIMESTAMPTZ
);
-- Lookup index for the enroll hot path: resolve the active key for a site.
CREATE INDEX IF NOT EXISTS idx_site_enrollment_keys_site_active
ON site_enrollment_keys (site_id, active);
-- At most one ACTIVE enrollment key per site (the "current" installer key).
-- Partial unique index so any number of inactive historical rows may coexist.
CREATE UNIQUE INDEX IF NOT EXISTS idx_site_enrollment_keys_one_active
ON site_enrollment_keys (site_id)
WHERE active;
-- ============================================================================
-- connect_machines — site binding + enrollment-state collision gate
-- ============================================================================
-- machine_uid already exists (migration 008) with a partial UNIQUE index on
-- (machine_uid) WHERE machine_uid IS NOT NULL. SPEC-016 §item-1 / resolved-decision #4
-- call for the dedup key to be PER-TENANT — (tenant_id, machine_uid) — so the same
-- hardware legitimately present in two tenants stays two rows. tenant_id is the
-- scoping column that exists on connect_machines (migration 004); machines have no
-- direct site_id today, so site is tracked separately (site_id below) and tenancy is
-- the uniqueness scope, exactly as the spec states.
--
-- CRITICAL CONSTRAINT (why we ADD rather than REPLACE the 008 index here):
-- db::machines::upsert_machine (the live connect-path upsert) uses
-- `ON CONFLICT (machine_uid) WHERE machine_uid IS NOT NULL` as its conflict arbiter.
-- Postgres matches that arbiter to the EXACT index from migration 008. Dropping that
-- index would make the live upsert fail to find an arbiter and error at runtime —
-- breaking every un-keyed agent reconnect. So migration 008's global index is LEFT
-- IN PLACE (the connect path keeps working unchanged) and the per-tenant index is
-- added ALONGSIDE it. In single-tenant Phase 1 the two are equivalent (every row's
-- tenant_id is the default tenant), so the per-tenant index adds the SPEC-016 dedup
-- semantics without a redundant-uniqueness conflict: a (tenant, uid) pair that is
-- unique is also globally unique today. When multi-tenancy activates AND
-- upsert_machine's ON CONFLICT is updated to name (tenant_id, machine_uid), a future
-- migration drops the global 008 index. Documented as deferred; do not drop it now.
-- Optional FK to the site a machine enrolled under (NULL for legacy / support-code
-- machines that never enrolled through /api/enroll). A site change on re-enroll is
-- the "site move" SPEC-016 audits.
ALTER TABLE connect_machines ADD COLUMN IF NOT EXISTS site_id UUID REFERENCES connect_sites(id) ON DELETE SET NULL;
-- enrollment_state: the collision gate (SPEC-016 §item-1/6). 'active' = live and
-- controllable (auto-approve posture); 'pending' = a machine_uid collision was
-- detected at enroll and an operator must confirm in the dashboard before the
-- endpoint may be controlled. Default 'active' so every legacy/connect-path row is
-- unaffected.
ALTER TABLE connect_machines
ADD COLUMN IF NOT EXISTS enrollment_state TEXT NOT NULL DEFAULT 'active'
CHECK (enrollment_state IN ('active', 'pending'));
-- Per-tenant machine_uid uniqueness (SPEC-016). Added ALONGSIDE migration 008's
-- global (machine_uid) index (see CRITICAL CONSTRAINT above — the connect-path
-- upsert's ON CONFLICT arbiter binds to the 008 index, which must survive). COALESCE
-- folds a NULL tenant_id to the default tenant so the index is well-defined while
-- tenancy is single-tenant (Phase 1); the WHERE clause excludes NULL machine_uid so
-- legacy un-keyed rows coexist freely.
CREATE UNIQUE INDEX IF NOT EXISTS idx_connect_machines_tenant_machine_uid
ON connect_machines (COALESCE(tenant_id, '00000000-0000-0000-0000-000000000001'::uuid), machine_uid)
WHERE machine_uid IS NOT NULL;

1008
server/src/api/enroll.rs Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -4,10 +4,12 @@ pub mod auth;
pub mod auth_logout;
pub mod changelog;
pub mod downloads;
pub mod enroll;
pub mod machine_keys;
pub mod releases;
pub mod removal;
pub mod sessions;
pub mod sites;
pub mod users;
use axum::{

217
server/src/api/sites.rs Normal file
View File

@@ -0,0 +1,217 @@
//! Site enrollment-key administration (SPEC-016, admin plane).
//!
//! Admin (dashboard JWT + admin role) endpoints for the per-site enrollment key
//! the dashboard surfaces and rotates:
//!
//! - `POST /api/sites/:id/enrollment-key/rotate` — regenerate the `cek_` secret,
//! bump the monotonic version, derive a new fingerprint, deactivate the prior
//! active key, and return the plaintext + fingerprint ONCE. Old installers can no
//! longer enroll NEW machines after this; already-enrolled agents (holding their
//! own `cak_`) are unaffected (SPEC-016 success-criterion #3). Doubles as
//! first-issue when a site has no key yet.
//! - `GET /api/sites/:id/enrollment-key` — read the CURRENT non-secret fingerprint
//! + version (never the secret). 404 if the site has no active key yet.
//!
//! Auth mirrors `api::machine_keys`: the [`crate::auth::AdminUser`] extractor gates
//! both routes, and they are mounted behind the JWT `auth_layer`.
//!
//! SECURITY: the plaintext `cek_` is returned exactly once (rotate response),
//! never persisted in plaintext and never logged. Read responses expose only the
//! version + fingerprint.
use axum::{
extract::{Path, State},
http::StatusCode,
Json,
};
use serde::Serialize;
use uuid::Uuid;
use crate::auth::{enrollment_keys, AdminUser};
use crate::db;
use crate::AppState;
/// Standard error envelope (matches `api::machine_keys::ApiError`).
#[derive(Debug, Serialize)]
pub struct ApiError {
pub detail: String,
pub error_code: String,
pub status_code: u16,
}
impl ApiError {
fn new(status: StatusCode, code: &str, detail: &str) -> (StatusCode, Json<ApiError>) {
(
status,
Json(ApiError {
detail: detail.to_string(),
error_code: code.to_string(),
status_code: status.as_u16(),
}),
)
}
}
type ApiResult<T> = Result<T, (StatusCode, Json<ApiError>)>;
/// Response for a freshly rotated/issued enrollment key. `key` is present ONLY
/// here, once.
#[derive(Debug, Serialize)]
pub struct RotatedEnrollmentKey {
pub site_id: Uuid,
/// The plaintext `cek_` enrollment key. Shown exactly once — bake it into the
/// site installer now; the server keeps only its hash.
pub key: String,
/// Monotonic rotation version.
pub version: i32,
/// The non-secret short hex code (the `XXXX` in `vN (XXXX)`).
pub fingerprint: String,
/// Fully rendered operator-facing fingerprint, e.g. `v3 (7F2A)`.
pub fingerprint_label: String,
}
/// Non-secret current-key view for the GET endpoint.
#[derive(Debug, Serialize)]
pub struct EnrollmentKeyView {
pub site_id: Uuid,
pub version: i32,
pub fingerprint: String,
pub fingerprint_label: String,
pub active: bool,
}
fn require_db(state: &AppState) -> ApiResult<&db::Database> {
state.db.as_ref().ok_or_else(|| {
ApiError::new(
StatusCode::SERVICE_UNAVAILABLE,
"DATABASE_UNAVAILABLE",
"Database not available",
)
})
}
/// Resolve a site by its UUID path segment, or a 404 envelope.
async fn resolve_site(db: &db::Database, site_id: Uuid) -> ApiResult<db::sites::Site> {
db::sites::get_site_by_id(db.pool(), site_id)
.await
.map_err(|e| {
tracing::error!("DB error resolving site: {}", e);
ApiError::new(
StatusCode::INTERNAL_SERVER_ERROR,
"INTERNAL_ERROR",
"Internal server error",
)
})?
.ok_or_else(|| ApiError::new(StatusCode::NOT_FOUND, "SITE_NOT_FOUND", "Site not found"))
}
/// POST /api/sites/:id/enrollment-key/rotate — rotate (or first-issue) a site's
/// enrollment key. Returns the plaintext `cek_` + fingerprint once.
pub async fn rotate_enrollment_key(
AdminUser(admin): AdminUser,
State(state): State<AppState>,
Path(site_id): Path<Uuid>,
) -> ApiResult<(StatusCode, Json<RotatedEnrollmentKey>)> {
let db = require_db(&state)?;
let site = resolve_site(db, site_id).await?;
// Mint plaintext + Argon2id hash + fingerprint. Only the hash + fingerprint
// are persisted; the plaintext is surfaced once below.
let plaintext = enrollment_keys::generate_enrollment_key();
let key_hash = enrollment_keys::hash_enrollment_key(&plaintext).map_err(|e| {
tracing::error!("Failed to hash enrollment key: {}", e);
ApiError::new(
StatusCode::INTERNAL_SERVER_ERROR,
"INTERNAL_ERROR",
"Failed to hash enrollment key",
)
})?;
let fingerprint = enrollment_keys::compute_fingerprint(&plaintext);
let new_key = db::enrollment_keys::rotate_key(db.pool(), site.id, &key_hash, &fingerprint)
.await
.map_err(|e| {
tracing::error!("DB error rotating enrollment key: {}", e);
ApiError::new(
StatusCode::INTERNAL_SERVER_ERROR,
"INTERNAL_ERROR",
"Failed to rotate enrollment key",
)
})?;
let fingerprint_label =
enrollment_keys::render_fingerprint(new_key.version, &new_key.fingerprint);
// Audit WITHOUT key material (no plaintext, no hash).
if let Err(e) = db::events::log_enrollment_event(
db.pool(),
db::events::EventTypes::ENROLLMENT_KEY_ROTATED,
serde_json::json!({
"site_id": site.id,
"site_code": site.site_code,
"version": new_key.version,
"fingerprint": new_key.fingerprint,
"rotated_by": admin.username,
}),
None,
)
.await
{
tracing::warn!("[ENROLL] failed to write key-rotate audit event: {}", e);
}
tracing::info!(
"Admin {} rotated enrollment key for site {} to {}",
admin.username,
site.site_code,
fingerprint_label
);
Ok((
StatusCode::CREATED,
Json(RotatedEnrollmentKey {
site_id: site.id,
key: plaintext,
version: new_key.version,
fingerprint: new_key.fingerprint,
fingerprint_label,
}),
))
}
/// GET /api/sites/:id/enrollment-key — current non-secret fingerprint + version.
pub async fn get_enrollment_key(
AdminUser(_admin): AdminUser,
State(state): State<AppState>,
Path(site_id): Path<Uuid>,
) -> ApiResult<Json<EnrollmentKeyView>> {
let db = require_db(&state)?;
let site = resolve_site(db, site_id).await?;
let key = db::enrollment_keys::get_active_for_site(db.pool(), site.id)
.await
.map_err(|e| {
tracing::error!("DB error loading enrollment key: {}", e);
ApiError::new(
StatusCode::INTERNAL_SERVER_ERROR,
"INTERNAL_ERROR",
"Internal server error",
)
})?
.ok_or_else(|| {
ApiError::new(
StatusCode::NOT_FOUND,
"NO_ENROLLMENT_KEY",
"Site has no active enrollment key",
)
})?;
let fingerprint_label = enrollment_keys::render_fingerprint(key.version, &key.fingerprint);
Ok(Json(EnrollmentKeyView {
site_id: site.id,
version: key.version,
fingerprint: key.fingerprint,
fingerprint_label,
active: key.active,
}))
}

View File

@@ -0,0 +1,191 @@
//! Per-site enrollment key minting, hashing, verification, and fingerprinting
//! (SPEC-016 zero-touch enrollment, auth layer).
//!
//! This is the low-sensitivity, rotatable side of the two-tier credential model
//! (SPEC-016 §Security). A per-site ENROLLMENT key (`cek_` prefix) gates "may
//! this machine register at all" at `POST /api/enroll`; a successful enroll mints
//! the high-sensitivity per-machine `cak_` operating credential
//! ([`crate::auth::agent_keys`]). Compromise of an enrollment key is contained to
//! one site and recovered by rotating it.
//!
//! Lifecycle owned here (the secret side):
//!
//! - [`generate_enrollment_key`] mints a high-entropy, `cek_`-prefixed plaintext
//! secret. Mirrors [`crate::auth::agent_keys::generate_agent_key`]'s entropy
//! approach (32 random bytes from the OS CSPRNG, hex-encoded) with a DISTINCT
//! prefix so the two key kinds are never confused in logs or storage. The
//! plaintext is shown to the operator exactly once at issue/rotate and is NEVER
//! persisted or logged.
//! - [`hash_enrollment_key`] / [`verify_enrollment_key`] use **Argon2id** (via
//! [`crate::auth::password`]). This DIFFERS from `cak_` (which uses SHA-256 for
//! a constant-shape equality lookup): SPEC-016 §2 explicitly requires the
//! enrollment key be "stored hashed (Argon2id, same as `cak_`/passwords)". The
//! trade-off is deliberate — enrollment keys are looked up by `(site, active)`
//! first (a small candidate set, usually one row) and only then verified, so the
//! per-verify KDF cost is bounded and not on a high-QPS path, while Argon2id
//! gives salted, GPU-resistant storage matching the password posture.
//! - [`compute_fingerprint`] derives the non-secret short HEX code shown as
//! `vN (XXXX)` (SPEC-016 resolved-decision #3 — hex, deliberately NOT the
//! GuruRMM word-style code, so the two products' artifacts are never visually
//! conflated).
//!
//! SECURITY: never log a plaintext key or its hash. Functions here return the
//! plaintext to the caller (issue/rotate endpoint) but emit no `tracing` output
//! containing key material.
use anyhow::Result;
use rand::RngCore;
use ring::digest;
/// Prefix marking a GuruConnect per-site enrollment key. Distinct from the
/// per-agent `cak_` prefix so the two key kinds are never confused.
pub const ENROLLMENT_KEY_PREFIX: &str = "cek_";
/// Number of random bytes behind an enrollment key (256 bits of entropy), matching
/// [`crate::auth::agent_keys`]. SPEC-016 §2 requires ≥256-bit.
const ENROLLMENT_KEY_RANDOM_BYTES: usize = 32;
/// Number of hex characters in the fingerprint code (the `XXXX` in `vN (XXXX)`).
/// Four hex chars = 16 bits — ample to let an operator tell two installers apart at
/// a glance; it is a non-secret display aid, not a security control.
const FINGERPRINT_HEX_LEN: usize = 4;
/// Generate a new high-entropy, `cek_`-prefixed per-site enrollment key (plaintext).
///
/// The returned string is the ONLY time the plaintext exists; the caller must
/// surface it to the operator once and store only [`hash_enrollment_key`] of it.
/// Uses the OS CSPRNG via `rand::rngs::OsRng`.
pub fn generate_enrollment_key() -> String {
let mut bytes = [0u8; ENROLLMENT_KEY_RANDOM_BYTES];
rand::rngs::OsRng.fill_bytes(&mut bytes);
format!("{}{}", ENROLLMENT_KEY_PREFIX, hex_encode(&bytes))
}
/// Hash an enrollment key for storage using Argon2id (SPEC-016 §2).
///
/// Delegates to [`crate::auth::password::hash_password`] so the KDF parameters and
/// salt generation match the password posture exactly. Returns the PHC-format
/// string Postgres stores in `site_enrollment_keys.key_hash`.
pub fn hash_enrollment_key(plaintext: &str) -> Result<String> {
crate::auth::password::hash_password(plaintext)
}
/// Verify a presented enrollment key against a stored Argon2id hash.
///
/// Returns `Ok(true)` on a match. A malformed stored hash or a mismatch yields
/// `Ok(false)` / an `Err` from the underlying verifier; the caller treats any
/// non-`Ok(true)` as a rejection. A cheap structural reject (`cek_` prefix) runs
/// first to skip the KDF on obviously-bogus input.
///
/// SECURITY: only compares; never logs the presented key or the hash.
pub fn verify_enrollment_key(presented: &str, stored_hash: &str) -> bool {
if !presented.starts_with(ENROLLMENT_KEY_PREFIX) {
return false;
}
crate::auth::password::verify_password(presented, stored_hash).unwrap_or(false)
}
/// Compute the non-secret short HEX fingerprint code for an enrollment key.
///
/// Derived as the first [`FINGERPRINT_HEX_LEN`] hex chars of the SHA-256 of the
/// plaintext secret, uppercased. This is a stable, non-reversible tag of the secret
/// (knowing the code does not reveal the key) used purely for display. Pair it with
/// the monotonic version via [`render_fingerprint`].
pub fn compute_fingerprint(plaintext: &str) -> String {
let d = digest::digest(&digest::SHA256, plaintext.as_bytes());
let hex = hex_encode(d.as_ref());
hex[..FINGERPRINT_HEX_LEN].to_ascii_uppercase()
}
/// Render the operator-facing fingerprint string `vN (XXXX)` (SPEC-016 §2).
///
/// `version` is the monotonic rotation counter; `code` is [`compute_fingerprint`].
/// Example: `render_fingerprint(3, "7F2A")` -> `"v3 (7F2A)"`.
pub fn render_fingerprint(version: i32, code: &str) -> String {
format!("v{} ({})", version, code)
}
/// Lowercase hex encoding without pulling in the `hex` crate (mirrors
/// [`crate::auth::agent_keys`]).
fn hex_encode(bytes: &[u8]) -> String {
use std::fmt::Write;
let mut s = String::with_capacity(bytes.len() * 2);
for b in bytes {
let _ = write!(s, "{:02x}", b);
}
s
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn generated_key_is_prefixed_and_high_entropy() {
let key = generate_enrollment_key();
assert!(key.starts_with(ENROLLMENT_KEY_PREFIX));
assert_eq!(
key.len(),
ENROLLMENT_KEY_PREFIX.len() + ENROLLMENT_KEY_RANDOM_BYTES * 2
);
}
#[test]
fn generated_keys_are_unique() {
assert_ne!(generate_enrollment_key(), generate_enrollment_key());
}
#[test]
fn hash_and_verify_roundtrip() {
let key = generate_enrollment_key();
let hash = hash_enrollment_key(&key).expect("hash");
assert!(verify_enrollment_key(&key, &hash));
}
#[test]
fn verify_rejects_wrong_key() {
let key = generate_enrollment_key();
let other = generate_enrollment_key();
let hash = hash_enrollment_key(&key).expect("hash");
assert!(!verify_enrollment_key(&other, &hash));
}
#[test]
fn verify_rejects_unprefixed_input_without_touching_kdf() {
let key = generate_enrollment_key();
let hash = hash_enrollment_key(&key).expect("hash");
// A value lacking the cek_ prefix is structurally rejected before the KDF.
assert!(!verify_enrollment_key("not-a-key", &hash));
}
#[test]
fn verify_rejects_malformed_stored_hash() {
let key = generate_enrollment_key();
// A garbage stored hash must not panic and must reject.
assert!(!verify_enrollment_key(&key, "not-a-phc-hash"));
}
#[test]
fn fingerprint_is_stable_uppercase_hex_of_expected_len() {
let key = "cek_deadbeef";
let f1 = compute_fingerprint(key);
let f2 = compute_fingerprint(key);
assert_eq!(f1, f2);
assert_eq!(f1.len(), FINGERPRINT_HEX_LEN);
assert!(f1.chars().all(|c| c.is_ascii_hexdigit()));
assert_eq!(f1, f1.to_ascii_uppercase());
}
#[test]
fn fingerprint_differs_per_key() {
assert_ne!(
compute_fingerprint("cek_aaa"),
compute_fingerprint("cek_bbb")
);
}
#[test]
fn render_fingerprint_matches_spec_shape() {
assert_eq!(render_fingerprint(3, "7F2A"), "v3 (7F2A)");
}
}

View File

@@ -4,6 +4,7 @@
//! validation for agents.
pub mod agent_keys;
pub mod enrollment_keys;
pub mod jwt;
pub mod password;
pub mod token_blacklist;

View File

@@ -0,0 +1,141 @@
//! Per-site enrollment key database operations (SPEC-016 zero-touch enrollment).
//!
//! Backs the `site_enrollment_keys` table (migration 010). Stores ONLY the
//! Argon2id hash of the `cek_` secret plus the non-secret rotation metadata
//! (version, fingerprint, active flag). Computing the hash and minting the
//! plaintext is [`crate::auth::enrollment_keys`]'s job; this module is
//! hash-agnostic persistence and takes already-hashed values.
//!
//! Rotation invariant: at most one `active` row per site (enforced by a partial
//! unique index in migration 010). [`rotate_key`] deactivates the current active
//! row and inserts a new active one inside a single transaction so the invariant
//! is never transiently violated.
//!
//! All queries use runtime `sqlx::query()` / `sqlx::query_as()` per the codebase
//! convention (no compile-time `query!` macros, no `.sqlx` offline cache).
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use sqlx::PgPool;
use uuid::Uuid;
/// Per-site enrollment key record.
///
/// `key_hash` is the only representation of the secret the server stores; the
/// plaintext is shown once at issue/rotate and never persisted.
#[derive(Debug, Clone, Serialize, Deserialize, sqlx::FromRow)]
pub struct EnrollmentKey {
pub id: Uuid,
pub site_id: Uuid,
pub key_hash: String,
pub version: i32,
pub fingerprint: String,
pub active: bool,
pub created_at: DateTime<Utc>,
pub rotated_at: Option<DateTime<Utc>>,
}
/// Fetch the active enrollment key for a site, if any.
///
/// This is the `/api/enroll` hot path: resolve the one active key whose hash the
/// presented `cek_` is verified against. The partial unique index guarantees at
/// most one active row, so `fetch_optional` is correct.
pub async fn get_active_for_site(
pool: &PgPool,
site_id: Uuid,
) -> Result<Option<EnrollmentKey>, sqlx::Error> {
sqlx::query_as::<_, EnrollmentKey>(
r#"
SELECT id, site_id, key_hash, version, fingerprint, active, created_at, rotated_at
FROM site_enrollment_keys
WHERE site_id = $1 AND active
"#,
)
.bind(site_id)
.fetch_optional(pool)
.await
}
/// Insert the FIRST enrollment key for a site at version 1 (initial issue).
///
/// Use [`rotate_key`] for subsequent rotations. Errors with a unique violation if
/// the site already has an active key (the caller should rotate instead).
#[allow(dead_code)] // Wired by site-admin issue flow; Phase A exposes rotation (which also covers first issue when none exists).
pub async fn insert_initial_key(
pool: &PgPool,
site_id: Uuid,
key_hash: &str,
fingerprint: &str,
) -> Result<EnrollmentKey, sqlx::Error> {
sqlx::query_as::<_, EnrollmentKey>(
r#"
INSERT INTO site_enrollment_keys (site_id, key_hash, version, fingerprint, active)
VALUES ($1, $2, 1, $3, true)
RETURNING id, site_id, key_hash, version, fingerprint, active, created_at, rotated_at
"#,
)
.bind(site_id)
.bind(key_hash)
.bind(fingerprint)
.fetch_one(pool)
.await
}
/// Rotate a site's enrollment key (SPEC-016 §2): deactivate the current active key
/// (if any) and insert a new active key at the next monotonic version, all in one
/// transaction.
///
/// Returns the newly-created active key. If the site has no key yet, this issues
/// version 1 (so rotation also serves as first-issue). The caller passes the
/// already-hashed new secret and its fingerprint; the plaintext is surfaced once by
/// the caller and never reaches this layer.
///
/// The transaction is what keeps the "at most one active key per site" invariant
/// (partial unique index) from being transiently violated between the UPDATE and
/// the INSERT.
pub async fn rotate_key(
pool: &PgPool,
site_id: Uuid,
new_key_hash: &str,
new_fingerprint: &str,
) -> Result<EnrollmentKey, sqlx::Error> {
let mut tx = pool.begin().await?;
// Highest existing version for this site (NULL -> 0 so the first key is v1).
let current_max: Option<i32> =
sqlx::query_scalar("SELECT MAX(version) FROM site_enrollment_keys WHERE site_id = $1")
.bind(site_id)
.fetch_one(&mut *tx)
.await?;
let next_version = current_max.unwrap_or(0) + 1;
// Deactivate the current active key (if any), stamping rotated_at.
sqlx::query(
r#"
UPDATE site_enrollment_keys
SET active = false, rotated_at = NOW()
WHERE site_id = $1 AND active
"#,
)
.bind(site_id)
.execute(&mut *tx)
.await?;
// Insert the new active key at the next version.
let new_key = sqlx::query_as::<_, EnrollmentKey>(
r#"
INSERT INTO site_enrollment_keys (site_id, key_hash, version, fingerprint, active)
VALUES ($1, $2, $3, $4, true)
RETURNING id, site_id, key_hash, version, fingerprint, active, created_at, rotated_at
"#,
)
.bind(site_id)
.bind(new_key_hash)
.bind(next_version)
.bind(new_fingerprint)
.fetch_one(&mut *tx)
.await?;
tx.commit().await?;
Ok(new_key)
}

View File

@@ -69,6 +69,40 @@ impl EventTypes {
pub const MACHINE_REMOVED: &'static str = "machine_removed";
/// An administrator soft-deleted (purged) a session and dropped it in-memory.
pub const SESSION_REMOVED: &'static str = "session_removed";
// Zero-touch enrollment events (SPEC-016). Written by POST /api/enroll and the
// site enrollment-key rotation endpoint. These carry no session, so they are
// logged via `log_enrollment_event` with `session_id = NULL`; the structured
// detail (machine_uid, site_code, fingerprint, etc.) goes in `details` and the
// source IP in `ip_address`.
/// A new machine self-registered at a site and was minted its first `cak_`.
pub const ENROLL_NEW: &'static str = "enroll_new";
/// An existing machine_uid re-enrolled at the SAME site — the row was reused and
/// a fresh `cak_` minted (re-image / re-install).
pub const ENROLL_REUSE: &'static str = "enroll_reuse";
/// An existing machine_uid enrolled under a DIFFERENT site — the machine's site
/// binding was updated (a "site move"). Fires an alert.
///
/// NOTE (SPEC-016 Phase A): the unauthenticated enroll path does NOT perform this
/// move — a cross-site enroll is REFUSED (`ENROLL_SITE_CONFLICT`) rather than
/// silently repointing the machine. This event is reserved for the deliberate
/// Phase-B `--reassign` flow (and the dashboard move action) that supersede it.
#[allow(dead_code)] // reserved for Phase-B --reassign; not emitted by Phase A enroll
pub const ENROLL_SITE_MOVE: &'static str = "enroll_site_move";
/// An existing machine_uid presented a valid key for a DIFFERENT site than the one
/// the machine is currently bound to. Phase A REFUSES this (no move, no key minted)
/// as the accidental-move / cross-site-hijack guard; the deliberate move arrives
/// with the Phase-B `--reassign` flow + dashboard. Fires an alert.
pub const ENROLL_SITE_CONFLICT: &'static str = "enroll_site_conflict";
/// A machine_uid collision was detected at enroll — the endpoint dropped to
/// `pending` and awaits operator confirmation in the dashboard. Fires an alert.
pub const ENROLL_COLLISION_PENDING: &'static str = "enroll_collision_pending";
/// An enroll attempt failed enrollment-key verification (wrong/inactive key or
/// unknown site_code). Security audit trail for the open-registration surface.
pub const ENROLL_REJECTED: &'static str = "enroll_rejected";
/// An administrator rotated a site's enrollment key (new version + fingerprint;
/// old installers can no longer enroll NEW machines).
pub const ENROLLMENT_KEY_ROTATED: &'static str = "enrollment_key_rotated";
}
/// Log a session event
@@ -154,6 +188,42 @@ pub async fn log_admin_removal(
Ok(result)
}
/// Log a zero-touch enrollment audit event (SPEC-016).
///
/// Shares the `connect_session_events` audit table but carries no session
/// (`session_id = NULL`, the FK column is nullable) and no viewer — enrollment is
/// an unauthenticated agent action, not a viewer/session event. The structured
/// detail (machine_uid, site_code, fingerprint version, decision, etc.) goes in
/// `details` and the agent's source IP in `ip_address`.
///
/// Best-effort: a failure to write the audit row must NOT fail the enroll (the
/// machine row and `cak_` already exist); the caller logs the error and proceeds,
/// matching how the relay and Task-5 removal treat audit writes.
pub async fn log_enrollment_event(
pool: &PgPool,
event_type: &str,
details: JsonValue,
ip_address: Option<IpAddr>,
) -> Result<i64, sqlx::Error> {
let ip_str = ip_address.map(|ip| ip.to_string());
let result = sqlx::query_scalar::<_, i64>(
r#"
INSERT INTO connect_session_events
(session_id, event_type, viewer_id, viewer_name, details, ip_address)
VALUES (NULL, $1, NULL, NULL, $2, $3::inet)
RETURNING id
"#,
)
.bind(event_type)
.bind(details)
.bind(ip_str)
.fetch_one(pool)
.await?;
Ok(result)
}
/// Get events for a session
#[allow(dead_code)] // TODO(native-remote-control): consumed by the integration API; see docs/specs/native-remote-control/
pub async fn get_session_events(

View File

@@ -64,6 +64,16 @@ pub struct Machine {
/// history) is retained. NULL = live. Nullable, so it is read NULL-tolerantly
/// in the manual `FromRow` below.
pub deleted_at: Option<DateTime<Utc>>,
/// Relational site binding for a machine enrolled via `/api/enroll` (SPEC-016,
/// migration 010). NULL for legacy / support-code / connect-path machines that
/// never enrolled through the zero-touch flow. A change of this on re-enroll is
/// the "site move" the enroll path audits.
pub site_id: Option<Uuid>,
/// Collision-gate state (SPEC-016, migration 010): `'active'` (live, auto-approve)
/// or `'pending'` (a machine_uid collision was detected at enroll; awaiting
/// operator confirmation before the endpoint may be controlled). Non-null with a
/// default of `'active'`; read NULL-tolerantly below for defense in depth.
pub enrollment_state: String,
}
impl<'r> FromRow<'r, PgRow> for Machine {
@@ -83,6 +93,13 @@ impl<'r> FromRow<'r, PgRow> for Machine {
machine_uid: row.try_get("machine_uid")?,
// Schema-nullable (migration 009); decode directly as Option.
deleted_at: row.try_get("deleted_at")?,
// Schema-nullable (migration 010); decode directly as Option.
site_id: row.try_get("site_id")?,
// Non-null with default 'active' (migration 010); read NULL-tolerantly
// (older snapshots / partial rows) and fall back to 'active'.
enrollment_state: row
.try_get::<Option<String>, _>("enrollment_state")?
.unwrap_or_else(|| "active".to_string()),
// Nullable-with-default columns mapped to non-`Option` Rust types: read as
// `Option<T>` and fall back to the type default so a NULL cell never errors.
is_elevated: row
@@ -207,6 +224,131 @@ pub async fn upsert_machine(
}
}
/// Find a machine by the SPEC-016 per-tenant dedup key `(tenant_id, machine_uid)`.
///
/// This is the enroll-time dedup lookup: the same hardware re-enrolling (re-image /
/// re-install) resolves to its existing row within the tenant, while the same
/// hardware in a DIFFERENT tenant is a distinct row (resolved-decision #4). Tenant
/// scoping uses the same default-tenant fold as the unique index so the lookup
/// matches the uniqueness guarantee.
///
/// Unlike `get_machine_by_agent_id`, this deliberately does NOT filter
/// `deleted_at IS NULL`: a previously operator-purged machine that legitimately
/// re-enrolls must be found so the enroll path can revive it (clearing
/// `deleted_at`), mirroring the connect-path revive in `upsert_machine`.
pub async fn get_machine_by_tenant_uid(
pool: &PgPool,
tenant_id: Uuid,
machine_uid: &str,
) -> Result<Option<Machine>, sqlx::Error> {
sqlx::query_as::<_, Machine>(
r#"
SELECT * FROM connect_machines
WHERE machine_uid = $1
AND COALESCE(tenant_id, '00000000-0000-0000-0000-000000000001'::uuid) = $2
"#,
)
.bind(machine_uid)
.bind(tenant_id)
.fetch_optional(pool)
.await
}
/// Parameters for an enroll-time machine create/update (SPEC-016 `/api/enroll`).
///
/// `agent_id` is a freshly minted opaque id for a NEW enrollment (the agent's
/// config UUID story is Phase B; the server only needs a unique non-null value for
/// the `agent_id UNIQUE` column). On REUSE/MOVE the existing row's `agent_id` is
/// preserved (the FK target of any already-minted `cak_`), so the update path does
/// not touch it.
pub struct EnrollMachineParams<'a> {
pub agent_id: &'a str,
pub hostname: &'a str,
pub machine_uid: &'a str,
pub tenant_id: Uuid,
pub site_id: Uuid,
/// Company label (-> connect_machines.organization).
pub company: Option<&'a str>,
/// Site label (-> connect_machines.site) — the free-text label, distinct from
/// the relational site_id binding.
pub site_label: Option<&'a str>,
pub tags: &'a [String],
/// 'active' (auto-approve) or 'pending' (collision-gated).
pub enrollment_state: &'a str,
}
/// Insert a NEW machine row for a first-time enrollment (SPEC-016).
///
/// Carries the labels, the relational `site_id`, the per-tenant `machine_uid`, and
/// the collision-gate `enrollment_state`. Persistent + online. Returns the created
/// row (its `id` is the FK target for the `cak_` the caller mints next).
pub async fn insert_enrolled_machine(
pool: &PgPool,
p: &EnrollMachineParams<'_>,
) -> Result<Machine, sqlx::Error> {
sqlx::query_as::<_, Machine>(
r#"
INSERT INTO connect_machines
(agent_id, hostname, is_persistent, status, last_seen, machine_uid,
tenant_id, site_id, organization, site, tags, enrollment_state)
VALUES ($1, $2, true, 'online', NOW(), $3, $4, $5, $6, $7, $8, $9)
RETURNING *
"#,
)
.bind(p.agent_id)
.bind(p.hostname)
.bind(p.machine_uid)
.bind(p.tenant_id)
.bind(p.site_id)
.bind(p.company)
.bind(p.site_label)
.bind(p.tags)
.bind(p.enrollment_state)
.fetch_one(pool)
.await
}
/// Update an EXISTING machine row on re-enroll / reuse / site-move (SPEC-016).
///
/// Refreshes hostname, site binding (`site_id`), labels, and `enrollment_state`,
/// and revives a soft-deleted row (`deleted_at = NULL`) — a re-enroll of a purged
/// host means it is live again, mirroring `upsert_machine`'s revive. Deliberately
/// does NOT change `agent_id`: the existing id is the FK target of any prior `cak_`.
/// Labels are COALESCE-merged so an enroll that omits a label does not wipe an
/// existing value; `tags` is overwritten only when a non-empty set is supplied
/// (matching `update_machine_metadata`'s convention).
pub async fn update_enrolled_machine(
pool: &PgPool,
machine_id: Uuid,
p: &EnrollMachineParams<'_>,
) -> Result<Machine, sqlx::Error> {
sqlx::query_as::<_, Machine>(
r#"
UPDATE connect_machines SET
hostname = $2,
site_id = $3,
organization = COALESCE($4, organization),
site = COALESCE($5, site),
tags = CASE WHEN $6::text[] = '{}' THEN tags ELSE $6 END,
enrollment_state = $7,
status = 'online',
last_seen = NOW(),
deleted_at = NULL
WHERE id = $1
RETURNING *
"#,
)
.bind(machine_id)
.bind(p.hostname)
.bind(p.site_id)
.bind(p.company)
.bind(p.site_label)
.bind(p.tags)
.bind(p.enrollment_state)
.fetch_one(pool)
.await
}
/// Update machine status and info
#[allow(dead_code)] // TODO(native-remote-control): consumed by the integration API; see docs/specs/native-remote-control/
pub async fn update_machine_status(

View File

@@ -4,10 +4,12 @@
//! Optional - server works without database if DATABASE_URL not set.
pub mod agent_keys;
pub mod enrollment_keys;
pub mod events;
pub mod machines;
pub mod releases;
pub mod sessions;
pub mod sites;
pub mod support_codes;
pub mod tenancy;
pub mod users;

94
server/src/db/sites.rs Normal file
View File

@@ -0,0 +1,94 @@
//! Site database operations (SPEC-016 zero-touch enrollment).
//!
//! Backs the `connect_sites` table (migration 010): the relational anchor a
//! per-site enrollment key hangs off and the `/api/enroll` flow resolves by
//! `site_code`. See the migration header for why this table exists (the prior
//! schema modeled "site" only as a free-text column on `connect_machines`).
//!
//! All queries use runtime `sqlx::query()` / `sqlx::query_as()` per the codebase
//! convention (no compile-time `query!` macros, no `.sqlx` offline cache).
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use sqlx::PgPool;
use uuid::Uuid;
/// Site record from the database.
#[derive(Debug, Clone, Serialize, Deserialize, sqlx::FromRow)]
pub struct Site {
pub id: Uuid,
pub site_code: String,
pub display_name: Option<String>,
pub company: Option<String>,
pub tenant_id: Option<Uuid>,
/// RESERVED for future per-site enrollment POLICY work (SPEC-016 §out-of-scope).
/// Not enforced in Phase A.
pub enrollment_policy: Option<String>,
pub created_at: DateTime<Utc>,
}
/// Resolve a site by its operator-facing `site_code`, scoped to the given tenant.
///
/// Tenant scoping uses the same default-tenant fold as the unique index so the
/// lookup matches the uniqueness guarantee: `(COALESCE(tenant_id, default),
/// site_code)`. Returns `None` if no site with that code exists in the tenant.
pub async fn get_site_by_code(
pool: &PgPool,
site_code: &str,
tenant_id: Uuid,
) -> Result<Option<Site>, sqlx::Error> {
sqlx::query_as::<_, Site>(
r#"
SELECT id, site_code, display_name, company, tenant_id, enrollment_policy, created_at
FROM connect_sites
WHERE site_code = $1
AND COALESCE(tenant_id, '00000000-0000-0000-0000-000000000001'::uuid) = $2
"#,
)
.bind(site_code)
.bind(tenant_id)
.fetch_optional(pool)
.await
}
/// Fetch a site by its primary-key UUID.
pub async fn get_site_by_id(pool: &PgPool, id: Uuid) -> Result<Option<Site>, sqlx::Error> {
sqlx::query_as::<_, Site>(
r#"
SELECT id, site_code, display_name, company, tenant_id, enrollment_policy, created_at
FROM connect_sites
WHERE id = $1
"#,
)
.bind(id)
.fetch_optional(pool)
.await
}
/// Insert a new site, returning the created row.
///
/// `tenant_id` is `None`-tolerant and resolved via `db::tenancy::current_tenant_id()`
/// at the call site. Errors with a unique-violation if `(tenant, site_code)` already
/// exists (the caller maps that to a 409).
#[allow(dead_code)] // Wired by the site-admin API (dashboard site CRUD); Phase A exposes key rotation, not site CRUD.
pub async fn insert_site(
pool: &PgPool,
site_code: &str,
display_name: Option<&str>,
company: Option<&str>,
tenant_id: Option<Uuid>,
) -> Result<Site, sqlx::Error> {
sqlx::query_as::<_, Site>(
r#"
INSERT INTO connect_sites (site_code, display_name, company, tenant_id)
VALUES ($1, $2, $3, $4)
RETURNING id, site_code, display_name, company, tenant_id, enrollment_policy, created_at
"#,
)
.bind(site_code)
.bind(display_name)
.bind(company)
.bind(tenant_id)
.fetch_one(pool)
.await
}

View File

@@ -448,6 +448,11 @@ async fn main() -> Result<()> {
)),
)
.route("/api/codes/:code/cancel", post(cancel_code))
// Zero-touch enrollment (SPEC-016). PUBLIC: no JWT — the per-site enrollment
// key in the body is the gate, and the handler applies its own
// per-(site_code, IP) rate limit / lockout (defense-in-depth). Mounted with
// the other public API routes.
.route("/api/enroll", post(api::enroll::enroll))
// WebSocket endpoints
.route("/ws/agent", get(relay::agent_ws_handler))
.route("/ws/viewer", get(relay::viewer_ws_handler))
@@ -498,6 +503,18 @@ async fn main() -> Result<()> {
"/api/machines/:agent_id/keys/:key_id",
delete(api::machine_keys::revoke_key),
)
// Per-site enrollment key administration (SPEC-016, admin-only / JWT).
// Rotate regenerates the cek_ secret + fingerprint (old installers can no
// longer enroll new machines); GET returns the current non-secret
// fingerprint/version. Both gated by the AdminUser extractor.
.route(
"/api/sites/:id/enrollment-key",
get(api::sites::get_enrollment_key),
)
.route(
"/api/sites/:id/enrollment-key/rotate",
post(api::sites::rotate_enrollment_key),
)
// REST API - Releases and Version
.route("/api/version", get(api::releases::get_version)) // No auth - for agent polling
.route("/api/releases", get(api::releases::list_releases))

View File

@@ -77,6 +77,19 @@ pub const CODE_VALIDATE_MAX_FAILURES: u32 = 10;
/// Support-code validate: how long an IP stays locked out once tripped.
pub const CODE_VALIDATE_LOCKOUT: Duration = Duration::from_secs(15 * 60);
/// Enroll (`POST /api/enroll`, SPEC-016): window length.
pub const ENROLL_WINDOW: Duration = Duration::from_secs(60);
/// Enroll: max requests per window per `(site_code, IP)`. A zero-touch site push
/// drives N machines through enroll near-simultaneously, so this is generous
/// (mass-deploy friendly) while still capping a runaway loop. Defense-in-depth: the
/// 256-bit enrollment key is the load-bearing gate, not this cap.
pub const ENROLL_MAX_PER_WINDOW: u32 = 60;
/// Enroll: consecutive FAILED enroll attempts (bad/inactive key, unknown site) from
/// one `(site_code, IP)` that trip the lockout.
pub const ENROLL_MAX_FAILURES: u32 = 20;
/// Enroll: how long a `(site_code, IP)` stays locked out once tripped.
pub const ENROLL_LOCKOUT: Duration = Duration::from_secs(15 * 60);
/// Hard cap on the number of distinct IPs tracked by any single limiter map.
/// Prevents an IP-rotating attacker from growing memory without bound. When the
/// cap is hit, the oldest-windowed entries are pruned. Generous for a real MSP
@@ -260,6 +273,150 @@ impl FailureLockout {
}
}
// ============================================================================
// Composite-key limiter for enrollment (keyed by (site_code, IP)) — SPEC-016
// ============================================================================
//
// The login / change-password / code-validate limiters above key purely on IP.
// SPEC-016 §3 wants the enroll defense keyed on `(site_code, source-IP)` so a noisy
// site push from one office IP cannot lock out a different site enrolling from the
// same egress IP. Rather than overload the IP-only maps, this is a small dedicated
// composite-key limiter + lockout. It is invoked from the enroll HANDLER (not a
// `from_fn` layer) because the `site_code` lives in the JSON body, which a
// pre-handler middleware cannot read without consuming it. Documented as
// defense-in-depth: the 256-bit enrollment key is the real gate.
/// Composite limiter key: the site_code and the real client IP.
type EnrollKey = (String, IpAddr);
/// Per-`(site_code, IP)` fixed-window limiter + consecutive-failure lockout.
///
/// Combines both protections behind one lock-guarded map so the enroll handler
/// makes a single allow/deny decision and reports success/failure into the same
/// structure. Self-pruning and size-capped, like the IP-only limiters.
#[derive(Clone)]
pub struct EnrollLimiter {
inner: std::sync::Arc<Mutex<HashMap<EnrollKey, EnrollEntry>>>,
max_per_window: u32,
window: Duration,
max_failures: u32,
cooldown: Duration,
}
#[derive(Debug, Clone, Copy)]
struct EnrollEntry {
window_started: Instant,
count: u32,
failures: u32,
locked_until: Option<Instant>,
last_seen: Instant,
}
impl EnrollLimiter {
pub fn new(
max_per_window: u32,
window: Duration,
max_failures: u32,
cooldown: Duration,
) -> Self {
Self {
inner: std::sync::Arc::new(Mutex::new(HashMap::new())),
max_per_window,
window,
max_failures,
cooldown,
}
}
fn entry_now() -> EnrollEntry {
let now = Instant::now();
EnrollEntry {
window_started: now,
count: 0,
failures: 0,
locked_until: None,
last_seen: now,
}
}
/// Admit one enroll attempt for `(site_code, ip)`. Returns `true` if allowed
/// (and counts it). Returns `false` if the key is currently locked out OR over
/// the per-window request cap. Clock injected for tests.
fn check_at(&self, site_code: &str, ip: IpAddr, now: Instant) -> bool {
let mut map = self.inner.lock().unwrap_or_else(|e| e.into_inner());
if map.len() >= MAX_TRACKED_IPS {
let window = self.window;
let cooldown = self.cooldown;
map.retain(|_, e| {
e.locked_until.map(|u| now < u).unwrap_or(false)
|| now.duration_since(e.window_started) < window
|| now.duration_since(e.last_seen) < cooldown
});
}
let key = (site_code.to_string(), ip);
let e = map.entry(key).or_insert_with(Self::entry_now);
e.last_seen = now;
// Lockout takes precedence.
if let Some(until) = e.locked_until {
if now < until {
return false;
}
// Cooldown elapsed — clear it for a fresh start.
e.locked_until = None;
e.failures = 0;
}
// Roll the fixed window forward if elapsed.
if now.duration_since(e.window_started) >= self.window {
e.window_started = now;
e.count = 0;
}
if e.count >= self.max_per_window {
false
} else {
e.count += 1;
true
}
}
/// Admit one enroll attempt (real clock).
pub fn check(&self, site_code: &str, ip: IpAddr) -> bool {
self.check_at(site_code, ip, Instant::now())
}
fn record_failure_at(&self, site_code: &str, ip: IpAddr, now: Instant) {
let mut map = self.inner.lock().unwrap_or_else(|e| e.into_inner());
let key = (site_code.to_string(), ip);
let e = map.entry(key).or_insert_with(Self::entry_now);
e.last_seen = now;
e.failures = e.failures.saturating_add(1);
if e.failures >= self.max_failures {
e.locked_until = Some(now + self.cooldown);
}
}
/// Record a FAILED enroll attempt (bad key / unknown site) for the key,
/// tripping the lockout once the streak reaches `max_failures`.
pub fn record_failure(&self, site_code: &str, ip: IpAddr) {
self.record_failure_at(site_code, ip, Instant::now());
}
/// Record a SUCCESSFUL enroll for the key, resetting its failure streak.
pub fn record_success(&self, site_code: &str, ip: IpAddr) {
let mut map = self.inner.lock().unwrap_or_else(|e| e.into_inner());
let key = (site_code.to_string(), ip);
if let Some(e) = map.get_mut(&key) {
e.failures = 0;
e.locked_until = None;
e.last_seen = Instant::now();
}
}
}
// ============================================================================
// Shared rate-limit state (lives in AppState)
// ============================================================================
@@ -275,6 +432,9 @@ pub struct RateLimitState {
pub code_validate: RateLimiter,
/// Per-IP lockout on repeated failed code validations (brute-force defense).
pub code_validate_lockout: FailureLockout,
/// `POST /api/enroll` (SPEC-016): per-`(site_code, IP)` request cap +
/// consecutive-failure lockout. Invoked from the enroll handler.
pub enroll: EnrollLimiter,
}
impl RateLimitState {
@@ -290,6 +450,12 @@ impl RateLimitState {
CODE_VALIDATE_MAX_FAILURES,
CODE_VALIDATE_LOCKOUT,
),
enroll: EnrollLimiter::new(
ENROLL_MAX_PER_WINDOW,
ENROLL_WINDOW,
ENROLL_MAX_FAILURES,
ENROLL_LOCKOUT,
),
}
}
}
@@ -524,4 +690,51 @@ mod tests {
assert!(lockout.is_locked_at(ip(8), t0));
assert!(!lockout.is_locked_at(ip(9), t0)); // ip9 unaffected
}
// -- EnrollLimiter (composite (site_code, IP) key) --------------------------
#[test]
fn enroll_window_allows_up_to_cap_then_blocks() {
let lim = EnrollLimiter::new(2, Duration::from_secs(60), 100, Duration::from_secs(600));
let t0 = Instant::now();
assert!(lim.check_at("SITE-A", ip(1), t0)); // 1
assert!(lim.check_at("SITE-A", ip(1), t0)); // 2
assert!(!lim.check_at("SITE-A", ip(1), t0)); // over cap
}
#[test]
fn enroll_is_keyed_by_site_and_ip() {
let lim = EnrollLimiter::new(1, Duration::from_secs(60), 100, Duration::from_secs(600));
let t0 = Instant::now();
assert!(lim.check_at("SITE-A", ip(1), t0));
assert!(!lim.check_at("SITE-A", ip(1), t0)); // same key over cap
// Different site, same IP -> independent bucket.
assert!(lim.check_at("SITE-B", ip(1), t0));
// Same site, different IP -> independent bucket.
assert!(lim.check_at("SITE-A", ip(2), t0));
}
#[test]
fn enroll_lockout_trips_after_failures_and_blocks_check() {
let lim = EnrollLimiter::new(100, Duration::from_secs(60), 3, Duration::from_secs(600));
let t0 = Instant::now();
lim.record_failure_at("SITE-A", ip(1), t0);
lim.record_failure_at("SITE-A", ip(1), t0);
// Not yet tripped: a check still admits.
assert!(lim.check_at("SITE-A", ip(1), t0));
lim.record_failure_at("SITE-A", ip(1), t0); // 3rd -> trips
// Now locked out: check denies even though under the request cap.
assert!(!lim.check_at("SITE-A", ip(1), t0));
}
#[test]
fn enroll_success_resets_failure_streak() {
let lim = EnrollLimiter::new(100, Duration::from_secs(60), 2, Duration::from_secs(600));
let t0 = Instant::now();
lim.record_failure_at("SITE-A", ip(1), t0);
lim.record_success("SITE-A", ip(1)); // reset
lim.record_failure_at("SITE-A", ip(1), t0);
// Only one failure since reset -> not locked.
assert!(lim.check_at("SITE-A", ip(1), t0));
}
}