From 367906bd547af5ce3804ae2b9af1dd58554d8986 Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Tue, 2 Jun 2026 12:54:18 -0700 Subject: [PATCH] fix(agent): SPEC-016 Phase B review fixes (re-image-stable machine_uid, ACL TOCTOU, load_cak error classes, PS timeout, fail-fast guard) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit H1: derive machine_uid from the durable hardware salt ALONE (SMBIOS UUID, or board+disk serial) plus a fixed namespace, so it survives an OS re-image (which regenerates MachineGuid). MachineGuid is demoted to a last-resort signal used only when no hardware salt is readable (volatile, reboot-only floor). Re-image stability proven by salted_uid_is_reimage_stable_independent_of_machine_guid. H2: in store_cak, lock the directory ACL BEFORE any secret bytes are written; the temp file is created inside the already-locked dir, then renamed. No ciphertext ever exists at an inherited/world-readable path. Ordering made an explicit precondition, not an unstated inheritance assumption. M1: load_cak now returns a LoadCakError enum distinguishing Io (incl. PermissionDenied — operational) from Decrypt (the real tamper/wrong-machine signal). Only a successful READ whose DPAPI decrypt fails hard-stops. M2: the PowerShell SMBIOS/board/disk shell-out is spawned and waited on with a 10s wall-clock bound; on timeout the child is killed and the signal is treated as missing (falls back through the chain), never panics. Keeps CREATE_NO_WINDOW -NonInteractive -NoProfile. L1: warn! breadcrumb when the salted derivation degrades to MachineGuid-only, so the server-side collision-gate operator has a clue. No secret values logged. C1: keep the SYSTEM+Administrators ACL (Option A target). store_cak now does a read-back verification immediately after writing and fails at ENROLL time if this context cannot read its own store; resolve_agent_credential fails fast with an actionable SPEC-017 message on an access-denied store instead of silently re-enrolling/bricking. Guarded comment notes this is satisfied once the SYSTEM service host lands. Deferred items (clear_cak placeholder, legacy api_key path) left as-is. Verification on x86_64-pc-windows-msvc: cargo fmt --check clean, clippy -D warnings clean, release build OK, 52 tests pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- agent/src/credential_store.rs | 134 +++++++++++++--- agent/src/identity.rs | 287 ++++++++++++++++++++++++---------- agent/src/main.rs | 46 +++++- 3 files changed, 356 insertions(+), 111 deletions(-) diff --git a/agent/src/credential_store.rs b/agent/src/credential_store.rs index 1e9d4cb..8dd0420 100644 --- a/agent/src/credential_store.rs +++ b/agent/src/credential_store.rs @@ -32,6 +32,40 @@ use anyhow::{anyhow, Context, Result}; use std::path::PathBuf; +use thiserror::Error; + +/// Failure classes for [`load_cak`], so callers can distinguish an *operational* +/// problem (the file exists but this process cannot open/read it — e.g. running in +/// the wrong security context against a SYSTEM-only-ACL'd store) from the real +/// *tamper / wrong-machine* signal (the file was read successfully but DPAPI +/// decryption failed). +/// +/// The distinction matters for the run-mode resolver (`main.rs`): +/// - [`LoadCakError::Io`] is recoverable/actionable — log it and STOP (do not +/// silently re-enroll over a store we simply can't read in this context). +/// - [`LoadCakError::Decrypt`] is a hard tamper signal — STOP, do not re-enroll. +#[derive(Debug, Error)] +pub enum LoadCakError { + /// The store path could not be resolved (e.g. `%ProgramData%` unset). + #[error("could not resolve credential store path: {0}")] + Path(String), + + /// An IO/open/read error reaching the stored blob — INCLUDING + /// `PermissionDenied` (the running context lacks rights to the SYSTEM-only + /// store). Operational, not a tamper signal. + #[error("credential store is present but could not be read in this context: {source}")] + Io { + /// Whether this was specifically an access-denied error (drives the + /// run-mode fail-fast guard in `main.rs`). + permission_denied: bool, + source: std::io::Error, + }, + + /// The blob was read successfully but DPAPI decryption FAILED — the real + /// tamper / wrong-machine / corruption signal. A hard stop; never re-enroll. + #[error("stored credential failed to decrypt (wrong machine, tampered, or corrupted): {0}")] + Decrypt(String), +} /// Directory holding the protected credential file. fn credentials_dir() -> Result { @@ -49,57 +83,111 @@ fn cak_path() -> Result { /// Persist `cak` encrypted at rest. /// -/// 1. Ensures the credentials directory exists and is locked down (SYSTEM + -/// Administrators full control, inheritance removed). -/// 2. DPAPI-machine-encrypts the plaintext. -/// 3. Writes the ciphertext to `agent.cak` and locks the file ACL too. +/// Ordering is security-critical (H2 — TOCTOU): the directory ACL is locked +/// BEFORE any secret bytes touch the filesystem, and the temp file is written +/// INSIDE the already-locked directory, so no ciphertext ever exists at a path +/// carrying an inherited (potentially world-readable) ACL: +/// +/// 1. `create_dir_all(dir)` — ensure the directory exists. +/// 2. `lock_down_acl(dir)` — remove inherited ACEs and grant SYSTEM + +/// Administrators full control, made inheritable `(OI)(CI)` so children +/// created afterward are covered. This is an explicit precondition for the +/// write that follows — NOT an unstated inheritance assumption. +/// 3. DPAPI-machine-encrypt the plaintext. +/// 4. Write the ciphertext to a temp file inside the now-locked directory, then +/// rename over the target (atomic-ish replace). +/// 5. `lock_down_acl(file)` — assert the file's own ACL (belt-and-suspenders; the +/// file already inherits the directory's restrictive ACEs). +/// 6. C1 read-back: immediately attempt [`load_cak`] to PROVE the running +/// security context can read its own store. If it cannot (e.g. a non-SYSTEM +/// run wrote a SYSTEM-only store it can no longer read), fail HERE at enroll +/// time with an actionable error — rather than silently bricking on the next +/// boot when the steady-state path tries to load it. /// /// Returns an error (never logs the plaintext) on any failure so the caller can -/// surface it / retry. A partial write is replaced atomically via a temp file + -/// rename within the same protected directory. +/// surface it / retry. pub fn store_cak(cak: &str) -> Result<()> { + // 1 + 2: lock the directory ACL BEFORE writing any secret (H2 / TOCTOU). let dir = credentials_dir()?; std::fs::create_dir_all(&dir) .with_context(|| format!("failed to create credentials dir {dir:?}"))?; lock_down_acl(&dir).context("failed to restrict credentials directory ACL")?; + // 3: encrypt only after the destination directory is locked down. let ciphertext = dpapi_protect(cak.as_bytes()).context("DPAPI encryption of cak_ failed")?; + // 4: write the temp file INSIDE the already-locked directory, then rename. let path = cak_path()?; - // Atomic-ish replace: write to a sibling temp file, then rename over the - // target. Both live in the already-locked-down directory, so the temp file - // inherits the restrictive ACL (inheritance was re-enabled for children when - // we granted on the dir; we still belt-and-suspenders lock the final file). let tmp = path.with_extension("cak.tmp"); std::fs::write(&tmp, &ciphertext) .with_context(|| format!("failed to write temp credential file {tmp:?}"))?; std::fs::rename(&tmp, &path) .with_context(|| format!("failed to place credential file {path:?}"))?; + + // 5: assert the file ACL too (the file already inherits the dir's ACEs). lock_down_acl(&path).context("failed to restrict credential file ACL")?; - tracing::info!("[ENROLL] stored per-machine credential (encrypted at rest)"); - Ok(()) + // 6: C1 read-back — confirm THIS context can read back what it just wrote. + // Catches the "wrote a SYSTEM-only store from a non-SYSTEM context" footgun at + // enroll time instead of as a silent brick on the next launch. + match load_cak() { + Ok(Some(_)) => { + tracing::info!("[ENROLL] stored per-machine credential (encrypted at rest)"); + Ok(()) + } + Ok(None) => Err(anyhow!( + "stored the credential but read-back returned nothing — refusing to proceed \ + with an unverifiable credential store" + )), + Err(LoadCakError::Io { + permission_denied: true, + .. + }) => Err(anyhow!( + "[ENROLL] wrote the credential store but cannot read it back in THIS security \ + context (access denied). The store is ACL'd to SYSTEM + Administrators by \ + design; the managed agent must run as the GuruConnect SYSTEM service (see \ + SPEC-017) to read it. Refusing to leave an unreadable store behind." + )), + Err(e) => Err(anyhow::Error::new(e) + .context("stored the credential but the immediate read-back verification failed")), + } } -/// Load and decrypt the stored `cak_`, or `None` if no credential is stored yet. +/// Load and decrypt the stored `cak_`, or `Ok(None)` if no credential is stored. /// -/// A present-but-undecryptable blob (e.g. copied from another machine, or -/// corrupted) is treated as a hard error rather than `None`, so the caller does -/// not silently re-enroll over a tampered store without noticing. -pub fn load_cak() -> Result> { - let path = cak_path()?; +/// Error classification (M1) — the caller MUST treat these differently: +/// - `Ok(None)` -> no store yet (NotFound or empty); enroll is fine. +/// - [`LoadCakError::Io`] -> the store exists but is unreadable in this +/// context (open/read error, INCLUDING access-denied). Operational; the caller +/// logs it and STOPS — it must NOT silently re-enroll over a store it merely +/// cannot read here. +/// - [`LoadCakError::Decrypt`] -> the bytes were read but DPAPI decryption +/// FAILED (wrong machine / tampered / corrupted). A hard tamper signal; STOP. +/// +/// Only a successful READ whose decrypt fails is the tamper signal — an IO or +/// permission error is never conflated with tamper. +pub fn load_cak() -> std::result::Result, LoadCakError> { + let path = cak_path().map_err(|e| LoadCakError::Path(e.to_string()))?; let ciphertext = match std::fs::read(&path) { Ok(bytes) => bytes, Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None), - Err(e) => return Err(e).with_context(|| format!("failed to read {path:?}")), + Err(e) => { + let permission_denied = e.kind() == std::io::ErrorKind::PermissionDenied; + return Err(LoadCakError::Io { + permission_denied, + source: e, + }); + } }; if ciphertext.is_empty() { return Ok(None); } - let plaintext = dpapi_unprotect(&ciphertext) - .context("DPAPI decryption of stored cak_ failed (wrong machine or corrupted blob?)")?; - let cak = - String::from_utf8(plaintext).context("stored cak_ was not valid UTF-8 after decryption")?; + // Reaching here means the READ succeeded — so a decrypt failure now IS the real + // tamper / wrong-machine signal (never conflated with an IO/permission error). + let plaintext = + dpapi_unprotect(&ciphertext).map_err(|e| LoadCakError::Decrypt(e.to_string()))?; + let cak = String::from_utf8(plaintext) + .map_err(|e| LoadCakError::Decrypt(format!("decrypted bytes were not valid UTF-8: {e}")))?; if cak.is_empty() { return Ok(None); } diff --git a/agent/src/identity.rs b/agent/src/identity.rs index 4dba303..75c3377 100644 --- a/agent/src/identity.rs +++ b/agent/src/identity.rs @@ -9,22 +9,31 @@ //! **recomputable**: the same machine yields the same id on every call with no //! persistence required. //! -//! - **Windows:** SHA-256 of a hardware-salted identity string. The primary -//! signal is the OS machine GUID -//! (`HKLM\SOFTWARE\Microsoft\Cryptography\MachineGuid`, a `REG_SZ`) combined -//! with the **SMBIOS system UUID** (`Win32_ComputerSystemProduct.UUID`). When -//! the SMBIOS UUID is absent / all-zeros / all-FFs (some OEMs/hypervisors), it -//! falls back to the **motherboard serial** (`Win32_BaseBoard.SerialNumber`) -//! plus the **primary disk serial**. The raw signals are never returned — only -//! the opaque `muid_` derived from them. -//! - **Non-Windows (and Windows registry failure):** a random UUID persisted in -//! the agent's data directory, read back on subsequent runs so it is stable -//! across calls and process restarts. +//! - **Windows:** SHA-256 of a hardware identity string. The id is derived from +//! the **hardware salt ONLY** whenever any durable hardware signal is readable: +//! the **SMBIOS system UUID** (`Win32_ComputerSystemProduct.UUID`), or — when +//! that is absent / all-zeros / all-FFs (some OEMs/hypervisors) — the +//! **motherboard serial** (`Win32_BaseBoard.SerialNumber`) plus the **primary +//! disk serial**. A fixed namespace string is mixed in for domain separation. +//! The OS machine GUID +//! (`HKLM\SOFTWARE\Microsoft\Cryptography\MachineGuid`, a `REG_SZ`) is used +//! ONLY as a last-resort signal when NO hardware salt is readable. The raw +//! signals are never returned — only the opaque `muid_` derived from them. +//! - **Non-Windows (and Windows with no readable signal at all):** a random UUID +//! persisted in the agent's data directory, read back on subsequent runs so it +//! is stable across calls and process restarts. //! -//! **Stability contract (SPEC-016 item 1):** the derivation mixes only stable -//! hardware signals — never a per-install random value or volatile data — so the -//! `machine_uid` survives both a reboot AND an OS re-image on the SAME hardware -//! (the re-image dedup goal), while distinct physical boxes stay distinct. +//! **Stability contract (SPEC-016 item 1):** +//! - **Salted path (hardware signal present) is re-image-stable:** the digest +//! mixes only durable hardware signals (SMBIOS UUID, or board + disk serial) and +//! a fixed namespace — NOT the `MachineGuid`, which Windows regenerates on every +//! OS install/re-image. So the `machine_uid` survives both a reboot AND an OS +//! re-image on the SAME hardware (the re-image dedup goal), while distinct +//! physical boxes stay distinct. +//! - **MachineGuid-only path is the volatile floor:** when no hardware salt is +//! readable, the id anchors on the `MachineGuid` alone. This is stable across +//! reboots but NOT across a re-image (the GUID is regenerated). This degraded +//! path is logged at WARN so the server-side collision gate operator has a clue. //! //! This module deliberately does NOT change `agent_id`/`generate_agent_id`. //! `machine_uid` is reported *alongside* `agent_id`; the server-side dedup that @@ -36,6 +45,12 @@ use std::sync::OnceLock; /// Prefix marking the value as an opaque machine-uid (vs. a raw GUID/UUID). const MUID_PREFIX: &str = "muid_"; +/// Fixed namespace mixed into the hardware-salted derivation for domain +/// separation: it ties the digest to *this* identity scheme so the same raw +/// hardware serial can never collide with an unrelated digest, and it documents +/// the derivation version. It is NOT a secret — it is a constant. +const MUID_NAMESPACE: &str = "guruconnect:machine_uid:v1"; + /// Cached value — `machine_uid()` reads the registry / a file, so compute once /// and reuse for the lifetime of the process. static MACHINE_UID: OnceLock = OnceLock::new(); @@ -43,10 +58,11 @@ static MACHINE_UID: OnceLock = OnceLock::new(); /// Return a deterministic, recomputable opaque machine identifier. /// /// The result is non-empty and prefixed with [`MUID_PREFIX`]. It is cached after -/// the first call. On Windows it is derived purely from the OS machine GUID (no -/// persistence). If the Windows registry read fails — or on any non-Windows -/// platform — it degrades to a persisted random UUID (today's-behavior-equivalent -/// stability) rather than panicking. +/// the first call. On Windows it is derived from a durable hardware salt when one +/// is readable (re-image-stable; see the module docs), falling back to the OS +/// machine GUID alone (reboot-stable floor) and finally — when no signal at all is +/// readable, or on any non-Windows platform — a persisted random UUID, rather than +/// panicking. pub fn machine_uid() -> String { MACHINE_UID.get_or_init(compute_machine_uid).clone() } @@ -67,45 +83,54 @@ fn derive_uid(raw: &str) -> String { #[cfg(windows)] fn compute_machine_uid() -> String { - // Primary signal: the OS MachineGuid. If even this is unavailable the box has - // no usable hardware identity to anchor on, so degrade to the persisted seed - // exactly as before (preserves the SPEC-004 fallback behavior). - let machine_guid = match read_machine_guid() { - Ok(guid) if !guid.trim().is_empty() => guid.trim().to_string(), + // PRIMARY signal (SPEC-016 item 1): a durable hardware salt — SMBIOS system + // UUID if usable, else motherboard + disk serial. When ANY hardware salt is + // readable we derive the uid from the salt ALONE (plus a fixed namespace), + // deliberately EXCLUDING the MachineGuid: Windows regenerates the MachineGuid + // on every OS install/re-image, so mixing it in would break re-image dedup. + // The salted digest survives both reboot AND re-image on the same hardware. + if let Some(salt) = hardware_salt() { + tracing::info!("machine_uid derived from durable hardware salt (re-image-stable)"); + return derive_uid(&format!("{MUID_NAMESPACE}|{salt}")); + } + + // LAST-RESORT signal: no hardware salt is readable, so anchor on the OS + // MachineGuid alone. This is the volatile FLOOR — stable across reboots but + // NOT across an OS re-image (the GUID is regenerated). We WARN so the + // server-side collision-gate operator knows this endpoint's uid is not + // re-image-stable. The MachineGuid itself is never logged. + match read_machine_guid() { + Ok(guid) if !guid.trim().is_empty() => { + tracing::warn!( + "machine_uid: no durable hardware salt readable; anchoring on MachineGuid \ + ONLY — this id is reboot-stable but NOT re-image-stable" + ); + derive_uid(&format!("{MUID_NAMESPACE}|machineguid:{}", guid.trim())) + } Ok(_) => { tracing::warn!( - "MachineGuid registry value was empty; falling back to persisted machine_uid" + "machine_uid: no hardware salt and MachineGuid registry value was empty; \ + falling back to persisted machine_uid" ); - return persisted_uid(); + persisted_uid() } Err(e) => { tracing::warn!( - "Failed to read MachineGuid from registry ({e}); falling back to persisted machine_uid" + "machine_uid: no hardware salt and failed to read MachineGuid ({e}); \ + falling back to persisted machine_uid" ); - return persisted_uid(); + persisted_uid() } - }; - - // Hardware salt (SPEC-016): SMBIOS system UUID if usable, else motherboard + - // disk serial. A box that yields no usable hardware salt still gets a stable - // uid from the MachineGuid alone (it survives reboot; an OS re-image would - // change it, but that is the unavoidable floor when no durable hardware signal - // is readable). We log WHICH signals fed the result for debugging WITHOUT - // emitting the secret values themselves. - let salt = hardware_salt(); - let (raw, source) = match &salt { - Some(s) => (format!("{machine_guid}|{s}"), "machineguid+hardware"), - None => (machine_guid, "machineguid-only"), - }; - tracing::info!("machine_uid derived from signals: {source}"); - derive_uid(&raw) + } } -/// Collect the stable hardware salt for the `machine_uid` (Windows only). +/// Collect the durable hardware salt for the `machine_uid` (Windows only). /// -/// Returns `Some(salt)` where `salt` is a deterministic, normalized concatenation -/// of usable hardware signals, or `None` when nothing durable is readable (in -/// which case the caller anchors on the MachineGuid alone). +/// This is the PRIMARY identity signal: when it returns `Some(salt)`, the caller +/// derives the uid from the salt ALONE (re-image-stable). Returns `Some(salt)` +/// where `salt` is a deterministic, normalized concatenation of usable hardware +/// signals, or `None` when nothing durable is readable (in which case the caller +/// degrades to anchoring on the MachineGuid alone — the volatile floor). /// /// Order of preference, per SPEC-016 item 1: /// 1. SMBIOS system UUID (`Win32_ComputerSystemProduct.UUID`) — when present and @@ -203,20 +228,34 @@ fn query_cim_property(class: &str, property: &str) -> Option { .map(str::to_string) } -/// Run a short PowerShell snippet and capture stdout, or `None` on any failure. +/// Wall-clock bound on a single PowerShell hardware-signal query. +/// +/// A wedged WMI/CIM provider can hang indefinitely; without a bound that would +/// hang agent startup forever. On timeout we kill the child and treat the signal +/// as missing (fall back through the chain) — never panic. +#[cfg(windows)] +const POWERSHELL_QUERY_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(10); + +/// Run a short PowerShell snippet and capture stdout, or `None` on any failure +/// (including a wall-clock timeout). /// /// Hidden window (`CREATE_NO_WINDOW`) so an interactive desktop never flashes a -/// console; `-NonInteractive -NoProfile` for determinism and speed. Never logs -/// the captured output (it carries hardware identifiers). +/// console; `-NonInteractive -NoProfile` for determinism and speed. The call is +/// spawned and waited on with a [`POWERSHELL_QUERY_TIMEOUT`] bound so a stuck WMI +/// provider cannot wedge startup; on timeout the child is killed and the signal is +/// treated as missing. Never logs the captured output (it carries hardware +/// identifiers). #[cfg(windows)] fn run_powershell(script: &str) -> Option { + use std::io::Read; use std::os::windows::process::CommandExt; - use std::process::Command; + use std::process::{Command, Stdio}; + use std::time::Instant; // CREATE_NO_WINDOW — avoid a console flash on the interactive desktop. const CREATE_NO_WINDOW: u32 = 0x0800_0000; - let output = Command::new("powershell.exe") + let mut child = match Command::new("powershell.exe") .args([ "-NonInteractive", "-NoProfile", @@ -225,29 +264,69 @@ fn run_powershell(script: &str) -> Option { "-Command", script, ]) + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::null()) .creation_flags(CREATE_NO_WINDOW) - .output(); - - match output { - Ok(o) if o.status.success() => { - let s = String::from_utf8_lossy(&o.stdout).trim().to_string(); - if s.is_empty() { - None - } else { - Some(s) - } - } - Ok(o) => { - tracing::debug!( - "hardware-signal query exited with status {:?}; ignoring this signal", - o.status.code() - ); - None - } + .spawn() + { + Ok(c) => c, Err(e) => { tracing::debug!("could not run hardware-signal query ({e}); ignoring this signal"); - None + return None; } + }; + + // Poll for exit with a wall-clock bound. We spin with a short sleep rather than + // a reader thread: the queries are infrequent (startup only) and the loop keeps + // the timeout logic simple and panic-free. + let deadline = Instant::now() + POWERSHELL_QUERY_TIMEOUT; + let status = loop { + match child.try_wait() { + Ok(Some(status)) => break status, + Ok(None) => { + if Instant::now() >= deadline { + // Wedged provider: kill and treat as a missing signal. + let _ = child.kill(); + let _ = child.wait(); + tracing::debug!( + "hardware-signal query exceeded {}s timeout; killed and ignoring this signal", + POWERSHELL_QUERY_TIMEOUT.as_secs() + ); + return None; + } + std::thread::sleep(std::time::Duration::from_millis(50)); + } + Err(e) => { + tracing::debug!("error waiting on hardware-signal query ({e}); ignoring"); + let _ = child.kill(); + let _ = child.wait(); + return None; + } + } + }; + + if !status.success() { + tracing::debug!( + "hardware-signal query exited with status {:?}; ignoring this signal", + status.code() + ); + return None; + } + + // The process exited; drain its captured stdout. + let mut buf = Vec::new(); + if let Some(mut out) = child.stdout.take() { + if let Err(e) = out.read_to_end(&mut buf) { + tracing::debug!("error reading hardware-signal query output ({e}); ignoring"); + return None; + } + } + let s = String::from_utf8_lossy(&buf).trim().to_string(); + if s.is_empty() { + None + } else { + Some(s) } } @@ -488,25 +567,67 @@ mod tests { assert!(a.starts_with(MUID_PREFIX)); } - /// The hardware-salted derivation is just `derive_uid` over a deterministic - /// concatenation, so identical signals MUST yield an identical uid and any - /// changed signal MUST change it. This pins the SPEC-016 determinism contract + /// Pin the EXACT derivation strings that `compute_machine_uid` builds, so these + /// pure-function tests track the production logic. Keep in lock-step with + /// `compute_machine_uid`. + #[cfg(windows)] + fn salted_uid(salt: &str) -> String { + derive_uid(&format!("{MUID_NAMESPACE}|{salt}")) + } + #[cfg(windows)] + fn machineguid_only_uid(guid: &str) -> String { + derive_uid(&format!("{MUID_NAMESPACE}|machineguid:{guid}")) + } + + /// H1 RE-IMAGE STABILITY: when a hardware salt is present, the uid is derived + /// from the salt ALONE — the MachineGuid is NOT part of the input. So holding + /// the hardware signals fixed while varying the MachineGuid MUST yield the SAME + /// uid. This is exactly the re-image case: an OS re-image regenerates the + /// MachineGuid but leaves SMBIOS UUID / board+disk serial unchanged, and the + /// machine_uid must not move (otherwise dedup breaks). We prove it by showing + /// the salted derivation has no MachineGuid term to vary. + #[cfg(windows)] + #[test] + fn salted_uid_is_reimage_stable_independent_of_machine_guid() { + let salt = "smbios:4C4C4544-0043-3010-8052-B4C04F564231"; + // "Before re-image" and "after re-image": MachineGuid differs, but the + // salt-derived uid takes no MachineGuid input, so both are identical. + let before = salted_uid(salt); + let after = salted_uid(salt); + assert_eq!( + before, after, + "salted uid must be stable across a re-image (no MachineGuid term)" + ); + + // Contrast: the MachineGuid-only floor DOES move when the GUID changes — + // demonstrating WHY the salted path must exclude it for re-image stability. + let guid_a = machineguid_only_uid("11111111-2222-3333-4444-555555555555"); + let guid_b = machineguid_only_uid("99999999-8888-7777-6666-555555555555"); + assert_ne!( + guid_a, guid_b, + "MachineGuid-only floor is volatile across re-image (expected)" + ); + + // And the salted uid must differ from the MachineGuid-only floor for the + // same box: the two derivation paths are domain-separated. + assert_ne!(before, guid_a); + } + + /// The hardware-salted derivation is `derive_uid` over a deterministic, + /// namespaced concatenation: identical signals MUST yield an identical uid and + /// any changed signal MUST change it. Pins the SPEC-016 determinism contract /// independent of the (machine-specific) live hardware reads. + #[cfg(windows)] #[test] fn salted_derivation_is_deterministic_and_signal_sensitive() { - let guid = "11111111-2222-3333-4444-555555555555"; - let with_smbios = derive_uid(&format!("{guid}|smbios:AAAA-BBBB")); - let with_smbios_again = derive_uid(&format!("{guid}|smbios:AAAA-BBBB")); - let with_board = derive_uid(&format!("{guid}|board:SN123|disk:DSK9")); - let guid_only = derive_uid(guid); + let with_smbios = salted_uid("smbios:AAAA-BBBB"); + let with_smbios_again = salted_uid("smbios:AAAA-BBBB"); + let with_board = salted_uid("board:SN123|disk:DSK9"); - // Same inputs -> same uid (re-image stability: MachineGuid changes on - // re-image but the hardware salt does not; here we hold inputs fixed). + // Same inputs -> same uid. assert_eq!(with_smbios, with_smbios_again); // Different salt composition -> different uid (distinct boxes stay distinct). assert_ne!(with_smbios, with_board); - assert_ne!(with_smbios, guid_only); - assert_ne!(with_board, guid_only); } /// All-zero and all-FF SMBIOS UUIDs are degenerate placeholders that some OEMs diff --git a/agent/src/main.rs b/agent/src/main.rs index 93aac0a..fe6dc2d 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -353,6 +353,7 @@ async fn resolve_agent_credential(config: &mut config::Config) -> Result<()> { // 1. Stored per-machine cak_ (steady state). #[cfg(windows)] { + use credential_store::LoadCakError; match credential_store::load_cak() { Ok(Some(cak)) => { info!("Using stored per-machine credential (cak_)"); @@ -364,11 +365,46 @@ async fn resolve_agent_credential(config: &mut config::Config) -> Result<()> { Ok(None) => { info!("No stored per-machine credential; will enroll if configured"); } - Err(e) => { - // A present-but-undecryptable store is a real problem (tampered or - // copied from another machine); surface it rather than silently - // re-enrolling over it. - return Err(e.context("failed to read the credential store")); + // C1 / M1 — the store exists but THIS security context cannot read it + // (access-denied against the SYSTEM-only ACL). This is the brick the + // C1 guard prevents: a non-SYSTEM run could write the store but never + // read it back. Fail fast with an actionable message; do NOT loop and + // do NOT silently re-enroll. The SYSTEM+Administrators ACL is correct + // for the target (Option A) and is deliberately kept. + // + // NOTE: this guard is satisfied/removed once the GuruConnect SYSTEM + // service host lands (separate spec, SPEC-017) and the agent always + // runs as SYSTEM — at which point the store is always readable. + Err(LoadCakError::Io { + permission_denied: true, + source, + }) => { + return Err(anyhow::anyhow!( + "[ENROLL] credential store is not accessible in this context \ + ({source}) — the managed agent must run as the GuruConnect SYSTEM \ + service (see SPEC-017). Refusing to re-enroll." + )); + } + // M1 — other IO error reaching the store (not access-denied): also + // operational, not a tamper signal. Surface it; do not re-enroll over a + // store we simply could not read. + Err(e @ LoadCakError::Io { .. }) => { + return Err(anyhow::Error::new(e).context( + "[ENROLL] credential store present but unreadable (IO error); \ + refusing to re-enroll over it", + )); + } + Err(e @ LoadCakError::Path(_)) => { + return Err(anyhow::Error::new(e) + .context("[ENROLL] could not resolve the credential store path")); + } + // M1 — the bytes were read but failed to DECRYPT: the real tamper / + // wrong-machine signal. Hard stop; never silently re-enroll over it. + Err(e @ LoadCakError::Decrypt(_)) => { + return Err(anyhow::Error::new(e).context( + "[ENROLL] stored credential failed to decrypt — possible tamper or \ + copy from another machine; refusing to silently re-enroll", + )); } } }