diff --git a/agent/src/enroll.rs b/agent/src/enroll.rs new file mode 100644 index 0000000..dae9250 --- /dev/null +++ b/agent/src/enroll.rs @@ -0,0 +1,384 @@ +//! First-run self-enrollment client (SPEC-016 Phase B, item 4). +//! +//! When the agent runs as a persistent (`PermanentAgent`) install with NO stored +//! `cak_` but WITH an `enrollment_key` + `site_code`, it walks through the +//! public, unauthenticated `POST /api/enroll` door: it presents its site +//! credentials and its hardware-derived `machine_uid`, and — on success — the +//! server mints and returns a per-machine `cak_` operating credential exactly +//! once. The agent persists that `cak_` encrypted at rest +//! ([`crate::credential_store`]) and connects with it; on every later run it uses +//! the stored `cak_` directly and never re-enrolls. +//! +//! Server contract consumed (must match `server/src/api/enroll.rs`): +//! - Request: `{ site_code, enrollment_key, machine_uid, hostname, +//! labels:{company,site,department,device_type,tags} }`. +//! - `201 Created` -> new enrollment; body has `key` (the `cak_`). +//! - `200 OK` -> reuse (re-image / re-install); body has `key`. +//! - `202 Accepted` -> `collision_pending`; NO key — operator must confirm in +//! the dashboard before the endpoint can connect. +//! - `401 Unauthorized` -> `ENROLL_REJECTED` (bad/rotated key or unknown site): +//! terminal-ish config problem, back off long. +//! - `409 Conflict` -> `ENROLL_SITE_CONFLICT` (machine bound to another site): +//! terminal-ish, requires the operator reassignment flow; back off long. +//! - `429 Too Many Requests` -> rate-limited; back off and retry. +//! +//! SECURITY: never log the `enrollment_key` or the minted `cak_`. Only states, +//! dispositions, and the (non-secret) `machine_uid`/`site_code` are logged. + +use anyhow::{anyhow, Context, Result}; +use serde::{Deserialize, Serialize}; +use std::time::Duration; + +use crate::config::Config; + +/// `POST /api/enroll` request body — mirrors `enroll::EnrollRequest`. +#[derive(Debug, Serialize)] +struct EnrollRequest<'a> { + site_code: &'a str, + enrollment_key: &'a str, + machine_uid: &'a str, + hostname: &'a str, + labels: EnrollLabels<'a>, +} + +/// Labels carried at enrollment — mirrors `enroll::EnrollLabels`. +#[derive(Debug, Serialize)] +struct EnrollLabels<'a> { + #[serde(skip_serializing_if = "Option::is_none")] + company: Option<&'a str>, + #[serde(skip_serializing_if = "Option::is_none")] + site: Option<&'a str>, + #[serde(skip_serializing_if = "Option::is_none")] + department: Option<&'a str>, + #[serde(skip_serializing_if = "Option::is_none")] + device_type: Option<&'a str>, + #[serde(skip_serializing_if = "slice_is_empty")] + tags: &'a [String], +} + +/// `skip_serializing_if` predicate for the `tags` slice — `Vec::is_empty` cannot +/// bind a `&&[String]`, so use a slice-typed helper. +fn slice_is_empty(s: &[String]) -> bool { + s.is_empty() +} + +/// `POST /api/enroll` success body — mirrors `enroll::EnrollResponse`. +#[derive(Debug, Deserialize)] +struct EnrollResponse { + #[allow(dead_code)] + machine_id: String, + #[serde(default)] + key: Option, + enrollment_state: String, + disposition: String, +} + +/// Backoff after a retryable failure (429 / network / 5xx). +const RETRYABLE_BACKOFF: Duration = Duration::from_secs(30); +/// Backoff after a terminal-ish config failure (401 / 409) or collision-pending. +/// These won't fix themselves without operator action, so retry slowly rather +/// than hot-looping while still recovering automatically once it IS fixed. +const TERMINAL_BACKOFF: Duration = Duration::from_secs(300); + +/// Drive enrollment until a `cak_` is issued, persisting it into the credential +/// store on success and loading it into `config.api_key`. +/// +/// Loops with backoff across retryable failures (it must not give up — a managed +/// machine left running should eventually enroll once the server/site is healthy) +/// and across collision-pending (HTTP 202: it keeps re-checking on a slow cadence +/// until an operator confirms the endpoint in the dashboard and the server begins +/// issuing a key). Returns `Ok(())` only once a `cak_` is stored. The only `Err` +/// returns are unrecoverable local faults (missing config, an un-persistable +/// credential) — network/HTTP failures are retried, never propagated. +pub async fn run_enrollment(config: &mut Config) -> Result<()> { + let site_code = config + .site_code + .clone() + .ok_or_else(|| anyhow!("enrollment requested but no site_code is configured"))?; + let enrollment_key = config + .enrollment_key + .clone() + .ok_or_else(|| anyhow!("enrollment requested but no enrollment_key is configured"))?; + + let https_base = config.https_base()?; + let machine_uid = crate::identity::machine_uid(); + let hostname = config.hostname(); + + tracing::info!( + "[ENROLL] first-run enrollment: site_code={} machine_uid={} hostname={}", + site_code, + machine_uid, + hostname + ); + + loop { + match attempt_enroll( + &https_base, + &site_code, + &enrollment_key, + &machine_uid, + &hostname, + config, + ) + .await + { + Ok(AttemptResult::Issued(cak)) => { + // Persist encrypted-at-rest, then load into the live config so the + // transport authenticates with the new per-machine credential. + #[cfg(windows)] + crate::credential_store::store_cak(&cak) + .context("failed to persist issued cak_ to the credential store")?; + config.api_key = cak; + // Enrollment material is single-use; drop it so it is not retained + // in memory or accidentally reused. + config.enrollment_key = None; + tracing::info!("[ENROLL] enrollment complete; connecting with per-machine key"); + return Ok(()); + } + Ok(AttemptResult::Pending) => { + tracing::warn!( + "[ENROLL] pending operator confirmation (machine_uid collision); \ + this machine cannot connect until confirmed in the dashboard. \ + Re-checking in {}s.", + TERMINAL_BACKOFF.as_secs() + ); + tokio::time::sleep(TERMINAL_BACKOFF).await; + } + Err(AttemptError::Terminal(msg)) => { + tracing::error!( + "[ENROLL] enrollment refused (operator action required): {msg}. \ + Retrying in {}s.", + TERMINAL_BACKOFF.as_secs() + ); + tokio::time::sleep(TERMINAL_BACKOFF).await; + } + Err(AttemptError::Retryable(msg)) => { + tracing::warn!( + "[ENROLL] transient enrollment failure: {msg}. Retrying in {}s.", + RETRYABLE_BACKOFF.as_secs() + ); + tokio::time::sleep(RETRYABLE_BACKOFF).await; + } + } + } +} + +/// Result of one HTTP enrollment attempt. +enum AttemptResult { + /// A `cak_` was issued (201/200). Carries the plaintext (never logged). + Issued(String), + /// Collision-gated (202): no key issued. + Pending, +} + +/// Failure classes that drive the backoff policy. +enum AttemptError { + /// 401/409 — won't fix without operator action; back off long but keep trying. + Terminal(String), + /// 429 / network / 5xx / decode — transient; short backoff. + Retryable(String), +} + +/// Make one `POST /api/enroll` call and classify the response per the contract. +async fn attempt_enroll( + https_base: &str, + site_code: &str, + enrollment_key: &str, + machine_uid: &str, + hostname: &str, + config: &Config, +) -> std::result::Result { + let url = format!("{}/api/enroll", https_base.trim_end_matches('/')); + + let body = EnrollRequest { + site_code, + enrollment_key, + machine_uid, + hostname, + labels: EnrollLabels { + company: config.company.as_deref().filter(|s| !s.is_empty()), + site: config.site.as_deref().filter(|s| !s.is_empty()), + department: config.department.as_deref().filter(|s| !s.is_empty()), + device_type: config.device_type.as_deref().filter(|s| !s.is_empty()), + tags: &config.tags, + }, + }; + + let client = build_client().map_err(|e| AttemptError::Retryable(e.to_string()))?; + + let response = client + .post(&url) + .json(&body) + .timeout(Duration::from_secs(30)) + .send() + .await + .map_err(|e| AttemptError::Retryable(format!("request to {url} failed: {e}")))?; + + let status = response.status(); + match status.as_u16() { + // New (201) or reuse (200): body carries the cak_. + 200 | 201 => { + let parsed: EnrollResponse = response + .json() + .await + .map_err(|e| AttemptError::Retryable(format!("malformed success body: {e}")))?; + match parsed.key { + Some(cak) if !cak.is_empty() => { + tracing::info!( + "[ENROLL] server accepted enrollment: state={} disposition={}", + parsed.enrollment_state, + parsed.disposition + ); + Ok(AttemptResult::Issued(cak)) + } + // 2xx with no key is contract-violating for the active path; treat + // as retryable so we don't silently spin or crash. + _ => Err(AttemptError::Retryable(format!( + "server returned {} with no key (state={}, disposition={})", + status, parsed.enrollment_state, parsed.disposition + ))), + } + } + + // Collision-gated: pending operator confirmation, no key. + 202 => { + // Body decode is best-effort here; the status alone is authoritative. + Ok(AttemptResult::Pending) + } + + // Bad/rotated enrollment key or unknown site code. + 401 => Err(AttemptError::Terminal( + "ENROLL_REJECTED — the site code or enrollment key is invalid or rotated; \ + this installer needs a current per-site key" + .to_string(), + )), + + // Machine already enrolled at a different site. + 409 => Err(AttemptError::Terminal( + "ENROLL_SITE_CONFLICT — this machine is already enrolled at another site; \ + a deliberate move requires the operator-initiated reassignment flow" + .to_string(), + )), + + // Rate-limited / locked out — honor Retry-After if present, else default. + 429 => { + let retry_after = response + .headers() + .get(reqwest::header::RETRY_AFTER) + .and_then(|v| v.to_str().ok()) + .and_then(|s| s.parse::().ok()); + Err(AttemptError::Retryable(match retry_after { + Some(secs) => format!("RATE_LIMITED (retry-after {secs}s)"), + None => "RATE_LIMITED".to_string(), + })) + } + + // 5xx or anything else — transient from the agent's perspective. + _ => Err(AttemptError::Retryable(format!( + "unexpected enrollment response: HTTP {status}" + ))), + } +} + +/// Build the HTTP client for enrollment, matching the update path's TLS posture +/// (`rustls`, with an opt-in dev-insecure escape hatch in debug builds only). +fn build_client() -> Result { + reqwest::Client::builder() + .danger_accept_invalid_certs(dev_insecure_tls()) + .build() + .context("failed to build enrollment HTTP client") +} + +/// Dev-only TLS bypass — identical policy to `update::dev_insecure_tls`: only in +/// debug builds AND only when `GURUCONNECT_DEV_INSECURE_TLS` is set. NEVER active +/// in a release build. +fn dev_insecure_tls() -> bool { + if cfg!(debug_assertions) && std::env::var("GURUCONNECT_DEV_INSECURE_TLS").is_ok() { + tracing::warn!( + "[ENROLL] TLS verification DISABLED (dev-insecure mode) — DO NOT use in production" + ); + true + } else { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// The request body must serialize to exactly the field names the Phase A + /// server deserializes (`enroll::EnrollRequest` / `EnrollLabels`). A drift here + /// is a silent enrollment failure, so pin the wire shape. + #[test] + fn request_serializes_to_the_server_contract() { + let tags = vec!["prod".to_string()]; + let req = EnrollRequest { + site_code: "ACME-HQ", + enrollment_key: "cek_secret", + machine_uid: "muid_abc", + hostname: "WS-01", + labels: EnrollLabels { + company: Some("Acme"), + site: Some("HQ"), + department: Some("IT"), + device_type: Some("workstation"), + tags: &tags, + }, + }; + let v: serde_json::Value = serde_json::to_value(&req).unwrap(); + assert_eq!(v["site_code"], "ACME-HQ"); + assert_eq!(v["enrollment_key"], "cek_secret"); + assert_eq!(v["machine_uid"], "muid_abc"); + assert_eq!(v["hostname"], "WS-01"); + assert_eq!(v["labels"]["company"], "Acme"); + assert_eq!(v["labels"]["site"], "HQ"); + assert_eq!(v["labels"]["department"], "IT"); + assert_eq!(v["labels"]["device_type"], "workstation"); + assert_eq!(v["labels"]["tags"][0], "prod"); + } + + /// Empty optional labels are omitted (the server defaults them), and an empty + /// tag list is not serialized — keeping the body minimal for a thin installer. + #[test] + fn request_omits_empty_optional_labels() { + let tags: Vec = Vec::new(); + let req = EnrollRequest { + site_code: "S", + enrollment_key: "cek_x", + machine_uid: "muid_x", + hostname: "H", + labels: EnrollLabels { + company: None, + site: None, + department: None, + device_type: None, + tags: &tags, + }, + }; + let v: serde_json::Value = serde_json::to_value(&req).unwrap(); + let labels = v["labels"].as_object().unwrap(); + assert!(!labels.contains_key("company")); + assert!(!labels.contains_key("department")); + assert!(!labels.contains_key("tags")); + } + + /// The success response decoder must accept both a key-bearing active body and + /// a keyless pending body (mirrors `EnrollResponse` with `skip_serializing_if`). + #[test] + fn response_decodes_active_and_pending_shapes() { + let active: EnrollResponse = serde_json::from_str( + r#"{"machine_id":"m1","key":"cak_live","enrollment_state":"active","disposition":"new"}"#, + ) + .unwrap(); + assert_eq!(active.key.as_deref(), Some("cak_live")); + assert_eq!(active.enrollment_state, "active"); + + let pending: EnrollResponse = serde_json::from_str( + r#"{"machine_id":"m2","enrollment_state":"pending","disposition":"collision_pending"}"#, + ) + .unwrap(); + assert!(pending.key.is_none()); + assert_eq!(pending.disposition, "collision_pending"); + } +} diff --git a/agent/src/main.rs b/agent/src/main.rs index a8fd11f..93aac0a 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -16,7 +16,10 @@ mod capture; mod chat; mod config; mod consent; +#[cfg(windows)] +mod credential_store; mod encoder; +mod enroll; mod identity; mod input; mod install; @@ -323,7 +326,76 @@ fn run_agent_mode(support_code: Option) -> Result<()> { // Run the agent let rt = tokio::runtime::Runtime::new()?; - rt.block_on(run_agent(config)) + rt.block_on(async move { + // SPEC-016 Phase B: resolve the operating credential before connecting. + // Support sessions are unaffected — they authenticate by support code, not + // by a per-machine cak_, so we only resolve enrollment for a managed agent. + if config.support_code.is_none() { + resolve_agent_credential(&mut config).await?; + } + run_agent(config).await + }) +} + +/// Resolve the per-machine operating credential for a managed agent (SPEC-016 +/// Phase B, run-mode wiring). +/// +/// Precedence: +/// 1. A `cak_` already stored encrypted at rest -> load it and connect with it +/// (the steady-state path; no network call, no re-enroll). +/// 2. No stored `cak_` but an `enrollment_key` + `site_code` are present -> +/// run first-run enrollment to obtain + persist a `cak_`, then connect. +/// 3. Neither a stored `cak_` nor enrollment material, but a non-empty +/// `api_key` is configured -> use it as the DEPRECATED shared/legacy key +/// (transition compatibility only; logged at WARNING). +/// 4. Nothing usable -> error; a managed agent cannot authenticate. +async fn resolve_agent_credential(config: &mut config::Config) -> Result<()> { + // 1. Stored per-machine cak_ (steady state). + #[cfg(windows)] + { + match credential_store::load_cak() { + Ok(Some(cak)) => { + info!("Using stored per-machine credential (cak_)"); + config.api_key = cak; + // Any leftover enrollment material is now moot. + config.enrollment_key = None; + return Ok(()); + } + Ok(None) => { + info!("No stored per-machine credential; will enroll if configured"); + } + Err(e) => { + // A present-but-undecryptable store is a real problem (tampered or + // copied from another machine); surface it rather than silently + // re-enrolling over it. + return Err(e.context("failed to read the credential store")); + } + } + } + + // 2. First-run enrollment (the SPEC-016 zero-touch path). run_enrollment only + // returns once a cak_ is stored (it retries network/429/collision-pending + // internally); a returned Err is an unrecoverable local fault. + if config.enrollment_key.is_some() && config.site_code.is_some() { + info!("Enrollment material present; running first-run enrollment"); + enroll::run_enrollment(config).await?; + return Ok(()); + } + + // 3. DEPRECATED shared/legacy api_key fallback (transition only). + if !config.api_key.is_empty() { + warn!( + "Connecting with a DEPRECATED shared/legacy api_key. Migrate this agent \ + to a per-site enrollment (SPEC-016); the shared key path will be removed." + ); + return Ok(()); + } + + // 4. Nothing usable. + Err(anyhow::anyhow!( + "no operating credential available: no stored cak_, no enrollment_key/site_code, \ + and no legacy api_key — this managed agent cannot authenticate" + )) } /// Run in viewer mode (connect to remote session)