feat(agent): first-run enrollment client + run-mode wiring (SPEC-016 Phase B items 3,5)
New enroll module: on a managed agent with no stored cak_ but with
enrollment_key + site_code, POST machine_uid + hostname + labels to
<https-base>/api/enroll and persist the minted cak_. Handles every Phase A
status code distinctly:
- 201 new / 200 reuse -> persist cak_ (DPAPI store) and connect
- 202 collision_pending -> log "pending operator confirmation", slow
re-check loop (no key issued; cannot connect until confirmed)
- 401 ENROLL_REJECTED / 409 ENROLL_SITE_CONFLICT -> distinct actionable
errors, long backoff (won't fix without operator action, but recovers
automatically once it does) — no tight loop
- 429 -> honor Retry-After, short backoff
- network / 5xx / decode -> short backoff
The enrollment_key and cak_ are never logged. Uses the existing reqwest
client and the update path's TLS posture (rustls; dev-insecure only in
debug + opt-in). Wire-contract unit tests pin the request shape against
the server's EnrollRequest/EnrollLabels and decode active + pending bodies.
main.rs run-mode wiring: before a managed agent connects, resolve the
operating credential by precedence — stored cak_ (steady state, no
network) -> first-run enrollment -> DEPRECATED legacy api_key (transition
only, logged at WARNING) -> error. The relay already accepts the cak_ as
the api_key query param, so the persistent transport authenticates with it
unchanged. Attended/support-code and viewer paths are untouched.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
384
agent/src/enroll.rs
Normal file
384
agent/src/enroll.rs
Normal file
@@ -0,0 +1,384 @@
|
||||
//! First-run self-enrollment client (SPEC-016 Phase B, item 4).
|
||||
//!
|
||||
//! When the agent runs as a persistent (`PermanentAgent`) install with NO stored
|
||||
//! `cak_` but WITH an `enrollment_key` + `site_code`, it walks through the
|
||||
//! public, unauthenticated `POST /api/enroll` door: it presents its site
|
||||
//! credentials and its hardware-derived `machine_uid`, and — on success — the
|
||||
//! server mints and returns a per-machine `cak_` operating credential exactly
|
||||
//! once. The agent persists that `cak_` encrypted at rest
|
||||
//! ([`crate::credential_store`]) and connects with it; on every later run it uses
|
||||
//! the stored `cak_` directly and never re-enrolls.
|
||||
//!
|
||||
//! Server contract consumed (must match `server/src/api/enroll.rs`):
|
||||
//! - Request: `{ site_code, enrollment_key, machine_uid, hostname,
|
||||
//! labels:{company,site,department,device_type,tags} }`.
|
||||
//! - `201 Created` -> new enrollment; body has `key` (the `cak_`).
|
||||
//! - `200 OK` -> reuse (re-image / re-install); body has `key`.
|
||||
//! - `202 Accepted` -> `collision_pending`; NO key — operator must confirm in
|
||||
//! the dashboard before the endpoint can connect.
|
||||
//! - `401 Unauthorized` -> `ENROLL_REJECTED` (bad/rotated key or unknown site):
|
||||
//! terminal-ish config problem, back off long.
|
||||
//! - `409 Conflict` -> `ENROLL_SITE_CONFLICT` (machine bound to another site):
|
||||
//! terminal-ish, requires the operator reassignment flow; back off long.
|
||||
//! - `429 Too Many Requests` -> rate-limited; back off and retry.
|
||||
//!
|
||||
//! SECURITY: never log the `enrollment_key` or the minted `cak_`. Only states,
|
||||
//! dispositions, and the (non-secret) `machine_uid`/`site_code` are logged.
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::config::Config;
|
||||
|
||||
/// `POST /api/enroll` request body — mirrors `enroll::EnrollRequest`.
|
||||
#[derive(Debug, Serialize)]
|
||||
struct EnrollRequest<'a> {
|
||||
site_code: &'a str,
|
||||
enrollment_key: &'a str,
|
||||
machine_uid: &'a str,
|
||||
hostname: &'a str,
|
||||
labels: EnrollLabels<'a>,
|
||||
}
|
||||
|
||||
/// Labels carried at enrollment — mirrors `enroll::EnrollLabels`.
|
||||
#[derive(Debug, Serialize)]
|
||||
struct EnrollLabels<'a> {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
company: Option<&'a str>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
site: Option<&'a str>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
department: Option<&'a str>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
device_type: Option<&'a str>,
|
||||
#[serde(skip_serializing_if = "slice_is_empty")]
|
||||
tags: &'a [String],
|
||||
}
|
||||
|
||||
/// `skip_serializing_if` predicate for the `tags` slice — `Vec::is_empty` cannot
|
||||
/// bind a `&&[String]`, so use a slice-typed helper.
|
||||
fn slice_is_empty(s: &[String]) -> bool {
|
||||
s.is_empty()
|
||||
}
|
||||
|
||||
/// `POST /api/enroll` success body — mirrors `enroll::EnrollResponse`.
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct EnrollResponse {
|
||||
#[allow(dead_code)]
|
||||
machine_id: String,
|
||||
#[serde(default)]
|
||||
key: Option<String>,
|
||||
enrollment_state: String,
|
||||
disposition: String,
|
||||
}
|
||||
|
||||
/// Backoff after a retryable failure (429 / network / 5xx).
|
||||
const RETRYABLE_BACKOFF: Duration = Duration::from_secs(30);
|
||||
/// Backoff after a terminal-ish config failure (401 / 409) or collision-pending.
|
||||
/// These won't fix themselves without operator action, so retry slowly rather
|
||||
/// than hot-looping while still recovering automatically once it IS fixed.
|
||||
const TERMINAL_BACKOFF: Duration = Duration::from_secs(300);
|
||||
|
||||
/// Drive enrollment until a `cak_` is issued, persisting it into the credential
|
||||
/// store on success and loading it into `config.api_key`.
|
||||
///
|
||||
/// Loops with backoff across retryable failures (it must not give up — a managed
|
||||
/// machine left running should eventually enroll once the server/site is healthy)
|
||||
/// and across collision-pending (HTTP 202: it keeps re-checking on a slow cadence
|
||||
/// until an operator confirms the endpoint in the dashboard and the server begins
|
||||
/// issuing a key). Returns `Ok(())` only once a `cak_` is stored. The only `Err`
|
||||
/// returns are unrecoverable local faults (missing config, an un-persistable
|
||||
/// credential) — network/HTTP failures are retried, never propagated.
|
||||
pub async fn run_enrollment(config: &mut Config) -> Result<()> {
|
||||
let site_code = config
|
||||
.site_code
|
||||
.clone()
|
||||
.ok_or_else(|| anyhow!("enrollment requested but no site_code is configured"))?;
|
||||
let enrollment_key = config
|
||||
.enrollment_key
|
||||
.clone()
|
||||
.ok_or_else(|| anyhow!("enrollment requested but no enrollment_key is configured"))?;
|
||||
|
||||
let https_base = config.https_base()?;
|
||||
let machine_uid = crate::identity::machine_uid();
|
||||
let hostname = config.hostname();
|
||||
|
||||
tracing::info!(
|
||||
"[ENROLL] first-run enrollment: site_code={} machine_uid={} hostname={}",
|
||||
site_code,
|
||||
machine_uid,
|
||||
hostname
|
||||
);
|
||||
|
||||
loop {
|
||||
match attempt_enroll(
|
||||
&https_base,
|
||||
&site_code,
|
||||
&enrollment_key,
|
||||
&machine_uid,
|
||||
&hostname,
|
||||
config,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(AttemptResult::Issued(cak)) => {
|
||||
// Persist encrypted-at-rest, then load into the live config so the
|
||||
// transport authenticates with the new per-machine credential.
|
||||
#[cfg(windows)]
|
||||
crate::credential_store::store_cak(&cak)
|
||||
.context("failed to persist issued cak_ to the credential store")?;
|
||||
config.api_key = cak;
|
||||
// Enrollment material is single-use; drop it so it is not retained
|
||||
// in memory or accidentally reused.
|
||||
config.enrollment_key = None;
|
||||
tracing::info!("[ENROLL] enrollment complete; connecting with per-machine key");
|
||||
return Ok(());
|
||||
}
|
||||
Ok(AttemptResult::Pending) => {
|
||||
tracing::warn!(
|
||||
"[ENROLL] pending operator confirmation (machine_uid collision); \
|
||||
this machine cannot connect until confirmed in the dashboard. \
|
||||
Re-checking in {}s.",
|
||||
TERMINAL_BACKOFF.as_secs()
|
||||
);
|
||||
tokio::time::sleep(TERMINAL_BACKOFF).await;
|
||||
}
|
||||
Err(AttemptError::Terminal(msg)) => {
|
||||
tracing::error!(
|
||||
"[ENROLL] enrollment refused (operator action required): {msg}. \
|
||||
Retrying in {}s.",
|
||||
TERMINAL_BACKOFF.as_secs()
|
||||
);
|
||||
tokio::time::sleep(TERMINAL_BACKOFF).await;
|
||||
}
|
||||
Err(AttemptError::Retryable(msg)) => {
|
||||
tracing::warn!(
|
||||
"[ENROLL] transient enrollment failure: {msg}. Retrying in {}s.",
|
||||
RETRYABLE_BACKOFF.as_secs()
|
||||
);
|
||||
tokio::time::sleep(RETRYABLE_BACKOFF).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Result of one HTTP enrollment attempt.
|
||||
enum AttemptResult {
|
||||
/// A `cak_` was issued (201/200). Carries the plaintext (never logged).
|
||||
Issued(String),
|
||||
/// Collision-gated (202): no key issued.
|
||||
Pending,
|
||||
}
|
||||
|
||||
/// Failure classes that drive the backoff policy.
|
||||
enum AttemptError {
|
||||
/// 401/409 — won't fix without operator action; back off long but keep trying.
|
||||
Terminal(String),
|
||||
/// 429 / network / 5xx / decode — transient; short backoff.
|
||||
Retryable(String),
|
||||
}
|
||||
|
||||
/// Make one `POST /api/enroll` call and classify the response per the contract.
|
||||
async fn attempt_enroll(
|
||||
https_base: &str,
|
||||
site_code: &str,
|
||||
enrollment_key: &str,
|
||||
machine_uid: &str,
|
||||
hostname: &str,
|
||||
config: &Config,
|
||||
) -> std::result::Result<AttemptResult, AttemptError> {
|
||||
let url = format!("{}/api/enroll", https_base.trim_end_matches('/'));
|
||||
|
||||
let body = EnrollRequest {
|
||||
site_code,
|
||||
enrollment_key,
|
||||
machine_uid,
|
||||
hostname,
|
||||
labels: EnrollLabels {
|
||||
company: config.company.as_deref().filter(|s| !s.is_empty()),
|
||||
site: config.site.as_deref().filter(|s| !s.is_empty()),
|
||||
department: config.department.as_deref().filter(|s| !s.is_empty()),
|
||||
device_type: config.device_type.as_deref().filter(|s| !s.is_empty()),
|
||||
tags: &config.tags,
|
||||
},
|
||||
};
|
||||
|
||||
let client = build_client().map_err(|e| AttemptError::Retryable(e.to_string()))?;
|
||||
|
||||
let response = client
|
||||
.post(&url)
|
||||
.json(&body)
|
||||
.timeout(Duration::from_secs(30))
|
||||
.send()
|
||||
.await
|
||||
.map_err(|e| AttemptError::Retryable(format!("request to {url} failed: {e}")))?;
|
||||
|
||||
let status = response.status();
|
||||
match status.as_u16() {
|
||||
// New (201) or reuse (200): body carries the cak_.
|
||||
200 | 201 => {
|
||||
let parsed: EnrollResponse = response
|
||||
.json()
|
||||
.await
|
||||
.map_err(|e| AttemptError::Retryable(format!("malformed success body: {e}")))?;
|
||||
match parsed.key {
|
||||
Some(cak) if !cak.is_empty() => {
|
||||
tracing::info!(
|
||||
"[ENROLL] server accepted enrollment: state={} disposition={}",
|
||||
parsed.enrollment_state,
|
||||
parsed.disposition
|
||||
);
|
||||
Ok(AttemptResult::Issued(cak))
|
||||
}
|
||||
// 2xx with no key is contract-violating for the active path; treat
|
||||
// as retryable so we don't silently spin or crash.
|
||||
_ => Err(AttemptError::Retryable(format!(
|
||||
"server returned {} with no key (state={}, disposition={})",
|
||||
status, parsed.enrollment_state, parsed.disposition
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
// Collision-gated: pending operator confirmation, no key.
|
||||
202 => {
|
||||
// Body decode is best-effort here; the status alone is authoritative.
|
||||
Ok(AttemptResult::Pending)
|
||||
}
|
||||
|
||||
// Bad/rotated enrollment key or unknown site code.
|
||||
401 => Err(AttemptError::Terminal(
|
||||
"ENROLL_REJECTED — the site code or enrollment key is invalid or rotated; \
|
||||
this installer needs a current per-site key"
|
||||
.to_string(),
|
||||
)),
|
||||
|
||||
// Machine already enrolled at a different site.
|
||||
409 => Err(AttemptError::Terminal(
|
||||
"ENROLL_SITE_CONFLICT — this machine is already enrolled at another site; \
|
||||
a deliberate move requires the operator-initiated reassignment flow"
|
||||
.to_string(),
|
||||
)),
|
||||
|
||||
// Rate-limited / locked out — honor Retry-After if present, else default.
|
||||
429 => {
|
||||
let retry_after = response
|
||||
.headers()
|
||||
.get(reqwest::header::RETRY_AFTER)
|
||||
.and_then(|v| v.to_str().ok())
|
||||
.and_then(|s| s.parse::<u64>().ok());
|
||||
Err(AttemptError::Retryable(match retry_after {
|
||||
Some(secs) => format!("RATE_LIMITED (retry-after {secs}s)"),
|
||||
None => "RATE_LIMITED".to_string(),
|
||||
}))
|
||||
}
|
||||
|
||||
// 5xx or anything else — transient from the agent's perspective.
|
||||
_ => Err(AttemptError::Retryable(format!(
|
||||
"unexpected enrollment response: HTTP {status}"
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build the HTTP client for enrollment, matching the update path's TLS posture
|
||||
/// (`rustls`, with an opt-in dev-insecure escape hatch in debug builds only).
|
||||
fn build_client() -> Result<reqwest::Client> {
|
||||
reqwest::Client::builder()
|
||||
.danger_accept_invalid_certs(dev_insecure_tls())
|
||||
.build()
|
||||
.context("failed to build enrollment HTTP client")
|
||||
}
|
||||
|
||||
/// Dev-only TLS bypass — identical policy to `update::dev_insecure_tls`: only in
|
||||
/// debug builds AND only when `GURUCONNECT_DEV_INSECURE_TLS` is set. NEVER active
|
||||
/// in a release build.
|
||||
fn dev_insecure_tls() -> bool {
|
||||
if cfg!(debug_assertions) && std::env::var("GURUCONNECT_DEV_INSECURE_TLS").is_ok() {
|
||||
tracing::warn!(
|
||||
"[ENROLL] TLS verification DISABLED (dev-insecure mode) — DO NOT use in production"
|
||||
);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
/// The request body must serialize to exactly the field names the Phase A
|
||||
/// server deserializes (`enroll::EnrollRequest` / `EnrollLabels`). A drift here
|
||||
/// is a silent enrollment failure, so pin the wire shape.
|
||||
#[test]
|
||||
fn request_serializes_to_the_server_contract() {
|
||||
let tags = vec!["prod".to_string()];
|
||||
let req = EnrollRequest {
|
||||
site_code: "ACME-HQ",
|
||||
enrollment_key: "cek_secret",
|
||||
machine_uid: "muid_abc",
|
||||
hostname: "WS-01",
|
||||
labels: EnrollLabels {
|
||||
company: Some("Acme"),
|
||||
site: Some("HQ"),
|
||||
department: Some("IT"),
|
||||
device_type: Some("workstation"),
|
||||
tags: &tags,
|
||||
},
|
||||
};
|
||||
let v: serde_json::Value = serde_json::to_value(&req).unwrap();
|
||||
assert_eq!(v["site_code"], "ACME-HQ");
|
||||
assert_eq!(v["enrollment_key"], "cek_secret");
|
||||
assert_eq!(v["machine_uid"], "muid_abc");
|
||||
assert_eq!(v["hostname"], "WS-01");
|
||||
assert_eq!(v["labels"]["company"], "Acme");
|
||||
assert_eq!(v["labels"]["site"], "HQ");
|
||||
assert_eq!(v["labels"]["department"], "IT");
|
||||
assert_eq!(v["labels"]["device_type"], "workstation");
|
||||
assert_eq!(v["labels"]["tags"][0], "prod");
|
||||
}
|
||||
|
||||
/// Empty optional labels are omitted (the server defaults them), and an empty
|
||||
/// tag list is not serialized — keeping the body minimal for a thin installer.
|
||||
#[test]
|
||||
fn request_omits_empty_optional_labels() {
|
||||
let tags: Vec<String> = Vec::new();
|
||||
let req = EnrollRequest {
|
||||
site_code: "S",
|
||||
enrollment_key: "cek_x",
|
||||
machine_uid: "muid_x",
|
||||
hostname: "H",
|
||||
labels: EnrollLabels {
|
||||
company: None,
|
||||
site: None,
|
||||
department: None,
|
||||
device_type: None,
|
||||
tags: &tags,
|
||||
},
|
||||
};
|
||||
let v: serde_json::Value = serde_json::to_value(&req).unwrap();
|
||||
let labels = v["labels"].as_object().unwrap();
|
||||
assert!(!labels.contains_key("company"));
|
||||
assert!(!labels.contains_key("department"));
|
||||
assert!(!labels.contains_key("tags"));
|
||||
}
|
||||
|
||||
/// The success response decoder must accept both a key-bearing active body and
|
||||
/// a keyless pending body (mirrors `EnrollResponse` with `skip_serializing_if`).
|
||||
#[test]
|
||||
fn response_decodes_active_and_pending_shapes() {
|
||||
let active: EnrollResponse = serde_json::from_str(
|
||||
r#"{"machine_id":"m1","key":"cak_live","enrollment_state":"active","disposition":"new"}"#,
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(active.key.as_deref(), Some("cak_live"));
|
||||
assert_eq!(active.enrollment_state, "active");
|
||||
|
||||
let pending: EnrollResponse = serde_json::from_str(
|
||||
r#"{"machine_id":"m2","enrollment_state":"pending","disposition":"collision_pending"}"#,
|
||||
)
|
||||
.unwrap();
|
||||
assert!(pending.key.is_none());
|
||||
assert_eq!(pending.disposition, "collision_pending");
|
||||
}
|
||||
}
|
||||
@@ -16,7 +16,10 @@ mod capture;
|
||||
mod chat;
|
||||
mod config;
|
||||
mod consent;
|
||||
#[cfg(windows)]
|
||||
mod credential_store;
|
||||
mod encoder;
|
||||
mod enroll;
|
||||
mod identity;
|
||||
mod input;
|
||||
mod install;
|
||||
@@ -323,7 +326,76 @@ fn run_agent_mode(support_code: Option<String>) -> Result<()> {
|
||||
|
||||
// Run the agent
|
||||
let rt = tokio::runtime::Runtime::new()?;
|
||||
rt.block_on(run_agent(config))
|
||||
rt.block_on(async move {
|
||||
// SPEC-016 Phase B: resolve the operating credential before connecting.
|
||||
// Support sessions are unaffected — they authenticate by support code, not
|
||||
// by a per-machine cak_, so we only resolve enrollment for a managed agent.
|
||||
if config.support_code.is_none() {
|
||||
resolve_agent_credential(&mut config).await?;
|
||||
}
|
||||
run_agent(config).await
|
||||
})
|
||||
}
|
||||
|
||||
/// Resolve the per-machine operating credential for a managed agent (SPEC-016
|
||||
/// Phase B, run-mode wiring).
|
||||
///
|
||||
/// Precedence:
|
||||
/// 1. A `cak_` already stored encrypted at rest -> load it and connect with it
|
||||
/// (the steady-state path; no network call, no re-enroll).
|
||||
/// 2. No stored `cak_` but an `enrollment_key` + `site_code` are present ->
|
||||
/// run first-run enrollment to obtain + persist a `cak_`, then connect.
|
||||
/// 3. Neither a stored `cak_` nor enrollment material, but a non-empty
|
||||
/// `api_key` is configured -> use it as the DEPRECATED shared/legacy key
|
||||
/// (transition compatibility only; logged at WARNING).
|
||||
/// 4. Nothing usable -> error; a managed agent cannot authenticate.
|
||||
async fn resolve_agent_credential(config: &mut config::Config) -> Result<()> {
|
||||
// 1. Stored per-machine cak_ (steady state).
|
||||
#[cfg(windows)]
|
||||
{
|
||||
match credential_store::load_cak() {
|
||||
Ok(Some(cak)) => {
|
||||
info!("Using stored per-machine credential (cak_)");
|
||||
config.api_key = cak;
|
||||
// Any leftover enrollment material is now moot.
|
||||
config.enrollment_key = None;
|
||||
return Ok(());
|
||||
}
|
||||
Ok(None) => {
|
||||
info!("No stored per-machine credential; will enroll if configured");
|
||||
}
|
||||
Err(e) => {
|
||||
// A present-but-undecryptable store is a real problem (tampered or
|
||||
// copied from another machine); surface it rather than silently
|
||||
// re-enrolling over it.
|
||||
return Err(e.context("failed to read the credential store"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 2. First-run enrollment (the SPEC-016 zero-touch path). run_enrollment only
|
||||
// returns once a cak_ is stored (it retries network/429/collision-pending
|
||||
// internally); a returned Err is an unrecoverable local fault.
|
||||
if config.enrollment_key.is_some() && config.site_code.is_some() {
|
||||
info!("Enrollment material present; running first-run enrollment");
|
||||
enroll::run_enrollment(config).await?;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// 3. DEPRECATED shared/legacy api_key fallback (transition only).
|
||||
if !config.api_key.is_empty() {
|
||||
warn!(
|
||||
"Connecting with a DEPRECATED shared/legacy api_key. Migrate this agent \
|
||||
to a per-site enrollment (SPEC-016); the shared key path will be removed."
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// 4. Nothing usable.
|
||||
Err(anyhow::anyhow!(
|
||||
"no operating credential available: no stored cak_, no enrollment_key/site_code, \
|
||||
and no legacy api_key — this managed agent cannot authenticate"
|
||||
))
|
||||
}
|
||||
|
||||
/// Run in viewer mode (connect to remote session)
|
||||
|
||||
Reference in New Issue
Block a user