From 7602b4346a955db787c2cbe4900513bce3e9c9e2 Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Tue, 2 Jun 2026 13:43:01 -0700 Subject: [PATCH 1/2] feat(agent): SPEC-018 Phase 1 managed-agent SYSTEM service host Run the managed/persistent GuruConnect agent as a LocalSystem Windows service so it is reachable at the login screen and across reboots, and so the SPEC-016 per-machine cak_ store (ACL-restricted to SYSTEM + Administrators) is finally readable in-context. Phase 1 scope (host + lifecycle only): - New agent/src/service/mod.rs: registers "GuruConnectAgent" with the SCM via the windows-service dispatcher, reports a correct lifecycle (StartPending -> Running -> StopPending -> Stopped), handles Stop/Shutdown via an AtomicBool the agent loop polls (graceful WS close), and provides install/uninstall/start (LocalSystem, AutoStart, sc-failure crash recovery). Idempotent install/uninstall. - main.rs: hidden `service-run` subcommand routes the SCM-launched process into the dispatcher; new run_managed_agent_service() runs the existing RunMode::PermanentAgent logic (resolve/enroll cak_, hold the relay) as SYSTEM. run_agent() now takes an optional SCM shutdown flag, skips the HKCU Run autostart and the tray when run as the service, and interrupts the reconnect backoff promptly on stop. An interactive launch of a managed binary now installs+starts the service and exits instead of double-running. - install.rs: a managed install (embedded config present) installs the LocalSystem service as the single autostart and removes the legacy HKCU Run entry; uninstall stops+deletes the service (idempotent). Attended/viewer installs are untouched. - Kept the SPEC-016 Phase B fail-fast guard as a harmless safety net for any non-SYSTEM invocation; updated its comment to name this service as the managed run context. Phase 2 NOT built (seams documented): session broker, per-session capture/input worker, CreateProcessAsUserW token handoff, service/worker IPC, and SERVICE_CONTROL_SESSIONCHANGE. Phase 1 enrolls/connects as SYSTEM but does not capture a desktop (a Session-0 process cannot). No service is installed/started on the dev host; that is a VM/admin integration step. fmt + clippy -D warnings + release build + 55 tests all pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- agent/src/install.rs | 70 ++++++ agent/src/main.rs | 250 ++++++++++++++++++--- agent/src/service/mod.rs | 457 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 746 insertions(+), 31 deletions(-) create mode 100644 agent/src/service/mod.rs diff --git a/agent/src/install.rs b/agent/src/install.rs index 82284e6..77a8e29 100644 --- a/agent/src/install.rs +++ b/agent/src/install.rs @@ -290,6 +290,18 @@ pub fn install(force_user_install: bool) -> Result<()> { // Register protocol handler register_protocol_handler(elevated)?; + // SPEC-018: a MANAGED install (embedded config => persistent agent) installs + // the LocalSystem service as its single autostart and removes the per-user + // HKCU\…\Run entry. Attended (support-code) and viewer installs are untouched: + // they have no embedded config and continue to use the HKCU Run / protocol + // handler paths exactly as before. + #[cfg(windows)] + { + if crate::config::Config::has_embedded_config() { + install_managed_service(&exe_path)?; + } + } + info!("Installation complete!"); if elevated { info!("Installed system-wide to: {}", install_path.display()); @@ -300,6 +312,64 @@ pub fn install(force_user_install: bool) -> Result<()> { Ok(()) } +/// SPEC-018: install the managed agent as a LocalSystem service and swap out the +/// legacy per-user `HKCU\…\Run` autostart so the service is the single managed +/// autostart (no double-run). +/// +/// Installing a LocalSystem service requires Administrator. If the SCM rejects the +/// create (not elevated), we surface the error rather than silently leaving the +/// machine with no managed autostart — a managed deployment is expected to run the +/// install elevated. The HKCU Run entry is removed best-effort regardless. +#[cfg(windows)] +pub fn install_managed_service(exe_path: &std::path::Path) -> Result<()> { + info!("Managed install: registering LocalSystem service (SPEC-018)"); + + crate::service::install_service(exe_path) + .map_err(|e| anyhow!("failed to install the managed agent service: {e:#}"))?; + + // Start the service now so the agent comes up immediately on first install + // rather than only on the next boot. Best-effort: the service is auto-start, so + // a transient start failure still self-heals on reboot. + if let Err(e) = crate::service::start_service() { + warn!( + "managed service installed but did not start now ({e:#}); \ + it is auto-start and will run on next boot" + ); + } + + // Remove the legacy per-user autostart so the agent does not also launch in the + // user's session (which would double-run alongside the service). + if let Err(e) = crate::startup::remove_from_startup() { + warn!( + "managed service installed, but failed to remove the legacy HKCU Run \ + autostart (harmless if it was never present): {}", + e + ); + } else { + info!("removed legacy HKCU Run autostart (service is now the managed autostart)"); + } + + Ok(()) +} + +/// SPEC-018: remove the managed agent service and any legacy HKCU Run autostart. +/// Idempotent — succeeds if neither is present. +#[cfg(windows)] +pub fn uninstall_managed_service() -> Result<()> { + info!("Managed uninstall: removing LocalSystem service (SPEC-018)"); + + // Best-effort removal of the legacy autostart first (cheap, no SCM). + if let Err(e) = crate::startup::remove_from_startup() { + warn!( + "failed to remove legacy HKCU Run autostart during uninstall: {}", + e + ); + } + + crate::service::uninstall_service() + .map_err(|e| anyhow!("failed to uninstall the managed agent service: {e:#}")) +} + /// Check if the guruconnect:// protocol handler is registered #[cfg(windows)] pub fn is_protocol_handler_registered() -> bool { diff --git a/agent/src/main.rs b/agent/src/main.rs index 5c2fb55..d2d57b3 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -24,6 +24,8 @@ mod identity; mod input; mod install; mod sas_client; +#[cfg(windows)] +mod service; mod session; mod startup; mod transport; @@ -182,6 +184,12 @@ enum Commands { /// Show detailed version and build information #[command(name = "version-info")] VersionInfo, + + /// Internal: entry point invoked by the Windows Service Control Manager to run + /// the managed agent as a LocalSystem service (SPEC-018). Not for interactive + /// use — running it by hand fails because there is no controlling SCM. + #[command(name = "service-run", hide = true)] + ServiceRun, } fn main() -> Result<()> { @@ -236,6 +244,21 @@ fn main() -> Result<()> { println!("{}", build_info::full_version()); Ok(()) } + Some(Commands::ServiceRun) => { + // SPEC-018 Phase 1: SCM-invoked entry. Hand off to the service + // dispatcher, which calls back into the control loop and runs the + // managed-agent logic as SYSTEM. Blocks until the service stops. + #[cfg(windows)] + { + service::run_dispatcher() + } + #[cfg(not(windows))] + { + Err(anyhow::anyhow!( + "service-run is a Windows-only entry point (SPEC-018)" + )) + } + } None => { // No subcommand - detect mode from filename or embedded config // Legacy: if support_code arg provided, use that @@ -264,16 +287,31 @@ fn main() -> Result<()> { run_agent_mode(Some(code)) } RunMode::PermanentAgent => { - // Embedded config found - run as permanent agent + // Embedded config found - managed/persistent agent. info!("Permanent agent mode detected (embedded config)"); - if !install::is_protocol_handler_registered() { - // First run - install then run as agent - info!("First run - installing agent"); - if let Err(e) = install::install(false) { - warn!("Installation failed: {}", e); - } + + // SPEC-018: managed mode runs as the LocalSystem service, not as + // an interactive process. The service is the single autostart. + // - If the service is already installed, the service is (or + // will be) running the agent — this interactive invocation + // must NOT spawn a second agent. Exit quietly. + // - On first run, install (which installs + starts the service + // and removes the legacy HKCU Run entry), then exit and let + // the service carry the agent as SYSTEM. + #[cfg(windows)] + { + run_permanent_agent_managed() + } + #[cfg(not(windows))] + { + if !install::is_protocol_handler_registered() { + info!("First run - installing agent"); + if let Err(e) = install::install(false) { + warn!("Installation failed: {}", e); + } + } + run_agent_mode(None) } - run_agent_mode(None) } RunMode::Default => { // No special mode detected - use legacy logic @@ -333,10 +371,92 @@ fn run_agent_mode(support_code: Option) -> Result<()> { if config.support_code.is_none() { resolve_agent_credential(&mut config).await?; } - run_agent(config).await + run_agent(config, None).await }) } +/// SPEC-018 Phase 1: run the managed/persistent agent as the LocalSystem service. +/// +/// Invoked from the service control loop ([`service::run_service`]) once the +/// service has reported `Running`. This is the same persistent-agent logic as +/// [`run_agent_mode`] (load config, resolve/enroll the per-machine `cak_` per +/// SPEC-016, hold the relay connection) — but it runs **as SYSTEM**, so the +/// SYSTEM-ACL'd `cak_` store is finally readable in-context, and it observes the +/// SCM `shutdown` flag for a graceful stop. +/// +/// Returns `Ok(())` when the agent loop exits because a stop was requested, and +/// `Err` only on an unrecoverable *local* fault (e.g. no usable credential and no +/// enrollment material) — network errors are retried inside the loop and never +/// surface here. +/// +/// Phase 2 seam: this is where the session broker is wired in — the runtime +/// started here will own the broker that spawns the per-session capture/input +/// worker (`CreateProcessAsUserW`) and the IPC server. Phase 1 connects/enrolls +/// only; it does not capture a desktop (a Session-0 SYSTEM process cannot). +#[cfg(windows)] +pub fn run_managed_agent_service( + shutdown: std::sync::Arc, +) -> Result<()> { + info!("Loading managed-agent configuration (running as SYSTEM)"); + + let mut config = config::Config::load()?; + // The service ONLY ever runs the managed/persistent path. A support session is + // an interactive, user-launched flow and must never be carried by the service. + config.support_code = None; + + info!("Server: {}", config.server_url); + if let Some(ref company) = config.company { + info!("Company: {}", company); + } + if let Some(ref site) = config.site { + info!("Site: {}", site); + } + + let rt = tokio::runtime::Runtime::new()?; + rt.block_on(async move { + // SPEC-016 Phase B: resolve the operating credential before connecting. + // Running as SYSTEM, the SYSTEM+Administrators-ACL'd cak_ store is now + // readable in-context, so the Phase B fail-fast guard is not hit on this + // path (it remains as a safety net for any non-SYSTEM invocation). + resolve_agent_credential(&mut config).await?; + run_agent(config, Some(shutdown)).await + }) +} + +/// SPEC-018 Phase 1: handle an interactive launch of a MANAGED agent binary (one +/// carrying embedded config, detected as [`config::RunMode::PermanentAgent`]). +/// +/// Managed mode runs as the LocalSystem service, never as an interactive process: +/// - If the service is already installed, the service is (or will be) running +/// the agent as SYSTEM, so this interactive invocation must NOT spawn a second +/// agent — it exits quietly. +/// - On first run, install (which installs + starts the service and removes the +/// legacy `HKCU\…\Run` autostart), then exit and let the service carry the +/// agent. If the service install fails (e.g. not elevated), fall back to +/// running the agent in-process for this run so the machine is not left with no +/// agent at all. +#[cfg(windows)] +fn run_permanent_agent_managed() -> Result<()> { + if service::is_service_installed() { + info!( + "Managed service already installed; the service runs the agent as SYSTEM — \ + this interactive instance has nothing to do" + ); + return Ok(()); + } + + info!("First run - installing managed agent service"); + if let Err(e) = install::install(false) { + warn!( + "Managed service install failed ({e:#}); falling back to in-process agent for this run" + ); + return run_agent_mode(None); + } + + info!("Managed agent service installed; handing off to the service"); + Ok(()) +} + /// Resolve the per-machine operating credential for a managed agent (SPEC-016 /// Phase B, run-mode wiring). /// @@ -372,9 +492,13 @@ async fn resolve_agent_credential(config: &mut config::Config) -> Result<()> { // do NOT silently re-enroll. The SYSTEM+Administrators ACL is correct // for the target (Option A) and is deliberately kept. // - // NOTE: this guard is satisfied/removed once the GuruConnect SYSTEM - // service host lands (separate spec, SPEC-018) and the agent always - // runs as SYSTEM — at which point the store is always readable. + // SPEC-018 (this spec): the managed agent now runs as the GuruConnect + // SYSTEM service ([`run_managed_agent_service`]), so on the production + // managed path the store IS readable in-context and this branch is NOT + // hit. The guard is intentionally retained as a harmless safety net for + // any non-SYSTEM invocation (e.g. someone running the managed binary + // interactively): it still fails fast with an actionable message rather + // than bricking. Do NOT remove it in Phase 1. Err(LoadCakError::Io { permission_denied: true, source, @@ -484,7 +608,22 @@ fn run_install(force_user_install: bool) -> Result<()> { fn run_uninstall() -> Result<()> { info!("Uninstalling GuruConnect..."); - // Remove from startup + // SPEC-018: remove the managed LocalSystem service and the legacy HKCU Run + // autostart. Idempotent — no error if the service was never installed (an + // attended/viewer install has no service), so this is safe for every install + // shape. Requires Administrator to delete the service; a non-elevated uninstall + // still clears the per-user autostart below. + #[cfg(windows)] + { + if let Err(e) = install::uninstall_managed_service() { + warn!( + "Failed to remove managed service (may require Administrator): {}", + e + ); + } + } + + // Remove from startup (covers non-elevated / attended / viewer installs). if let Err(e) = startup::remove_from_startup() { warn!("Failed to remove from startup: {}", e); } @@ -582,31 +721,62 @@ fn cleanup_on_exit() { } } -/// Run the agent main loop -async fn run_agent(config: config::Config) -> Result<()> { +/// Run the agent main loop. +/// +/// `service_shutdown`, when present, is the SCM cooperative-stop flag (SPEC-018): +/// the managed-agent service passes it so the loop exits promptly on +/// `Stop`/`Shutdown`. It is `None` for the interactive/user-launched paths, which +/// stop via the tray exit / server control messages instead. +async fn run_agent( + config: config::Config, + service_shutdown: Option>, +) -> Result<()> { + use std::sync::atomic::Ordering; + let elevated = install::is_elevated(); + let running_as_service = service_shutdown.is_some(); let mut session = session::SessionManager::new(config.clone(), elevated); let is_support_session = config.support_code.is_some(); let hostname = config.hostname(); - // Add to startup - if let Err(e) = startup::add_to_startup() { + // Helper: has the SCM asked us to stop? + let stop_requested = |flag: &Option>| -> bool { + flag.as_ref() + .map(|f| f.load(Ordering::SeqCst)) + .unwrap_or(false) + }; + + // Autostart persistence: + // - As the SYSTEM service (SPEC-018), the SERVICE itself is the managed + // autostart — do NOT write the per-user HKCU\…\Run entry (that would be a + // second, redundant autostart, and writing it from SYSTEM lands in the + // wrong hive). The service install/uninstall owns lifecycle. + // - Interactive/user-launched runs keep the existing HKCU Run behavior. + if running_as_service { + info!("Running as the GuruConnect SYSTEM service; service is the autostart (skipping HKCU Run)"); + } else if let Err(e) = startup::add_to_startup() { warn!("Failed to add to startup: {}", e); } - // Create tray icon - let tray = match tray::TrayController::new( - &hostname, - config.support_code.as_deref(), - is_support_session, - ) { - Ok(t) => { - info!("Tray icon created"); - Some(t) - } - Err(e) => { - warn!("Failed to create tray icon: {}", e); - None + // A Session-0 SYSTEM service has no interactive desktop, so a tray icon is + // both impossible and meaningless there (SPEC-018 Phase 2 moves the user-facing + // surface into the per-session worker). Only create the tray off the service. + let tray = if running_as_service { + None + } else { + match tray::TrayController::new( + &hostname, + config.support_code.as_deref(), + is_support_session, + ) { + Ok(t) => { + info!("Tray icon created"); + Some(t) + } + Err(e) => { + warn!("Failed to create tray icon: {}", e); + None + } } }; @@ -615,6 +785,12 @@ async fn run_agent(config: config::Config) -> Result<()> { // Connect to server and run main loop loop { + // SPEC-018: honour an SCM stop request before (re)connecting. + if stop_requested(&service_shutdown) { + info!("Service stop requested; exiting agent loop"); + return Ok(()); + } + info!("Connecting to server..."); if is_support_session { @@ -713,6 +889,18 @@ async fn run_agent(config: config::Config) -> Result<()> { } info!("Reconnecting in 5 seconds..."); - tokio::time::sleep(tokio::time::Duration::from_secs(5)).await; + // SPEC-018: poll the SCM stop flag during the backoff so a service stop is + // honoured within ~250ms instead of waiting the full reconnect delay. + if service_shutdown.is_some() { + for _ in 0..20 { + if stop_requested(&service_shutdown) { + info!("Service stop requested during reconnect backoff; exiting agent loop"); + return Ok(()); + } + tokio::time::sleep(tokio::time::Duration::from_millis(250)).await; + } + } else { + tokio::time::sleep(tokio::time::Duration::from_secs(5)).await; + } } } diff --git a/agent/src/service/mod.rs b/agent/src/service/mod.rs new file mode 100644 index 0000000..1ed76f3 --- /dev/null +++ b/agent/src/service/mod.rs @@ -0,0 +1,457 @@ +//! Windows SYSTEM service host for the managed GuruConnect agent (SPEC-018). +//! +//! # Phase 1 scope (this module) +//! +//! Phase 1 proves the *managed/persistent* agent can run as **LocalSystem** in +//! the isolated Session 0 across reboots and at the login screen: +//! +//! 1. Register the agent with the Service Control Manager (SCM) and run, when +//! started, the **existing persistent-agent logic** (`RunMode::PermanentAgent` +//! path) *as SYSTEM* — i.e. resolve/enroll the per-machine `cak_` (SPEC-016, +//! now readable because the SYSTEM-ACL'd store is in-context) and hold the +//! relay WSS connection. +//! 2. Report a correct service lifecycle to the SCM (`StartPending` -> +//! `Running` -> `StopPending` -> `Stopped`) and handle `Stop`/`Shutdown` +//! gracefully (signal the agent loop to close the WS connection and exit). +//! 3. Provide install/uninstall of the service (LocalSystem, auto-start, crash +//! recovery) so managed mode uses the service as its single autostart +//! instead of the per-user `HKCU\…\Run` entry. +//! +//! # Phase 2 (deliberately NOT built here — see SPEC-018 §Scope) +//! +//! A SYSTEM service lives in Session 0 and **cannot** capture or inject the +//! interactive desktop directly. Phase 1 therefore enrolls and connects but does +//! **NOT** capture a desktop yet. The following are Phase 2 and are intentionally +//! absent; the seams where they attach are called out inline below: +//! +//! - the **session broker** (`WTSEnumerateSessionsW` / +//! `WTSGetActiveConsoleSessionId` / `WTSQueryUserToken`), +//! - the **per-session capture/input worker** spawned via `CreateProcessAsUserW` +//! into `winsta0\default`, +//! - **service <-> worker IPC** (the per-session ACL'd named pipe), and +//! - **`SERVICE_CONTROL_SESSIONCHANGE`** reaction (logon/logoff/console-connect +//! retarget). +//! +//! Phase 1 registers the control handler for `Stop`/`Shutdown`/`Interrogate` +//! only. When Phase 2 lands, the broker hangs off the same control handler +//! (adding `SESSIONCHANGE`) and off the same agent runtime started here. + +#![cfg(windows)] + +use std::ffi::OsString; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::{Context, Result}; +use tracing::{error, info, warn}; + +use windows_service::{ + define_windows_service, + service::{ + ServiceAccess, ServiceControl, ServiceControlAccept, ServiceErrorControl, ServiceExitCode, + ServiceInfo, ServiceStartType, ServiceState, ServiceStatus, ServiceType, + }, + service_control_handler::{self, ServiceControlHandlerResult}, + service_dispatcher, + service_manager::{ServiceManager, ServiceManagerAccess}, +}; + +/// Internal service name registered with the SCM (no spaces; used by `sc`, +/// `ServiceManager`, and the control handler). +pub const SERVICE_NAME: &str = "GuruConnectAgent"; + +/// Human-facing display name shown in `services.msc`. +pub const SERVICE_DISPLAY_NAME: &str = "GuruConnect Managed Agent"; + +/// Service description shown in `services.msc`. +pub const SERVICE_DESCRIPTION: &str = + "Runs the managed GuruConnect remote-support agent as LocalSystem so it is \ + reachable at the login screen and across reboots (SPEC-018)."; + +/// Hidden subcommand the SCM invokes to enter the service control loop. The +/// service is registered with this as its launch argument (see [`install_service`]), +/// and `main.rs` routes it into [`run_dispatcher`]. +pub const SERVICE_RUN_ARG: &str = "service-run"; + +/// Hint we give the SCM for how long start/stop transitions may take before it +/// should consider the service hung. +const TRANSITION_WAIT: Duration = Duration::from_secs(10); + +// The `windows-service` dispatcher requires a `extern "system"` entry point with +// a fixed ABI; this macro generates `ffi_service_main`, which trampolines into +// our safe `service_main`. +define_windows_service!(ffi_service_main, service_main); + +/// Enter the SCM dispatcher (called from `main.rs` for the `service-run` +/// subcommand). Blocks until the service stops. This must be invoked by the SCM, +/// not interactively — `service_dispatcher::start` fails with +/// `ERROR_FAILED_SERVICE_CONTROLLER_CONNECT` (1063) if there is no controlling +/// SCM, which is the expected outcome of running `guruconnect service-run` by hand. +pub fn run_dispatcher() -> Result<()> { + service_dispatcher::start(SERVICE_NAME, ffi_service_main) + .context("failed to connect to the service control dispatcher (must be started by the SCM)") +} + +/// SCM-invoked service body. Any error is logged; the function cannot return an +/// error to the SCM directly, so [`run_service`] reports a failed exit code on the +/// status handle before returning. +fn service_main(_arguments: Vec) { + if let Err(e) = run_service() { + error!("service exited with error: {e:#}"); + } +} + +/// Drive the full service lifecycle: register the control handler, report +/// `Running`, run the persistent agent until a stop is requested, then report +/// `Stopped`. +fn run_service() -> Result<()> { + info!("GuruConnect managed agent service starting (running as SYSTEM in session 0)"); + + // Cooperative shutdown flag flipped by the SCM control handler and observed by + // the agent runtime. `AtomicBool` keeps the handler closure trivially `Send` + // and avoids holding a lock inside an SCM callback. + let shutdown = Arc::new(AtomicBool::new(false)); + let shutdown_for_handler = shutdown.clone(); + + let event_handler = move |control_event| -> ServiceControlHandlerResult { + match control_event { + // SPEC-018 Phase 1: graceful stop. Phase 2 adds + // `ServiceControl::SessionChange(_)` here to drive the session broker + // (retarget the capture/input worker on logon/logoff/console-connect); + // we intentionally do not accept SESSIONCHANGE yet. + ServiceControl::Stop | ServiceControl::Shutdown => { + info!("received {control_event:?}; signalling agent to shut down"); + shutdown_for_handler.store(true, Ordering::SeqCst); + ServiceControlHandlerResult::NoError + } + ServiceControl::Interrogate => ServiceControlHandlerResult::NoError, + _ => ServiceControlHandlerResult::NotImplemented, + } + }; + + let status_handle = service_control_handler::register(SERVICE_NAME, event_handler) + .context("failed to register the service control handler")?; + + // Report StartPending while we spin up the runtime and connect. + set_status( + &status_handle, + ServiceState::StartPending, + ServiceControlAccept::empty(), + TRANSITION_WAIT, + ); + + // Report Running and accept Stop + Shutdown. We report Running before the + // first connect attempt completes because the agent loop reconnects forever; + // "the service is up and trying" is the correct steady state, and blocking the + // SCM on the first relay handshake would risk a start timeout on a slow boot. + set_status( + &status_handle, + ServiceState::Running, + ServiceControlAccept::STOP | ServiceControlAccept::SHUTDOWN, + Duration::default(), + ); + info!("service reported Running; entering managed-agent control loop"); + + // Run the existing persistent-agent logic as SYSTEM. This is the Phase 1 + // payload: resolve/enroll the cak_ (SPEC-016) and hold the relay connection. + let run_result = crate::run_managed_agent_service(shutdown.clone()); + + if let Err(e) = &run_result { + // The agent loop only returns Err on an unrecoverable LOCAL fault (e.g. no + // usable credential and nothing to enroll with). Network errors are + // retried inside the loop and never surface here. Report the failure to + // the SCM so recovery actions (restart) engage. + error!("managed-agent control loop terminated with error: {e:#}"); + } else { + info!("managed-agent control loop exited cleanly on stop request"); + } + + // Transition StopPending -> Stopped. + set_status( + &status_handle, + ServiceState::StopPending, + ServiceControlAccept::empty(), + TRANSITION_WAIT, + ); + + let exit_code = match run_result { + Ok(()) => ServiceExitCode::Win32(0), + // ERROR_SERVICE_SPECIFIC_ERROR-style: surface a non-zero service-specific + // code so the SCM treats the exit as a failure and applies recovery. + Err(_) => ServiceExitCode::ServiceSpecific(1), + }; + + set_status_with_exit( + &status_handle, + ServiceState::Stopped, + ServiceControlAccept::empty(), + Duration::default(), + exit_code, + ); + info!("service reported Stopped"); + + Ok(()) +} + +/// Report a status with a zero (success) exit code. +fn set_status( + handle: &service_control_handler::ServiceStatusHandle, + state: ServiceState, + accepted: ServiceControlAccept, + wait_hint: Duration, +) { + set_status_with_exit( + handle, + state, + accepted, + wait_hint, + ServiceExitCode::Win32(0), + ); +} + +/// Report a status to the SCM. A failure to report is logged (best-effort) — we +/// cannot do anything actionable about it and must not panic inside the service. +fn set_status_with_exit( + handle: &service_control_handler::ServiceStatusHandle, + state: ServiceState, + accepted: ServiceControlAccept, + wait_hint: Duration, + exit_code: ServiceExitCode, +) { + let status = ServiceStatus { + service_type: ServiceType::OWN_PROCESS, + current_state: state, + controls_accepted: accepted, + exit_code, + checkpoint: 0, + wait_hint, + process_id: None, + }; + if let Err(e) = handle.set_service_status(status) { + warn!("failed to report service status {state:?} to the SCM: {e}"); + } +} + +// --------------------------------------------------------------------------- +// Install / uninstall (used by install.rs for managed mode) +// --------------------------------------------------------------------------- + +/// Install (or reinstall) the managed agent as a LocalSystem auto-start service +/// pointing at `exe_path` with the [`SERVICE_RUN_ARG`] launch argument. +/// +/// Idempotent: if the service already exists it is stopped and deleted first, +/// then recreated, so an upgrade picks up a new binary path / config. Configures +/// crash recovery (restart on failure) via `sc failure`. +/// +/// Requires Administrator (SCM `CREATE_SERVICE`). Returns an error otherwise. +pub fn install_service(exe_path: &std::path::Path) -> Result<()> { + let manager = ServiceManager::local_computer( + None::<&str>, + ServiceManagerAccess::CONNECT | ServiceManagerAccess::CREATE_SERVICE, + ) + .context("failed to connect to the Service Control Manager (run as Administrator)")?; + + // Remove any prior installation so the binary path / args are refreshed. + if let Ok(existing) = manager.open_service( + SERVICE_NAME, + ServiceAccess::QUERY_STATUS | ServiceAccess::STOP | ServiceAccess::DELETE, + ) { + info!("existing {SERVICE_NAME} service found; removing before reinstall"); + stop_if_running(&existing); + existing + .delete() + .context("failed to delete the existing service before reinstall")?; + drop(existing); + // The SCM marks a service for deletion but only removes it once all handles + // close; a brief settle avoids a CreateService "marked for deletion" race. + std::thread::sleep(Duration::from_secs(2)); + } + + let service_info = ServiceInfo { + name: OsString::from(SERVICE_NAME), + display_name: OsString::from(SERVICE_DISPLAY_NAME), + service_type: ServiceType::OWN_PROCESS, + start_type: ServiceStartType::AutoStart, + error_control: ServiceErrorControl::Normal, + executable_path: exe_path.to_path_buf(), + launch_arguments: vec![OsString::from(SERVICE_RUN_ARG)], + dependencies: vec![], + // account_name: None => LocalSystem (the SPEC-018 requirement). + account_name: None, + account_password: None, + }; + + let service = manager + .create_service(&service_info, ServiceAccess::CHANGE_CONFIG) + .context("failed to create the GuruConnect managed agent service")?; + + service + .set_description(SERVICE_DESCRIPTION) + .context("failed to set the service description")?; + + configure_recovery(); + + info!( + "installed {SERVICE_NAME} (LocalSystem, auto-start) -> {} {}", + exe_path.display(), + SERVICE_RUN_ARG + ); + Ok(()) +} + +/// Configure SCM crash-recovery so the service restarts on unexpected exit. +/// +/// `windows-service` 0.7 does not expose `ChangeServiceConfig2` recovery actions +/// in a stable, ergonomic form, so we mirror the established pattern used by the +/// SAS service binary and shell out to `sc failure`. `reset=86400` clears the +/// failure count after a day; three `restart/5000` actions retry after 5s each. +fn configure_recovery() { + use std::os::windows::process::CommandExt; + const CREATE_NO_WINDOW: u32 = 0x0800_0000; + + match std::process::Command::new("sc") + .args([ + "failure", + SERVICE_NAME, + "reset=86400", + "actions=restart/5000/restart/5000/restart/5000", + ]) + .creation_flags(CREATE_NO_WINDOW) + .output() + { + Ok(out) if out.status.success() => { + info!("configured crash-recovery (restart) for {SERVICE_NAME}"); + } + Ok(out) => { + warn!( + "could not configure crash-recovery for {SERVICE_NAME} (sc failure exit {:?}); \ + the service will still run but will not auto-restart on crash", + out.status.code() + ); + } + Err(e) => { + warn!("could not invoke `sc failure` to set crash-recovery for {SERVICE_NAME}: {e}"); + } + } +} + +/// Stop (if running) and delete the managed agent service. Idempotent: succeeds +/// quietly if the service is not installed. +pub fn uninstall_service() -> Result<()> { + let manager = ServiceManager::local_computer(None::<&str>, ServiceManagerAccess::CONNECT) + .context("failed to connect to the Service Control Manager (run as Administrator)")?; + + match manager.open_service( + SERVICE_NAME, + ServiceAccess::QUERY_STATUS | ServiceAccess::STOP | ServiceAccess::DELETE, + ) { + Ok(service) => { + stop_if_running(&service); + service + .delete() + .context("failed to delete the managed agent service")?; + info!("uninstalled {SERVICE_NAME} service"); + Ok(()) + } + Err(_) => { + // Not installed — nothing to do (idempotent uninstall). + info!("{SERVICE_NAME} service is not installed; nothing to uninstall"); + Ok(()) + } + } +} + +/// Start the managed agent service now (used right after a first-run install so +/// the agent comes up without waiting for the next boot). Best-effort: logs and +/// returns the SCM error if the start fails, but a failure is not fatal to install +/// because the service is auto-start and will come up on the next boot regardless. +pub fn start_service() -> Result<()> { + let manager = ServiceManager::local_computer(None::<&str>, ServiceManagerAccess::CONNECT) + .context("failed to connect to the Service Control Manager")?; + let service = manager + .open_service( + SERVICE_NAME, + ServiceAccess::START | ServiceAccess::QUERY_STATUS, + ) + .context("failed to open the managed agent service to start it")?; + + // If it is already running (e.g. reinstall-over-running), there is nothing to do. + if let Ok(status) = service.query_status() { + if status.current_state == ServiceState::Running + || status.current_state == ServiceState::StartPending + { + info!("{SERVICE_NAME} is already running/starting"); + return Ok(()); + } + } + + service + .start::(&[]) + .context("failed to start the managed agent service")?; + info!("started {SERVICE_NAME}"); + Ok(()) +} + +/// Report whether the managed agent service is currently installed. +pub fn is_service_installed() -> bool { + match ServiceManager::local_computer(None::<&str>, ServiceManagerAccess::CONNECT) { + Ok(manager) => manager + .open_service(SERVICE_NAME, ServiceAccess::QUERY_STATUS) + .is_ok(), + Err(_) => false, + } +} + +/// Best-effort stop of a service, waiting briefly for it to leave the running +/// state so a subsequent `delete` does not race an in-flight stop. +fn stop_if_running(service: &windows_service::service::Service) { + if let Ok(status) = service.query_status() { + if status.current_state != ServiceState::Stopped { + info!("stopping {SERVICE_NAME} before delete"); + let _ = service.stop(); + for _ in 0..10 { + std::thread::sleep(Duration::from_millis(500)); + match service.query_status() { + Ok(s) if s.current_state == ServiceState::Stopped => break, + _ => continue, + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// The launch argument the service is registered with MUST equal the hidden + /// `service-run` subcommand `main.rs` dispatches into [`run_dispatcher`]; a + /// mismatch would register a service the SCM could start but that would fall + /// through to normal (non-service) mode and immediately exit. + #[test] + fn service_run_arg_matches_subcommand_name() { + assert_eq!(SERVICE_RUN_ARG, "service-run"); + } + + /// Service identifiers are non-empty and the internal name carries no spaces + /// (the SCM key / `sc` argument must be a single token). + #[test] + fn service_identifiers_are_well_formed() { + assert!(!SERVICE_NAME.is_empty()); + assert!( + !SERVICE_NAME.contains(char::is_whitespace), + "the SCM service name must be a single whitespace-free token" + ); + assert!(!SERVICE_DISPLAY_NAME.is_empty()); + assert!(!SERVICE_DESCRIPTION.is_empty()); + } + + /// `is_service_installed` must never panic regardless of elevation/SCM access; + /// on a dev workstation without the service installed it returns `false`. (We + /// do NOT install the service in tests — that is a VM/admin integration step.) + #[test] + fn is_service_installed_is_total() { + let _ = is_service_installed(); + } +} From a0e0d5f1e7f283590ba21ab46bc1c133cc016af2 Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Tue, 2 Jun 2026 13:57:41 -0700 Subject: [PATCH 2/2] fix(agent): SPEC-018 Phase 1 review fixes (cancellable session loop, panic guard, service-create retry) H: thread the SCM cooperative-stop flag into the connected session loop (run_with_tray) via a new Option<&Arc> param. The flag was only observed by the outer run_agent reconnect loop, which never runs while a session is connected, so an SCM Stop/Shutdown left the service Running until force-kill. The inner loop now checks it each tick, closes the WS cleanly, and returns the SERVICE_STOP sentinel that the outer loop maps to a graceful stop. The new param is optional: attended/viewer/interactive callers pass None and behave exactly as before. M: wrap the managed-agent runtime block_on in catch_unwind(AssertUnwindSafe) so a panic in the agent future cannot unwind across the extern "system" service entry (UB/abort). A caught panic becomes an Err -> ServiceExitCode::ServiceSpecific(1) so SCM recovery engages cleanly. L1: replace the fixed 2s sleep after delete() on reinstall with a bounded retry on CreateService returning ERROR_SERVICE_MARKED_FOR_DELETE (1072), gated on having actually deleted a prior instance. L2: clarify the --elevated -> force_user_install mapping (comment only). N1: add a clap-metadata test pinning the service-run subcommand name to SERVICE_RUN_ARG, cross-linked from the existing literal test. N2: correct the service doc comments now that graceful stop interrupts the connected case too. Verified on Windows host: cargo fmt --check, clippy -D warnings, release build (x86_64-pc-windows-msvc), and cargo test (58 passed) all green. Co-Authored-By: Claude Opus 4.8 (1M context) --- agent/src/main.rs | 103 +++++++++++++++++++++++++++++++++++---- agent/src/service/mod.rs | 75 +++++++++++++++++++++++++--- agent/src/session/mod.rs | 97 +++++++++++++++++++++++++++++++++++- 3 files changed, 258 insertions(+), 17 deletions(-) diff --git a/agent/src/main.rs b/agent/src/main.rs index d2d57b3..9df5249 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -234,7 +234,24 @@ fn main() -> Result<()> { Some(Commands::Install { user_only, elevated, - }) => run_install(user_only || elevated), + }) => { + // `run_install`'s parameter is `force_user_install` — when true it + // skips the UAC re-elevation attempt and installs in-place with + // whatever rights this process already has. + // + // - `user_only`: the user explicitly asked for a per-user install; + // honour it directly. + // - `elevated`: this is the internal, already-elevated re-exec spawned + // by `try_elevate_and_install` ("install --elevated"). It must NOT + // attempt to elevate AGAIN (that would loop / re-prompt), so we pass + // force=true here too. This is correct even though it routes through + // the "user install" parameter, because the re-exec genuinely runs + // elevated: `is_elevated()` returns true inside `install()`, so the + // path resolves to Program Files and the LocalSystem service installs + // normally. The flag only suppresses re-elevation; it does not force a + // per-user (non-elevated) install when we are already elevated. + run_install(user_only || elevated) + } Some(Commands::Uninstall) => run_uninstall(), Some(Commands::Launch { url }) => run_launch(&url), Some(Commands::VersionInfo) => { @@ -413,14 +430,40 @@ pub fn run_managed_agent_service( } let rt = tokio::runtime::Runtime::new()?; - rt.block_on(async move { - // SPEC-016 Phase B: resolve the operating credential before connecting. - // Running as SYSTEM, the SYSTEM+Administrators-ACL'd cak_ store is now - // readable in-context, so the Phase B fail-fast guard is not hit on this - // path (it remains as a safety net for any non-SYSTEM invocation). - resolve_agent_credential(&mut config).await?; - run_agent(config, Some(shutdown)).await - }) + + // SPEC-018 (finding M): this future runs across the `extern "system"` service + // entry point (ffi_service_main -> service_main -> run_service -> here). A + // panic that unwound across that FFI boundary is undefined behaviour (the C + // ABI cannot carry a Rust unwind) and would abort the process instead of + // taking the intended ServiceSpecific(1) fault path. Catch it here and convert + // it into an `Err`, which `run_service` maps to ServiceExitCode::ServiceSpecific(1) + // so the SCM applies its configured recovery (restart) cleanly. `Running` is + // already reported before we get here, so a fault does not strand StartPending. + let outcome = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + rt.block_on(async move { + // SPEC-016 Phase B: resolve the operating credential before connecting. + // Running as SYSTEM, the SYSTEM+Administrators-ACL'd cak_ store is now + // readable in-context, so the Phase B fail-fast guard is not hit on this + // path (it remains as a safety net for any non-SYSTEM invocation). + resolve_agent_credential(&mut config).await?; + run_agent(config, Some(shutdown)).await + }) + })); + + match outcome { + Ok(result) => result, + Err(panic) => { + // Recover a human-readable message from the panic payload for the log; + // do not re-panic (that would unwind across the FFI boundary again). + let detail = panic + .downcast_ref::<&str>() + .map(|s| s.to_string()) + .or_else(|| panic.downcast_ref::().cloned()) + .unwrap_or_else(|| "non-string panic payload".to_string()); + error!("managed-agent runtime panicked: {detail}"); + Err(anyhow::anyhow!("managed-agent runtime panicked: {detail}")) + } + } } /// SPEC-018 Phase 1: handle an interactive launch of a MANAGED agent binary (one @@ -812,11 +855,22 @@ async fn run_agent( } if let Err(e) = session - .run_with_tray(tray.as_ref(), chat_ctrl.as_ref()) + .run_with_tray(tray.as_ref(), chat_ctrl.as_ref(), service_shutdown.as_ref()) .await { let error_msg = e.to_string(); + // SPEC-018 (finding H): the connected session loop broke + // because the SCM asked the service to stop. The loop already + // closed the WebSocket cleanly; treat this as a graceful stop + // (no reconnect) so the service transitions StopPending -> + // Stopped. Only the service path can produce this (it is the + // only caller that passes a shutdown flag). + if error_msg.contains(session::SERVICE_STOP_SENTINEL) { + info!("Service stop requested during session; exiting agent loop"); + return Ok(()); + } + if error_msg.contains("USER_EXIT") { info!("Session ended by user"); cleanup_on_exit(); @@ -904,3 +958,32 @@ async fn run_agent( } } } + +#[cfg(test)] +mod tests { + use super::*; + use clap::CommandFactory; + + /// SPEC-018 finding N1: pin the clap subcommand name to the constant the SCM + /// is registered with. The service is installed with `SERVICE_RUN_ARG` as its + /// launch argument; when the SCM starts it, clap must route that exact token + /// into [`Commands::ServiceRun`]. If the `#[command(name = "service-run")]` + /// attribute and the constant ever drift apart, the SCM would start the binary + /// but clap would fail to match the subcommand and the process would fall + /// through to default (non-service) mode and exit. Asserting against the live + /// clap metadata (not a second string literal) makes that drift impossible. + #[test] + #[cfg(windows)] + fn service_run_subcommand_matches_scm_launch_arg() { + let cmd = Cli::command(); + let has_matching_subcommand = cmd + .get_subcommands() + .any(|sc| sc.get_name() == service::SERVICE_RUN_ARG); + assert!( + has_matching_subcommand, + "no clap subcommand named '{}' (the SCM launch arg); the ServiceRun \ + #[command(name = ...)] attribute drifted from service::SERVICE_RUN_ARG", + service::SERVICE_RUN_ARG + ); + } +} diff --git a/agent/src/service/mod.rs b/agent/src/service/mod.rs index 1ed76f3..e3f99b2 100644 --- a/agent/src/service/mod.rs +++ b/agent/src/service/mod.rs @@ -12,7 +12,11 @@ //! relay WSS connection. //! 2. Report a correct service lifecycle to the SCM (`StartPending` -> //! `Running` -> `StopPending` -> `Stopped`) and handle `Stop`/`Shutdown` -//! gracefully (signal the agent loop to close the WS connection and exit). +//! gracefully. The control handler sets a shared shutdown flag; the agent +//! runtime observes it both between reconnect attempts AND inside the +//! connected session loop (SPEC-018 finding H), so a stop received while a +//! session is live breaks out promptly, closes the WS connection cleanly, +//! and exits — rather than waiting for the SCM to force-kill. //! 3. Provide install/uninstall of the service (LocalSystem, auto-start, crash //! recovery) so managed mode uses the service as its single autostart //! instead of the per-user `HKCU\…\Run` entry. @@ -122,6 +126,11 @@ fn run_service() -> Result<()> { // we intentionally do not accept SESSIONCHANGE yet. ServiceControl::Stop | ServiceControl::Shutdown => { info!("received {control_event:?}; signalling agent to shut down"); + // Set the cooperative-stop flag. The agent runtime observes it on + // every idle tick of the connected session loop and between + // reconnect attempts (SPEC-018 finding H), so it breaks out and + // closes the WebSocket cleanly within ~100ms even if a session is + // currently connected. shutdown_for_handler.store(true, Ordering::SeqCst); ServiceControlHandlerResult::NoError } @@ -253,6 +262,7 @@ pub fn install_service(exe_path: &std::path::Path) -> Result<()> { .context("failed to connect to the Service Control Manager (run as Administrator)")?; // Remove any prior installation so the binary path / args are refreshed. + let mut deleted_existing = false; if let Ok(existing) = manager.open_service( SERVICE_NAME, ServiceAccess::QUERY_STATUS | ServiceAccess::STOP | ServiceAccess::DELETE, @@ -263,9 +273,7 @@ pub fn install_service(exe_path: &std::path::Path) -> Result<()> { .delete() .context("failed to delete the existing service before reinstall")?; drop(existing); - // The SCM marks a service for deletion but only removes it once all handles - // close; a brief settle avoids a CreateService "marked for deletion" race. - std::thread::sleep(Duration::from_secs(2)); + deleted_existing = true; } let service_info = ServiceInfo { @@ -282,8 +290,7 @@ pub fn install_service(exe_path: &std::path::Path) -> Result<()> { account_password: None, }; - let service = manager - .create_service(&service_info, ServiceAccess::CHANGE_CONFIG) + let service = create_service_with_retry(&manager, &service_info, deleted_existing) .context("failed to create the GuruConnect managed agent service")?; service @@ -300,6 +307,56 @@ pub fn install_service(exe_path: &std::path::Path) -> Result<()> { Ok(()) } +/// Create the service, retrying briefly if the SCM still has the prior instance +/// "marked for deletion" (SPEC-018 finding L1). +/// +/// When a service is deleted, the SCM only removes it from its database once every +/// open handle to it closes; until then a fresh `CreateService` fails with +/// `ERROR_SERVICE_MARKED_FOR_DELETE` (1072). The previous implementation papered +/// over this with a fixed 2s sleep after `delete()`, which is both slower than +/// necessary in the common case and still racy on a busy box. Instead we attempt +/// the create immediately and, only if we just deleted an existing instance and +/// hit 1072, retry a few times with short backoff — succeeding as soon as the SCM +/// finishes the removal, and giving up with the real error if it never does. +/// +/// The retry is gated on `deleted_existing`: on a clean first install there was no +/// prior instance, so a 1072 there is unexpected and is surfaced immediately +/// rather than masked by retries. +fn create_service_with_retry( + manager: &ServiceManager, + service_info: &ServiceInfo, + deleted_existing: bool, +) -> Result { + // ERROR_SERVICE_MARKED_FOR_DELETE (winerror.h). The service is gone from the + // caller's perspective but the SCM has not finished reaping it. + const ERROR_SERVICE_MARKED_FOR_DELETE: i32 = 1072; + // Bounded: ~5 attempts over ~2s total worst case (matches the old fixed sleep + // ceiling) but returns the instant the SCM is ready. + const MAX_ATTEMPTS: u32 = 5; + const BACKOFF: Duration = Duration::from_millis(400); + + let mut attempt = 0; + loop { + attempt += 1; + match manager.create_service(service_info, ServiceAccess::CHANGE_CONFIG) { + Ok(service) => return Ok(service), + Err(windows_service::Error::Winapi(ref io_err)) + if deleted_existing + && io_err.raw_os_error() == Some(ERROR_SERVICE_MARKED_FOR_DELETE) + && attempt < MAX_ATTEMPTS => + { + warn!( + "{SERVICE_NAME} still marked for deletion by the SCM \ + (attempt {attempt}/{MAX_ATTEMPTS}); retrying in {}ms", + BACKOFF.as_millis() + ); + std::thread::sleep(BACKOFF); + } + Err(e) => return Err(e), + } + } +} + /// Configure SCM crash-recovery so the service restarts on unexpected exit. /// /// `windows-service` 0.7 does not expose `ChangeServiceConfig2` recovery actions @@ -429,6 +486,12 @@ mod tests { /// `service-run` subcommand `main.rs` dispatches into [`run_dispatcher`]; a /// mismatch would register a service the SCM could start but that would fall /// through to normal (non-service) mode and immediately exit. + /// + /// This pins the value of the constant itself. The companion test + /// `tests::service_run_subcommand_matches_scm_launch_arg` in `main.rs` pins the + /// other half — that the clap `#[command(name = "service-run")]` attribute on + /// `Commands::ServiceRun` resolves to this same constant — so the two string + /// literals cannot silently drift apart. #[test] fn service_run_arg_matches_subcommand_name() { assert_eq!(SERVICE_RUN_ARG, "service-run"); diff --git a/agent/src/session/mod.rs b/agent/src/session/mod.rs index 5e40185..16cea1f 100644 --- a/agent/src/session/mod.rs +++ b/agent/src/session/mod.rs @@ -41,8 +41,18 @@ use crate::proto::{message, AgentStatus, ChatMessage, Heartbeat, HeartbeatAck, M use crate::transport::WebSocketTransport; use crate::tray::{TrayAction, TrayController}; use anyhow::Result; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; use std::time::{Duration, Instant}; +/// Sentinel error string returned by [`SessionManager::run_with_tray`] when the +/// loop breaks because the SCM asked the managed-agent service to stop (SPEC-018, +/// finding H). The outer `run_agent` loop matches on this to treat the exit as a +/// graceful service stop (clean WS close, no reconnect) rather than a session +/// error. Only the service path passes a shutdown flag, so only the service path +/// can ever produce this. +pub const SERVICE_STOP_SENTINEL: &str = "SERVICE_STOP"; + // Heartbeat interval (30 seconds) const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(30); // Status report interval (60 seconds) @@ -285,16 +295,34 @@ impl SessionManager { Ok(()) } - /// Run the session main loop with tray and chat event processing + /// Run the session main loop with tray and chat event processing. + /// + /// `service_shutdown` (SPEC-018 finding H) is the SCM cooperative-stop flag. + /// It is `Some(flag)` ONLY on the managed-agent service path; the + /// attended/viewer/interactive callers pass `None` and behave EXACTLY as + /// before. When present, the flag is polled on every idle tick (the natural + /// ~100ms seam below) so an SCM Stop/Shutdown received while CONNECTED breaks + /// this inner loop promptly — instead of only being observed by the outer + /// `run_agent` reconnect loop, which never runs while a session is connected. + /// On a set flag the loop closes the WebSocket cleanly (via the shared exit + /// path at the bottom) and returns the [`SERVICE_STOP_SENTINEL`] error, which + /// the outer loop maps to a graceful stop. pub async fn run_with_tray( &mut self, tray: Option<&TrayController>, chat: Option<&ChatController>, + service_shutdown: Option<&Arc>, ) -> Result<()> { if self.transport.is_none() { anyhow::bail!("Not connected"); } + // Helper: has the SCM asked the service to stop? Always false off the + // service path (where `service_shutdown` is `None`). + let stop_requested = |flag: Option<&Arc>| -> bool { + flag.is_some_and(|f| f.load(Ordering::SeqCst)) + }; + // Send initial status self.send_status().await?; @@ -307,6 +335,29 @@ impl SessionManager { // Main loop loop { + // SPEC-018 (finding H): honour an SCM stop request received while the + // session is CONNECTED. The outer `run_agent` loop only observes the + // flag between connection attempts, but a managed agent spends its + // entire connected life inside THIS loop — so without this check an + // SCM Stop while connected would not break out until the connection + // dropped on its own. Breaking here falls through to the shared exit + // path below, which closes the transport cleanly (clean WS close); + // the sentinel tells the outer loop this was a graceful stop. + if stop_requested(service_shutdown) { + tracing::info!("Service stop requested; ending connected session loop"); + self.release_streaming(); + self.state = SessionState::Disconnected; + if let Some(transport) = self.transport.as_mut() { + // Best-effort clean WebSocket close (sends a Close frame). A + // failure here just means the peer/socket is already gone; the + // service still stops cleanly. + if let Err(e) = transport.close().await { + tracing::warn!("error during clean WebSocket close on service stop: {}", e); + } + } + return Err(anyhow::anyhow!(SERVICE_STOP_SENTINEL)); + } + // Process tray events if let Some(t) = tray { if let Some(action) = t.process_events() { @@ -745,3 +796,47 @@ impl SessionManager { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + + /// SPEC-018 finding H: the connected-stop contract. When the SCM sets the + /// shutdown flag, `run_with_tray` returns an error whose message contains + /// [`SERVICE_STOP_SENTINEL`]; the outer `run_agent` loop recognises a graceful + /// stop with `error_msg.contains(SERVICE_STOP_SENTINEL)`. This pins that the + /// error the loop constructs on stop actually satisfies that match — so the + /// two halves (producer here, consumer in `main.rs`) cannot drift. + /// + /// A full end-to-end test of the in-loop interrupt would need a live connected + /// transport (a real or mocked server), which is an integration concern; this + /// unit test instead pins the wire contract the interrupt relies on. + #[test] + fn service_stop_sentinel_is_matched_by_outer_loop_check() { + let produced = anyhow::anyhow!(SERVICE_STOP_SENTINEL); + assert!( + produced.to_string().contains(SERVICE_STOP_SENTINEL), + "the stop error must contain the sentinel the outer loop matches on" + ); + assert!( + !SERVICE_STOP_SENTINEL.is_empty(), + "the sentinel must be a non-empty, distinctive token" + ); + } + + /// The shutdown-flag check is a no-op (always `false`) when no flag is passed, + /// i.e. on the attended/viewer/interactive paths — guaranteeing the new + /// parameter is a pure addition that cannot alter non-service behaviour + /// (SPEC-018 finding H: "no regression"). + #[test] + fn no_shutdown_flag_never_requests_stop() { + let none: Option<&Arc> = None; + let check = |flag: Option<&Arc>| flag.is_some_and(|f| f.load(Ordering::SeqCst)); + assert!(!check(none)); + + let set = Arc::new(AtomicBool::new(true)); + assert!(check(Some(&set))); + let unset = Arc::new(AtomicBool::new(false)); + assert!(!check(Some(&unset))); + } +}