diff --git a/agent/src/install.rs b/agent/src/install.rs index 82284e6..77a8e29 100644 --- a/agent/src/install.rs +++ b/agent/src/install.rs @@ -290,6 +290,18 @@ pub fn install(force_user_install: bool) -> Result<()> { // Register protocol handler register_protocol_handler(elevated)?; + // SPEC-018: a MANAGED install (embedded config => persistent agent) installs + // the LocalSystem service as its single autostart and removes the per-user + // HKCU\…\Run entry. Attended (support-code) and viewer installs are untouched: + // they have no embedded config and continue to use the HKCU Run / protocol + // handler paths exactly as before. + #[cfg(windows)] + { + if crate::config::Config::has_embedded_config() { + install_managed_service(&exe_path)?; + } + } + info!("Installation complete!"); if elevated { info!("Installed system-wide to: {}", install_path.display()); @@ -300,6 +312,64 @@ pub fn install(force_user_install: bool) -> Result<()> { Ok(()) } +/// SPEC-018: install the managed agent as a LocalSystem service and swap out the +/// legacy per-user `HKCU\…\Run` autostart so the service is the single managed +/// autostart (no double-run). +/// +/// Installing a LocalSystem service requires Administrator. If the SCM rejects the +/// create (not elevated), we surface the error rather than silently leaving the +/// machine with no managed autostart — a managed deployment is expected to run the +/// install elevated. The HKCU Run entry is removed best-effort regardless. +#[cfg(windows)] +pub fn install_managed_service(exe_path: &std::path::Path) -> Result<()> { + info!("Managed install: registering LocalSystem service (SPEC-018)"); + + crate::service::install_service(exe_path) + .map_err(|e| anyhow!("failed to install the managed agent service: {e:#}"))?; + + // Start the service now so the agent comes up immediately on first install + // rather than only on the next boot. Best-effort: the service is auto-start, so + // a transient start failure still self-heals on reboot. + if let Err(e) = crate::service::start_service() { + warn!( + "managed service installed but did not start now ({e:#}); \ + it is auto-start and will run on next boot" + ); + } + + // Remove the legacy per-user autostart so the agent does not also launch in the + // user's session (which would double-run alongside the service). + if let Err(e) = crate::startup::remove_from_startup() { + warn!( + "managed service installed, but failed to remove the legacy HKCU Run \ + autostart (harmless if it was never present): {}", + e + ); + } else { + info!("removed legacy HKCU Run autostart (service is now the managed autostart)"); + } + + Ok(()) +} + +/// SPEC-018: remove the managed agent service and any legacy HKCU Run autostart. +/// Idempotent — succeeds if neither is present. +#[cfg(windows)] +pub fn uninstall_managed_service() -> Result<()> { + info!("Managed uninstall: removing LocalSystem service (SPEC-018)"); + + // Best-effort removal of the legacy autostart first (cheap, no SCM). + if let Err(e) = crate::startup::remove_from_startup() { + warn!( + "failed to remove legacy HKCU Run autostart during uninstall: {}", + e + ); + } + + crate::service::uninstall_service() + .map_err(|e| anyhow!("failed to uninstall the managed agent service: {e:#}")) +} + /// Check if the guruconnect:// protocol handler is registered #[cfg(windows)] pub fn is_protocol_handler_registered() -> bool { diff --git a/agent/src/main.rs b/agent/src/main.rs index 5c2fb55..9df5249 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -24,6 +24,8 @@ mod identity; mod input; mod install; mod sas_client; +#[cfg(windows)] +mod service; mod session; mod startup; mod transport; @@ -182,6 +184,12 @@ enum Commands { /// Show detailed version and build information #[command(name = "version-info")] VersionInfo, + + /// Internal: entry point invoked by the Windows Service Control Manager to run + /// the managed agent as a LocalSystem service (SPEC-018). Not for interactive + /// use — running it by hand fails because there is no controlling SCM. + #[command(name = "service-run", hide = true)] + ServiceRun, } fn main() -> Result<()> { @@ -226,7 +234,24 @@ fn main() -> Result<()> { Some(Commands::Install { user_only, elevated, - }) => run_install(user_only || elevated), + }) => { + // `run_install`'s parameter is `force_user_install` — when true it + // skips the UAC re-elevation attempt and installs in-place with + // whatever rights this process already has. + // + // - `user_only`: the user explicitly asked for a per-user install; + // honour it directly. + // - `elevated`: this is the internal, already-elevated re-exec spawned + // by `try_elevate_and_install` ("install --elevated"). It must NOT + // attempt to elevate AGAIN (that would loop / re-prompt), so we pass + // force=true here too. This is correct even though it routes through + // the "user install" parameter, because the re-exec genuinely runs + // elevated: `is_elevated()` returns true inside `install()`, so the + // path resolves to Program Files and the LocalSystem service installs + // normally. The flag only suppresses re-elevation; it does not force a + // per-user (non-elevated) install when we are already elevated. + run_install(user_only || elevated) + } Some(Commands::Uninstall) => run_uninstall(), Some(Commands::Launch { url }) => run_launch(&url), Some(Commands::VersionInfo) => { @@ -236,6 +261,21 @@ fn main() -> Result<()> { println!("{}", build_info::full_version()); Ok(()) } + Some(Commands::ServiceRun) => { + // SPEC-018 Phase 1: SCM-invoked entry. Hand off to the service + // dispatcher, which calls back into the control loop and runs the + // managed-agent logic as SYSTEM. Blocks until the service stops. + #[cfg(windows)] + { + service::run_dispatcher() + } + #[cfg(not(windows))] + { + Err(anyhow::anyhow!( + "service-run is a Windows-only entry point (SPEC-018)" + )) + } + } None => { // No subcommand - detect mode from filename or embedded config // Legacy: if support_code arg provided, use that @@ -264,16 +304,31 @@ fn main() -> Result<()> { run_agent_mode(Some(code)) } RunMode::PermanentAgent => { - // Embedded config found - run as permanent agent + // Embedded config found - managed/persistent agent. info!("Permanent agent mode detected (embedded config)"); - if !install::is_protocol_handler_registered() { - // First run - install then run as agent - info!("First run - installing agent"); - if let Err(e) = install::install(false) { - warn!("Installation failed: {}", e); - } + + // SPEC-018: managed mode runs as the LocalSystem service, not as + // an interactive process. The service is the single autostart. + // - If the service is already installed, the service is (or + // will be) running the agent — this interactive invocation + // must NOT spawn a second agent. Exit quietly. + // - On first run, install (which installs + starts the service + // and removes the legacy HKCU Run entry), then exit and let + // the service carry the agent as SYSTEM. + #[cfg(windows)] + { + run_permanent_agent_managed() + } + #[cfg(not(windows))] + { + if !install::is_protocol_handler_registered() { + info!("First run - installing agent"); + if let Err(e) = install::install(false) { + warn!("Installation failed: {}", e); + } + } + run_agent_mode(None) } - run_agent_mode(None) } RunMode::Default => { // No special mode detected - use legacy logic @@ -333,10 +388,118 @@ fn run_agent_mode(support_code: Option) -> Result<()> { if config.support_code.is_none() { resolve_agent_credential(&mut config).await?; } - run_agent(config).await + run_agent(config, None).await }) } +/// SPEC-018 Phase 1: run the managed/persistent agent as the LocalSystem service. +/// +/// Invoked from the service control loop ([`service::run_service`]) once the +/// service has reported `Running`. This is the same persistent-agent logic as +/// [`run_agent_mode`] (load config, resolve/enroll the per-machine `cak_` per +/// SPEC-016, hold the relay connection) — but it runs **as SYSTEM**, so the +/// SYSTEM-ACL'd `cak_` store is finally readable in-context, and it observes the +/// SCM `shutdown` flag for a graceful stop. +/// +/// Returns `Ok(())` when the agent loop exits because a stop was requested, and +/// `Err` only on an unrecoverable *local* fault (e.g. no usable credential and no +/// enrollment material) — network errors are retried inside the loop and never +/// surface here. +/// +/// Phase 2 seam: this is where the session broker is wired in — the runtime +/// started here will own the broker that spawns the per-session capture/input +/// worker (`CreateProcessAsUserW`) and the IPC server. Phase 1 connects/enrolls +/// only; it does not capture a desktop (a Session-0 SYSTEM process cannot). +#[cfg(windows)] +pub fn run_managed_agent_service( + shutdown: std::sync::Arc, +) -> Result<()> { + info!("Loading managed-agent configuration (running as SYSTEM)"); + + let mut config = config::Config::load()?; + // The service ONLY ever runs the managed/persistent path. A support session is + // an interactive, user-launched flow and must never be carried by the service. + config.support_code = None; + + info!("Server: {}", config.server_url); + if let Some(ref company) = config.company { + info!("Company: {}", company); + } + if let Some(ref site) = config.site { + info!("Site: {}", site); + } + + let rt = tokio::runtime::Runtime::new()?; + + // SPEC-018 (finding M): this future runs across the `extern "system"` service + // entry point (ffi_service_main -> service_main -> run_service -> here). A + // panic that unwound across that FFI boundary is undefined behaviour (the C + // ABI cannot carry a Rust unwind) and would abort the process instead of + // taking the intended ServiceSpecific(1) fault path. Catch it here and convert + // it into an `Err`, which `run_service` maps to ServiceExitCode::ServiceSpecific(1) + // so the SCM applies its configured recovery (restart) cleanly. `Running` is + // already reported before we get here, so a fault does not strand StartPending. + let outcome = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + rt.block_on(async move { + // SPEC-016 Phase B: resolve the operating credential before connecting. + // Running as SYSTEM, the SYSTEM+Administrators-ACL'd cak_ store is now + // readable in-context, so the Phase B fail-fast guard is not hit on this + // path (it remains as a safety net for any non-SYSTEM invocation). + resolve_agent_credential(&mut config).await?; + run_agent(config, Some(shutdown)).await + }) + })); + + match outcome { + Ok(result) => result, + Err(panic) => { + // Recover a human-readable message from the panic payload for the log; + // do not re-panic (that would unwind across the FFI boundary again). + let detail = panic + .downcast_ref::<&str>() + .map(|s| s.to_string()) + .or_else(|| panic.downcast_ref::().cloned()) + .unwrap_or_else(|| "non-string panic payload".to_string()); + error!("managed-agent runtime panicked: {detail}"); + Err(anyhow::anyhow!("managed-agent runtime panicked: {detail}")) + } + } +} + +/// SPEC-018 Phase 1: handle an interactive launch of a MANAGED agent binary (one +/// carrying embedded config, detected as [`config::RunMode::PermanentAgent`]). +/// +/// Managed mode runs as the LocalSystem service, never as an interactive process: +/// - If the service is already installed, the service is (or will be) running +/// the agent as SYSTEM, so this interactive invocation must NOT spawn a second +/// agent — it exits quietly. +/// - On first run, install (which installs + starts the service and removes the +/// legacy `HKCU\…\Run` autostart), then exit and let the service carry the +/// agent. If the service install fails (e.g. not elevated), fall back to +/// running the agent in-process for this run so the machine is not left with no +/// agent at all. +#[cfg(windows)] +fn run_permanent_agent_managed() -> Result<()> { + if service::is_service_installed() { + info!( + "Managed service already installed; the service runs the agent as SYSTEM — \ + this interactive instance has nothing to do" + ); + return Ok(()); + } + + info!("First run - installing managed agent service"); + if let Err(e) = install::install(false) { + warn!( + "Managed service install failed ({e:#}); falling back to in-process agent for this run" + ); + return run_agent_mode(None); + } + + info!("Managed agent service installed; handing off to the service"); + Ok(()) +} + /// Resolve the per-machine operating credential for a managed agent (SPEC-016 /// Phase B, run-mode wiring). /// @@ -372,9 +535,13 @@ async fn resolve_agent_credential(config: &mut config::Config) -> Result<()> { // do NOT silently re-enroll. The SYSTEM+Administrators ACL is correct // for the target (Option A) and is deliberately kept. // - // NOTE: this guard is satisfied/removed once the GuruConnect SYSTEM - // service host lands (separate spec, SPEC-018) and the agent always - // runs as SYSTEM — at which point the store is always readable. + // SPEC-018 (this spec): the managed agent now runs as the GuruConnect + // SYSTEM service ([`run_managed_agent_service`]), so on the production + // managed path the store IS readable in-context and this branch is NOT + // hit. The guard is intentionally retained as a harmless safety net for + // any non-SYSTEM invocation (e.g. someone running the managed binary + // interactively): it still fails fast with an actionable message rather + // than bricking. Do NOT remove it in Phase 1. Err(LoadCakError::Io { permission_denied: true, source, @@ -484,7 +651,22 @@ fn run_install(force_user_install: bool) -> Result<()> { fn run_uninstall() -> Result<()> { info!("Uninstalling GuruConnect..."); - // Remove from startup + // SPEC-018: remove the managed LocalSystem service and the legacy HKCU Run + // autostart. Idempotent — no error if the service was never installed (an + // attended/viewer install has no service), so this is safe for every install + // shape. Requires Administrator to delete the service; a non-elevated uninstall + // still clears the per-user autostart below. + #[cfg(windows)] + { + if let Err(e) = install::uninstall_managed_service() { + warn!( + "Failed to remove managed service (may require Administrator): {}", + e + ); + } + } + + // Remove from startup (covers non-elevated / attended / viewer installs). if let Err(e) = startup::remove_from_startup() { warn!("Failed to remove from startup: {}", e); } @@ -582,31 +764,62 @@ fn cleanup_on_exit() { } } -/// Run the agent main loop -async fn run_agent(config: config::Config) -> Result<()> { +/// Run the agent main loop. +/// +/// `service_shutdown`, when present, is the SCM cooperative-stop flag (SPEC-018): +/// the managed-agent service passes it so the loop exits promptly on +/// `Stop`/`Shutdown`. It is `None` for the interactive/user-launched paths, which +/// stop via the tray exit / server control messages instead. +async fn run_agent( + config: config::Config, + service_shutdown: Option>, +) -> Result<()> { + use std::sync::atomic::Ordering; + let elevated = install::is_elevated(); + let running_as_service = service_shutdown.is_some(); let mut session = session::SessionManager::new(config.clone(), elevated); let is_support_session = config.support_code.is_some(); let hostname = config.hostname(); - // Add to startup - if let Err(e) = startup::add_to_startup() { + // Helper: has the SCM asked us to stop? + let stop_requested = |flag: &Option>| -> bool { + flag.as_ref() + .map(|f| f.load(Ordering::SeqCst)) + .unwrap_or(false) + }; + + // Autostart persistence: + // - As the SYSTEM service (SPEC-018), the SERVICE itself is the managed + // autostart — do NOT write the per-user HKCU\…\Run entry (that would be a + // second, redundant autostart, and writing it from SYSTEM lands in the + // wrong hive). The service install/uninstall owns lifecycle. + // - Interactive/user-launched runs keep the existing HKCU Run behavior. + if running_as_service { + info!("Running as the GuruConnect SYSTEM service; service is the autostart (skipping HKCU Run)"); + } else if let Err(e) = startup::add_to_startup() { warn!("Failed to add to startup: {}", e); } - // Create tray icon - let tray = match tray::TrayController::new( - &hostname, - config.support_code.as_deref(), - is_support_session, - ) { - Ok(t) => { - info!("Tray icon created"); - Some(t) - } - Err(e) => { - warn!("Failed to create tray icon: {}", e); - None + // A Session-0 SYSTEM service has no interactive desktop, so a tray icon is + // both impossible and meaningless there (SPEC-018 Phase 2 moves the user-facing + // surface into the per-session worker). Only create the tray off the service. + let tray = if running_as_service { + None + } else { + match tray::TrayController::new( + &hostname, + config.support_code.as_deref(), + is_support_session, + ) { + Ok(t) => { + info!("Tray icon created"); + Some(t) + } + Err(e) => { + warn!("Failed to create tray icon: {}", e); + None + } } }; @@ -615,6 +828,12 @@ async fn run_agent(config: config::Config) -> Result<()> { // Connect to server and run main loop loop { + // SPEC-018: honour an SCM stop request before (re)connecting. + if stop_requested(&service_shutdown) { + info!("Service stop requested; exiting agent loop"); + return Ok(()); + } + info!("Connecting to server..."); if is_support_session { @@ -636,11 +855,22 @@ async fn run_agent(config: config::Config) -> Result<()> { } if let Err(e) = session - .run_with_tray(tray.as_ref(), chat_ctrl.as_ref()) + .run_with_tray(tray.as_ref(), chat_ctrl.as_ref(), service_shutdown.as_ref()) .await { let error_msg = e.to_string(); + // SPEC-018 (finding H): the connected session loop broke + // because the SCM asked the service to stop. The loop already + // closed the WebSocket cleanly; treat this as a graceful stop + // (no reconnect) so the service transitions StopPending -> + // Stopped. Only the service path can produce this (it is the + // only caller that passes a shutdown flag). + if error_msg.contains(session::SERVICE_STOP_SENTINEL) { + info!("Service stop requested during session; exiting agent loop"); + return Ok(()); + } + if error_msg.contains("USER_EXIT") { info!("Session ended by user"); cleanup_on_exit(); @@ -713,6 +943,47 @@ async fn run_agent(config: config::Config) -> Result<()> { } info!("Reconnecting in 5 seconds..."); - tokio::time::sleep(tokio::time::Duration::from_secs(5)).await; + // SPEC-018: poll the SCM stop flag during the backoff so a service stop is + // honoured within ~250ms instead of waiting the full reconnect delay. + if service_shutdown.is_some() { + for _ in 0..20 { + if stop_requested(&service_shutdown) { + info!("Service stop requested during reconnect backoff; exiting agent loop"); + return Ok(()); + } + tokio::time::sleep(tokio::time::Duration::from_millis(250)).await; + } + } else { + tokio::time::sleep(tokio::time::Duration::from_secs(5)).await; + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use clap::CommandFactory; + + /// SPEC-018 finding N1: pin the clap subcommand name to the constant the SCM + /// is registered with. The service is installed with `SERVICE_RUN_ARG` as its + /// launch argument; when the SCM starts it, clap must route that exact token + /// into [`Commands::ServiceRun`]. If the `#[command(name = "service-run")]` + /// attribute and the constant ever drift apart, the SCM would start the binary + /// but clap would fail to match the subcommand and the process would fall + /// through to default (non-service) mode and exit. Asserting against the live + /// clap metadata (not a second string literal) makes that drift impossible. + #[test] + #[cfg(windows)] + fn service_run_subcommand_matches_scm_launch_arg() { + let cmd = Cli::command(); + let has_matching_subcommand = cmd + .get_subcommands() + .any(|sc| sc.get_name() == service::SERVICE_RUN_ARG); + assert!( + has_matching_subcommand, + "no clap subcommand named '{}' (the SCM launch arg); the ServiceRun \ + #[command(name = ...)] attribute drifted from service::SERVICE_RUN_ARG", + service::SERVICE_RUN_ARG + ); } } diff --git a/agent/src/service/mod.rs b/agent/src/service/mod.rs new file mode 100644 index 0000000..e3f99b2 --- /dev/null +++ b/agent/src/service/mod.rs @@ -0,0 +1,520 @@ +//! Windows SYSTEM service host for the managed GuruConnect agent (SPEC-018). +//! +//! # Phase 1 scope (this module) +//! +//! Phase 1 proves the *managed/persistent* agent can run as **LocalSystem** in +//! the isolated Session 0 across reboots and at the login screen: +//! +//! 1. Register the agent with the Service Control Manager (SCM) and run, when +//! started, the **existing persistent-agent logic** (`RunMode::PermanentAgent` +//! path) *as SYSTEM* — i.e. resolve/enroll the per-machine `cak_` (SPEC-016, +//! now readable because the SYSTEM-ACL'd store is in-context) and hold the +//! relay WSS connection. +//! 2. Report a correct service lifecycle to the SCM (`StartPending` -> +//! `Running` -> `StopPending` -> `Stopped`) and handle `Stop`/`Shutdown` +//! gracefully. The control handler sets a shared shutdown flag; the agent +//! runtime observes it both between reconnect attempts AND inside the +//! connected session loop (SPEC-018 finding H), so a stop received while a +//! session is live breaks out promptly, closes the WS connection cleanly, +//! and exits — rather than waiting for the SCM to force-kill. +//! 3. Provide install/uninstall of the service (LocalSystem, auto-start, crash +//! recovery) so managed mode uses the service as its single autostart +//! instead of the per-user `HKCU\…\Run` entry. +//! +//! # Phase 2 (deliberately NOT built here — see SPEC-018 §Scope) +//! +//! A SYSTEM service lives in Session 0 and **cannot** capture or inject the +//! interactive desktop directly. Phase 1 therefore enrolls and connects but does +//! **NOT** capture a desktop yet. The following are Phase 2 and are intentionally +//! absent; the seams where they attach are called out inline below: +//! +//! - the **session broker** (`WTSEnumerateSessionsW` / +//! `WTSGetActiveConsoleSessionId` / `WTSQueryUserToken`), +//! - the **per-session capture/input worker** spawned via `CreateProcessAsUserW` +//! into `winsta0\default`, +//! - **service <-> worker IPC** (the per-session ACL'd named pipe), and +//! - **`SERVICE_CONTROL_SESSIONCHANGE`** reaction (logon/logoff/console-connect +//! retarget). +//! +//! Phase 1 registers the control handler for `Stop`/`Shutdown`/`Interrogate` +//! only. When Phase 2 lands, the broker hangs off the same control handler +//! (adding `SESSIONCHANGE`) and off the same agent runtime started here. + +#![cfg(windows)] + +use std::ffi::OsString; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; +use std::time::Duration; + +use anyhow::{Context, Result}; +use tracing::{error, info, warn}; + +use windows_service::{ + define_windows_service, + service::{ + ServiceAccess, ServiceControl, ServiceControlAccept, ServiceErrorControl, ServiceExitCode, + ServiceInfo, ServiceStartType, ServiceState, ServiceStatus, ServiceType, + }, + service_control_handler::{self, ServiceControlHandlerResult}, + service_dispatcher, + service_manager::{ServiceManager, ServiceManagerAccess}, +}; + +/// Internal service name registered with the SCM (no spaces; used by `sc`, +/// `ServiceManager`, and the control handler). +pub const SERVICE_NAME: &str = "GuruConnectAgent"; + +/// Human-facing display name shown in `services.msc`. +pub const SERVICE_DISPLAY_NAME: &str = "GuruConnect Managed Agent"; + +/// Service description shown in `services.msc`. +pub const SERVICE_DESCRIPTION: &str = + "Runs the managed GuruConnect remote-support agent as LocalSystem so it is \ + reachable at the login screen and across reboots (SPEC-018)."; + +/// Hidden subcommand the SCM invokes to enter the service control loop. The +/// service is registered with this as its launch argument (see [`install_service`]), +/// and `main.rs` routes it into [`run_dispatcher`]. +pub const SERVICE_RUN_ARG: &str = "service-run"; + +/// Hint we give the SCM for how long start/stop transitions may take before it +/// should consider the service hung. +const TRANSITION_WAIT: Duration = Duration::from_secs(10); + +// The `windows-service` dispatcher requires a `extern "system"` entry point with +// a fixed ABI; this macro generates `ffi_service_main`, which trampolines into +// our safe `service_main`. +define_windows_service!(ffi_service_main, service_main); + +/// Enter the SCM dispatcher (called from `main.rs` for the `service-run` +/// subcommand). Blocks until the service stops. This must be invoked by the SCM, +/// not interactively — `service_dispatcher::start` fails with +/// `ERROR_FAILED_SERVICE_CONTROLLER_CONNECT` (1063) if there is no controlling +/// SCM, which is the expected outcome of running `guruconnect service-run` by hand. +pub fn run_dispatcher() -> Result<()> { + service_dispatcher::start(SERVICE_NAME, ffi_service_main) + .context("failed to connect to the service control dispatcher (must be started by the SCM)") +} + +/// SCM-invoked service body. Any error is logged; the function cannot return an +/// error to the SCM directly, so [`run_service`] reports a failed exit code on the +/// status handle before returning. +fn service_main(_arguments: Vec) { + if let Err(e) = run_service() { + error!("service exited with error: {e:#}"); + } +} + +/// Drive the full service lifecycle: register the control handler, report +/// `Running`, run the persistent agent until a stop is requested, then report +/// `Stopped`. +fn run_service() -> Result<()> { + info!("GuruConnect managed agent service starting (running as SYSTEM in session 0)"); + + // Cooperative shutdown flag flipped by the SCM control handler and observed by + // the agent runtime. `AtomicBool` keeps the handler closure trivially `Send` + // and avoids holding a lock inside an SCM callback. + let shutdown = Arc::new(AtomicBool::new(false)); + let shutdown_for_handler = shutdown.clone(); + + let event_handler = move |control_event| -> ServiceControlHandlerResult { + match control_event { + // SPEC-018 Phase 1: graceful stop. Phase 2 adds + // `ServiceControl::SessionChange(_)` here to drive the session broker + // (retarget the capture/input worker on logon/logoff/console-connect); + // we intentionally do not accept SESSIONCHANGE yet. + ServiceControl::Stop | ServiceControl::Shutdown => { + info!("received {control_event:?}; signalling agent to shut down"); + // Set the cooperative-stop flag. The agent runtime observes it on + // every idle tick of the connected session loop and between + // reconnect attempts (SPEC-018 finding H), so it breaks out and + // closes the WebSocket cleanly within ~100ms even if a session is + // currently connected. + shutdown_for_handler.store(true, Ordering::SeqCst); + ServiceControlHandlerResult::NoError + } + ServiceControl::Interrogate => ServiceControlHandlerResult::NoError, + _ => ServiceControlHandlerResult::NotImplemented, + } + }; + + let status_handle = service_control_handler::register(SERVICE_NAME, event_handler) + .context("failed to register the service control handler")?; + + // Report StartPending while we spin up the runtime and connect. + set_status( + &status_handle, + ServiceState::StartPending, + ServiceControlAccept::empty(), + TRANSITION_WAIT, + ); + + // Report Running and accept Stop + Shutdown. We report Running before the + // first connect attempt completes because the agent loop reconnects forever; + // "the service is up and trying" is the correct steady state, and blocking the + // SCM on the first relay handshake would risk a start timeout on a slow boot. + set_status( + &status_handle, + ServiceState::Running, + ServiceControlAccept::STOP | ServiceControlAccept::SHUTDOWN, + Duration::default(), + ); + info!("service reported Running; entering managed-agent control loop"); + + // Run the existing persistent-agent logic as SYSTEM. This is the Phase 1 + // payload: resolve/enroll the cak_ (SPEC-016) and hold the relay connection. + let run_result = crate::run_managed_agent_service(shutdown.clone()); + + if let Err(e) = &run_result { + // The agent loop only returns Err on an unrecoverable LOCAL fault (e.g. no + // usable credential and nothing to enroll with). Network errors are + // retried inside the loop and never surface here. Report the failure to + // the SCM so recovery actions (restart) engage. + error!("managed-agent control loop terminated with error: {e:#}"); + } else { + info!("managed-agent control loop exited cleanly on stop request"); + } + + // Transition StopPending -> Stopped. + set_status( + &status_handle, + ServiceState::StopPending, + ServiceControlAccept::empty(), + TRANSITION_WAIT, + ); + + let exit_code = match run_result { + Ok(()) => ServiceExitCode::Win32(0), + // ERROR_SERVICE_SPECIFIC_ERROR-style: surface a non-zero service-specific + // code so the SCM treats the exit as a failure and applies recovery. + Err(_) => ServiceExitCode::ServiceSpecific(1), + }; + + set_status_with_exit( + &status_handle, + ServiceState::Stopped, + ServiceControlAccept::empty(), + Duration::default(), + exit_code, + ); + info!("service reported Stopped"); + + Ok(()) +} + +/// Report a status with a zero (success) exit code. +fn set_status( + handle: &service_control_handler::ServiceStatusHandle, + state: ServiceState, + accepted: ServiceControlAccept, + wait_hint: Duration, +) { + set_status_with_exit( + handle, + state, + accepted, + wait_hint, + ServiceExitCode::Win32(0), + ); +} + +/// Report a status to the SCM. A failure to report is logged (best-effort) — we +/// cannot do anything actionable about it and must not panic inside the service. +fn set_status_with_exit( + handle: &service_control_handler::ServiceStatusHandle, + state: ServiceState, + accepted: ServiceControlAccept, + wait_hint: Duration, + exit_code: ServiceExitCode, +) { + let status = ServiceStatus { + service_type: ServiceType::OWN_PROCESS, + current_state: state, + controls_accepted: accepted, + exit_code, + checkpoint: 0, + wait_hint, + process_id: None, + }; + if let Err(e) = handle.set_service_status(status) { + warn!("failed to report service status {state:?} to the SCM: {e}"); + } +} + +// --------------------------------------------------------------------------- +// Install / uninstall (used by install.rs for managed mode) +// --------------------------------------------------------------------------- + +/// Install (or reinstall) the managed agent as a LocalSystem auto-start service +/// pointing at `exe_path` with the [`SERVICE_RUN_ARG`] launch argument. +/// +/// Idempotent: if the service already exists it is stopped and deleted first, +/// then recreated, so an upgrade picks up a new binary path / config. Configures +/// crash recovery (restart on failure) via `sc failure`. +/// +/// Requires Administrator (SCM `CREATE_SERVICE`). Returns an error otherwise. +pub fn install_service(exe_path: &std::path::Path) -> Result<()> { + let manager = ServiceManager::local_computer( + None::<&str>, + ServiceManagerAccess::CONNECT | ServiceManagerAccess::CREATE_SERVICE, + ) + .context("failed to connect to the Service Control Manager (run as Administrator)")?; + + // Remove any prior installation so the binary path / args are refreshed. + let mut deleted_existing = false; + if let Ok(existing) = manager.open_service( + SERVICE_NAME, + ServiceAccess::QUERY_STATUS | ServiceAccess::STOP | ServiceAccess::DELETE, + ) { + info!("existing {SERVICE_NAME} service found; removing before reinstall"); + stop_if_running(&existing); + existing + .delete() + .context("failed to delete the existing service before reinstall")?; + drop(existing); + deleted_existing = true; + } + + let service_info = ServiceInfo { + name: OsString::from(SERVICE_NAME), + display_name: OsString::from(SERVICE_DISPLAY_NAME), + service_type: ServiceType::OWN_PROCESS, + start_type: ServiceStartType::AutoStart, + error_control: ServiceErrorControl::Normal, + executable_path: exe_path.to_path_buf(), + launch_arguments: vec![OsString::from(SERVICE_RUN_ARG)], + dependencies: vec![], + // account_name: None => LocalSystem (the SPEC-018 requirement). + account_name: None, + account_password: None, + }; + + let service = create_service_with_retry(&manager, &service_info, deleted_existing) + .context("failed to create the GuruConnect managed agent service")?; + + service + .set_description(SERVICE_DESCRIPTION) + .context("failed to set the service description")?; + + configure_recovery(); + + info!( + "installed {SERVICE_NAME} (LocalSystem, auto-start) -> {} {}", + exe_path.display(), + SERVICE_RUN_ARG + ); + Ok(()) +} + +/// Create the service, retrying briefly if the SCM still has the prior instance +/// "marked for deletion" (SPEC-018 finding L1). +/// +/// When a service is deleted, the SCM only removes it from its database once every +/// open handle to it closes; until then a fresh `CreateService` fails with +/// `ERROR_SERVICE_MARKED_FOR_DELETE` (1072). The previous implementation papered +/// over this with a fixed 2s sleep after `delete()`, which is both slower than +/// necessary in the common case and still racy on a busy box. Instead we attempt +/// the create immediately and, only if we just deleted an existing instance and +/// hit 1072, retry a few times with short backoff — succeeding as soon as the SCM +/// finishes the removal, and giving up with the real error if it never does. +/// +/// The retry is gated on `deleted_existing`: on a clean first install there was no +/// prior instance, so a 1072 there is unexpected and is surfaced immediately +/// rather than masked by retries. +fn create_service_with_retry( + manager: &ServiceManager, + service_info: &ServiceInfo, + deleted_existing: bool, +) -> Result { + // ERROR_SERVICE_MARKED_FOR_DELETE (winerror.h). The service is gone from the + // caller's perspective but the SCM has not finished reaping it. + const ERROR_SERVICE_MARKED_FOR_DELETE: i32 = 1072; + // Bounded: ~5 attempts over ~2s total worst case (matches the old fixed sleep + // ceiling) but returns the instant the SCM is ready. + const MAX_ATTEMPTS: u32 = 5; + const BACKOFF: Duration = Duration::from_millis(400); + + let mut attempt = 0; + loop { + attempt += 1; + match manager.create_service(service_info, ServiceAccess::CHANGE_CONFIG) { + Ok(service) => return Ok(service), + Err(windows_service::Error::Winapi(ref io_err)) + if deleted_existing + && io_err.raw_os_error() == Some(ERROR_SERVICE_MARKED_FOR_DELETE) + && attempt < MAX_ATTEMPTS => + { + warn!( + "{SERVICE_NAME} still marked for deletion by the SCM \ + (attempt {attempt}/{MAX_ATTEMPTS}); retrying in {}ms", + BACKOFF.as_millis() + ); + std::thread::sleep(BACKOFF); + } + Err(e) => return Err(e), + } + } +} + +/// Configure SCM crash-recovery so the service restarts on unexpected exit. +/// +/// `windows-service` 0.7 does not expose `ChangeServiceConfig2` recovery actions +/// in a stable, ergonomic form, so we mirror the established pattern used by the +/// SAS service binary and shell out to `sc failure`. `reset=86400` clears the +/// failure count after a day; three `restart/5000` actions retry after 5s each. +fn configure_recovery() { + use std::os::windows::process::CommandExt; + const CREATE_NO_WINDOW: u32 = 0x0800_0000; + + match std::process::Command::new("sc") + .args([ + "failure", + SERVICE_NAME, + "reset=86400", + "actions=restart/5000/restart/5000/restart/5000", + ]) + .creation_flags(CREATE_NO_WINDOW) + .output() + { + Ok(out) if out.status.success() => { + info!("configured crash-recovery (restart) for {SERVICE_NAME}"); + } + Ok(out) => { + warn!( + "could not configure crash-recovery for {SERVICE_NAME} (sc failure exit {:?}); \ + the service will still run but will not auto-restart on crash", + out.status.code() + ); + } + Err(e) => { + warn!("could not invoke `sc failure` to set crash-recovery for {SERVICE_NAME}: {e}"); + } + } +} + +/// Stop (if running) and delete the managed agent service. Idempotent: succeeds +/// quietly if the service is not installed. +pub fn uninstall_service() -> Result<()> { + let manager = ServiceManager::local_computer(None::<&str>, ServiceManagerAccess::CONNECT) + .context("failed to connect to the Service Control Manager (run as Administrator)")?; + + match manager.open_service( + SERVICE_NAME, + ServiceAccess::QUERY_STATUS | ServiceAccess::STOP | ServiceAccess::DELETE, + ) { + Ok(service) => { + stop_if_running(&service); + service + .delete() + .context("failed to delete the managed agent service")?; + info!("uninstalled {SERVICE_NAME} service"); + Ok(()) + } + Err(_) => { + // Not installed — nothing to do (idempotent uninstall). + info!("{SERVICE_NAME} service is not installed; nothing to uninstall"); + Ok(()) + } + } +} + +/// Start the managed agent service now (used right after a first-run install so +/// the agent comes up without waiting for the next boot). Best-effort: logs and +/// returns the SCM error if the start fails, but a failure is not fatal to install +/// because the service is auto-start and will come up on the next boot regardless. +pub fn start_service() -> Result<()> { + let manager = ServiceManager::local_computer(None::<&str>, ServiceManagerAccess::CONNECT) + .context("failed to connect to the Service Control Manager")?; + let service = manager + .open_service( + SERVICE_NAME, + ServiceAccess::START | ServiceAccess::QUERY_STATUS, + ) + .context("failed to open the managed agent service to start it")?; + + // If it is already running (e.g. reinstall-over-running), there is nothing to do. + if let Ok(status) = service.query_status() { + if status.current_state == ServiceState::Running + || status.current_state == ServiceState::StartPending + { + info!("{SERVICE_NAME} is already running/starting"); + return Ok(()); + } + } + + service + .start::(&[]) + .context("failed to start the managed agent service")?; + info!("started {SERVICE_NAME}"); + Ok(()) +} + +/// Report whether the managed agent service is currently installed. +pub fn is_service_installed() -> bool { + match ServiceManager::local_computer(None::<&str>, ServiceManagerAccess::CONNECT) { + Ok(manager) => manager + .open_service(SERVICE_NAME, ServiceAccess::QUERY_STATUS) + .is_ok(), + Err(_) => false, + } +} + +/// Best-effort stop of a service, waiting briefly for it to leave the running +/// state so a subsequent `delete` does not race an in-flight stop. +fn stop_if_running(service: &windows_service::service::Service) { + if let Ok(status) = service.query_status() { + if status.current_state != ServiceState::Stopped { + info!("stopping {SERVICE_NAME} before delete"); + let _ = service.stop(); + for _ in 0..10 { + std::thread::sleep(Duration::from_millis(500)); + match service.query_status() { + Ok(s) if s.current_state == ServiceState::Stopped => break, + _ => continue, + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// The launch argument the service is registered with MUST equal the hidden + /// `service-run` subcommand `main.rs` dispatches into [`run_dispatcher`]; a + /// mismatch would register a service the SCM could start but that would fall + /// through to normal (non-service) mode and immediately exit. + /// + /// This pins the value of the constant itself. The companion test + /// `tests::service_run_subcommand_matches_scm_launch_arg` in `main.rs` pins the + /// other half — that the clap `#[command(name = "service-run")]` attribute on + /// `Commands::ServiceRun` resolves to this same constant — so the two string + /// literals cannot silently drift apart. + #[test] + fn service_run_arg_matches_subcommand_name() { + assert_eq!(SERVICE_RUN_ARG, "service-run"); + } + + /// Service identifiers are non-empty and the internal name carries no spaces + /// (the SCM key / `sc` argument must be a single token). + #[test] + fn service_identifiers_are_well_formed() { + assert!(!SERVICE_NAME.is_empty()); + assert!( + !SERVICE_NAME.contains(char::is_whitespace), + "the SCM service name must be a single whitespace-free token" + ); + assert!(!SERVICE_DISPLAY_NAME.is_empty()); + assert!(!SERVICE_DESCRIPTION.is_empty()); + } + + /// `is_service_installed` must never panic regardless of elevation/SCM access; + /// on a dev workstation without the service installed it returns `false`. (We + /// do NOT install the service in tests — that is a VM/admin integration step.) + #[test] + fn is_service_installed_is_total() { + let _ = is_service_installed(); + } +} diff --git a/agent/src/session/mod.rs b/agent/src/session/mod.rs index 5e40185..16cea1f 100644 --- a/agent/src/session/mod.rs +++ b/agent/src/session/mod.rs @@ -41,8 +41,18 @@ use crate::proto::{message, AgentStatus, ChatMessage, Heartbeat, HeartbeatAck, M use crate::transport::WebSocketTransport; use crate::tray::{TrayAction, TrayController}; use anyhow::Result; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; use std::time::{Duration, Instant}; +/// Sentinel error string returned by [`SessionManager::run_with_tray`] when the +/// loop breaks because the SCM asked the managed-agent service to stop (SPEC-018, +/// finding H). The outer `run_agent` loop matches on this to treat the exit as a +/// graceful service stop (clean WS close, no reconnect) rather than a session +/// error. Only the service path passes a shutdown flag, so only the service path +/// can ever produce this. +pub const SERVICE_STOP_SENTINEL: &str = "SERVICE_STOP"; + // Heartbeat interval (30 seconds) const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(30); // Status report interval (60 seconds) @@ -285,16 +295,34 @@ impl SessionManager { Ok(()) } - /// Run the session main loop with tray and chat event processing + /// Run the session main loop with tray and chat event processing. + /// + /// `service_shutdown` (SPEC-018 finding H) is the SCM cooperative-stop flag. + /// It is `Some(flag)` ONLY on the managed-agent service path; the + /// attended/viewer/interactive callers pass `None` and behave EXACTLY as + /// before. When present, the flag is polled on every idle tick (the natural + /// ~100ms seam below) so an SCM Stop/Shutdown received while CONNECTED breaks + /// this inner loop promptly — instead of only being observed by the outer + /// `run_agent` reconnect loop, which never runs while a session is connected. + /// On a set flag the loop closes the WebSocket cleanly (via the shared exit + /// path at the bottom) and returns the [`SERVICE_STOP_SENTINEL`] error, which + /// the outer loop maps to a graceful stop. pub async fn run_with_tray( &mut self, tray: Option<&TrayController>, chat: Option<&ChatController>, + service_shutdown: Option<&Arc>, ) -> Result<()> { if self.transport.is_none() { anyhow::bail!("Not connected"); } + // Helper: has the SCM asked the service to stop? Always false off the + // service path (where `service_shutdown` is `None`). + let stop_requested = |flag: Option<&Arc>| -> bool { + flag.is_some_and(|f| f.load(Ordering::SeqCst)) + }; + // Send initial status self.send_status().await?; @@ -307,6 +335,29 @@ impl SessionManager { // Main loop loop { + // SPEC-018 (finding H): honour an SCM stop request received while the + // session is CONNECTED. The outer `run_agent` loop only observes the + // flag between connection attempts, but a managed agent spends its + // entire connected life inside THIS loop — so without this check an + // SCM Stop while connected would not break out until the connection + // dropped on its own. Breaking here falls through to the shared exit + // path below, which closes the transport cleanly (clean WS close); + // the sentinel tells the outer loop this was a graceful stop. + if stop_requested(service_shutdown) { + tracing::info!("Service stop requested; ending connected session loop"); + self.release_streaming(); + self.state = SessionState::Disconnected; + if let Some(transport) = self.transport.as_mut() { + // Best-effort clean WebSocket close (sends a Close frame). A + // failure here just means the peer/socket is already gone; the + // service still stops cleanly. + if let Err(e) = transport.close().await { + tracing::warn!("error during clean WebSocket close on service stop: {}", e); + } + } + return Err(anyhow::anyhow!(SERVICE_STOP_SENTINEL)); + } + // Process tray events if let Some(t) = tray { if let Some(action) = t.process_events() { @@ -745,3 +796,47 @@ impl SessionManager { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + + /// SPEC-018 finding H: the connected-stop contract. When the SCM sets the + /// shutdown flag, `run_with_tray` returns an error whose message contains + /// [`SERVICE_STOP_SENTINEL`]; the outer `run_agent` loop recognises a graceful + /// stop with `error_msg.contains(SERVICE_STOP_SENTINEL)`. This pins that the + /// error the loop constructs on stop actually satisfies that match — so the + /// two halves (producer here, consumer in `main.rs`) cannot drift. + /// + /// A full end-to-end test of the in-loop interrupt would need a live connected + /// transport (a real or mocked server), which is an integration concern; this + /// unit test instead pins the wire contract the interrupt relies on. + #[test] + fn service_stop_sentinel_is_matched_by_outer_loop_check() { + let produced = anyhow::anyhow!(SERVICE_STOP_SENTINEL); + assert!( + produced.to_string().contains(SERVICE_STOP_SENTINEL), + "the stop error must contain the sentinel the outer loop matches on" + ); + assert!( + !SERVICE_STOP_SENTINEL.is_empty(), + "the sentinel must be a non-empty, distinctive token" + ); + } + + /// The shutdown-flag check is a no-op (always `false`) when no flag is passed, + /// i.e. on the attended/viewer/interactive paths — guaranteeing the new + /// parameter is a pure addition that cannot alter non-service behaviour + /// (SPEC-018 finding H: "no regression"). + #[test] + fn no_shutdown_flag_never_requests_stop() { + let none: Option<&Arc> = None; + let check = |flag: Option<&Arc>| flag.is_some_and(|f| f.load(Ordering::SeqCst)); + assert!(!check(none)); + + let set = Arc::new(AtomicBool::new(true)); + assert!(check(Some(&set))); + let unset = Arc::new(AtomicBool::new(false)); + assert!(!check(Some(&unset))); + } +}