//! Windows SYSTEM service host for the managed GuruConnect agent (SPEC-018). //! //! # Phase 1 scope (this module) //! //! Phase 1 proves the *managed/persistent* agent can run as **LocalSystem** in //! the isolated Session 0 across reboots and at the login screen: //! //! 1. Register the agent with the Service Control Manager (SCM) and run, when //! started, the **existing persistent-agent logic** (`RunMode::PermanentAgent` //! path) *as SYSTEM* — i.e. resolve/enroll the per-machine `cak_` (SPEC-016, //! now readable because the SYSTEM-ACL'd store is in-context) and hold the //! relay WSS connection. //! 2. Report a correct service lifecycle to the SCM (`StartPending` -> //! `Running` -> `StopPending` -> `Stopped`) and handle `Stop`/`Shutdown` //! gracefully. The control handler sets a shared shutdown flag; the agent //! runtime observes it both between reconnect attempts AND inside the //! connected session loop (SPEC-018 finding H), so a stop received while a //! session is live breaks out promptly, closes the WS connection cleanly, //! and exits — rather than waiting for the SCM to force-kill. //! 3. Provide install/uninstall of the service (LocalSystem, auto-start, crash //! recovery) so managed mode uses the service as its single autostart //! instead of the per-user `HKCU\…\Run` entry. //! //! # Phase 2 (deliberately NOT built here — see SPEC-018 §Scope) //! //! A SYSTEM service lives in Session 0 and **cannot** capture or inject the //! interactive desktop directly. Phase 1 therefore enrolls and connects but does //! **NOT** capture a desktop yet. The following are Phase 2 and are intentionally //! absent; the seams where they attach are called out inline below: //! //! - the **session broker** (`WTSEnumerateSessionsW` / //! `WTSGetActiveConsoleSessionId` / `WTSQueryUserToken`), //! - the **per-session capture/input worker** spawned via `CreateProcessAsUserW` //! into `winsta0\default`, //! - **service <-> worker IPC** (the per-session ACL'd named pipe), and //! - **`SERVICE_CONTROL_SESSIONCHANGE`** reaction (logon/logoff/console-connect //! retarget). //! //! Phase 1 registers the control handler for `Stop`/`Shutdown`/`Interrogate` //! only. When Phase 2 lands, the broker hangs off the same control handler //! (adding `SESSIONCHANGE`) and off the same agent runtime started here. #![cfg(windows)] use std::ffi::OsString; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result}; use tracing::{error, info, warn}; use windows_service::{ define_windows_service, service::{ ServiceAccess, ServiceControl, ServiceControlAccept, ServiceErrorControl, ServiceExitCode, ServiceInfo, ServiceStartType, ServiceState, ServiceStatus, ServiceType, }, service_control_handler::{self, ServiceControlHandlerResult}, service_dispatcher, service_manager::{ServiceManager, ServiceManagerAccess}, }; /// Internal service name registered with the SCM (no spaces; used by `sc`, /// `ServiceManager`, and the control handler). pub const SERVICE_NAME: &str = "GuruConnectAgent"; /// Human-facing display name shown in `services.msc`. pub const SERVICE_DISPLAY_NAME: &str = "GuruConnect Managed Agent"; /// Service description shown in `services.msc`. pub const SERVICE_DESCRIPTION: &str = "Runs the managed GuruConnect remote-support agent as LocalSystem so it is \ reachable at the login screen and across reboots (SPEC-018)."; /// Hidden subcommand the SCM invokes to enter the service control loop. The /// service is registered with this as its launch argument (see [`install_service`]), /// and `main.rs` routes it into [`run_dispatcher`]. pub const SERVICE_RUN_ARG: &str = "service-run"; /// Hint we give the SCM for how long start/stop transitions may take before it /// should consider the service hung. const TRANSITION_WAIT: Duration = Duration::from_secs(10); // The `windows-service` dispatcher requires a `extern "system"` entry point with // a fixed ABI; this macro generates `ffi_service_main`, which trampolines into // our safe `service_main`. define_windows_service!(ffi_service_main, service_main); /// Enter the SCM dispatcher (called from `main.rs` for the `service-run` /// subcommand). Blocks until the service stops. This must be invoked by the SCM, /// not interactively — `service_dispatcher::start` fails with /// `ERROR_FAILED_SERVICE_CONTROLLER_CONNECT` (1063) if there is no controlling /// SCM, which is the expected outcome of running `guruconnect service-run` by hand. pub fn run_dispatcher() -> Result<()> { service_dispatcher::start(SERVICE_NAME, ffi_service_main) .context("failed to connect to the service control dispatcher (must be started by the SCM)") } /// SCM-invoked service body. Any error is logged; the function cannot return an /// error to the SCM directly, so [`run_service`] reports a failed exit code on the /// status handle before returning. fn service_main(_arguments: Vec) { if let Err(e) = run_service() { error!("service exited with error: {e:#}"); } } /// Drive the full service lifecycle: register the control handler, report /// `Running`, run the persistent agent until a stop is requested, then report /// `Stopped`. fn run_service() -> Result<()> { info!("GuruConnect managed agent service starting (running as SYSTEM in session 0)"); // Cooperative shutdown flag flipped by the SCM control handler and observed by // the agent runtime. `AtomicBool` keeps the handler closure trivially `Send` // and avoids holding a lock inside an SCM callback. let shutdown = Arc::new(AtomicBool::new(false)); let shutdown_for_handler = shutdown.clone(); let event_handler = move |control_event| -> ServiceControlHandlerResult { match control_event { // SPEC-018 Phase 1: graceful stop. Phase 2 adds // `ServiceControl::SessionChange(_)` here to drive the session broker // (retarget the capture/input worker on logon/logoff/console-connect); // we intentionally do not accept SESSIONCHANGE yet. ServiceControl::Stop | ServiceControl::Shutdown => { info!("received {control_event:?}; signalling agent to shut down"); // Set the cooperative-stop flag. The agent runtime observes it on // every idle tick of the connected session loop and between // reconnect attempts (SPEC-018 finding H), so it breaks out and // closes the WebSocket cleanly within ~100ms even if a session is // currently connected. shutdown_for_handler.store(true, Ordering::SeqCst); ServiceControlHandlerResult::NoError } ServiceControl::Interrogate => ServiceControlHandlerResult::NoError, _ => ServiceControlHandlerResult::NotImplemented, } }; let status_handle = service_control_handler::register(SERVICE_NAME, event_handler) .context("failed to register the service control handler")?; // Report StartPending while we spin up the runtime and connect. set_status( &status_handle, ServiceState::StartPending, ServiceControlAccept::empty(), TRANSITION_WAIT, ); // Report Running and accept Stop + Shutdown. We report Running before the // first connect attempt completes because the agent loop reconnects forever; // "the service is up and trying" is the correct steady state, and blocking the // SCM on the first relay handshake would risk a start timeout on a slow boot. set_status( &status_handle, ServiceState::Running, ServiceControlAccept::STOP | ServiceControlAccept::SHUTDOWN, Duration::default(), ); info!("service reported Running; entering managed-agent control loop"); // Run the existing persistent-agent logic as SYSTEM. This is the Phase 1 // payload: resolve/enroll the cak_ (SPEC-016) and hold the relay connection. let run_result = crate::run_managed_agent_service(shutdown.clone()); if let Err(e) = &run_result { // The agent loop only returns Err on an unrecoverable LOCAL fault (e.g. no // usable credential and nothing to enroll with). Network errors are // retried inside the loop and never surface here. Report the failure to // the SCM so recovery actions (restart) engage. error!("managed-agent control loop terminated with error: {e:#}"); } else { info!("managed-agent control loop exited cleanly on stop request"); } // Transition StopPending -> Stopped. set_status( &status_handle, ServiceState::StopPending, ServiceControlAccept::empty(), TRANSITION_WAIT, ); let exit_code = match run_result { Ok(()) => ServiceExitCode::Win32(0), // ERROR_SERVICE_SPECIFIC_ERROR-style: surface a non-zero service-specific // code so the SCM treats the exit as a failure and applies recovery. Err(_) => ServiceExitCode::ServiceSpecific(1), }; set_status_with_exit( &status_handle, ServiceState::Stopped, ServiceControlAccept::empty(), Duration::default(), exit_code, ); info!("service reported Stopped"); Ok(()) } /// Report a status with a zero (success) exit code. fn set_status( handle: &service_control_handler::ServiceStatusHandle, state: ServiceState, accepted: ServiceControlAccept, wait_hint: Duration, ) { set_status_with_exit( handle, state, accepted, wait_hint, ServiceExitCode::Win32(0), ); } /// Report a status to the SCM. A failure to report is logged (best-effort) — we /// cannot do anything actionable about it and must not panic inside the service. fn set_status_with_exit( handle: &service_control_handler::ServiceStatusHandle, state: ServiceState, accepted: ServiceControlAccept, wait_hint: Duration, exit_code: ServiceExitCode, ) { let status = ServiceStatus { service_type: ServiceType::OWN_PROCESS, current_state: state, controls_accepted: accepted, exit_code, checkpoint: 0, wait_hint, process_id: None, }; if let Err(e) = handle.set_service_status(status) { warn!("failed to report service status {state:?} to the SCM: {e}"); } } // --------------------------------------------------------------------------- // Install / uninstall (used by install.rs for managed mode) // --------------------------------------------------------------------------- /// Install (or reinstall) the managed agent as a LocalSystem auto-start service /// pointing at `exe_path` with the [`SERVICE_RUN_ARG`] launch argument. /// /// Idempotent: if the service already exists it is stopped and deleted first, /// then recreated, so an upgrade picks up a new binary path / config. Configures /// crash recovery (restart on failure) via `sc failure`. /// /// Requires Administrator (SCM `CREATE_SERVICE`). Returns an error otherwise. pub fn install_service(exe_path: &std::path::Path) -> Result<()> { let manager = ServiceManager::local_computer( None::<&str>, ServiceManagerAccess::CONNECT | ServiceManagerAccess::CREATE_SERVICE, ) .context("failed to connect to the Service Control Manager (run as Administrator)")?; // Remove any prior installation so the binary path / args are refreshed. let mut deleted_existing = false; if let Ok(existing) = manager.open_service( SERVICE_NAME, ServiceAccess::QUERY_STATUS | ServiceAccess::STOP | ServiceAccess::DELETE, ) { info!("existing {SERVICE_NAME} service found; removing before reinstall"); stop_if_running(&existing); existing .delete() .context("failed to delete the existing service before reinstall")?; drop(existing); deleted_existing = true; } let service_info = ServiceInfo { name: OsString::from(SERVICE_NAME), display_name: OsString::from(SERVICE_DISPLAY_NAME), service_type: ServiceType::OWN_PROCESS, start_type: ServiceStartType::AutoStart, error_control: ServiceErrorControl::Normal, executable_path: exe_path.to_path_buf(), launch_arguments: vec![OsString::from(SERVICE_RUN_ARG)], dependencies: vec![], // account_name: None => LocalSystem (the SPEC-018 requirement). account_name: None, account_password: None, }; let service = create_service_with_retry(&manager, &service_info, deleted_existing) .context("failed to create the GuruConnect managed agent service")?; service .set_description(SERVICE_DESCRIPTION) .context("failed to set the service description")?; configure_recovery(); info!( "installed {SERVICE_NAME} (LocalSystem, auto-start) -> {} {}", exe_path.display(), SERVICE_RUN_ARG ); Ok(()) } /// Create the service, retrying briefly if the SCM still has the prior instance /// "marked for deletion" (SPEC-018 finding L1). /// /// When a service is deleted, the SCM only removes it from its database once every /// open handle to it closes; until then a fresh `CreateService` fails with /// `ERROR_SERVICE_MARKED_FOR_DELETE` (1072). The previous implementation papered /// over this with a fixed 2s sleep after `delete()`, which is both slower than /// necessary in the common case and still racy on a busy box. Instead we attempt /// the create immediately and, only if we just deleted an existing instance and /// hit 1072, retry a few times with short backoff — succeeding as soon as the SCM /// finishes the removal, and giving up with the real error if it never does. /// /// The retry is gated on `deleted_existing`: on a clean first install there was no /// prior instance, so a 1072 there is unexpected and is surfaced immediately /// rather than masked by retries. fn create_service_with_retry( manager: &ServiceManager, service_info: &ServiceInfo, deleted_existing: bool, ) -> Result { // ERROR_SERVICE_MARKED_FOR_DELETE (winerror.h). The service is gone from the // caller's perspective but the SCM has not finished reaping it. const ERROR_SERVICE_MARKED_FOR_DELETE: i32 = 1072; // Bounded: ~5 attempts over ~2s total worst case (matches the old fixed sleep // ceiling) but returns the instant the SCM is ready. const MAX_ATTEMPTS: u32 = 5; const BACKOFF: Duration = Duration::from_millis(400); let mut attempt = 0; loop { attempt += 1; match manager.create_service(service_info, ServiceAccess::CHANGE_CONFIG) { Ok(service) => return Ok(service), Err(windows_service::Error::Winapi(ref io_err)) if deleted_existing && io_err.raw_os_error() == Some(ERROR_SERVICE_MARKED_FOR_DELETE) && attempt < MAX_ATTEMPTS => { warn!( "{SERVICE_NAME} still marked for deletion by the SCM \ (attempt {attempt}/{MAX_ATTEMPTS}); retrying in {}ms", BACKOFF.as_millis() ); std::thread::sleep(BACKOFF); } Err(e) => return Err(e), } } } /// Configure SCM crash-recovery so the service restarts on unexpected exit. /// /// `windows-service` 0.7 does not expose `ChangeServiceConfig2` recovery actions /// in a stable, ergonomic form, so we mirror the established pattern used by the /// SAS service binary and shell out to `sc failure`. `reset=86400` clears the /// failure count after a day; three `restart/5000` actions retry after 5s each. fn configure_recovery() { use std::os::windows::process::CommandExt; const CREATE_NO_WINDOW: u32 = 0x0800_0000; match std::process::Command::new("sc") .args([ "failure", SERVICE_NAME, "reset=86400", "actions=restart/5000/restart/5000/restart/5000", ]) .creation_flags(CREATE_NO_WINDOW) .output() { Ok(out) if out.status.success() => { info!("configured crash-recovery (restart) for {SERVICE_NAME}"); } Ok(out) => { warn!( "could not configure crash-recovery for {SERVICE_NAME} (sc failure exit {:?}); \ the service will still run but will not auto-restart on crash", out.status.code() ); } Err(e) => { warn!("could not invoke `sc failure` to set crash-recovery for {SERVICE_NAME}: {e}"); } } } /// Stop (if running) and delete the managed agent service. Idempotent: succeeds /// quietly if the service is not installed. pub fn uninstall_service() -> Result<()> { let manager = ServiceManager::local_computer(None::<&str>, ServiceManagerAccess::CONNECT) .context("failed to connect to the Service Control Manager (run as Administrator)")?; match manager.open_service( SERVICE_NAME, ServiceAccess::QUERY_STATUS | ServiceAccess::STOP | ServiceAccess::DELETE, ) { Ok(service) => { stop_if_running(&service); service .delete() .context("failed to delete the managed agent service")?; info!("uninstalled {SERVICE_NAME} service"); Ok(()) } Err(_) => { // Not installed — nothing to do (idempotent uninstall). info!("{SERVICE_NAME} service is not installed; nothing to uninstall"); Ok(()) } } } /// Start the managed agent service now (used right after a first-run install so /// the agent comes up without waiting for the next boot). Best-effort: logs and /// returns the SCM error if the start fails, but a failure is not fatal to install /// because the service is auto-start and will come up on the next boot regardless. pub fn start_service() -> Result<()> { let manager = ServiceManager::local_computer(None::<&str>, ServiceManagerAccess::CONNECT) .context("failed to connect to the Service Control Manager")?; let service = manager .open_service( SERVICE_NAME, ServiceAccess::START | ServiceAccess::QUERY_STATUS, ) .context("failed to open the managed agent service to start it")?; // If it is already running (e.g. reinstall-over-running), there is nothing to do. if let Ok(status) = service.query_status() { if status.current_state == ServiceState::Running || status.current_state == ServiceState::StartPending { info!("{SERVICE_NAME} is already running/starting"); return Ok(()); } } service .start::(&[]) .context("failed to start the managed agent service")?; info!("started {SERVICE_NAME}"); Ok(()) } /// Report whether the managed agent service is currently installed. pub fn is_service_installed() -> bool { match ServiceManager::local_computer(None::<&str>, ServiceManagerAccess::CONNECT) { Ok(manager) => manager .open_service(SERVICE_NAME, ServiceAccess::QUERY_STATUS) .is_ok(), Err(_) => false, } } /// Best-effort stop of a service, waiting briefly for it to leave the running /// state so a subsequent `delete` does not race an in-flight stop. fn stop_if_running(service: &windows_service::service::Service) { if let Ok(status) = service.query_status() { if status.current_state != ServiceState::Stopped { info!("stopping {SERVICE_NAME} before delete"); let _ = service.stop(); for _ in 0..10 { std::thread::sleep(Duration::from_millis(500)); match service.query_status() { Ok(s) if s.current_state == ServiceState::Stopped => break, _ => continue, } } } } } #[cfg(test)] mod tests { use super::*; /// The launch argument the service is registered with MUST equal the hidden /// `service-run` subcommand `main.rs` dispatches into [`run_dispatcher`]; a /// mismatch would register a service the SCM could start but that would fall /// through to normal (non-service) mode and immediately exit. /// /// This pins the value of the constant itself. The companion test /// `tests::service_run_subcommand_matches_scm_launch_arg` in `main.rs` pins the /// other half — that the clap `#[command(name = "service-run")]` attribute on /// `Commands::ServiceRun` resolves to this same constant — so the two string /// literals cannot silently drift apart. #[test] fn service_run_arg_matches_subcommand_name() { assert_eq!(SERVICE_RUN_ARG, "service-run"); } /// Service identifiers are non-empty and the internal name carries no spaces /// (the SCM key / `sc` argument must be a single token). #[test] fn service_identifiers_are_well_formed() { assert!(!SERVICE_NAME.is_empty()); assert!( !SERVICE_NAME.contains(char::is_whitespace), "the SCM service name must be a single whitespace-free token" ); assert!(!SERVICE_DISPLAY_NAME.is_empty()); assert!(!SERVICE_DESCRIPTION.is_empty()); } /// `is_service_installed` must never panic regardless of elevation/SCM access; /// on a dev workstation without the service installed it returns `false`. (We /// do NOT install the service in tests — that is a VM/admin integration step.) #[test] fn is_service_installed_is_total() { let _ = is_service_installed(); } }