diff --git a/agent/src/main.rs b/agent/src/main.rs index d2d57b3..9df5249 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -234,7 +234,24 @@ fn main() -> Result<()> { Some(Commands::Install { user_only, elevated, - }) => run_install(user_only || elevated), + }) => { + // `run_install`'s parameter is `force_user_install` — when true it + // skips the UAC re-elevation attempt and installs in-place with + // whatever rights this process already has. + // + // - `user_only`: the user explicitly asked for a per-user install; + // honour it directly. + // - `elevated`: this is the internal, already-elevated re-exec spawned + // by `try_elevate_and_install` ("install --elevated"). It must NOT + // attempt to elevate AGAIN (that would loop / re-prompt), so we pass + // force=true here too. This is correct even though it routes through + // the "user install" parameter, because the re-exec genuinely runs + // elevated: `is_elevated()` returns true inside `install()`, so the + // path resolves to Program Files and the LocalSystem service installs + // normally. The flag only suppresses re-elevation; it does not force a + // per-user (non-elevated) install when we are already elevated. + run_install(user_only || elevated) + } Some(Commands::Uninstall) => run_uninstall(), Some(Commands::Launch { url }) => run_launch(&url), Some(Commands::VersionInfo) => { @@ -413,14 +430,40 @@ pub fn run_managed_agent_service( } let rt = tokio::runtime::Runtime::new()?; - rt.block_on(async move { - // SPEC-016 Phase B: resolve the operating credential before connecting. - // Running as SYSTEM, the SYSTEM+Administrators-ACL'd cak_ store is now - // readable in-context, so the Phase B fail-fast guard is not hit on this - // path (it remains as a safety net for any non-SYSTEM invocation). - resolve_agent_credential(&mut config).await?; - run_agent(config, Some(shutdown)).await - }) + + // SPEC-018 (finding M): this future runs across the `extern "system"` service + // entry point (ffi_service_main -> service_main -> run_service -> here). A + // panic that unwound across that FFI boundary is undefined behaviour (the C + // ABI cannot carry a Rust unwind) and would abort the process instead of + // taking the intended ServiceSpecific(1) fault path. Catch it here and convert + // it into an `Err`, which `run_service` maps to ServiceExitCode::ServiceSpecific(1) + // so the SCM applies its configured recovery (restart) cleanly. `Running` is + // already reported before we get here, so a fault does not strand StartPending. + let outcome = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + rt.block_on(async move { + // SPEC-016 Phase B: resolve the operating credential before connecting. + // Running as SYSTEM, the SYSTEM+Administrators-ACL'd cak_ store is now + // readable in-context, so the Phase B fail-fast guard is not hit on this + // path (it remains as a safety net for any non-SYSTEM invocation). + resolve_agent_credential(&mut config).await?; + run_agent(config, Some(shutdown)).await + }) + })); + + match outcome { + Ok(result) => result, + Err(panic) => { + // Recover a human-readable message from the panic payload for the log; + // do not re-panic (that would unwind across the FFI boundary again). + let detail = panic + .downcast_ref::<&str>() + .map(|s| s.to_string()) + .or_else(|| panic.downcast_ref::().cloned()) + .unwrap_or_else(|| "non-string panic payload".to_string()); + error!("managed-agent runtime panicked: {detail}"); + Err(anyhow::anyhow!("managed-agent runtime panicked: {detail}")) + } + } } /// SPEC-018 Phase 1: handle an interactive launch of a MANAGED agent binary (one @@ -812,11 +855,22 @@ async fn run_agent( } if let Err(e) = session - .run_with_tray(tray.as_ref(), chat_ctrl.as_ref()) + .run_with_tray(tray.as_ref(), chat_ctrl.as_ref(), service_shutdown.as_ref()) .await { let error_msg = e.to_string(); + // SPEC-018 (finding H): the connected session loop broke + // because the SCM asked the service to stop. The loop already + // closed the WebSocket cleanly; treat this as a graceful stop + // (no reconnect) so the service transitions StopPending -> + // Stopped. Only the service path can produce this (it is the + // only caller that passes a shutdown flag). + if error_msg.contains(session::SERVICE_STOP_SENTINEL) { + info!("Service stop requested during session; exiting agent loop"); + return Ok(()); + } + if error_msg.contains("USER_EXIT") { info!("Session ended by user"); cleanup_on_exit(); @@ -904,3 +958,32 @@ async fn run_agent( } } } + +#[cfg(test)] +mod tests { + use super::*; + use clap::CommandFactory; + + /// SPEC-018 finding N1: pin the clap subcommand name to the constant the SCM + /// is registered with. The service is installed with `SERVICE_RUN_ARG` as its + /// launch argument; when the SCM starts it, clap must route that exact token + /// into [`Commands::ServiceRun`]. If the `#[command(name = "service-run")]` + /// attribute and the constant ever drift apart, the SCM would start the binary + /// but clap would fail to match the subcommand and the process would fall + /// through to default (non-service) mode and exit. Asserting against the live + /// clap metadata (not a second string literal) makes that drift impossible. + #[test] + #[cfg(windows)] + fn service_run_subcommand_matches_scm_launch_arg() { + let cmd = Cli::command(); + let has_matching_subcommand = cmd + .get_subcommands() + .any(|sc| sc.get_name() == service::SERVICE_RUN_ARG); + assert!( + has_matching_subcommand, + "no clap subcommand named '{}' (the SCM launch arg); the ServiceRun \ + #[command(name = ...)] attribute drifted from service::SERVICE_RUN_ARG", + service::SERVICE_RUN_ARG + ); + } +} diff --git a/agent/src/service/mod.rs b/agent/src/service/mod.rs index 1ed76f3..e3f99b2 100644 --- a/agent/src/service/mod.rs +++ b/agent/src/service/mod.rs @@ -12,7 +12,11 @@ //! relay WSS connection. //! 2. Report a correct service lifecycle to the SCM (`StartPending` -> //! `Running` -> `StopPending` -> `Stopped`) and handle `Stop`/`Shutdown` -//! gracefully (signal the agent loop to close the WS connection and exit). +//! gracefully. The control handler sets a shared shutdown flag; the agent +//! runtime observes it both between reconnect attempts AND inside the +//! connected session loop (SPEC-018 finding H), so a stop received while a +//! session is live breaks out promptly, closes the WS connection cleanly, +//! and exits — rather than waiting for the SCM to force-kill. //! 3. Provide install/uninstall of the service (LocalSystem, auto-start, crash //! recovery) so managed mode uses the service as its single autostart //! instead of the per-user `HKCU\…\Run` entry. @@ -122,6 +126,11 @@ fn run_service() -> Result<()> { // we intentionally do not accept SESSIONCHANGE yet. ServiceControl::Stop | ServiceControl::Shutdown => { info!("received {control_event:?}; signalling agent to shut down"); + // Set the cooperative-stop flag. The agent runtime observes it on + // every idle tick of the connected session loop and between + // reconnect attempts (SPEC-018 finding H), so it breaks out and + // closes the WebSocket cleanly within ~100ms even if a session is + // currently connected. shutdown_for_handler.store(true, Ordering::SeqCst); ServiceControlHandlerResult::NoError } @@ -253,6 +262,7 @@ pub fn install_service(exe_path: &std::path::Path) -> Result<()> { .context("failed to connect to the Service Control Manager (run as Administrator)")?; // Remove any prior installation so the binary path / args are refreshed. + let mut deleted_existing = false; if let Ok(existing) = manager.open_service( SERVICE_NAME, ServiceAccess::QUERY_STATUS | ServiceAccess::STOP | ServiceAccess::DELETE, @@ -263,9 +273,7 @@ pub fn install_service(exe_path: &std::path::Path) -> Result<()> { .delete() .context("failed to delete the existing service before reinstall")?; drop(existing); - // The SCM marks a service for deletion but only removes it once all handles - // close; a brief settle avoids a CreateService "marked for deletion" race. - std::thread::sleep(Duration::from_secs(2)); + deleted_existing = true; } let service_info = ServiceInfo { @@ -282,8 +290,7 @@ pub fn install_service(exe_path: &std::path::Path) -> Result<()> { account_password: None, }; - let service = manager - .create_service(&service_info, ServiceAccess::CHANGE_CONFIG) + let service = create_service_with_retry(&manager, &service_info, deleted_existing) .context("failed to create the GuruConnect managed agent service")?; service @@ -300,6 +307,56 @@ pub fn install_service(exe_path: &std::path::Path) -> Result<()> { Ok(()) } +/// Create the service, retrying briefly if the SCM still has the prior instance +/// "marked for deletion" (SPEC-018 finding L1). +/// +/// When a service is deleted, the SCM only removes it from its database once every +/// open handle to it closes; until then a fresh `CreateService` fails with +/// `ERROR_SERVICE_MARKED_FOR_DELETE` (1072). The previous implementation papered +/// over this with a fixed 2s sleep after `delete()`, which is both slower than +/// necessary in the common case and still racy on a busy box. Instead we attempt +/// the create immediately and, only if we just deleted an existing instance and +/// hit 1072, retry a few times with short backoff — succeeding as soon as the SCM +/// finishes the removal, and giving up with the real error if it never does. +/// +/// The retry is gated on `deleted_existing`: on a clean first install there was no +/// prior instance, so a 1072 there is unexpected and is surfaced immediately +/// rather than masked by retries. +fn create_service_with_retry( + manager: &ServiceManager, + service_info: &ServiceInfo, + deleted_existing: bool, +) -> Result { + // ERROR_SERVICE_MARKED_FOR_DELETE (winerror.h). The service is gone from the + // caller's perspective but the SCM has not finished reaping it. + const ERROR_SERVICE_MARKED_FOR_DELETE: i32 = 1072; + // Bounded: ~5 attempts over ~2s total worst case (matches the old fixed sleep + // ceiling) but returns the instant the SCM is ready. + const MAX_ATTEMPTS: u32 = 5; + const BACKOFF: Duration = Duration::from_millis(400); + + let mut attempt = 0; + loop { + attempt += 1; + match manager.create_service(service_info, ServiceAccess::CHANGE_CONFIG) { + Ok(service) => return Ok(service), + Err(windows_service::Error::Winapi(ref io_err)) + if deleted_existing + && io_err.raw_os_error() == Some(ERROR_SERVICE_MARKED_FOR_DELETE) + && attempt < MAX_ATTEMPTS => + { + warn!( + "{SERVICE_NAME} still marked for deletion by the SCM \ + (attempt {attempt}/{MAX_ATTEMPTS}); retrying in {}ms", + BACKOFF.as_millis() + ); + std::thread::sleep(BACKOFF); + } + Err(e) => return Err(e), + } + } +} + /// Configure SCM crash-recovery so the service restarts on unexpected exit. /// /// `windows-service` 0.7 does not expose `ChangeServiceConfig2` recovery actions @@ -429,6 +486,12 @@ mod tests { /// `service-run` subcommand `main.rs` dispatches into [`run_dispatcher`]; a /// mismatch would register a service the SCM could start but that would fall /// through to normal (non-service) mode and immediately exit. + /// + /// This pins the value of the constant itself. The companion test + /// `tests::service_run_subcommand_matches_scm_launch_arg` in `main.rs` pins the + /// other half — that the clap `#[command(name = "service-run")]` attribute on + /// `Commands::ServiceRun` resolves to this same constant — so the two string + /// literals cannot silently drift apart. #[test] fn service_run_arg_matches_subcommand_name() { assert_eq!(SERVICE_RUN_ARG, "service-run"); diff --git a/agent/src/session/mod.rs b/agent/src/session/mod.rs index 5e40185..16cea1f 100644 --- a/agent/src/session/mod.rs +++ b/agent/src/session/mod.rs @@ -41,8 +41,18 @@ use crate::proto::{message, AgentStatus, ChatMessage, Heartbeat, HeartbeatAck, M use crate::transport::WebSocketTransport; use crate::tray::{TrayAction, TrayController}; use anyhow::Result; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; use std::time::{Duration, Instant}; +/// Sentinel error string returned by [`SessionManager::run_with_tray`] when the +/// loop breaks because the SCM asked the managed-agent service to stop (SPEC-018, +/// finding H). The outer `run_agent` loop matches on this to treat the exit as a +/// graceful service stop (clean WS close, no reconnect) rather than a session +/// error. Only the service path passes a shutdown flag, so only the service path +/// can ever produce this. +pub const SERVICE_STOP_SENTINEL: &str = "SERVICE_STOP"; + // Heartbeat interval (30 seconds) const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(30); // Status report interval (60 seconds) @@ -285,16 +295,34 @@ impl SessionManager { Ok(()) } - /// Run the session main loop with tray and chat event processing + /// Run the session main loop with tray and chat event processing. + /// + /// `service_shutdown` (SPEC-018 finding H) is the SCM cooperative-stop flag. + /// It is `Some(flag)` ONLY on the managed-agent service path; the + /// attended/viewer/interactive callers pass `None` and behave EXACTLY as + /// before. When present, the flag is polled on every idle tick (the natural + /// ~100ms seam below) so an SCM Stop/Shutdown received while CONNECTED breaks + /// this inner loop promptly — instead of only being observed by the outer + /// `run_agent` reconnect loop, which never runs while a session is connected. + /// On a set flag the loop closes the WebSocket cleanly (via the shared exit + /// path at the bottom) and returns the [`SERVICE_STOP_SENTINEL`] error, which + /// the outer loop maps to a graceful stop. pub async fn run_with_tray( &mut self, tray: Option<&TrayController>, chat: Option<&ChatController>, + service_shutdown: Option<&Arc>, ) -> Result<()> { if self.transport.is_none() { anyhow::bail!("Not connected"); } + // Helper: has the SCM asked the service to stop? Always false off the + // service path (where `service_shutdown` is `None`). + let stop_requested = |flag: Option<&Arc>| -> bool { + flag.is_some_and(|f| f.load(Ordering::SeqCst)) + }; + // Send initial status self.send_status().await?; @@ -307,6 +335,29 @@ impl SessionManager { // Main loop loop { + // SPEC-018 (finding H): honour an SCM stop request received while the + // session is CONNECTED. The outer `run_agent` loop only observes the + // flag between connection attempts, but a managed agent spends its + // entire connected life inside THIS loop — so without this check an + // SCM Stop while connected would not break out until the connection + // dropped on its own. Breaking here falls through to the shared exit + // path below, which closes the transport cleanly (clean WS close); + // the sentinel tells the outer loop this was a graceful stop. + if stop_requested(service_shutdown) { + tracing::info!("Service stop requested; ending connected session loop"); + self.release_streaming(); + self.state = SessionState::Disconnected; + if let Some(transport) = self.transport.as_mut() { + // Best-effort clean WebSocket close (sends a Close frame). A + // failure here just means the peer/socket is already gone; the + // service still stops cleanly. + if let Err(e) = transport.close().await { + tracing::warn!("error during clean WebSocket close on service stop: {}", e); + } + } + return Err(anyhow::anyhow!(SERVICE_STOP_SENTINEL)); + } + // Process tray events if let Some(t) = tray { if let Some(action) = t.process_events() { @@ -745,3 +796,47 @@ impl SessionManager { Ok(()) } } + +#[cfg(test)] +mod tests { + use super::*; + + /// SPEC-018 finding H: the connected-stop contract. When the SCM sets the + /// shutdown flag, `run_with_tray` returns an error whose message contains + /// [`SERVICE_STOP_SENTINEL`]; the outer `run_agent` loop recognises a graceful + /// stop with `error_msg.contains(SERVICE_STOP_SENTINEL)`. This pins that the + /// error the loop constructs on stop actually satisfies that match — so the + /// two halves (producer here, consumer in `main.rs`) cannot drift. + /// + /// A full end-to-end test of the in-loop interrupt would need a live connected + /// transport (a real or mocked server), which is an integration concern; this + /// unit test instead pins the wire contract the interrupt relies on. + #[test] + fn service_stop_sentinel_is_matched_by_outer_loop_check() { + let produced = anyhow::anyhow!(SERVICE_STOP_SENTINEL); + assert!( + produced.to_string().contains(SERVICE_STOP_SENTINEL), + "the stop error must contain the sentinel the outer loop matches on" + ); + assert!( + !SERVICE_STOP_SENTINEL.is_empty(), + "the sentinel must be a non-empty, distinctive token" + ); + } + + /// The shutdown-flag check is a no-op (always `false`) when no flag is passed, + /// i.e. on the attended/viewer/interactive paths — guaranteeing the new + /// parameter is a pure addition that cannot alter non-service behaviour + /// (SPEC-018 finding H: "no regression"). + #[test] + fn no_shutdown_flag_never_requests_stop() { + let none: Option<&Arc> = None; + let check = |flag: Option<&Arc>| flag.is_some_and(|f| f.load(Ordering::SeqCst)); + assert!(!check(none)); + + let set = Arc::new(AtomicBool::new(true)); + assert!(check(Some(&set))); + let unset = Arc::new(AtomicBool::new(false)); + assert!(!check(Some(&unset))); + } +}