Merge pull request 'SPEC-018 Phase 1: managed agent as LocalSystem service host' (#7) from feat/spec-018-service-host into main
This commit was merged in pull request #7.
This commit is contained in:
@@ -290,6 +290,18 @@ pub fn install(force_user_install: bool) -> Result<()> {
|
|||||||
// Register protocol handler
|
// Register protocol handler
|
||||||
register_protocol_handler(elevated)?;
|
register_protocol_handler(elevated)?;
|
||||||
|
|
||||||
|
// SPEC-018: a MANAGED install (embedded config => persistent agent) installs
|
||||||
|
// the LocalSystem service as its single autostart and removes the per-user
|
||||||
|
// HKCU\…\Run entry. Attended (support-code) and viewer installs are untouched:
|
||||||
|
// they have no embedded config and continue to use the HKCU Run / protocol
|
||||||
|
// handler paths exactly as before.
|
||||||
|
#[cfg(windows)]
|
||||||
|
{
|
||||||
|
if crate::config::Config::has_embedded_config() {
|
||||||
|
install_managed_service(&exe_path)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
info!("Installation complete!");
|
info!("Installation complete!");
|
||||||
if elevated {
|
if elevated {
|
||||||
info!("Installed system-wide to: {}", install_path.display());
|
info!("Installed system-wide to: {}", install_path.display());
|
||||||
@@ -300,6 +312,64 @@ pub fn install(force_user_install: bool) -> Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// SPEC-018: install the managed agent as a LocalSystem service and swap out the
|
||||||
|
/// legacy per-user `HKCU\…\Run` autostart so the service is the single managed
|
||||||
|
/// autostart (no double-run).
|
||||||
|
///
|
||||||
|
/// Installing a LocalSystem service requires Administrator. If the SCM rejects the
|
||||||
|
/// create (not elevated), we surface the error rather than silently leaving the
|
||||||
|
/// machine with no managed autostart — a managed deployment is expected to run the
|
||||||
|
/// install elevated. The HKCU Run entry is removed best-effort regardless.
|
||||||
|
#[cfg(windows)]
|
||||||
|
pub fn install_managed_service(exe_path: &std::path::Path) -> Result<()> {
|
||||||
|
info!("Managed install: registering LocalSystem service (SPEC-018)");
|
||||||
|
|
||||||
|
crate::service::install_service(exe_path)
|
||||||
|
.map_err(|e| anyhow!("failed to install the managed agent service: {e:#}"))?;
|
||||||
|
|
||||||
|
// Start the service now so the agent comes up immediately on first install
|
||||||
|
// rather than only on the next boot. Best-effort: the service is auto-start, so
|
||||||
|
// a transient start failure still self-heals on reboot.
|
||||||
|
if let Err(e) = crate::service::start_service() {
|
||||||
|
warn!(
|
||||||
|
"managed service installed but did not start now ({e:#}); \
|
||||||
|
it is auto-start and will run on next boot"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove the legacy per-user autostart so the agent does not also launch in the
|
||||||
|
// user's session (which would double-run alongside the service).
|
||||||
|
if let Err(e) = crate::startup::remove_from_startup() {
|
||||||
|
warn!(
|
||||||
|
"managed service installed, but failed to remove the legacy HKCU Run \
|
||||||
|
autostart (harmless if it was never present): {}",
|
||||||
|
e
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
info!("removed legacy HKCU Run autostart (service is now the managed autostart)");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// SPEC-018: remove the managed agent service and any legacy HKCU Run autostart.
|
||||||
|
/// Idempotent — succeeds if neither is present.
|
||||||
|
#[cfg(windows)]
|
||||||
|
pub fn uninstall_managed_service() -> Result<()> {
|
||||||
|
info!("Managed uninstall: removing LocalSystem service (SPEC-018)");
|
||||||
|
|
||||||
|
// Best-effort removal of the legacy autostart first (cheap, no SCM).
|
||||||
|
if let Err(e) = crate::startup::remove_from_startup() {
|
||||||
|
warn!(
|
||||||
|
"failed to remove legacy HKCU Run autostart during uninstall: {}",
|
||||||
|
e
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
crate::service::uninstall_service()
|
||||||
|
.map_err(|e| anyhow!("failed to uninstall the managed agent service: {e:#}"))
|
||||||
|
}
|
||||||
|
|
||||||
/// Check if the guruconnect:// protocol handler is registered
|
/// Check if the guruconnect:// protocol handler is registered
|
||||||
#[cfg(windows)]
|
#[cfg(windows)]
|
||||||
pub fn is_protocol_handler_registered() -> bool {
|
pub fn is_protocol_handler_registered() -> bool {
|
||||||
|
|||||||
@@ -24,6 +24,8 @@ mod identity;
|
|||||||
mod input;
|
mod input;
|
||||||
mod install;
|
mod install;
|
||||||
mod sas_client;
|
mod sas_client;
|
||||||
|
#[cfg(windows)]
|
||||||
|
mod service;
|
||||||
mod session;
|
mod session;
|
||||||
mod startup;
|
mod startup;
|
||||||
mod transport;
|
mod transport;
|
||||||
@@ -182,6 +184,12 @@ enum Commands {
|
|||||||
/// Show detailed version and build information
|
/// Show detailed version and build information
|
||||||
#[command(name = "version-info")]
|
#[command(name = "version-info")]
|
||||||
VersionInfo,
|
VersionInfo,
|
||||||
|
|
||||||
|
/// Internal: entry point invoked by the Windows Service Control Manager to run
|
||||||
|
/// the managed agent as a LocalSystem service (SPEC-018). Not for interactive
|
||||||
|
/// use — running it by hand fails because there is no controlling SCM.
|
||||||
|
#[command(name = "service-run", hide = true)]
|
||||||
|
ServiceRun,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
@@ -226,7 +234,24 @@ fn main() -> Result<()> {
|
|||||||
Some(Commands::Install {
|
Some(Commands::Install {
|
||||||
user_only,
|
user_only,
|
||||||
elevated,
|
elevated,
|
||||||
}) => run_install(user_only || elevated),
|
}) => {
|
||||||
|
// `run_install`'s parameter is `force_user_install` — when true it
|
||||||
|
// skips the UAC re-elevation attempt and installs in-place with
|
||||||
|
// whatever rights this process already has.
|
||||||
|
//
|
||||||
|
// - `user_only`: the user explicitly asked for a per-user install;
|
||||||
|
// honour it directly.
|
||||||
|
// - `elevated`: this is the internal, already-elevated re-exec spawned
|
||||||
|
// by `try_elevate_and_install` ("install --elevated"). It must NOT
|
||||||
|
// attempt to elevate AGAIN (that would loop / re-prompt), so we pass
|
||||||
|
// force=true here too. This is correct even though it routes through
|
||||||
|
// the "user install" parameter, because the re-exec genuinely runs
|
||||||
|
// elevated: `is_elevated()` returns true inside `install()`, so the
|
||||||
|
// path resolves to Program Files and the LocalSystem service installs
|
||||||
|
// normally. The flag only suppresses re-elevation; it does not force a
|
||||||
|
// per-user (non-elevated) install when we are already elevated.
|
||||||
|
run_install(user_only || elevated)
|
||||||
|
}
|
||||||
Some(Commands::Uninstall) => run_uninstall(),
|
Some(Commands::Uninstall) => run_uninstall(),
|
||||||
Some(Commands::Launch { url }) => run_launch(&url),
|
Some(Commands::Launch { url }) => run_launch(&url),
|
||||||
Some(Commands::VersionInfo) => {
|
Some(Commands::VersionInfo) => {
|
||||||
@@ -236,6 +261,21 @@ fn main() -> Result<()> {
|
|||||||
println!("{}", build_info::full_version());
|
println!("{}", build_info::full_version());
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
Some(Commands::ServiceRun) => {
|
||||||
|
// SPEC-018 Phase 1: SCM-invoked entry. Hand off to the service
|
||||||
|
// dispatcher, which calls back into the control loop and runs the
|
||||||
|
// managed-agent logic as SYSTEM. Blocks until the service stops.
|
||||||
|
#[cfg(windows)]
|
||||||
|
{
|
||||||
|
service::run_dispatcher()
|
||||||
|
}
|
||||||
|
#[cfg(not(windows))]
|
||||||
|
{
|
||||||
|
Err(anyhow::anyhow!(
|
||||||
|
"service-run is a Windows-only entry point (SPEC-018)"
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
None => {
|
None => {
|
||||||
// No subcommand - detect mode from filename or embedded config
|
// No subcommand - detect mode from filename or embedded config
|
||||||
// Legacy: if support_code arg provided, use that
|
// Legacy: if support_code arg provided, use that
|
||||||
@@ -264,10 +304,24 @@ fn main() -> Result<()> {
|
|||||||
run_agent_mode(Some(code))
|
run_agent_mode(Some(code))
|
||||||
}
|
}
|
||||||
RunMode::PermanentAgent => {
|
RunMode::PermanentAgent => {
|
||||||
// Embedded config found - run as permanent agent
|
// Embedded config found - managed/persistent agent.
|
||||||
info!("Permanent agent mode detected (embedded config)");
|
info!("Permanent agent mode detected (embedded config)");
|
||||||
|
|
||||||
|
// SPEC-018: managed mode runs as the LocalSystem service, not as
|
||||||
|
// an interactive process. The service is the single autostart.
|
||||||
|
// - If the service is already installed, the service is (or
|
||||||
|
// will be) running the agent — this interactive invocation
|
||||||
|
// must NOT spawn a second agent. Exit quietly.
|
||||||
|
// - On first run, install (which installs + starts the service
|
||||||
|
// and removes the legacy HKCU Run entry), then exit and let
|
||||||
|
// the service carry the agent as SYSTEM.
|
||||||
|
#[cfg(windows)]
|
||||||
|
{
|
||||||
|
run_permanent_agent_managed()
|
||||||
|
}
|
||||||
|
#[cfg(not(windows))]
|
||||||
|
{
|
||||||
if !install::is_protocol_handler_registered() {
|
if !install::is_protocol_handler_registered() {
|
||||||
// First run - install then run as agent
|
|
||||||
info!("First run - installing agent");
|
info!("First run - installing agent");
|
||||||
if let Err(e) = install::install(false) {
|
if let Err(e) = install::install(false) {
|
||||||
warn!("Installation failed: {}", e);
|
warn!("Installation failed: {}", e);
|
||||||
@@ -275,6 +329,7 @@ fn main() -> Result<()> {
|
|||||||
}
|
}
|
||||||
run_agent_mode(None)
|
run_agent_mode(None)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
RunMode::Default => {
|
RunMode::Default => {
|
||||||
// No special mode detected - use legacy logic
|
// No special mode detected - use legacy logic
|
||||||
if !install::is_protocol_handler_registered() {
|
if !install::is_protocol_handler_registered() {
|
||||||
@@ -333,10 +388,118 @@ fn run_agent_mode(support_code: Option<String>) -> Result<()> {
|
|||||||
if config.support_code.is_none() {
|
if config.support_code.is_none() {
|
||||||
resolve_agent_credential(&mut config).await?;
|
resolve_agent_credential(&mut config).await?;
|
||||||
}
|
}
|
||||||
run_agent(config).await
|
run_agent(config, None).await
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// SPEC-018 Phase 1: run the managed/persistent agent as the LocalSystem service.
|
||||||
|
///
|
||||||
|
/// Invoked from the service control loop ([`service::run_service`]) once the
|
||||||
|
/// service has reported `Running`. This is the same persistent-agent logic as
|
||||||
|
/// [`run_agent_mode`] (load config, resolve/enroll the per-machine `cak_` per
|
||||||
|
/// SPEC-016, hold the relay connection) — but it runs **as SYSTEM**, so the
|
||||||
|
/// SYSTEM-ACL'd `cak_` store is finally readable in-context, and it observes the
|
||||||
|
/// SCM `shutdown` flag for a graceful stop.
|
||||||
|
///
|
||||||
|
/// Returns `Ok(())` when the agent loop exits because a stop was requested, and
|
||||||
|
/// `Err` only on an unrecoverable *local* fault (e.g. no usable credential and no
|
||||||
|
/// enrollment material) — network errors are retried inside the loop and never
|
||||||
|
/// surface here.
|
||||||
|
///
|
||||||
|
/// Phase 2 seam: this is where the session broker is wired in — the runtime
|
||||||
|
/// started here will own the broker that spawns the per-session capture/input
|
||||||
|
/// worker (`CreateProcessAsUserW`) and the IPC server. Phase 1 connects/enrolls
|
||||||
|
/// only; it does not capture a desktop (a Session-0 SYSTEM process cannot).
|
||||||
|
#[cfg(windows)]
|
||||||
|
pub fn run_managed_agent_service(
|
||||||
|
shutdown: std::sync::Arc<std::sync::atomic::AtomicBool>,
|
||||||
|
) -> Result<()> {
|
||||||
|
info!("Loading managed-agent configuration (running as SYSTEM)");
|
||||||
|
|
||||||
|
let mut config = config::Config::load()?;
|
||||||
|
// The service ONLY ever runs the managed/persistent path. A support session is
|
||||||
|
// an interactive, user-launched flow and must never be carried by the service.
|
||||||
|
config.support_code = None;
|
||||||
|
|
||||||
|
info!("Server: {}", config.server_url);
|
||||||
|
if let Some(ref company) = config.company {
|
||||||
|
info!("Company: {}", company);
|
||||||
|
}
|
||||||
|
if let Some(ref site) = config.site {
|
||||||
|
info!("Site: {}", site);
|
||||||
|
}
|
||||||
|
|
||||||
|
let rt = tokio::runtime::Runtime::new()?;
|
||||||
|
|
||||||
|
// SPEC-018 (finding M): this future runs across the `extern "system"` service
|
||||||
|
// entry point (ffi_service_main -> service_main -> run_service -> here). A
|
||||||
|
// panic that unwound across that FFI boundary is undefined behaviour (the C
|
||||||
|
// ABI cannot carry a Rust unwind) and would abort the process instead of
|
||||||
|
// taking the intended ServiceSpecific(1) fault path. Catch it here and convert
|
||||||
|
// it into an `Err`, which `run_service` maps to ServiceExitCode::ServiceSpecific(1)
|
||||||
|
// so the SCM applies its configured recovery (restart) cleanly. `Running` is
|
||||||
|
// already reported before we get here, so a fault does not strand StartPending.
|
||||||
|
let outcome = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
||||||
|
rt.block_on(async move {
|
||||||
|
// SPEC-016 Phase B: resolve the operating credential before connecting.
|
||||||
|
// Running as SYSTEM, the SYSTEM+Administrators-ACL'd cak_ store is now
|
||||||
|
// readable in-context, so the Phase B fail-fast guard is not hit on this
|
||||||
|
// path (it remains as a safety net for any non-SYSTEM invocation).
|
||||||
|
resolve_agent_credential(&mut config).await?;
|
||||||
|
run_agent(config, Some(shutdown)).await
|
||||||
|
})
|
||||||
|
}));
|
||||||
|
|
||||||
|
match outcome {
|
||||||
|
Ok(result) => result,
|
||||||
|
Err(panic) => {
|
||||||
|
// Recover a human-readable message from the panic payload for the log;
|
||||||
|
// do not re-panic (that would unwind across the FFI boundary again).
|
||||||
|
let detail = panic
|
||||||
|
.downcast_ref::<&str>()
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.or_else(|| panic.downcast_ref::<String>().cloned())
|
||||||
|
.unwrap_or_else(|| "non-string panic payload".to_string());
|
||||||
|
error!("managed-agent runtime panicked: {detail}");
|
||||||
|
Err(anyhow::anyhow!("managed-agent runtime panicked: {detail}"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// SPEC-018 Phase 1: handle an interactive launch of a MANAGED agent binary (one
|
||||||
|
/// carrying embedded config, detected as [`config::RunMode::PermanentAgent`]).
|
||||||
|
///
|
||||||
|
/// Managed mode runs as the LocalSystem service, never as an interactive process:
|
||||||
|
/// - If the service is already installed, the service is (or will be) running
|
||||||
|
/// the agent as SYSTEM, so this interactive invocation must NOT spawn a second
|
||||||
|
/// agent — it exits quietly.
|
||||||
|
/// - On first run, install (which installs + starts the service and removes the
|
||||||
|
/// legacy `HKCU\…\Run` autostart), then exit and let the service carry the
|
||||||
|
/// agent. If the service install fails (e.g. not elevated), fall back to
|
||||||
|
/// running the agent in-process for this run so the machine is not left with no
|
||||||
|
/// agent at all.
|
||||||
|
#[cfg(windows)]
|
||||||
|
fn run_permanent_agent_managed() -> Result<()> {
|
||||||
|
if service::is_service_installed() {
|
||||||
|
info!(
|
||||||
|
"Managed service already installed; the service runs the agent as SYSTEM — \
|
||||||
|
this interactive instance has nothing to do"
|
||||||
|
);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("First run - installing managed agent service");
|
||||||
|
if let Err(e) = install::install(false) {
|
||||||
|
warn!(
|
||||||
|
"Managed service install failed ({e:#}); falling back to in-process agent for this run"
|
||||||
|
);
|
||||||
|
return run_agent_mode(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Managed agent service installed; handing off to the service");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Resolve the per-machine operating credential for a managed agent (SPEC-016
|
/// Resolve the per-machine operating credential for a managed agent (SPEC-016
|
||||||
/// Phase B, run-mode wiring).
|
/// Phase B, run-mode wiring).
|
||||||
///
|
///
|
||||||
@@ -372,9 +535,13 @@ async fn resolve_agent_credential(config: &mut config::Config) -> Result<()> {
|
|||||||
// do NOT silently re-enroll. The SYSTEM+Administrators ACL is correct
|
// do NOT silently re-enroll. The SYSTEM+Administrators ACL is correct
|
||||||
// for the target (Option A) and is deliberately kept.
|
// for the target (Option A) and is deliberately kept.
|
||||||
//
|
//
|
||||||
// NOTE: this guard is satisfied/removed once the GuruConnect SYSTEM
|
// SPEC-018 (this spec): the managed agent now runs as the GuruConnect
|
||||||
// service host lands (separate spec, SPEC-018) and the agent always
|
// SYSTEM service ([`run_managed_agent_service`]), so on the production
|
||||||
// runs as SYSTEM — at which point the store is always readable.
|
// managed path the store IS readable in-context and this branch is NOT
|
||||||
|
// hit. The guard is intentionally retained as a harmless safety net for
|
||||||
|
// any non-SYSTEM invocation (e.g. someone running the managed binary
|
||||||
|
// interactively): it still fails fast with an actionable message rather
|
||||||
|
// than bricking. Do NOT remove it in Phase 1.
|
||||||
Err(LoadCakError::Io {
|
Err(LoadCakError::Io {
|
||||||
permission_denied: true,
|
permission_denied: true,
|
||||||
source,
|
source,
|
||||||
@@ -484,7 +651,22 @@ fn run_install(force_user_install: bool) -> Result<()> {
|
|||||||
fn run_uninstall() -> Result<()> {
|
fn run_uninstall() -> Result<()> {
|
||||||
info!("Uninstalling GuruConnect...");
|
info!("Uninstalling GuruConnect...");
|
||||||
|
|
||||||
// Remove from startup
|
// SPEC-018: remove the managed LocalSystem service and the legacy HKCU Run
|
||||||
|
// autostart. Idempotent — no error if the service was never installed (an
|
||||||
|
// attended/viewer install has no service), so this is safe for every install
|
||||||
|
// shape. Requires Administrator to delete the service; a non-elevated uninstall
|
||||||
|
// still clears the per-user autostart below.
|
||||||
|
#[cfg(windows)]
|
||||||
|
{
|
||||||
|
if let Err(e) = install::uninstall_managed_service() {
|
||||||
|
warn!(
|
||||||
|
"Failed to remove managed service (may require Administrator): {}",
|
||||||
|
e
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove from startup (covers non-elevated / attended / viewer installs).
|
||||||
if let Err(e) = startup::remove_from_startup() {
|
if let Err(e) = startup::remove_from_startup() {
|
||||||
warn!("Failed to remove from startup: {}", e);
|
warn!("Failed to remove from startup: {}", e);
|
||||||
}
|
}
|
||||||
@@ -582,20 +764,50 @@ fn cleanup_on_exit() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Run the agent main loop
|
/// Run the agent main loop.
|
||||||
async fn run_agent(config: config::Config) -> Result<()> {
|
///
|
||||||
|
/// `service_shutdown`, when present, is the SCM cooperative-stop flag (SPEC-018):
|
||||||
|
/// the managed-agent service passes it so the loop exits promptly on
|
||||||
|
/// `Stop`/`Shutdown`. It is `None` for the interactive/user-launched paths, which
|
||||||
|
/// stop via the tray exit / server control messages instead.
|
||||||
|
async fn run_agent(
|
||||||
|
config: config::Config,
|
||||||
|
service_shutdown: Option<std::sync::Arc<std::sync::atomic::AtomicBool>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
use std::sync::atomic::Ordering;
|
||||||
|
|
||||||
let elevated = install::is_elevated();
|
let elevated = install::is_elevated();
|
||||||
|
let running_as_service = service_shutdown.is_some();
|
||||||
let mut session = session::SessionManager::new(config.clone(), elevated);
|
let mut session = session::SessionManager::new(config.clone(), elevated);
|
||||||
let is_support_session = config.support_code.is_some();
|
let is_support_session = config.support_code.is_some();
|
||||||
let hostname = config.hostname();
|
let hostname = config.hostname();
|
||||||
|
|
||||||
// Add to startup
|
// Helper: has the SCM asked us to stop?
|
||||||
if let Err(e) = startup::add_to_startup() {
|
let stop_requested = |flag: &Option<std::sync::Arc<std::sync::atomic::AtomicBool>>| -> bool {
|
||||||
|
flag.as_ref()
|
||||||
|
.map(|f| f.load(Ordering::SeqCst))
|
||||||
|
.unwrap_or(false)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Autostart persistence:
|
||||||
|
// - As the SYSTEM service (SPEC-018), the SERVICE itself is the managed
|
||||||
|
// autostart — do NOT write the per-user HKCU\…\Run entry (that would be a
|
||||||
|
// second, redundant autostart, and writing it from SYSTEM lands in the
|
||||||
|
// wrong hive). The service install/uninstall owns lifecycle.
|
||||||
|
// - Interactive/user-launched runs keep the existing HKCU Run behavior.
|
||||||
|
if running_as_service {
|
||||||
|
info!("Running as the GuruConnect SYSTEM service; service is the autostart (skipping HKCU Run)");
|
||||||
|
} else if let Err(e) = startup::add_to_startup() {
|
||||||
warn!("Failed to add to startup: {}", e);
|
warn!("Failed to add to startup: {}", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create tray icon
|
// A Session-0 SYSTEM service has no interactive desktop, so a tray icon is
|
||||||
let tray = match tray::TrayController::new(
|
// both impossible and meaningless there (SPEC-018 Phase 2 moves the user-facing
|
||||||
|
// surface into the per-session worker). Only create the tray off the service.
|
||||||
|
let tray = if running_as_service {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
match tray::TrayController::new(
|
||||||
&hostname,
|
&hostname,
|
||||||
config.support_code.as_deref(),
|
config.support_code.as_deref(),
|
||||||
is_support_session,
|
is_support_session,
|
||||||
@@ -608,6 +820,7 @@ async fn run_agent(config: config::Config) -> Result<()> {
|
|||||||
warn!("Failed to create tray icon: {}", e);
|
warn!("Failed to create tray icon: {}", e);
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Create chat controller
|
// Create chat controller
|
||||||
@@ -615,6 +828,12 @@ async fn run_agent(config: config::Config) -> Result<()> {
|
|||||||
|
|
||||||
// Connect to server and run main loop
|
// Connect to server and run main loop
|
||||||
loop {
|
loop {
|
||||||
|
// SPEC-018: honour an SCM stop request before (re)connecting.
|
||||||
|
if stop_requested(&service_shutdown) {
|
||||||
|
info!("Service stop requested; exiting agent loop");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
info!("Connecting to server...");
|
info!("Connecting to server...");
|
||||||
|
|
||||||
if is_support_session {
|
if is_support_session {
|
||||||
@@ -636,11 +855,22 @@ async fn run_agent(config: config::Config) -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if let Err(e) = session
|
if let Err(e) = session
|
||||||
.run_with_tray(tray.as_ref(), chat_ctrl.as_ref())
|
.run_with_tray(tray.as_ref(), chat_ctrl.as_ref(), service_shutdown.as_ref())
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
let error_msg = e.to_string();
|
let error_msg = e.to_string();
|
||||||
|
|
||||||
|
// SPEC-018 (finding H): the connected session loop broke
|
||||||
|
// because the SCM asked the service to stop. The loop already
|
||||||
|
// closed the WebSocket cleanly; treat this as a graceful stop
|
||||||
|
// (no reconnect) so the service transitions StopPending ->
|
||||||
|
// Stopped. Only the service path can produce this (it is the
|
||||||
|
// only caller that passes a shutdown flag).
|
||||||
|
if error_msg.contains(session::SERVICE_STOP_SENTINEL) {
|
||||||
|
info!("Service stop requested during session; exiting agent loop");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
if error_msg.contains("USER_EXIT") {
|
if error_msg.contains("USER_EXIT") {
|
||||||
info!("Session ended by user");
|
info!("Session ended by user");
|
||||||
cleanup_on_exit();
|
cleanup_on_exit();
|
||||||
@@ -713,6 +943,47 @@ async fn run_agent(config: config::Config) -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
info!("Reconnecting in 5 seconds...");
|
info!("Reconnecting in 5 seconds...");
|
||||||
|
// SPEC-018: poll the SCM stop flag during the backoff so a service stop is
|
||||||
|
// honoured within ~250ms instead of waiting the full reconnect delay.
|
||||||
|
if service_shutdown.is_some() {
|
||||||
|
for _ in 0..20 {
|
||||||
|
if stop_requested(&service_shutdown) {
|
||||||
|
info!("Service stop requested during reconnect backoff; exiting agent loop");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_millis(250)).await;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
tokio::time::sleep(tokio::time::Duration::from_secs(5)).await;
|
tokio::time::sleep(tokio::time::Duration::from_secs(5)).await;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use clap::CommandFactory;
|
||||||
|
|
||||||
|
/// SPEC-018 finding N1: pin the clap subcommand name to the constant the SCM
|
||||||
|
/// is registered with. The service is installed with `SERVICE_RUN_ARG` as its
|
||||||
|
/// launch argument; when the SCM starts it, clap must route that exact token
|
||||||
|
/// into [`Commands::ServiceRun`]. If the `#[command(name = "service-run")]`
|
||||||
|
/// attribute and the constant ever drift apart, the SCM would start the binary
|
||||||
|
/// but clap would fail to match the subcommand and the process would fall
|
||||||
|
/// through to default (non-service) mode and exit. Asserting against the live
|
||||||
|
/// clap metadata (not a second string literal) makes that drift impossible.
|
||||||
|
#[test]
|
||||||
|
#[cfg(windows)]
|
||||||
|
fn service_run_subcommand_matches_scm_launch_arg() {
|
||||||
|
let cmd = Cli::command();
|
||||||
|
let has_matching_subcommand = cmd
|
||||||
|
.get_subcommands()
|
||||||
|
.any(|sc| sc.get_name() == service::SERVICE_RUN_ARG);
|
||||||
|
assert!(
|
||||||
|
has_matching_subcommand,
|
||||||
|
"no clap subcommand named '{}' (the SCM launch arg); the ServiceRun \
|
||||||
|
#[command(name = ...)] attribute drifted from service::SERVICE_RUN_ARG",
|
||||||
|
service::SERVICE_RUN_ARG
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
520
agent/src/service/mod.rs
Normal file
520
agent/src/service/mod.rs
Normal file
@@ -0,0 +1,520 @@
|
|||||||
|
//! Windows SYSTEM service host for the managed GuruConnect agent (SPEC-018).
|
||||||
|
//!
|
||||||
|
//! # Phase 1 scope (this module)
|
||||||
|
//!
|
||||||
|
//! Phase 1 proves the *managed/persistent* agent can run as **LocalSystem** in
|
||||||
|
//! the isolated Session 0 across reboots and at the login screen:
|
||||||
|
//!
|
||||||
|
//! 1. Register the agent with the Service Control Manager (SCM) and run, when
|
||||||
|
//! started, the **existing persistent-agent logic** (`RunMode::PermanentAgent`
|
||||||
|
//! path) *as SYSTEM* — i.e. resolve/enroll the per-machine `cak_` (SPEC-016,
|
||||||
|
//! now readable because the SYSTEM-ACL'd store is in-context) and hold the
|
||||||
|
//! relay WSS connection.
|
||||||
|
//! 2. Report a correct service lifecycle to the SCM (`StartPending` ->
|
||||||
|
//! `Running` -> `StopPending` -> `Stopped`) and handle `Stop`/`Shutdown`
|
||||||
|
//! gracefully. The control handler sets a shared shutdown flag; the agent
|
||||||
|
//! runtime observes it both between reconnect attempts AND inside the
|
||||||
|
//! connected session loop (SPEC-018 finding H), so a stop received while a
|
||||||
|
//! session is live breaks out promptly, closes the WS connection cleanly,
|
||||||
|
//! and exits — rather than waiting for the SCM to force-kill.
|
||||||
|
//! 3. Provide install/uninstall of the service (LocalSystem, auto-start, crash
|
||||||
|
//! recovery) so managed mode uses the service as its single autostart
|
||||||
|
//! instead of the per-user `HKCU\…\Run` entry.
|
||||||
|
//!
|
||||||
|
//! # Phase 2 (deliberately NOT built here — see SPEC-018 §Scope)
|
||||||
|
//!
|
||||||
|
//! A SYSTEM service lives in Session 0 and **cannot** capture or inject the
|
||||||
|
//! interactive desktop directly. Phase 1 therefore enrolls and connects but does
|
||||||
|
//! **NOT** capture a desktop yet. The following are Phase 2 and are intentionally
|
||||||
|
//! absent; the seams where they attach are called out inline below:
|
||||||
|
//!
|
||||||
|
//! - the **session broker** (`WTSEnumerateSessionsW` /
|
||||||
|
//! `WTSGetActiveConsoleSessionId` / `WTSQueryUserToken`),
|
||||||
|
//! - the **per-session capture/input worker** spawned via `CreateProcessAsUserW`
|
||||||
|
//! into `winsta0\default`,
|
||||||
|
//! - **service <-> worker IPC** (the per-session ACL'd named pipe), and
|
||||||
|
//! - **`SERVICE_CONTROL_SESSIONCHANGE`** reaction (logon/logoff/console-connect
|
||||||
|
//! retarget).
|
||||||
|
//!
|
||||||
|
//! Phase 1 registers the control handler for `Stop`/`Shutdown`/`Interrogate`
|
||||||
|
//! only. When Phase 2 lands, the broker hangs off the same control handler
|
||||||
|
//! (adding `SESSIONCHANGE`) and off the same agent runtime started here.
|
||||||
|
|
||||||
|
#![cfg(windows)]
|
||||||
|
|
||||||
|
use std::ffi::OsString;
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use anyhow::{Context, Result};
|
||||||
|
use tracing::{error, info, warn};
|
||||||
|
|
||||||
|
use windows_service::{
|
||||||
|
define_windows_service,
|
||||||
|
service::{
|
||||||
|
ServiceAccess, ServiceControl, ServiceControlAccept, ServiceErrorControl, ServiceExitCode,
|
||||||
|
ServiceInfo, ServiceStartType, ServiceState, ServiceStatus, ServiceType,
|
||||||
|
},
|
||||||
|
service_control_handler::{self, ServiceControlHandlerResult},
|
||||||
|
service_dispatcher,
|
||||||
|
service_manager::{ServiceManager, ServiceManagerAccess},
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Internal service name registered with the SCM (no spaces; used by `sc`,
|
||||||
|
/// `ServiceManager`, and the control handler).
|
||||||
|
pub const SERVICE_NAME: &str = "GuruConnectAgent";
|
||||||
|
|
||||||
|
/// Human-facing display name shown in `services.msc`.
|
||||||
|
pub const SERVICE_DISPLAY_NAME: &str = "GuruConnect Managed Agent";
|
||||||
|
|
||||||
|
/// Service description shown in `services.msc`.
|
||||||
|
pub const SERVICE_DESCRIPTION: &str =
|
||||||
|
"Runs the managed GuruConnect remote-support agent as LocalSystem so it is \
|
||||||
|
reachable at the login screen and across reboots (SPEC-018).";
|
||||||
|
|
||||||
|
/// Hidden subcommand the SCM invokes to enter the service control loop. The
|
||||||
|
/// service is registered with this as its launch argument (see [`install_service`]),
|
||||||
|
/// and `main.rs` routes it into [`run_dispatcher`].
|
||||||
|
pub const SERVICE_RUN_ARG: &str = "service-run";
|
||||||
|
|
||||||
|
/// Hint we give the SCM for how long start/stop transitions may take before it
|
||||||
|
/// should consider the service hung.
|
||||||
|
const TRANSITION_WAIT: Duration = Duration::from_secs(10);
|
||||||
|
|
||||||
|
// The `windows-service` dispatcher requires a `extern "system"` entry point with
|
||||||
|
// a fixed ABI; this macro generates `ffi_service_main`, which trampolines into
|
||||||
|
// our safe `service_main`.
|
||||||
|
define_windows_service!(ffi_service_main, service_main);
|
||||||
|
|
||||||
|
/// Enter the SCM dispatcher (called from `main.rs` for the `service-run`
|
||||||
|
/// subcommand). Blocks until the service stops. This must be invoked by the SCM,
|
||||||
|
/// not interactively — `service_dispatcher::start` fails with
|
||||||
|
/// `ERROR_FAILED_SERVICE_CONTROLLER_CONNECT` (1063) if there is no controlling
|
||||||
|
/// SCM, which is the expected outcome of running `guruconnect service-run` by hand.
|
||||||
|
pub fn run_dispatcher() -> Result<()> {
|
||||||
|
service_dispatcher::start(SERVICE_NAME, ffi_service_main)
|
||||||
|
.context("failed to connect to the service control dispatcher (must be started by the SCM)")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// SCM-invoked service body. Any error is logged; the function cannot return an
|
||||||
|
/// error to the SCM directly, so [`run_service`] reports a failed exit code on the
|
||||||
|
/// status handle before returning.
|
||||||
|
fn service_main(_arguments: Vec<OsString>) {
|
||||||
|
if let Err(e) = run_service() {
|
||||||
|
error!("service exited with error: {e:#}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Drive the full service lifecycle: register the control handler, report
|
||||||
|
/// `Running`, run the persistent agent until a stop is requested, then report
|
||||||
|
/// `Stopped`.
|
||||||
|
fn run_service() -> Result<()> {
|
||||||
|
info!("GuruConnect managed agent service starting (running as SYSTEM in session 0)");
|
||||||
|
|
||||||
|
// Cooperative shutdown flag flipped by the SCM control handler and observed by
|
||||||
|
// the agent runtime. `AtomicBool` keeps the handler closure trivially `Send`
|
||||||
|
// and avoids holding a lock inside an SCM callback.
|
||||||
|
let shutdown = Arc::new(AtomicBool::new(false));
|
||||||
|
let shutdown_for_handler = shutdown.clone();
|
||||||
|
|
||||||
|
let event_handler = move |control_event| -> ServiceControlHandlerResult {
|
||||||
|
match control_event {
|
||||||
|
// SPEC-018 Phase 1: graceful stop. Phase 2 adds
|
||||||
|
// `ServiceControl::SessionChange(_)` here to drive the session broker
|
||||||
|
// (retarget the capture/input worker on logon/logoff/console-connect);
|
||||||
|
// we intentionally do not accept SESSIONCHANGE yet.
|
||||||
|
ServiceControl::Stop | ServiceControl::Shutdown => {
|
||||||
|
info!("received {control_event:?}; signalling agent to shut down");
|
||||||
|
// Set the cooperative-stop flag. The agent runtime observes it on
|
||||||
|
// every idle tick of the connected session loop and between
|
||||||
|
// reconnect attempts (SPEC-018 finding H), so it breaks out and
|
||||||
|
// closes the WebSocket cleanly within ~100ms even if a session is
|
||||||
|
// currently connected.
|
||||||
|
shutdown_for_handler.store(true, Ordering::SeqCst);
|
||||||
|
ServiceControlHandlerResult::NoError
|
||||||
|
}
|
||||||
|
ServiceControl::Interrogate => ServiceControlHandlerResult::NoError,
|
||||||
|
_ => ServiceControlHandlerResult::NotImplemented,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let status_handle = service_control_handler::register(SERVICE_NAME, event_handler)
|
||||||
|
.context("failed to register the service control handler")?;
|
||||||
|
|
||||||
|
// Report StartPending while we spin up the runtime and connect.
|
||||||
|
set_status(
|
||||||
|
&status_handle,
|
||||||
|
ServiceState::StartPending,
|
||||||
|
ServiceControlAccept::empty(),
|
||||||
|
TRANSITION_WAIT,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Report Running and accept Stop + Shutdown. We report Running before the
|
||||||
|
// first connect attempt completes because the agent loop reconnects forever;
|
||||||
|
// "the service is up and trying" is the correct steady state, and blocking the
|
||||||
|
// SCM on the first relay handshake would risk a start timeout on a slow boot.
|
||||||
|
set_status(
|
||||||
|
&status_handle,
|
||||||
|
ServiceState::Running,
|
||||||
|
ServiceControlAccept::STOP | ServiceControlAccept::SHUTDOWN,
|
||||||
|
Duration::default(),
|
||||||
|
);
|
||||||
|
info!("service reported Running; entering managed-agent control loop");
|
||||||
|
|
||||||
|
// Run the existing persistent-agent logic as SYSTEM. This is the Phase 1
|
||||||
|
// payload: resolve/enroll the cak_ (SPEC-016) and hold the relay connection.
|
||||||
|
let run_result = crate::run_managed_agent_service(shutdown.clone());
|
||||||
|
|
||||||
|
if let Err(e) = &run_result {
|
||||||
|
// The agent loop only returns Err on an unrecoverable LOCAL fault (e.g. no
|
||||||
|
// usable credential and nothing to enroll with). Network errors are
|
||||||
|
// retried inside the loop and never surface here. Report the failure to
|
||||||
|
// the SCM so recovery actions (restart) engage.
|
||||||
|
error!("managed-agent control loop terminated with error: {e:#}");
|
||||||
|
} else {
|
||||||
|
info!("managed-agent control loop exited cleanly on stop request");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transition StopPending -> Stopped.
|
||||||
|
set_status(
|
||||||
|
&status_handle,
|
||||||
|
ServiceState::StopPending,
|
||||||
|
ServiceControlAccept::empty(),
|
||||||
|
TRANSITION_WAIT,
|
||||||
|
);
|
||||||
|
|
||||||
|
let exit_code = match run_result {
|
||||||
|
Ok(()) => ServiceExitCode::Win32(0),
|
||||||
|
// ERROR_SERVICE_SPECIFIC_ERROR-style: surface a non-zero service-specific
|
||||||
|
// code so the SCM treats the exit as a failure and applies recovery.
|
||||||
|
Err(_) => ServiceExitCode::ServiceSpecific(1),
|
||||||
|
};
|
||||||
|
|
||||||
|
set_status_with_exit(
|
||||||
|
&status_handle,
|
||||||
|
ServiceState::Stopped,
|
||||||
|
ServiceControlAccept::empty(),
|
||||||
|
Duration::default(),
|
||||||
|
exit_code,
|
||||||
|
);
|
||||||
|
info!("service reported Stopped");
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Report a status with a zero (success) exit code.
|
||||||
|
fn set_status(
|
||||||
|
handle: &service_control_handler::ServiceStatusHandle,
|
||||||
|
state: ServiceState,
|
||||||
|
accepted: ServiceControlAccept,
|
||||||
|
wait_hint: Duration,
|
||||||
|
) {
|
||||||
|
set_status_with_exit(
|
||||||
|
handle,
|
||||||
|
state,
|
||||||
|
accepted,
|
||||||
|
wait_hint,
|
||||||
|
ServiceExitCode::Win32(0),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Report a status to the SCM. A failure to report is logged (best-effort) — we
|
||||||
|
/// cannot do anything actionable about it and must not panic inside the service.
|
||||||
|
fn set_status_with_exit(
|
||||||
|
handle: &service_control_handler::ServiceStatusHandle,
|
||||||
|
state: ServiceState,
|
||||||
|
accepted: ServiceControlAccept,
|
||||||
|
wait_hint: Duration,
|
||||||
|
exit_code: ServiceExitCode,
|
||||||
|
) {
|
||||||
|
let status = ServiceStatus {
|
||||||
|
service_type: ServiceType::OWN_PROCESS,
|
||||||
|
current_state: state,
|
||||||
|
controls_accepted: accepted,
|
||||||
|
exit_code,
|
||||||
|
checkpoint: 0,
|
||||||
|
wait_hint,
|
||||||
|
process_id: None,
|
||||||
|
};
|
||||||
|
if let Err(e) = handle.set_service_status(status) {
|
||||||
|
warn!("failed to report service status {state:?} to the SCM: {e}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Install / uninstall (used by install.rs for managed mode)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Install (or reinstall) the managed agent as a LocalSystem auto-start service
|
||||||
|
/// pointing at `exe_path` with the [`SERVICE_RUN_ARG`] launch argument.
|
||||||
|
///
|
||||||
|
/// Idempotent: if the service already exists it is stopped and deleted first,
|
||||||
|
/// then recreated, so an upgrade picks up a new binary path / config. Configures
|
||||||
|
/// crash recovery (restart on failure) via `sc failure`.
|
||||||
|
///
|
||||||
|
/// Requires Administrator (SCM `CREATE_SERVICE`). Returns an error otherwise.
|
||||||
|
pub fn install_service(exe_path: &std::path::Path) -> Result<()> {
|
||||||
|
let manager = ServiceManager::local_computer(
|
||||||
|
None::<&str>,
|
||||||
|
ServiceManagerAccess::CONNECT | ServiceManagerAccess::CREATE_SERVICE,
|
||||||
|
)
|
||||||
|
.context("failed to connect to the Service Control Manager (run as Administrator)")?;
|
||||||
|
|
||||||
|
// Remove any prior installation so the binary path / args are refreshed.
|
||||||
|
let mut deleted_existing = false;
|
||||||
|
if let Ok(existing) = manager.open_service(
|
||||||
|
SERVICE_NAME,
|
||||||
|
ServiceAccess::QUERY_STATUS | ServiceAccess::STOP | ServiceAccess::DELETE,
|
||||||
|
) {
|
||||||
|
info!("existing {SERVICE_NAME} service found; removing before reinstall");
|
||||||
|
stop_if_running(&existing);
|
||||||
|
existing
|
||||||
|
.delete()
|
||||||
|
.context("failed to delete the existing service before reinstall")?;
|
||||||
|
drop(existing);
|
||||||
|
deleted_existing = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
let service_info = ServiceInfo {
|
||||||
|
name: OsString::from(SERVICE_NAME),
|
||||||
|
display_name: OsString::from(SERVICE_DISPLAY_NAME),
|
||||||
|
service_type: ServiceType::OWN_PROCESS,
|
||||||
|
start_type: ServiceStartType::AutoStart,
|
||||||
|
error_control: ServiceErrorControl::Normal,
|
||||||
|
executable_path: exe_path.to_path_buf(),
|
||||||
|
launch_arguments: vec![OsString::from(SERVICE_RUN_ARG)],
|
||||||
|
dependencies: vec![],
|
||||||
|
// account_name: None => LocalSystem (the SPEC-018 requirement).
|
||||||
|
account_name: None,
|
||||||
|
account_password: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let service = create_service_with_retry(&manager, &service_info, deleted_existing)
|
||||||
|
.context("failed to create the GuruConnect managed agent service")?;
|
||||||
|
|
||||||
|
service
|
||||||
|
.set_description(SERVICE_DESCRIPTION)
|
||||||
|
.context("failed to set the service description")?;
|
||||||
|
|
||||||
|
configure_recovery();
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"installed {SERVICE_NAME} (LocalSystem, auto-start) -> {} {}",
|
||||||
|
exe_path.display(),
|
||||||
|
SERVICE_RUN_ARG
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create the service, retrying briefly if the SCM still has the prior instance
|
||||||
|
/// "marked for deletion" (SPEC-018 finding L1).
|
||||||
|
///
|
||||||
|
/// When a service is deleted, the SCM only removes it from its database once every
|
||||||
|
/// open handle to it closes; until then a fresh `CreateService` fails with
|
||||||
|
/// `ERROR_SERVICE_MARKED_FOR_DELETE` (1072). The previous implementation papered
|
||||||
|
/// over this with a fixed 2s sleep after `delete()`, which is both slower than
|
||||||
|
/// necessary in the common case and still racy on a busy box. Instead we attempt
|
||||||
|
/// the create immediately and, only if we just deleted an existing instance and
|
||||||
|
/// hit 1072, retry a few times with short backoff — succeeding as soon as the SCM
|
||||||
|
/// finishes the removal, and giving up with the real error if it never does.
|
||||||
|
///
|
||||||
|
/// The retry is gated on `deleted_existing`: on a clean first install there was no
|
||||||
|
/// prior instance, so a 1072 there is unexpected and is surfaced immediately
|
||||||
|
/// rather than masked by retries.
|
||||||
|
fn create_service_with_retry(
|
||||||
|
manager: &ServiceManager,
|
||||||
|
service_info: &ServiceInfo,
|
||||||
|
deleted_existing: bool,
|
||||||
|
) -> Result<windows_service::service::Service, windows_service::Error> {
|
||||||
|
// ERROR_SERVICE_MARKED_FOR_DELETE (winerror.h). The service is gone from the
|
||||||
|
// caller's perspective but the SCM has not finished reaping it.
|
||||||
|
const ERROR_SERVICE_MARKED_FOR_DELETE: i32 = 1072;
|
||||||
|
// Bounded: ~5 attempts over ~2s total worst case (matches the old fixed sleep
|
||||||
|
// ceiling) but returns the instant the SCM is ready.
|
||||||
|
const MAX_ATTEMPTS: u32 = 5;
|
||||||
|
const BACKOFF: Duration = Duration::from_millis(400);
|
||||||
|
|
||||||
|
let mut attempt = 0;
|
||||||
|
loop {
|
||||||
|
attempt += 1;
|
||||||
|
match manager.create_service(service_info, ServiceAccess::CHANGE_CONFIG) {
|
||||||
|
Ok(service) => return Ok(service),
|
||||||
|
Err(windows_service::Error::Winapi(ref io_err))
|
||||||
|
if deleted_existing
|
||||||
|
&& io_err.raw_os_error() == Some(ERROR_SERVICE_MARKED_FOR_DELETE)
|
||||||
|
&& attempt < MAX_ATTEMPTS =>
|
||||||
|
{
|
||||||
|
warn!(
|
||||||
|
"{SERVICE_NAME} still marked for deletion by the SCM \
|
||||||
|
(attempt {attempt}/{MAX_ATTEMPTS}); retrying in {}ms",
|
||||||
|
BACKOFF.as_millis()
|
||||||
|
);
|
||||||
|
std::thread::sleep(BACKOFF);
|
||||||
|
}
|
||||||
|
Err(e) => return Err(e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configure SCM crash-recovery so the service restarts on unexpected exit.
|
||||||
|
///
|
||||||
|
/// `windows-service` 0.7 does not expose `ChangeServiceConfig2` recovery actions
|
||||||
|
/// in a stable, ergonomic form, so we mirror the established pattern used by the
|
||||||
|
/// SAS service binary and shell out to `sc failure`. `reset=86400` clears the
|
||||||
|
/// failure count after a day; three `restart/5000` actions retry after 5s each.
|
||||||
|
fn configure_recovery() {
|
||||||
|
use std::os::windows::process::CommandExt;
|
||||||
|
const CREATE_NO_WINDOW: u32 = 0x0800_0000;
|
||||||
|
|
||||||
|
match std::process::Command::new("sc")
|
||||||
|
.args([
|
||||||
|
"failure",
|
||||||
|
SERVICE_NAME,
|
||||||
|
"reset=86400",
|
||||||
|
"actions=restart/5000/restart/5000/restart/5000",
|
||||||
|
])
|
||||||
|
.creation_flags(CREATE_NO_WINDOW)
|
||||||
|
.output()
|
||||||
|
{
|
||||||
|
Ok(out) if out.status.success() => {
|
||||||
|
info!("configured crash-recovery (restart) for {SERVICE_NAME}");
|
||||||
|
}
|
||||||
|
Ok(out) => {
|
||||||
|
warn!(
|
||||||
|
"could not configure crash-recovery for {SERVICE_NAME} (sc failure exit {:?}); \
|
||||||
|
the service will still run but will not auto-restart on crash",
|
||||||
|
out.status.code()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!("could not invoke `sc failure` to set crash-recovery for {SERVICE_NAME}: {e}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stop (if running) and delete the managed agent service. Idempotent: succeeds
|
||||||
|
/// quietly if the service is not installed.
|
||||||
|
pub fn uninstall_service() -> Result<()> {
|
||||||
|
let manager = ServiceManager::local_computer(None::<&str>, ServiceManagerAccess::CONNECT)
|
||||||
|
.context("failed to connect to the Service Control Manager (run as Administrator)")?;
|
||||||
|
|
||||||
|
match manager.open_service(
|
||||||
|
SERVICE_NAME,
|
||||||
|
ServiceAccess::QUERY_STATUS | ServiceAccess::STOP | ServiceAccess::DELETE,
|
||||||
|
) {
|
||||||
|
Ok(service) => {
|
||||||
|
stop_if_running(&service);
|
||||||
|
service
|
||||||
|
.delete()
|
||||||
|
.context("failed to delete the managed agent service")?;
|
||||||
|
info!("uninstalled {SERVICE_NAME} service");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
// Not installed — nothing to do (idempotent uninstall).
|
||||||
|
info!("{SERVICE_NAME} service is not installed; nothing to uninstall");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Start the managed agent service now (used right after a first-run install so
|
||||||
|
/// the agent comes up without waiting for the next boot). Best-effort: logs and
|
||||||
|
/// returns the SCM error if the start fails, but a failure is not fatal to install
|
||||||
|
/// because the service is auto-start and will come up on the next boot regardless.
|
||||||
|
pub fn start_service() -> Result<()> {
|
||||||
|
let manager = ServiceManager::local_computer(None::<&str>, ServiceManagerAccess::CONNECT)
|
||||||
|
.context("failed to connect to the Service Control Manager")?;
|
||||||
|
let service = manager
|
||||||
|
.open_service(
|
||||||
|
SERVICE_NAME,
|
||||||
|
ServiceAccess::START | ServiceAccess::QUERY_STATUS,
|
||||||
|
)
|
||||||
|
.context("failed to open the managed agent service to start it")?;
|
||||||
|
|
||||||
|
// If it is already running (e.g. reinstall-over-running), there is nothing to do.
|
||||||
|
if let Ok(status) = service.query_status() {
|
||||||
|
if status.current_state == ServiceState::Running
|
||||||
|
|| status.current_state == ServiceState::StartPending
|
||||||
|
{
|
||||||
|
info!("{SERVICE_NAME} is already running/starting");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
service
|
||||||
|
.start::<String>(&[])
|
||||||
|
.context("failed to start the managed agent service")?;
|
||||||
|
info!("started {SERVICE_NAME}");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Report whether the managed agent service is currently installed.
|
||||||
|
pub fn is_service_installed() -> bool {
|
||||||
|
match ServiceManager::local_computer(None::<&str>, ServiceManagerAccess::CONNECT) {
|
||||||
|
Ok(manager) => manager
|
||||||
|
.open_service(SERVICE_NAME, ServiceAccess::QUERY_STATUS)
|
||||||
|
.is_ok(),
|
||||||
|
Err(_) => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Best-effort stop of a service, waiting briefly for it to leave the running
|
||||||
|
/// state so a subsequent `delete` does not race an in-flight stop.
|
||||||
|
fn stop_if_running(service: &windows_service::service::Service) {
|
||||||
|
if let Ok(status) = service.query_status() {
|
||||||
|
if status.current_state != ServiceState::Stopped {
|
||||||
|
info!("stopping {SERVICE_NAME} before delete");
|
||||||
|
let _ = service.stop();
|
||||||
|
for _ in 0..10 {
|
||||||
|
std::thread::sleep(Duration::from_millis(500));
|
||||||
|
match service.query_status() {
|
||||||
|
Ok(s) if s.current_state == ServiceState::Stopped => break,
|
||||||
|
_ => continue,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// The launch argument the service is registered with MUST equal the hidden
|
||||||
|
/// `service-run` subcommand `main.rs` dispatches into [`run_dispatcher`]; a
|
||||||
|
/// mismatch would register a service the SCM could start but that would fall
|
||||||
|
/// through to normal (non-service) mode and immediately exit.
|
||||||
|
///
|
||||||
|
/// This pins the value of the constant itself. The companion test
|
||||||
|
/// `tests::service_run_subcommand_matches_scm_launch_arg` in `main.rs` pins the
|
||||||
|
/// other half — that the clap `#[command(name = "service-run")]` attribute on
|
||||||
|
/// `Commands::ServiceRun` resolves to this same constant — so the two string
|
||||||
|
/// literals cannot silently drift apart.
|
||||||
|
#[test]
|
||||||
|
fn service_run_arg_matches_subcommand_name() {
|
||||||
|
assert_eq!(SERVICE_RUN_ARG, "service-run");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Service identifiers are non-empty and the internal name carries no spaces
|
||||||
|
/// (the SCM key / `sc` argument must be a single token).
|
||||||
|
#[test]
|
||||||
|
fn service_identifiers_are_well_formed() {
|
||||||
|
assert!(!SERVICE_NAME.is_empty());
|
||||||
|
assert!(
|
||||||
|
!SERVICE_NAME.contains(char::is_whitespace),
|
||||||
|
"the SCM service name must be a single whitespace-free token"
|
||||||
|
);
|
||||||
|
assert!(!SERVICE_DISPLAY_NAME.is_empty());
|
||||||
|
assert!(!SERVICE_DESCRIPTION.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `is_service_installed` must never panic regardless of elevation/SCM access;
|
||||||
|
/// on a dev workstation without the service installed it returns `false`. (We
|
||||||
|
/// do NOT install the service in tests — that is a VM/admin integration step.)
|
||||||
|
#[test]
|
||||||
|
fn is_service_installed_is_total() {
|
||||||
|
let _ = is_service_installed();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -41,8 +41,18 @@ use crate::proto::{message, AgentStatus, ChatMessage, Heartbeat, HeartbeatAck, M
|
|||||||
use crate::transport::WebSocketTransport;
|
use crate::transport::WebSocketTransport;
|
||||||
use crate::tray::{TrayAction, TrayController};
|
use crate::tray::{TrayAction, TrayController};
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::sync::Arc;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
|
/// Sentinel error string returned by [`SessionManager::run_with_tray`] when the
|
||||||
|
/// loop breaks because the SCM asked the managed-agent service to stop (SPEC-018,
|
||||||
|
/// finding H). The outer `run_agent` loop matches on this to treat the exit as a
|
||||||
|
/// graceful service stop (clean WS close, no reconnect) rather than a session
|
||||||
|
/// error. Only the service path passes a shutdown flag, so only the service path
|
||||||
|
/// can ever produce this.
|
||||||
|
pub const SERVICE_STOP_SENTINEL: &str = "SERVICE_STOP";
|
||||||
|
|
||||||
// Heartbeat interval (30 seconds)
|
// Heartbeat interval (30 seconds)
|
||||||
const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(30);
|
const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(30);
|
||||||
// Status report interval (60 seconds)
|
// Status report interval (60 seconds)
|
||||||
@@ -285,16 +295,34 @@ impl SessionManager {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Run the session main loop with tray and chat event processing
|
/// Run the session main loop with tray and chat event processing.
|
||||||
|
///
|
||||||
|
/// `service_shutdown` (SPEC-018 finding H) is the SCM cooperative-stop flag.
|
||||||
|
/// It is `Some(flag)` ONLY on the managed-agent service path; the
|
||||||
|
/// attended/viewer/interactive callers pass `None` and behave EXACTLY as
|
||||||
|
/// before. When present, the flag is polled on every idle tick (the natural
|
||||||
|
/// ~100ms seam below) so an SCM Stop/Shutdown received while CONNECTED breaks
|
||||||
|
/// this inner loop promptly — instead of only being observed by the outer
|
||||||
|
/// `run_agent` reconnect loop, which never runs while a session is connected.
|
||||||
|
/// On a set flag the loop closes the WebSocket cleanly (via the shared exit
|
||||||
|
/// path at the bottom) and returns the [`SERVICE_STOP_SENTINEL`] error, which
|
||||||
|
/// the outer loop maps to a graceful stop.
|
||||||
pub async fn run_with_tray(
|
pub async fn run_with_tray(
|
||||||
&mut self,
|
&mut self,
|
||||||
tray: Option<&TrayController>,
|
tray: Option<&TrayController>,
|
||||||
chat: Option<&ChatController>,
|
chat: Option<&ChatController>,
|
||||||
|
service_shutdown: Option<&Arc<AtomicBool>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
if self.transport.is_none() {
|
if self.transport.is_none() {
|
||||||
anyhow::bail!("Not connected");
|
anyhow::bail!("Not connected");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Helper: has the SCM asked the service to stop? Always false off the
|
||||||
|
// service path (where `service_shutdown` is `None`).
|
||||||
|
let stop_requested = |flag: Option<&Arc<AtomicBool>>| -> bool {
|
||||||
|
flag.is_some_and(|f| f.load(Ordering::SeqCst))
|
||||||
|
};
|
||||||
|
|
||||||
// Send initial status
|
// Send initial status
|
||||||
self.send_status().await?;
|
self.send_status().await?;
|
||||||
|
|
||||||
@@ -307,6 +335,29 @@ impl SessionManager {
|
|||||||
|
|
||||||
// Main loop
|
// Main loop
|
||||||
loop {
|
loop {
|
||||||
|
// SPEC-018 (finding H): honour an SCM stop request received while the
|
||||||
|
// session is CONNECTED. The outer `run_agent` loop only observes the
|
||||||
|
// flag between connection attempts, but a managed agent spends its
|
||||||
|
// entire connected life inside THIS loop — so without this check an
|
||||||
|
// SCM Stop while connected would not break out until the connection
|
||||||
|
// dropped on its own. Breaking here falls through to the shared exit
|
||||||
|
// path below, which closes the transport cleanly (clean WS close);
|
||||||
|
// the sentinel tells the outer loop this was a graceful stop.
|
||||||
|
if stop_requested(service_shutdown) {
|
||||||
|
tracing::info!("Service stop requested; ending connected session loop");
|
||||||
|
self.release_streaming();
|
||||||
|
self.state = SessionState::Disconnected;
|
||||||
|
if let Some(transport) = self.transport.as_mut() {
|
||||||
|
// Best-effort clean WebSocket close (sends a Close frame). A
|
||||||
|
// failure here just means the peer/socket is already gone; the
|
||||||
|
// service still stops cleanly.
|
||||||
|
if let Err(e) = transport.close().await {
|
||||||
|
tracing::warn!("error during clean WebSocket close on service stop: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Err(anyhow::anyhow!(SERVICE_STOP_SENTINEL));
|
||||||
|
}
|
||||||
|
|
||||||
// Process tray events
|
// Process tray events
|
||||||
if let Some(t) = tray {
|
if let Some(t) = tray {
|
||||||
if let Some(action) = t.process_events() {
|
if let Some(action) = t.process_events() {
|
||||||
@@ -745,3 +796,47 @@ impl SessionManager {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// SPEC-018 finding H: the connected-stop contract. When the SCM sets the
|
||||||
|
/// shutdown flag, `run_with_tray` returns an error whose message contains
|
||||||
|
/// [`SERVICE_STOP_SENTINEL`]; the outer `run_agent` loop recognises a graceful
|
||||||
|
/// stop with `error_msg.contains(SERVICE_STOP_SENTINEL)`. This pins that the
|
||||||
|
/// error the loop constructs on stop actually satisfies that match — so the
|
||||||
|
/// two halves (producer here, consumer in `main.rs`) cannot drift.
|
||||||
|
///
|
||||||
|
/// A full end-to-end test of the in-loop interrupt would need a live connected
|
||||||
|
/// transport (a real or mocked server), which is an integration concern; this
|
||||||
|
/// unit test instead pins the wire contract the interrupt relies on.
|
||||||
|
#[test]
|
||||||
|
fn service_stop_sentinel_is_matched_by_outer_loop_check() {
|
||||||
|
let produced = anyhow::anyhow!(SERVICE_STOP_SENTINEL);
|
||||||
|
assert!(
|
||||||
|
produced.to_string().contains(SERVICE_STOP_SENTINEL),
|
||||||
|
"the stop error must contain the sentinel the outer loop matches on"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
!SERVICE_STOP_SENTINEL.is_empty(),
|
||||||
|
"the sentinel must be a non-empty, distinctive token"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The shutdown-flag check is a no-op (always `false`) when no flag is passed,
|
||||||
|
/// i.e. on the attended/viewer/interactive paths — guaranteeing the new
|
||||||
|
/// parameter is a pure addition that cannot alter non-service behaviour
|
||||||
|
/// (SPEC-018 finding H: "no regression").
|
||||||
|
#[test]
|
||||||
|
fn no_shutdown_flag_never_requests_stop() {
|
||||||
|
let none: Option<&Arc<AtomicBool>> = None;
|
||||||
|
let check = |flag: Option<&Arc<AtomicBool>>| flag.is_some_and(|f| f.load(Ordering::SeqCst));
|
||||||
|
assert!(!check(none));
|
||||||
|
|
||||||
|
let set = Arc::new(AtomicBool::new(true));
|
||||||
|
assert!(check(Some(&set)));
|
||||||
|
let unset = Arc::new(AtomicBool::new(false));
|
||||||
|
assert!(!check(Some(&unset)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user