Compare commits
23 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 11af9dff8e | |||
| a0e0d5f1e7 | |||
| 7602b4346a | |||
| 55b9c97b28 | |||
| 94c07c2431 | |||
| 4c49b73a71 | |||
| 367906bd54 | |||
| 52477e4c4a | |||
| 87c6e17d4a | |||
| 6a000d012f | |||
| d0b8db070f | |||
| 89c3718266 | |||
| 4106fc4bc4 | |||
| 0f02f23765 | |||
| 59e40c8019 | |||
| c286a29b9d | |||
| 18429f6fe3 | |||
| 3b9e4068c9 | |||
| 87f229509b | |||
| 40c7d860cc | |||
| 0059b21db6 | |||
| f950511e3e | |||
| 16017456aa |
@@ -27,6 +27,15 @@ on:
|
|||||||
# computes the next semver from conventional commits at dispatch time.
|
# computes the next semver from conventional commits at dispatch time.
|
||||||
# build-and-test.yml remains the automatic PR/push CI gate.
|
# build-and-test.yml remains the automatic PR/push CI gate.
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
channel:
|
||||||
|
description: 'Release channel (stable = full versioned release; beta = signed prerelease test build, no version bump/changelog)'
|
||||||
|
required: true
|
||||||
|
default: 'stable'
|
||||||
|
type: choice
|
||||||
|
options:
|
||||||
|
- stable
|
||||||
|
- beta
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -36,8 +45,11 @@ jobs:
|
|||||||
name: Version + Changelog
|
name: Version + Changelog
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
outputs:
|
outputs:
|
||||||
version: ${{ steps.bump.outputs.version }}
|
# Coalesce across the stable (bump) and beta (beta) paths: exactly one of them runs per
|
||||||
released: ${{ steps.bump.outputs.released }}
|
# dispatch, so the first non-empty value wins. prerelease is 'true' only on the beta path.
|
||||||
|
version: ${{ steps.bump.outputs.version || steps.beta.outputs.version }}
|
||||||
|
released: ${{ steps.bump.outputs.released || steps.beta.outputs.released }}
|
||||||
|
prerelease: ${{ steps.beta.outputs.prerelease || 'false' }}
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout (full history + tags)
|
- name: Checkout (full history + tags)
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -59,7 +71,8 @@ jobs:
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
- name: Install git-cliff
|
- name: Install git-cliff
|
||||||
if: steps.guard.outputs.skip != 'true'
|
# Stable-only: beta produces no changelog, so git-cliff is unnecessary on the beta path.
|
||||||
|
if: steps.guard.outputs.skip != 'true' && github.event.inputs.channel == 'stable'
|
||||||
run: |
|
run: |
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
CLIFF_VERSION="2.6.1"
|
CLIFF_VERSION="2.6.1"
|
||||||
@@ -72,12 +85,16 @@ jobs:
|
|||||||
|
|
||||||
- name: Determine next version and bump components
|
- name: Determine next version and bump components
|
||||||
id: bump
|
id: bump
|
||||||
if: steps.guard.outputs.skip != 'true'
|
# Stable-only: the beta path (id: beta) handles versioning without a manifest bump/commit.
|
||||||
|
if: steps.guard.outputs.skip != 'true' && github.event.inputs.channel == 'stable'
|
||||||
run: |
|
run: |
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# ----- locate the last release tag (vX.Y.Z) -----
|
# ----- locate the last release tag (vX.Y.Z) -----
|
||||||
LAST_TAG="$(git tag --list 'v*' --sort=-v:refname | head -n1 || true)"
|
# Match ONLY strict final-release tags (vMAJOR.MINOR.PATCH). Beta tags look like
|
||||||
|
# v0.3.0-beta.7; if one of those were picked up here it would corrupt the next stable
|
||||||
|
# base version, so prerelease tags are explicitly excluded from this lookup.
|
||||||
|
LAST_TAG="$(git tag --list 'v*' --sort=-v:refname | grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' | head -n1 || true)"
|
||||||
if [ -z "${LAST_TAG}" ]; then
|
if [ -z "${LAST_TAG}" ]; then
|
||||||
echo "[INFO] No prior release tag found; baseline is current manifest version."
|
echo "[INFO] No prior release tag found; baseline is current manifest version."
|
||||||
BASE_VERSION="$(grep -m1 '^version' agent/Cargo.toml | sed -E 's/.*"([0-9]+\.[0-9]+\.[0-9]+)".*/\1/')"
|
BASE_VERSION="$(grep -m1 '^version' agent/Cargo.toml | sed -E 's/.*"([0-9]+\.[0-9]+\.[0-9]+)".*/\1/')"
|
||||||
@@ -186,8 +203,39 @@ jobs:
|
|||||||
sed -i -E "0,/^version = \"[0-9]+\.[0-9]+\.[0-9]+\"/s//version = \"${NEXT}\"/" Cargo.toml || true
|
sed -i -E "0,/^version = \"[0-9]+\.[0-9]+\.[0-9]+\"/s//version = \"${NEXT}\"/" Cargo.toml || true
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
- name: Beta channel - tag prerelease build (no bump, no commit, no changelog)
|
||||||
|
id: beta
|
||||||
|
# Beta-only path. Reuses the IDENTICAL downstream build + sign + publish jobs, but does
|
||||||
|
# NOT compute a semver bump, mutate any manifest, generate a changelog, or make a release
|
||||||
|
# commit. It just tags the CURRENT HEAD with a unique prerelease version so the Windows
|
||||||
|
# build job can check out `ref: v${VER}` exactly as it does for stable.
|
||||||
|
if: github.event.inputs.channel == 'beta' && steps.guard.outputs.skip != 'true'
|
||||||
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Base version is read straight from the agent manifest — NOT bumped, NOT written back.
|
||||||
|
BASE="$(grep -m1 '^version' agent/Cargo.toml | sed -E 's/.*"([0-9]+\.[0-9]+\.[0-9]+)".*/\1/')"
|
||||||
|
# GITHUB_RUN_NUMBER guarantees a unique prerelease suffix without counting existing tags.
|
||||||
|
VER="${BASE}-beta.${GITHUB_RUN_NUMBER}"
|
||||||
|
echo "[INFO] Beta build version: ${VER} (base ${BASE}, run ${GITHUB_RUN_NUMBER})"
|
||||||
|
|
||||||
|
# Tag the current HEAD (no release commit). Push the tag so build-agent-windows can
|
||||||
|
# check out ref: v${VER}.
|
||||||
|
git config user.name "guruconnect-ci"
|
||||||
|
git config user.email "ci@azcomputerguru.com"
|
||||||
|
# Beta tags are disposable test markers; force makes re-running a failed beta dispatch idempotent (re-run reuses GITHUB_RUN_NUMBER, so the tag already exists).
|
||||||
|
git tag -f "v${VER}"
|
||||||
|
REMOTE="https://${{ secrets.CI_PUSH_TOKEN }}@git.azcomputerguru.com/${GITHUB_REPOSITORY}.git"
|
||||||
|
git push --force "${REMOTE}" "v${VER}"
|
||||||
|
echo "[OK] Pushed beta prerelease tag v${VER}"
|
||||||
|
|
||||||
|
echo "version=${VER}" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "released=true" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "prerelease=true" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
- name: Generate changelog (git-cliff)
|
- name: Generate changelog (git-cliff)
|
||||||
if: steps.guard.outputs.skip != 'true' && steps.bump.outputs.released == 'true'
|
# Stable-only: beta produces no changelog artifact.
|
||||||
|
if: steps.guard.outputs.skip != 'true' && steps.bump.outputs.released == 'true' && github.event.inputs.channel == 'stable'
|
||||||
env:
|
env:
|
||||||
VERSION: ${{ steps.bump.outputs.version }}
|
VERSION: ${{ steps.bump.outputs.version }}
|
||||||
run: |
|
run: |
|
||||||
@@ -232,7 +280,10 @@ jobs:
|
|||||||
|
|
||||||
# Re-derive the set of changed components (same logic as the bump step). On the first
|
# Re-derive the set of changed components (same logic as the bump step). On the first
|
||||||
# release (no prior tag) all components are considered changed.
|
# release (no prior tag) all components are considered changed.
|
||||||
LAST_TAG="$(git tag --list 'v*' --sort=-v:refname | head -n1 || true)"
|
# Match ONLY strict final-release tags (vMAJOR.MINOR.PATCH); exclude beta prerelease
|
||||||
|
# tags (v0.3.0-beta.7) so the changelog diff range is taken against the last real
|
||||||
|
# release, not an intervening beta build.
|
||||||
|
LAST_TAG="$(git tag --list 'v*' --sort=-v:refname | grep -E '^v[0-9]+\.[0-9]+\.[0-9]+$' | head -n1 || true)"
|
||||||
if [ -z "${LAST_TAG}" ]; then
|
if [ -z "${LAST_TAG}" ]; then
|
||||||
CHANGED_FILES="$(git ls-files)"
|
CHANGED_FILES="$(git ls-files)"
|
||||||
FIRST_RELEASE=true
|
FIRST_RELEASE=true
|
||||||
@@ -252,7 +303,8 @@ jobs:
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
- name: Commit release + create tag
|
- name: Commit release + create tag
|
||||||
if: steps.guard.outputs.skip != 'true' && steps.bump.outputs.released == 'true'
|
# Stable-only: beta tags HEAD directly in the beta step and never makes a release commit.
|
||||||
|
if: steps.guard.outputs.skip != 'true' && steps.bump.outputs.released == 'true' && github.event.inputs.channel == 'stable'
|
||||||
env:
|
env:
|
||||||
VERSION: ${{ steps.bump.outputs.version }}
|
VERSION: ${{ steps.bump.outputs.version }}
|
||||||
run: |
|
run: |
|
||||||
@@ -276,7 +328,8 @@ jobs:
|
|||||||
echo "[OK] Pushed release commit and tag v${VERSION}"
|
echo "[OK] Pushed release commit and tag v${VERSION}"
|
||||||
|
|
||||||
- name: Upload changelog artifact
|
- name: Upload changelog artifact
|
||||||
if: steps.guard.outputs.skip != 'true' && steps.bump.outputs.released == 'true'
|
# Stable-only: there is no changelog on the beta path, so nothing to upload.
|
||||||
|
if: steps.guard.outputs.skip != 'true' && steps.bump.outputs.released == 'true' && github.event.inputs.channel == 'stable'
|
||||||
uses: actions/upload-artifact@v3
|
uses: actions/upload-artifact@v3
|
||||||
with:
|
with:
|
||||||
name: changelog
|
name: changelog
|
||||||
@@ -445,6 +498,9 @@ jobs:
|
|||||||
echo "sha256=${SUM}" >> "$GITHUB_OUTPUT"
|
echo "sha256=${SUM}" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
- name: Download changelog artifact
|
- name: Download changelog artifact
|
||||||
|
# Stable-only: the beta path uploads no `changelog` artifact. The release-creation step
|
||||||
|
# already guards on `[ -f changelog-artifact/CHANGELOG.md ]`, so skipping this is safe.
|
||||||
|
if: github.event.inputs.channel == 'stable'
|
||||||
uses: actions/download-artifact@v3
|
uses: actions/download-artifact@v3
|
||||||
with:
|
with:
|
||||||
name: changelog
|
name: changelog
|
||||||
@@ -472,17 +528,26 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
VERSION: ${{ needs.version.outputs.version }}
|
VERSION: ${{ needs.version.outputs.version }}
|
||||||
SHA256: ${{ steps.sha.outputs.sha256 }}
|
SHA256: ${{ steps.sha.outputs.sha256 }}
|
||||||
|
# PRERELEASE is 'true' on the beta path, 'false' on stable; drives the Gitea release flag.
|
||||||
|
PRERELEASE: ${{ needs.version.outputs.prerelease }}
|
||||||
GITEA_TOKEN: ${{ secrets.CI_PUSH_TOKEN }}
|
GITEA_TOKEN: ${{ secrets.CI_PUSH_TOKEN }}
|
||||||
run: |
|
run: |
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
API_BASE="https://git.azcomputerguru.com/api/v1/repos/${GITHUB_REPOSITORY}"
|
API_BASE="https://git.azcomputerguru.com/api/v1/repos/${GITHUB_REPOSITORY}"
|
||||||
TAG="v${VERSION}"
|
TAG="v${VERSION}"
|
||||||
echo "[INFO] Creating Gitea release ${TAG} on ${GITHUB_REPOSITORY}"
|
echo "[INFO] Creating Gitea release ${TAG} on ${GITHUB_REPOSITORY} (prerelease=${PRERELEASE})"
|
||||||
|
|
||||||
|
# Beta builds get a clear "prerelease test build" note in the body; the -beta.N suffix
|
||||||
|
# is already carried in TAG, so the release name "Release v..." needs no extra handling.
|
||||||
|
if [ "${PRERELEASE}" = "true" ]; then
|
||||||
|
BODY="$(printf 'GuruConnect %s (PRERELEASE / beta test build)\n\nSHA-256 (guruconnect.exe): %s\n\nSigned via Azure Trusted Signing. Not a stable release — no changelog/version bump.' "${TAG}" "${SHA256}")"
|
||||||
|
else
|
||||||
BODY="$(printf 'GuruConnect %s\n\nSHA-256 (guruconnect.exe): %s\n\nSee CHANGELOG.md and /api/changelog for details.' "${TAG}" "${SHA256}")"
|
BODY="$(printf 'GuruConnect %s\n\nSHA-256 (guruconnect.exe): %s\n\nSee CHANGELOG.md and /api/changelog for details.' "${TAG}" "${SHA256}")"
|
||||||
|
fi
|
||||||
|
|
||||||
# Build the JSON payload with python (handles escaping of the multi-line body safely).
|
# Build the JSON payload with python (handles escaping of the multi-line body safely).
|
||||||
CREATE_PAYLOAD="$(TAG="$TAG" BODY="$BODY" python3 -c 'import json,os; print(json.dumps({"tag_name": os.environ["TAG"], "name": "Release " + os.environ["TAG"], "body": os.environ["BODY"], "draft": False, "prerelease": False}))')"
|
# prerelease is derived from the PRERELEASE env var (beta -> true, stable -> false).
|
||||||
|
CREATE_PAYLOAD="$(TAG="$TAG" BODY="$BODY" PRERELEASE="$PRERELEASE" python3 -c 'import json,os; print(json.dumps({"tag_name": os.environ["TAG"], "name": "Release " + os.environ["TAG"], "body": os.environ["BODY"], "draft": False, "prerelease": os.environ.get("PRERELEASE","false") == "true"}))')"
|
||||||
|
|
||||||
RELEASE_JSON="$(curl -fsS -X POST \
|
RELEASE_JSON="$(curl -fsS -X POST \
|
||||||
"${API_BASE}/releases" \
|
"${API_BASE}/releases" \
|
||||||
|
|||||||
4
Cargo.lock
generated
4
Cargo.lock
generated
@@ -1407,7 +1407,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "guruconnect"
|
name = "guruconnect"
|
||||||
version = "0.2.0"
|
version = "0.3.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"bytes",
|
"bytes",
|
||||||
@@ -1447,7 +1447,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "guruconnect-server"
|
name = "guruconnect-server"
|
||||||
version = "0.2.0"
|
version = "0.3.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"argon2",
|
"argon2",
|
||||||
|
|||||||
@@ -92,6 +92,7 @@ windows = { version = "0.58", features = [
|
|||||||
"Win32_System_Console",
|
"Win32_System_Console",
|
||||||
"Win32_System_Environment",
|
"Win32_System_Environment",
|
||||||
"Win32_Security",
|
"Win32_Security",
|
||||||
|
"Win32_Security_Cryptography",
|
||||||
"Win32_Storage_FileSystem",
|
"Win32_Storage_FileSystem",
|
||||||
"Win32_System_Pipes",
|
"Win32_System_Pipes",
|
||||||
"Win32_System_SystemServices",
|
"Win32_System_SystemServices",
|
||||||
|
|||||||
@@ -16,18 +16,39 @@ use uuid::Uuid;
|
|||||||
const MAGIC_MARKER: &[u8] = b"GURUCONFIG";
|
const MAGIC_MARKER: &[u8] = b"GURUCONFIG";
|
||||||
|
|
||||||
/// Embedded configuration data (appended to executable)
|
/// Embedded configuration data (appended to executable)
|
||||||
|
///
|
||||||
|
/// SPEC-016 Phase B: a managed-install config now carries the per-site
|
||||||
|
/// `enrollment_key` + `site_code` so the agent can self-register on first run.
|
||||||
|
/// The legacy `api_key` is retained (defaulted) for backward-compat with older
|
||||||
|
/// pre-enrollment installers; a fresh site installer carries only the enrollment
|
||||||
|
/// credentials and the agent obtains its per-machine `cak_` via `/api/enroll`.
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct EmbeddedConfig {
|
pub struct EmbeddedConfig {
|
||||||
/// Server WebSocket URL
|
/// Server WebSocket URL
|
||||||
pub server_url: String,
|
pub server_url: String,
|
||||||
/// API key for authentication
|
/// DEPRECATED shared/legacy API key for authentication. Optional — a
|
||||||
pub api_key: String,
|
/// SPEC-016 site installer omits it and enrolls for a per-machine `cak_`.
|
||||||
|
#[serde(default)]
|
||||||
|
pub api_key: Option<String>,
|
||||||
|
/// Per-site enrollment key (`cek_`), the low-sensitivity registration gate
|
||||||
|
/// (SPEC-016 §Security). Presented to `/api/enroll`; never logged.
|
||||||
|
#[serde(default)]
|
||||||
|
pub enrollment_key: Option<String>,
|
||||||
|
/// Per-site code identifying which site this installer enrolls into.
|
||||||
|
#[serde(default)]
|
||||||
|
pub site_code: Option<String>,
|
||||||
/// Company/organization name
|
/// Company/organization name
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub company: Option<String>,
|
pub company: Option<String>,
|
||||||
/// Site/location name
|
/// Site/location name
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub site: Option<String>,
|
pub site: Option<String>,
|
||||||
|
/// Department label (reserved — SPEC-007 AgentStatus parity).
|
||||||
|
#[serde(default)]
|
||||||
|
pub department: Option<String>,
|
||||||
|
/// Device-type label (reserved — SPEC-007 AgentStatus parity).
|
||||||
|
#[serde(default)]
|
||||||
|
pub device_type: Option<String>,
|
||||||
/// Tags for categorization
|
/// Tags for categorization
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub tags: Vec<String>,
|
pub tags: Vec<String>,
|
||||||
@@ -52,9 +73,28 @@ pub struct Config {
|
|||||||
/// Server WebSocket URL (e.g., wss://connect.example.com/ws)
|
/// Server WebSocket URL (e.g., wss://connect.example.com/ws)
|
||||||
pub server_url: String,
|
pub server_url: String,
|
||||||
|
|
||||||
/// Agent API key for authentication
|
/// Operating credential used to authenticate the persistent WS connection.
|
||||||
|
///
|
||||||
|
/// SPEC-016 Phase B: the AUTHORITATIVE credential is a per-machine `cak_`
|
||||||
|
/// obtained at first-run enrollment and stored encrypted at rest (see
|
||||||
|
/// [`crate::credential_store`]); it is loaded into this field before connect.
|
||||||
|
/// A non-empty value carried in config is the DEPRECATED shared/legacy
|
||||||
|
/// `api_key`, kept only for transition compatibility. Empty means "not yet
|
||||||
|
/// enrolled / no credential" — the run-mode wiring must enroll first.
|
||||||
|
#[serde(default)]
|
||||||
pub api_key: String,
|
pub api_key: String,
|
||||||
|
|
||||||
|
/// Per-site enrollment key (`cek_`) — present only for a not-yet-enrolled
|
||||||
|
/// managed install. Never persisted to the on-disk TOML (it is install-time
|
||||||
|
/// material, delivered by the site wrapper); never logged.
|
||||||
|
#[serde(skip)]
|
||||||
|
pub enrollment_key: Option<String>,
|
||||||
|
|
||||||
|
/// Per-site code identifying which site to enroll into (paired with
|
||||||
|
/// `enrollment_key`). Not persisted to the on-disk TOML.
|
||||||
|
#[serde(skip)]
|
||||||
|
pub site_code: Option<String>,
|
||||||
|
|
||||||
/// Unique agent identifier (generated on first run)
|
/// Unique agent identifier (generated on first run)
|
||||||
#[serde(default = "generate_agent_id")]
|
#[serde(default = "generate_agent_id")]
|
||||||
pub agent_id: String,
|
pub agent_id: String,
|
||||||
@@ -70,6 +110,14 @@ pub struct Config {
|
|||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub site: Option<String>,
|
pub site: Option<String>,
|
||||||
|
|
||||||
|
/// Department label (reserved — SPEC-007 AgentStatus parity).
|
||||||
|
#[serde(default)]
|
||||||
|
pub department: Option<String>,
|
||||||
|
|
||||||
|
/// Device-type label (reserved — SPEC-007 AgentStatus parity).
|
||||||
|
#[serde(default)]
|
||||||
|
pub device_type: Option<String>,
|
||||||
|
|
||||||
/// Tags for categorization (from embedded config)
|
/// Tags for categorization (from embedded config)
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub tags: Vec<String>,
|
pub tags: Vec<String>,
|
||||||
@@ -91,6 +139,25 @@ fn generate_agent_id() -> String {
|
|||||||
Uuid::new_v4().to_string()
|
Uuid::new_v4().to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Layer SPEC-016 enrollment material from the environment onto a `Config`.
|
||||||
|
///
|
||||||
|
/// `GURUCONNECT_ENROLLMENT_KEY` / `GURUCONNECT_SITE_CODE` only OVERRIDE when set
|
||||||
|
/// and non-empty, so embedded/install-time values already present on the config
|
||||||
|
/// are preserved. Used by the file and env load paths (the embedded path already
|
||||||
|
/// carries these from the install blob).
|
||||||
|
fn apply_enrollment_env(config: &mut Config) {
|
||||||
|
if let Ok(v) = std::env::var("GURUCONNECT_ENROLLMENT_KEY") {
|
||||||
|
if !v.is_empty() {
|
||||||
|
config.enrollment_key = Some(v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Ok(v) = std::env::var("GURUCONNECT_SITE_CODE") {
|
||||||
|
if !v.is_empty() {
|
||||||
|
config.site_code = Some(v);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
pub struct CaptureConfig {
|
pub struct CaptureConfig {
|
||||||
/// Target frames per second (1-60)
|
/// Target frames per second (1-60)
|
||||||
@@ -317,18 +384,26 @@ impl Config {
|
|||||||
info!("Using embedded configuration");
|
info!("Using embedded configuration");
|
||||||
let config = Config {
|
let config = Config {
|
||||||
server_url: embedded.server_url,
|
server_url: embedded.server_url,
|
||||||
api_key: embedded.api_key,
|
// Legacy/shared api_key if the installer carried one; empty
|
||||||
|
// otherwise (the SPEC-016 path enrolls for a per-machine cak_).
|
||||||
|
api_key: embedded.api_key.unwrap_or_default(),
|
||||||
|
enrollment_key: embedded.enrollment_key,
|
||||||
|
site_code: embedded.site_code,
|
||||||
agent_id: generate_agent_id(),
|
agent_id: generate_agent_id(),
|
||||||
hostname_override: None,
|
hostname_override: None,
|
||||||
company: embedded.company,
|
company: embedded.company,
|
||||||
site: embedded.site,
|
site: embedded.site,
|
||||||
|
department: embedded.department,
|
||||||
|
device_type: embedded.device_type,
|
||||||
tags: embedded.tags,
|
tags: embedded.tags,
|
||||||
support_code: None,
|
support_code: None,
|
||||||
capture: CaptureConfig::default(),
|
capture: CaptureConfig::default(),
|
||||||
encoding: EncodingConfig::default(),
|
encoding: EncodingConfig::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
// Save to file for persistence (so agent_id is preserved)
|
// Save to file for persistence (so agent_id is preserved). The
|
||||||
|
// #[serde(skip)] enrollment fields are intentionally NOT written to
|
||||||
|
// the on-disk TOML — they are install-time material only.
|
||||||
let _ = config.save();
|
let _ = config.save();
|
||||||
return Ok(config);
|
return Ok(config);
|
||||||
}
|
}
|
||||||
@@ -349,8 +424,12 @@ impl Config {
|
|||||||
let _ = config.save();
|
let _ = config.save();
|
||||||
}
|
}
|
||||||
|
|
||||||
// support_code is always None when loading from file (set via CLI)
|
// support_code is always None when loading from file (set via CLI).
|
||||||
config.support_code = None;
|
config.support_code = None;
|
||||||
|
// The enrollment fields are #[serde(skip)], so a file never carries
|
||||||
|
// them; layer them in from the environment for testing / a
|
||||||
|
// file-delivered managed install that supplies them out-of-band.
|
||||||
|
apply_enrollment_env(&mut config);
|
||||||
|
|
||||||
return Ok(config);
|
return Ok(config);
|
||||||
}
|
}
|
||||||
@@ -365,18 +444,23 @@ impl Config {
|
|||||||
let agent_id =
|
let agent_id =
|
||||||
std::env::var("GURUCONNECT_AGENT_ID").unwrap_or_else(|_| generate_agent_id());
|
std::env::var("GURUCONNECT_AGENT_ID").unwrap_or_else(|_| generate_agent_id());
|
||||||
|
|
||||||
let config = Config {
|
let mut config = Config {
|
||||||
server_url,
|
server_url,
|
||||||
api_key,
|
api_key,
|
||||||
|
enrollment_key: None,
|
||||||
|
site_code: None,
|
||||||
agent_id,
|
agent_id,
|
||||||
hostname_override: std::env::var("GURUCONNECT_HOSTNAME").ok(),
|
hostname_override: std::env::var("GURUCONNECT_HOSTNAME").ok(),
|
||||||
company: None,
|
company: None,
|
||||||
site: None,
|
site: None,
|
||||||
|
department: None,
|
||||||
|
device_type: None,
|
||||||
tags: Vec::new(),
|
tags: Vec::new(),
|
||||||
support_code: None,
|
support_code: None,
|
||||||
capture: CaptureConfig::default(),
|
capture: CaptureConfig::default(),
|
||||||
encoding: EncodingConfig::default(),
|
encoding: EncodingConfig::default(),
|
||||||
};
|
};
|
||||||
|
apply_enrollment_env(&mut config);
|
||||||
|
|
||||||
// Save config with generated agent_id for persistence
|
// Save config with generated agent_id for persistence
|
||||||
let _ = config.save();
|
let _ = config.save();
|
||||||
@@ -384,6 +468,34 @@ impl Config {
|
|||||||
Ok(config)
|
Ok(config)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Derive the HTTPS API base (e.g. `https://connect.example.com`) from the
|
||||||
|
/// agent's WebSocket `server_url` (e.g. `wss://connect.example.com/ws/agent`).
|
||||||
|
///
|
||||||
|
/// `/api/enroll` is REST/HTTPS while the persistent transport is `wss`, so we
|
||||||
|
/// reuse the same host/authority and swap scheme + drop the WS path. Mapping:
|
||||||
|
/// `wss` -> `https`, `ws` -> `http` (dev). Returns an error if `server_url`
|
||||||
|
/// has no parseable host.
|
||||||
|
pub fn https_base(&self) -> Result<String> {
|
||||||
|
let parsed = url::Url::parse(&self.server_url)
|
||||||
|
.with_context(|| format!("invalid server_url: {}", self.server_url))?;
|
||||||
|
let scheme = match parsed.scheme() {
|
||||||
|
"wss" | "https" => "https",
|
||||||
|
"ws" | "http" => "http",
|
||||||
|
other => {
|
||||||
|
return Err(anyhow!(
|
||||||
|
"unsupported server_url scheme '{other}' (expected ws/wss)"
|
||||||
|
))
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let host = parsed
|
||||||
|
.host_str()
|
||||||
|
.ok_or_else(|| anyhow!("server_url has no host: {}", self.server_url))?;
|
||||||
|
Ok(match parsed.port() {
|
||||||
|
Some(port) => format!("{scheme}://{host}:{port}"),
|
||||||
|
None => format!("{scheme}://{host}"),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
/// Get the configuration file path
|
/// Get the configuration file path
|
||||||
fn config_path() -> PathBuf {
|
fn config_path() -> PathBuf {
|
||||||
// Check for config in current directory first
|
// Check for config in current directory first
|
||||||
|
|||||||
413
agent/src/credential_store.rs
Normal file
413
agent/src/credential_store.rs
Normal file
@@ -0,0 +1,413 @@
|
|||||||
|
//! At-rest storage for the per-machine operating credential (`cak_`).
|
||||||
|
//!
|
||||||
|
//! SPEC-016 Phase B, item 4 + §Security. The `cak_` minted by `/api/enroll` is
|
||||||
|
//! the high-sensitivity, per-machine, independently-revocable operating
|
||||||
|
//! credential. It is stored with **two independent layers** (Mike's locked
|
||||||
|
//! decision — "BOTH layers"):
|
||||||
|
//!
|
||||||
|
//! 1. **DPAPI-machine encryption** (`CryptProtectData` with
|
||||||
|
//! `CRYPTPROTECT_LOCAL_MACHINE`): the on-disk bytes are a DPAPI blob keyed to
|
||||||
|
//! THIS machine. A copied/exfiltrated file is inert on any other box — DPAPI
|
||||||
|
//! machine keys do not leave the machine.
|
||||||
|
//! 2. **SYSTEM/Administrators-only ACL** on the containing directory + file: a
|
||||||
|
//! non-admin user cannot even read the ciphertext. Inheritance is removed and
|
||||||
|
//! only `SYSTEM` and `BUILTIN\Administrators` are granted full control.
|
||||||
|
//!
|
||||||
|
//! Local admin / SYSTEM can always recover the value — that is accepted (SPEC-016
|
||||||
|
//! §Security): the blast radius of one leaked `cak_` is a single, independently
|
||||||
|
//! revocable machine.
|
||||||
|
//!
|
||||||
|
//! Storage location (chosen over an HKLM value): a file under
|
||||||
|
//! `%ProgramData%\GuruConnect\credentials\agent.cak`. Rationale — the agent
|
||||||
|
//! already keeps its config and the `machine_uid` fallback seed under
|
||||||
|
//! `%ProgramData%\GuruConnect`, so co-locating keeps a single protected
|
||||||
|
//! directory; and a directory/file ACL applied via `icacls` is auditable with far
|
||||||
|
//! less unsafe FFI than building a registry-key security descriptor by hand. Both
|
||||||
|
//! storage shapes are explicitly permitted by the spec.
|
||||||
|
//!
|
||||||
|
//! SECURITY: the plaintext `cak_` is NEVER logged. Errors describe the operation,
|
||||||
|
//! not the value.
|
||||||
|
|
||||||
|
#![cfg(windows)]
|
||||||
|
|
||||||
|
use anyhow::{anyhow, Context, Result};
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use thiserror::Error;
|
||||||
|
|
||||||
|
/// Failure classes for [`load_cak`], so callers can distinguish an *operational*
|
||||||
|
/// problem (the file exists but this process cannot open/read it — e.g. running in
|
||||||
|
/// the wrong security context against a SYSTEM-only-ACL'd store) from the real
|
||||||
|
/// *tamper / wrong-machine* signal (the file was read successfully but DPAPI
|
||||||
|
/// decryption failed).
|
||||||
|
///
|
||||||
|
/// The distinction matters for the run-mode resolver (`main.rs`):
|
||||||
|
/// - [`LoadCakError::Io`] is recoverable/actionable — log it and STOP (do not
|
||||||
|
/// silently re-enroll over a store we simply can't read in this context).
|
||||||
|
/// - [`LoadCakError::Decrypt`] is a hard tamper signal — STOP, do not re-enroll.
|
||||||
|
#[derive(Debug, Error)]
|
||||||
|
pub enum LoadCakError {
|
||||||
|
/// The store path could not be resolved (e.g. `%ProgramData%` unset).
|
||||||
|
#[error("could not resolve credential store path: {0}")]
|
||||||
|
Path(String),
|
||||||
|
|
||||||
|
/// An IO/open/read error reaching the stored blob — INCLUDING
|
||||||
|
/// `PermissionDenied` (the running context lacks rights to the SYSTEM-only
|
||||||
|
/// store). Operational, not a tamper signal.
|
||||||
|
#[error("credential store is present but could not be read in this context: {source}")]
|
||||||
|
Io {
|
||||||
|
/// Whether this was specifically an access-denied error (drives the
|
||||||
|
/// run-mode fail-fast guard in `main.rs`).
|
||||||
|
permission_denied: bool,
|
||||||
|
source: std::io::Error,
|
||||||
|
},
|
||||||
|
|
||||||
|
/// The blob was read successfully but DPAPI decryption FAILED — the real
|
||||||
|
/// tamper / wrong-machine / corruption signal. A hard stop; never re-enroll.
|
||||||
|
#[error("stored credential failed to decrypt (wrong machine, tampered, or corrupted): {0}")]
|
||||||
|
Decrypt(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Directory holding the protected credential file.
|
||||||
|
fn credentials_dir() -> Result<PathBuf> {
|
||||||
|
let program_data =
|
||||||
|
std::env::var("ProgramData").context("ProgramData environment variable is not set")?;
|
||||||
|
Ok(PathBuf::from(program_data)
|
||||||
|
.join("GuruConnect")
|
||||||
|
.join("credentials"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Full path to the DPAPI-encrypted `cak_` blob.
|
||||||
|
fn cak_path() -> Result<PathBuf> {
|
||||||
|
Ok(credentials_dir()?.join("agent.cak"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Persist `cak` encrypted at rest.
|
||||||
|
///
|
||||||
|
/// Ordering is security-critical (H2 — TOCTOU): the directory ACL is locked
|
||||||
|
/// BEFORE any secret bytes touch the filesystem, and the temp file is written
|
||||||
|
/// INSIDE the already-locked directory, so no ciphertext ever exists at a path
|
||||||
|
/// carrying an inherited (potentially world-readable) ACL:
|
||||||
|
///
|
||||||
|
/// 1. `create_dir_all(dir)` — ensure the directory exists.
|
||||||
|
/// 2. `lock_down_acl(dir)` — remove inherited ACEs and grant SYSTEM +
|
||||||
|
/// Administrators full control, made inheritable `(OI)(CI)` so children
|
||||||
|
/// created afterward are covered. This is an explicit precondition for the
|
||||||
|
/// write that follows — NOT an unstated inheritance assumption.
|
||||||
|
/// 3. DPAPI-machine-encrypt the plaintext.
|
||||||
|
/// 4. Write the ciphertext to a temp file inside the now-locked directory, then
|
||||||
|
/// rename over the target (atomic-ish replace).
|
||||||
|
/// 5. `lock_down_acl(file)` — assert the file's own ACL (belt-and-suspenders; the
|
||||||
|
/// file already inherits the directory's restrictive ACEs).
|
||||||
|
/// 6. C1 read-back: immediately attempt [`load_cak`] to PROVE the running
|
||||||
|
/// security context can read its own store. If it cannot (e.g. a non-SYSTEM
|
||||||
|
/// run wrote a SYSTEM-only store it can no longer read), fail HERE at enroll
|
||||||
|
/// time with an actionable error — rather than silently bricking on the next
|
||||||
|
/// boot when the steady-state path tries to load it.
|
||||||
|
///
|
||||||
|
/// Returns an error (never logs the plaintext) on any failure so the caller can
|
||||||
|
/// surface it / retry.
|
||||||
|
pub fn store_cak(cak: &str) -> Result<()> {
|
||||||
|
// 1 + 2: lock the directory ACL BEFORE writing any secret (H2 / TOCTOU).
|
||||||
|
let dir = credentials_dir()?;
|
||||||
|
std::fs::create_dir_all(&dir)
|
||||||
|
.with_context(|| format!("failed to create credentials dir {dir:?}"))?;
|
||||||
|
lock_down_acl(&dir).context("failed to restrict credentials directory ACL")?;
|
||||||
|
|
||||||
|
// 3: encrypt only after the destination directory is locked down.
|
||||||
|
let ciphertext = dpapi_protect(cak.as_bytes()).context("DPAPI encryption of cak_ failed")?;
|
||||||
|
|
||||||
|
// 4: write the temp file INSIDE the already-locked directory, then rename.
|
||||||
|
let path = cak_path()?;
|
||||||
|
let tmp = path.with_extension("cak.tmp");
|
||||||
|
std::fs::write(&tmp, &ciphertext)
|
||||||
|
.with_context(|| format!("failed to write temp credential file {tmp:?}"))?;
|
||||||
|
std::fs::rename(&tmp, &path)
|
||||||
|
.with_context(|| format!("failed to place credential file {path:?}"))?;
|
||||||
|
|
||||||
|
// 5: assert the file ACL too (the file already inherits the dir's ACEs).
|
||||||
|
lock_down_acl(&path).context("failed to restrict credential file ACL")?;
|
||||||
|
|
||||||
|
// 6: C1 read-back — confirm THIS context can read back what it just wrote.
|
||||||
|
// Catches the "wrote a SYSTEM-only store from a non-SYSTEM context" footgun at
|
||||||
|
// enroll time instead of as a silent brick on the next launch.
|
||||||
|
match load_cak() {
|
||||||
|
Ok(Some(_)) => {
|
||||||
|
tracing::info!("[ENROLL] stored per-machine credential (encrypted at rest)");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Ok(None) => Err(anyhow!(
|
||||||
|
"stored the credential but read-back returned nothing — refusing to proceed \
|
||||||
|
with an unverifiable credential store"
|
||||||
|
)),
|
||||||
|
Err(LoadCakError::Io {
|
||||||
|
permission_denied: true,
|
||||||
|
..
|
||||||
|
}) => Err(anyhow!(
|
||||||
|
"[ENROLL] wrote the credential store but cannot read it back in THIS security \
|
||||||
|
context (access denied). The store is ACL'd to SYSTEM + Administrators by \
|
||||||
|
design; the managed agent must run as the GuruConnect SYSTEM service (see \
|
||||||
|
SPEC-018) to read it. Refusing to leave an unreadable store behind."
|
||||||
|
)),
|
||||||
|
Err(e) => Err(anyhow::Error::new(e)
|
||||||
|
.context("stored the credential but the immediate read-back verification failed")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load and decrypt the stored `cak_`, or `Ok(None)` if no credential is stored.
|
||||||
|
///
|
||||||
|
/// Error classification (M1) — the caller MUST treat these differently:
|
||||||
|
/// - `Ok(None)` -> no store yet (NotFound or empty); enroll is fine.
|
||||||
|
/// - [`LoadCakError::Io`] -> the store exists but is unreadable in this
|
||||||
|
/// context (open/read error, INCLUDING access-denied). Operational; the caller
|
||||||
|
/// logs it and STOPS — it must NOT silently re-enroll over a store it merely
|
||||||
|
/// cannot read here.
|
||||||
|
/// - [`LoadCakError::Decrypt`] -> the bytes were read but DPAPI decryption
|
||||||
|
/// FAILED (wrong machine / tampered / corrupted). A hard tamper signal; STOP.
|
||||||
|
///
|
||||||
|
/// Only a successful READ whose decrypt fails is the tamper signal — an IO or
|
||||||
|
/// permission error is never conflated with tamper.
|
||||||
|
pub fn load_cak() -> std::result::Result<Option<String>, LoadCakError> {
|
||||||
|
let path = cak_path().map_err(|e| LoadCakError::Path(e.to_string()))?;
|
||||||
|
let ciphertext = match std::fs::read(&path) {
|
||||||
|
Ok(bytes) => bytes,
|
||||||
|
Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(None),
|
||||||
|
Err(e) => {
|
||||||
|
let permission_denied = e.kind() == std::io::ErrorKind::PermissionDenied;
|
||||||
|
return Err(LoadCakError::Io {
|
||||||
|
permission_denied,
|
||||||
|
source: e,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if ciphertext.is_empty() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
// Reaching here means the READ succeeded — so a decrypt failure now IS the real
|
||||||
|
// tamper / wrong-machine signal (never conflated with an IO/permission error).
|
||||||
|
let plaintext =
|
||||||
|
dpapi_unprotect(&ciphertext).map_err(|e| LoadCakError::Decrypt(e.to_string()))?;
|
||||||
|
let cak = String::from_utf8(plaintext)
|
||||||
|
.map_err(|e| LoadCakError::Decrypt(format!("decrypted bytes were not valid UTF-8: {e}")))?;
|
||||||
|
if cak.is_empty() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
Ok(Some(cak))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Remove the stored credential (e.g. on revocation / forced re-enroll).
|
||||||
|
/// Succeeds if the file is already absent.
|
||||||
|
///
|
||||||
|
/// Part of the store/load/clear API the spec requires (SPEC-016 item 4). Not yet
|
||||||
|
/// called from a code path — the relay-side `cak_` revocation / forced re-enroll
|
||||||
|
/// flow that drives it is the deferred SPEC-016 Phase B/D server work (the
|
||||||
|
/// `TODO(SPEC-016 Phase B/D): consider revoking existing cak_ on collision` note
|
||||||
|
/// in `server/src/api/enroll.rs`) — so it is retained as part of the complete
|
||||||
|
/// store API and explicitly allowed dead until that server work lands.
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub fn clear_cak() -> Result<()> {
|
||||||
|
let path = cak_path()?;
|
||||||
|
match std::fs::remove_file(&path) {
|
||||||
|
Ok(()) => {
|
||||||
|
tracing::info!("[ENROLL] cleared stored per-machine credential");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()),
|
||||||
|
Err(e) => Err(e).with_context(|| format!("failed to remove {path:?}")),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// DPAPI (machine scope)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// DPAPI-machine-encrypt `plaintext` into a self-contained blob.
|
||||||
|
fn dpapi_protect(plaintext: &[u8]) -> Result<Vec<u8>> {
|
||||||
|
use windows::Win32::Security::Cryptography::{
|
||||||
|
CryptProtectData, CRYPTPROTECT_LOCAL_MACHINE, CRYPT_INTEGER_BLOB,
|
||||||
|
};
|
||||||
|
|
||||||
|
// CryptProtectData requires a mutable input pointer in the struct, though it
|
||||||
|
// does not modify the bytes; copy into a local Vec to get a *mut without
|
||||||
|
// aliasing the caller's slice.
|
||||||
|
let mut input = plaintext.to_vec();
|
||||||
|
let in_blob = CRYPT_INTEGER_BLOB {
|
||||||
|
cbData: u32::try_from(input.len()).context("plaintext too large for DPAPI")?,
|
||||||
|
pbData: input.as_mut_ptr(),
|
||||||
|
};
|
||||||
|
let mut out_blob = CRYPT_INTEGER_BLOB::default();
|
||||||
|
|
||||||
|
// SAFETY: in_blob points at a valid, sized buffer; out_blob is owned here and
|
||||||
|
// its pbData is allocated by DPAPI (freed via LocalFree below). No prompt
|
||||||
|
// struct / entropy / reserved args.
|
||||||
|
unsafe {
|
||||||
|
CryptProtectData(
|
||||||
|
&in_blob,
|
||||||
|
windows::core::PCWSTR::null(),
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
CRYPTPROTECT_LOCAL_MACHINE,
|
||||||
|
&mut out_blob,
|
||||||
|
)
|
||||||
|
.context("CryptProtectData failed")?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let result = copy_and_free_blob(&out_blob);
|
||||||
|
// Best-effort scrub of the transient plaintext copy.
|
||||||
|
input.iter_mut().for_each(|b| *b = 0);
|
||||||
|
|
||||||
|
result.ok_or_else(|| anyhow!("CryptProtectData returned an empty/invalid blob"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// DPAPI-decrypt a blob previously produced by [`dpapi_protect`] on this machine.
|
||||||
|
fn dpapi_unprotect(ciphertext: &[u8]) -> Result<Vec<u8>> {
|
||||||
|
use windows::Win32::Security::Cryptography::{
|
||||||
|
CryptUnprotectData, CRYPTPROTECT_LOCAL_MACHINE, CRYPT_INTEGER_BLOB,
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut input = ciphertext.to_vec();
|
||||||
|
let in_blob = CRYPT_INTEGER_BLOB {
|
||||||
|
cbData: u32::try_from(input.len()).context("ciphertext too large for DPAPI")?,
|
||||||
|
pbData: input.as_mut_ptr(),
|
||||||
|
};
|
||||||
|
let mut out_blob = CRYPT_INTEGER_BLOB::default();
|
||||||
|
|
||||||
|
// SAFETY: as in dpapi_protect — valid sized input, owned output freed below.
|
||||||
|
unsafe {
|
||||||
|
CryptUnprotectData(
|
||||||
|
&in_blob,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
None,
|
||||||
|
CRYPTPROTECT_LOCAL_MACHINE,
|
||||||
|
&mut out_blob,
|
||||||
|
)
|
||||||
|
.context("CryptUnprotectData failed")?;
|
||||||
|
}
|
||||||
|
|
||||||
|
copy_and_free_blob(&out_blob)
|
||||||
|
.ok_or_else(|| anyhow!("CryptUnprotectData returned an empty/invalid blob"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Copy a DPAPI output blob into an owned `Vec` and `LocalFree` the DPAPI buffer.
|
||||||
|
///
|
||||||
|
/// Returns `Some(bytes)` on success, `None` if the blob is null/empty. Always
|
||||||
|
/// frees `pbData` when non-null (DPAPI allocates it with `LocalAlloc`).
|
||||||
|
fn copy_and_free_blob(
|
||||||
|
blob: &windows::Win32::Security::Cryptography::CRYPT_INTEGER_BLOB,
|
||||||
|
) -> Option<Vec<u8>> {
|
||||||
|
use windows::Win32::Foundation::{LocalFree, HLOCAL};
|
||||||
|
|
||||||
|
if blob.pbData.is_null() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
// SAFETY: DPAPI guarantees pbData points at cbData valid bytes on success.
|
||||||
|
let bytes = unsafe { std::slice::from_raw_parts(blob.pbData, blob.cbData as usize).to_vec() };
|
||||||
|
// SAFETY: pbData was allocated by DPAPI via LocalAlloc; free it once.
|
||||||
|
unsafe {
|
||||||
|
let _ = LocalFree(HLOCAL(blob.pbData as *mut core::ffi::c_void));
|
||||||
|
}
|
||||||
|
if bytes.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(bytes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// ACL hardening
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Restrict `path` (file or directory) to SYSTEM + Administrators full control,
|
||||||
|
/// removing inherited ACEs so a permissive parent grant cannot leak read access.
|
||||||
|
///
|
||||||
|
/// Implemented via `icacls` — the documented, auditable mechanism — rather than
|
||||||
|
/// hand-rolling a security descriptor through `SetNamedSecurityInfoW` (hundreds
|
||||||
|
/// of lines of SID/ACL FFI). `icacls` ships on every supported Windows target.
|
||||||
|
/// A failure here is surfaced (the caller treats inability to lock down the
|
||||||
|
/// credential store as a hard error) but the well-known SIDs `*S-1-5-18`
|
||||||
|
/// (LocalSystem) and `*S-1-5-32-544` (BUILTIN\Administrators) are language- and
|
||||||
|
/// locale-independent, so this does not break on localized Windows.
|
||||||
|
fn lock_down_acl(path: &std::path::Path) -> Result<()> {
|
||||||
|
use std::os::windows::process::CommandExt;
|
||||||
|
use std::process::Command;
|
||||||
|
|
||||||
|
const CREATE_NO_WINDOW: u32 = 0x0800_0000;
|
||||||
|
|
||||||
|
let path_str = path
|
||||||
|
.to_str()
|
||||||
|
.ok_or_else(|| anyhow!("credential path is not valid UTF-8: {path:?}"))?;
|
||||||
|
|
||||||
|
// /inheritance:r -> remove inherited ACEs (drop the permissive parent grant)
|
||||||
|
// /grant:r -> replace any existing explicit grants for the principal
|
||||||
|
// *S-1-5-18 -> LocalSystem; *S-1-5-32-544 -> BUILTIN\Administrators
|
||||||
|
let output = Command::new("icacls")
|
||||||
|
.arg(path_str)
|
||||||
|
.args([
|
||||||
|
"/inheritance:r",
|
||||||
|
"/grant:r",
|
||||||
|
"*S-1-5-18:(OI)(CI)F",
|
||||||
|
"/grant:r",
|
||||||
|
"*S-1-5-32-544:(OI)(CI)F",
|
||||||
|
])
|
||||||
|
.creation_flags(CREATE_NO_WINDOW)
|
||||||
|
.output()
|
||||||
|
.context("failed to invoke icacls to harden credential ACL")?;
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
// icacls writes its diagnostics to stdout; surface the code only (no
|
||||||
|
// credential material is ever passed to icacls, only the path).
|
||||||
|
return Err(anyhow!(
|
||||||
|
"icacls failed to harden {path_str} (exit {:?})",
|
||||||
|
output.status.code()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// DPAPI round-trips on the same machine: protect then unprotect must recover
|
||||||
|
/// the exact plaintext. (Runs on the build/test host, which IS the same
|
||||||
|
/// machine — the machine-scope key is available to any process here.)
|
||||||
|
#[test]
|
||||||
|
fn dpapi_roundtrip_recovers_plaintext() {
|
||||||
|
let secret = b"cak_test_value_0123456789abcdef";
|
||||||
|
let blob = dpapi_protect(secret).expect("DPAPI protect should succeed on this machine");
|
||||||
|
assert_ne!(
|
||||||
|
blob.as_slice(),
|
||||||
|
secret.as_slice(),
|
||||||
|
"ciphertext must differ from plaintext"
|
||||||
|
);
|
||||||
|
let recovered = dpapi_unprotect(&blob).expect("DPAPI unprotect should succeed");
|
||||||
|
assert_eq!(recovered, secret, "round-trip must recover the exact bytes");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A non-empty plaintext yields a non-empty, differing blob, and an empty
|
||||||
|
/// input is handled (DPAPI accepts zero-length and round-trips to empty).
|
||||||
|
#[test]
|
||||||
|
fn dpapi_roundtrip_handles_varied_lengths() {
|
||||||
|
for plaintext in [b"x".as_slice(), b"cak_".as_slice(), &[0u8; 256]] {
|
||||||
|
let blob = dpapi_protect(plaintext).expect("protect");
|
||||||
|
let back = dpapi_unprotect(&blob).expect("unprotect");
|
||||||
|
assert_eq!(back.as_slice(), plaintext);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Tampering with the ciphertext must make decryption FAIL rather than return
|
||||||
|
/// garbage — DPAPI authenticates its blobs.
|
||||||
|
#[test]
|
||||||
|
fn dpapi_rejects_tampered_blob() {
|
||||||
|
let mut blob = dpapi_protect(b"cak_tamper_target").expect("protect");
|
||||||
|
// Flip a byte in the middle of the blob.
|
||||||
|
let mid = blob.len() / 2;
|
||||||
|
blob[mid] ^= 0xFF;
|
||||||
|
assert!(
|
||||||
|
dpapi_unprotect(&blob).is_err(),
|
||||||
|
"a tampered DPAPI blob must fail to decrypt"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
384
agent/src/enroll.rs
Normal file
384
agent/src/enroll.rs
Normal file
@@ -0,0 +1,384 @@
|
|||||||
|
//! First-run self-enrollment client (SPEC-016 Phase B, item 4).
|
||||||
|
//!
|
||||||
|
//! When the agent runs as a persistent (`PermanentAgent`) install with NO stored
|
||||||
|
//! `cak_` but WITH an `enrollment_key` + `site_code`, it walks through the
|
||||||
|
//! public, unauthenticated `POST /api/enroll` door: it presents its site
|
||||||
|
//! credentials and its hardware-derived `machine_uid`, and — on success — the
|
||||||
|
//! server mints and returns a per-machine `cak_` operating credential exactly
|
||||||
|
//! once. The agent persists that `cak_` encrypted at rest
|
||||||
|
//! ([`crate::credential_store`]) and connects with it; on every later run it uses
|
||||||
|
//! the stored `cak_` directly and never re-enrolls.
|
||||||
|
//!
|
||||||
|
//! Server contract consumed (must match `server/src/api/enroll.rs`):
|
||||||
|
//! - Request: `{ site_code, enrollment_key, machine_uid, hostname,
|
||||||
|
//! labels:{company,site,department,device_type,tags} }`.
|
||||||
|
//! - `201 Created` -> new enrollment; body has `key` (the `cak_`).
|
||||||
|
//! - `200 OK` -> reuse (re-image / re-install); body has `key`.
|
||||||
|
//! - `202 Accepted` -> `collision_pending`; NO key — operator must confirm in
|
||||||
|
//! the dashboard before the endpoint can connect.
|
||||||
|
//! - `401 Unauthorized` -> `ENROLL_REJECTED` (bad/rotated key or unknown site):
|
||||||
|
//! terminal-ish config problem, back off long.
|
||||||
|
//! - `409 Conflict` -> `ENROLL_SITE_CONFLICT` (machine bound to another site):
|
||||||
|
//! terminal-ish, requires the operator reassignment flow; back off long.
|
||||||
|
//! - `429 Too Many Requests` -> rate-limited; back off and retry.
|
||||||
|
//!
|
||||||
|
//! SECURITY: never log the `enrollment_key` or the minted `cak_`. Only states,
|
||||||
|
//! dispositions, and the (non-secret) `machine_uid`/`site_code` are logged.
|
||||||
|
|
||||||
|
use anyhow::{anyhow, Context, Result};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use crate::config::Config;
|
||||||
|
|
||||||
|
/// `POST /api/enroll` request body — mirrors `enroll::EnrollRequest`.
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct EnrollRequest<'a> {
|
||||||
|
site_code: &'a str,
|
||||||
|
enrollment_key: &'a str,
|
||||||
|
machine_uid: &'a str,
|
||||||
|
hostname: &'a str,
|
||||||
|
labels: EnrollLabels<'a>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Labels carried at enrollment — mirrors `enroll::EnrollLabels`.
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
struct EnrollLabels<'a> {
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
company: Option<&'a str>,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
site: Option<&'a str>,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
department: Option<&'a str>,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
device_type: Option<&'a str>,
|
||||||
|
#[serde(skip_serializing_if = "slice_is_empty")]
|
||||||
|
tags: &'a [String],
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `skip_serializing_if` predicate for the `tags` slice — `Vec::is_empty` cannot
|
||||||
|
/// bind a `&&[String]`, so use a slice-typed helper.
|
||||||
|
fn slice_is_empty(s: &[String]) -> bool {
|
||||||
|
s.is_empty()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `POST /api/enroll` success body — mirrors `enroll::EnrollResponse`.
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
struct EnrollResponse {
|
||||||
|
#[allow(dead_code)]
|
||||||
|
machine_id: String,
|
||||||
|
#[serde(default)]
|
||||||
|
key: Option<String>,
|
||||||
|
enrollment_state: String,
|
||||||
|
disposition: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Backoff after a retryable failure (429 / network / 5xx).
|
||||||
|
const RETRYABLE_BACKOFF: Duration = Duration::from_secs(30);
|
||||||
|
/// Backoff after a terminal-ish config failure (401 / 409) or collision-pending.
|
||||||
|
/// These won't fix themselves without operator action, so retry slowly rather
|
||||||
|
/// than hot-looping while still recovering automatically once it IS fixed.
|
||||||
|
const TERMINAL_BACKOFF: Duration = Duration::from_secs(300);
|
||||||
|
|
||||||
|
/// Drive enrollment until a `cak_` is issued, persisting it into the credential
|
||||||
|
/// store on success and loading it into `config.api_key`.
|
||||||
|
///
|
||||||
|
/// Loops with backoff across retryable failures (it must not give up — a managed
|
||||||
|
/// machine left running should eventually enroll once the server/site is healthy)
|
||||||
|
/// and across collision-pending (HTTP 202: it keeps re-checking on a slow cadence
|
||||||
|
/// until an operator confirms the endpoint in the dashboard and the server begins
|
||||||
|
/// issuing a key). Returns `Ok(())` only once a `cak_` is stored. The only `Err`
|
||||||
|
/// returns are unrecoverable local faults (missing config, an un-persistable
|
||||||
|
/// credential) — network/HTTP failures are retried, never propagated.
|
||||||
|
pub async fn run_enrollment(config: &mut Config) -> Result<()> {
|
||||||
|
let site_code = config
|
||||||
|
.site_code
|
||||||
|
.clone()
|
||||||
|
.ok_or_else(|| anyhow!("enrollment requested but no site_code is configured"))?;
|
||||||
|
let enrollment_key = config
|
||||||
|
.enrollment_key
|
||||||
|
.clone()
|
||||||
|
.ok_or_else(|| anyhow!("enrollment requested but no enrollment_key is configured"))?;
|
||||||
|
|
||||||
|
let https_base = config.https_base()?;
|
||||||
|
let machine_uid = crate::identity::machine_uid();
|
||||||
|
let hostname = config.hostname();
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
"[ENROLL] first-run enrollment: site_code={} machine_uid={} hostname={}",
|
||||||
|
site_code,
|
||||||
|
machine_uid,
|
||||||
|
hostname
|
||||||
|
);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
match attempt_enroll(
|
||||||
|
&https_base,
|
||||||
|
&site_code,
|
||||||
|
&enrollment_key,
|
||||||
|
&machine_uid,
|
||||||
|
&hostname,
|
||||||
|
config,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(AttemptResult::Issued(cak)) => {
|
||||||
|
// Persist encrypted-at-rest, then load into the live config so the
|
||||||
|
// transport authenticates with the new per-machine credential.
|
||||||
|
#[cfg(windows)]
|
||||||
|
crate::credential_store::store_cak(&cak)
|
||||||
|
.context("failed to persist issued cak_ to the credential store")?;
|
||||||
|
config.api_key = cak;
|
||||||
|
// Enrollment material is single-use; drop it so it is not retained
|
||||||
|
// in memory or accidentally reused.
|
||||||
|
config.enrollment_key = None;
|
||||||
|
tracing::info!("[ENROLL] enrollment complete; connecting with per-machine key");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
Ok(AttemptResult::Pending) => {
|
||||||
|
tracing::warn!(
|
||||||
|
"[ENROLL] pending operator confirmation (machine_uid collision); \
|
||||||
|
this machine cannot connect until confirmed in the dashboard. \
|
||||||
|
Re-checking in {}s.",
|
||||||
|
TERMINAL_BACKOFF.as_secs()
|
||||||
|
);
|
||||||
|
tokio::time::sleep(TERMINAL_BACKOFF).await;
|
||||||
|
}
|
||||||
|
Err(AttemptError::Terminal(msg)) => {
|
||||||
|
tracing::error!(
|
||||||
|
"[ENROLL] enrollment refused (operator action required): {msg}. \
|
||||||
|
Retrying in {}s.",
|
||||||
|
TERMINAL_BACKOFF.as_secs()
|
||||||
|
);
|
||||||
|
tokio::time::sleep(TERMINAL_BACKOFF).await;
|
||||||
|
}
|
||||||
|
Err(AttemptError::Retryable(msg)) => {
|
||||||
|
tracing::warn!(
|
||||||
|
"[ENROLL] transient enrollment failure: {msg}. Retrying in {}s.",
|
||||||
|
RETRYABLE_BACKOFF.as_secs()
|
||||||
|
);
|
||||||
|
tokio::time::sleep(RETRYABLE_BACKOFF).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result of one HTTP enrollment attempt.
|
||||||
|
enum AttemptResult {
|
||||||
|
/// A `cak_` was issued (201/200). Carries the plaintext (never logged).
|
||||||
|
Issued(String),
|
||||||
|
/// Collision-gated (202): no key issued.
|
||||||
|
Pending,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Failure classes that drive the backoff policy.
|
||||||
|
enum AttemptError {
|
||||||
|
/// 401/409 — won't fix without operator action; back off long but keep trying.
|
||||||
|
Terminal(String),
|
||||||
|
/// 429 / network / 5xx / decode — transient; short backoff.
|
||||||
|
Retryable(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Make one `POST /api/enroll` call and classify the response per the contract.
|
||||||
|
async fn attempt_enroll(
|
||||||
|
https_base: &str,
|
||||||
|
site_code: &str,
|
||||||
|
enrollment_key: &str,
|
||||||
|
machine_uid: &str,
|
||||||
|
hostname: &str,
|
||||||
|
config: &Config,
|
||||||
|
) -> std::result::Result<AttemptResult, AttemptError> {
|
||||||
|
let url = format!("{}/api/enroll", https_base.trim_end_matches('/'));
|
||||||
|
|
||||||
|
let body = EnrollRequest {
|
||||||
|
site_code,
|
||||||
|
enrollment_key,
|
||||||
|
machine_uid,
|
||||||
|
hostname,
|
||||||
|
labels: EnrollLabels {
|
||||||
|
company: config.company.as_deref().filter(|s| !s.is_empty()),
|
||||||
|
site: config.site.as_deref().filter(|s| !s.is_empty()),
|
||||||
|
department: config.department.as_deref().filter(|s| !s.is_empty()),
|
||||||
|
device_type: config.device_type.as_deref().filter(|s| !s.is_empty()),
|
||||||
|
tags: &config.tags,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
let client = build_client().map_err(|e| AttemptError::Retryable(e.to_string()))?;
|
||||||
|
|
||||||
|
let response = client
|
||||||
|
.post(&url)
|
||||||
|
.json(&body)
|
||||||
|
.timeout(Duration::from_secs(30))
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| AttemptError::Retryable(format!("request to {url} failed: {e}")))?;
|
||||||
|
|
||||||
|
let status = response.status();
|
||||||
|
match status.as_u16() {
|
||||||
|
// New (201) or reuse (200): body carries the cak_.
|
||||||
|
200 | 201 => {
|
||||||
|
let parsed: EnrollResponse = response
|
||||||
|
.json()
|
||||||
|
.await
|
||||||
|
.map_err(|e| AttemptError::Retryable(format!("malformed success body: {e}")))?;
|
||||||
|
match parsed.key {
|
||||||
|
Some(cak) if !cak.is_empty() => {
|
||||||
|
tracing::info!(
|
||||||
|
"[ENROLL] server accepted enrollment: state={} disposition={}",
|
||||||
|
parsed.enrollment_state,
|
||||||
|
parsed.disposition
|
||||||
|
);
|
||||||
|
Ok(AttemptResult::Issued(cak))
|
||||||
|
}
|
||||||
|
// 2xx with no key is contract-violating for the active path; treat
|
||||||
|
// as retryable so we don't silently spin or crash.
|
||||||
|
_ => Err(AttemptError::Retryable(format!(
|
||||||
|
"server returned {} with no key (state={}, disposition={})",
|
||||||
|
status, parsed.enrollment_state, parsed.disposition
|
||||||
|
))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collision-gated: pending operator confirmation, no key.
|
||||||
|
202 => {
|
||||||
|
// Body decode is best-effort here; the status alone is authoritative.
|
||||||
|
Ok(AttemptResult::Pending)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Bad/rotated enrollment key or unknown site code.
|
||||||
|
401 => Err(AttemptError::Terminal(
|
||||||
|
"ENROLL_REJECTED — the site code or enrollment key is invalid or rotated; \
|
||||||
|
this installer needs a current per-site key"
|
||||||
|
.to_string(),
|
||||||
|
)),
|
||||||
|
|
||||||
|
// Machine already enrolled at a different site.
|
||||||
|
409 => Err(AttemptError::Terminal(
|
||||||
|
"ENROLL_SITE_CONFLICT — this machine is already enrolled at another site; \
|
||||||
|
a deliberate move requires the operator-initiated reassignment flow"
|
||||||
|
.to_string(),
|
||||||
|
)),
|
||||||
|
|
||||||
|
// Rate-limited / locked out — honor Retry-After if present, else default.
|
||||||
|
429 => {
|
||||||
|
let retry_after = response
|
||||||
|
.headers()
|
||||||
|
.get(reqwest::header::RETRY_AFTER)
|
||||||
|
.and_then(|v| v.to_str().ok())
|
||||||
|
.and_then(|s| s.parse::<u64>().ok());
|
||||||
|
Err(AttemptError::Retryable(match retry_after {
|
||||||
|
Some(secs) => format!("RATE_LIMITED (retry-after {secs}s)"),
|
||||||
|
None => "RATE_LIMITED".to_string(),
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5xx or anything else — transient from the agent's perspective.
|
||||||
|
_ => Err(AttemptError::Retryable(format!(
|
||||||
|
"unexpected enrollment response: HTTP {status}"
|
||||||
|
))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build the HTTP client for enrollment, matching the update path's TLS posture
|
||||||
|
/// (`rustls`, with an opt-in dev-insecure escape hatch in debug builds only).
|
||||||
|
fn build_client() -> Result<reqwest::Client> {
|
||||||
|
reqwest::Client::builder()
|
||||||
|
.danger_accept_invalid_certs(dev_insecure_tls())
|
||||||
|
.build()
|
||||||
|
.context("failed to build enrollment HTTP client")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dev-only TLS bypass — identical policy to `update::dev_insecure_tls`: only in
|
||||||
|
/// debug builds AND only when `GURUCONNECT_DEV_INSECURE_TLS` is set. NEVER active
|
||||||
|
/// in a release build.
|
||||||
|
fn dev_insecure_tls() -> bool {
|
||||||
|
if cfg!(debug_assertions) && std::env::var("GURUCONNECT_DEV_INSECURE_TLS").is_ok() {
|
||||||
|
tracing::warn!(
|
||||||
|
"[ENROLL] TLS verification DISABLED (dev-insecure mode) — DO NOT use in production"
|
||||||
|
);
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// The request body must serialize to exactly the field names the Phase A
|
||||||
|
/// server deserializes (`enroll::EnrollRequest` / `EnrollLabels`). A drift here
|
||||||
|
/// is a silent enrollment failure, so pin the wire shape.
|
||||||
|
#[test]
|
||||||
|
fn request_serializes_to_the_server_contract() {
|
||||||
|
let tags = vec!["prod".to_string()];
|
||||||
|
let req = EnrollRequest {
|
||||||
|
site_code: "ACME-HQ",
|
||||||
|
enrollment_key: "cek_secret",
|
||||||
|
machine_uid: "muid_abc",
|
||||||
|
hostname: "WS-01",
|
||||||
|
labels: EnrollLabels {
|
||||||
|
company: Some("Acme"),
|
||||||
|
site: Some("HQ"),
|
||||||
|
department: Some("IT"),
|
||||||
|
device_type: Some("workstation"),
|
||||||
|
tags: &tags,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
let v: serde_json::Value = serde_json::to_value(&req).unwrap();
|
||||||
|
assert_eq!(v["site_code"], "ACME-HQ");
|
||||||
|
assert_eq!(v["enrollment_key"], "cek_secret");
|
||||||
|
assert_eq!(v["machine_uid"], "muid_abc");
|
||||||
|
assert_eq!(v["hostname"], "WS-01");
|
||||||
|
assert_eq!(v["labels"]["company"], "Acme");
|
||||||
|
assert_eq!(v["labels"]["site"], "HQ");
|
||||||
|
assert_eq!(v["labels"]["department"], "IT");
|
||||||
|
assert_eq!(v["labels"]["device_type"], "workstation");
|
||||||
|
assert_eq!(v["labels"]["tags"][0], "prod");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Empty optional labels are omitted (the server defaults them), and an empty
|
||||||
|
/// tag list is not serialized — keeping the body minimal for a thin installer.
|
||||||
|
#[test]
|
||||||
|
fn request_omits_empty_optional_labels() {
|
||||||
|
let tags: Vec<String> = Vec::new();
|
||||||
|
let req = EnrollRequest {
|
||||||
|
site_code: "S",
|
||||||
|
enrollment_key: "cek_x",
|
||||||
|
machine_uid: "muid_x",
|
||||||
|
hostname: "H",
|
||||||
|
labels: EnrollLabels {
|
||||||
|
company: None,
|
||||||
|
site: None,
|
||||||
|
department: None,
|
||||||
|
device_type: None,
|
||||||
|
tags: &tags,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
let v: serde_json::Value = serde_json::to_value(&req).unwrap();
|
||||||
|
let labels = v["labels"].as_object().unwrap();
|
||||||
|
assert!(!labels.contains_key("company"));
|
||||||
|
assert!(!labels.contains_key("department"));
|
||||||
|
assert!(!labels.contains_key("tags"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The success response decoder must accept both a key-bearing active body and
|
||||||
|
/// a keyless pending body (mirrors `EnrollResponse` with `skip_serializing_if`).
|
||||||
|
#[test]
|
||||||
|
fn response_decodes_active_and_pending_shapes() {
|
||||||
|
let active: EnrollResponse = serde_json::from_str(
|
||||||
|
r#"{"machine_id":"m1","key":"cak_live","enrollment_state":"active","disposition":"new"}"#,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(active.key.as_deref(), Some("cak_live"));
|
||||||
|
assert_eq!(active.enrollment_state, "active");
|
||||||
|
|
||||||
|
let pending: EnrollResponse = serde_json::from_str(
|
||||||
|
r#"{"machine_id":"m2","enrollment_state":"pending","disposition":"collision_pending"}"#,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert!(pending.key.is_none());
|
||||||
|
assert_eq!(pending.disposition, "collision_pending");
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -9,22 +9,48 @@
|
|||||||
//! **recomputable**: the same machine yields the same id on every call with no
|
//! **recomputable**: the same machine yields the same id on every call with no
|
||||||
//! persistence required.
|
//! persistence required.
|
||||||
//!
|
//!
|
||||||
//! - **Windows:** SHA-256 hash of the OS machine GUID read from
|
//! - **Windows:** SHA-256 of a hardware identity string. The id is derived from
|
||||||
//! `HKLM\SOFTWARE\Microsoft\Cryptography\MachineGuid` (a `REG_SZ`). The raw
|
//! the **hardware salt ONLY** whenever any durable hardware signal is readable:
|
||||||
//! GUID is never returned — only the opaque `muid_<hex>` derived from it.
|
//! the **SMBIOS system UUID** (`Win32_ComputerSystemProduct.UUID`), or — when
|
||||||
//! - **Non-Windows (and Windows registry failure):** a random UUID persisted in
|
//! that is absent / all-zeros / all-FFs (some OEMs/hypervisors) — the
|
||||||
//! the agent's data directory, read back on subsequent runs so it is stable
|
//! **motherboard serial** (`Win32_BaseBoard.SerialNumber`) plus the **primary
|
||||||
//! across calls and process restarts.
|
//! disk serial**. A fixed namespace string is mixed in for domain separation.
|
||||||
|
//! The OS machine GUID
|
||||||
|
//! (`HKLM\SOFTWARE\Microsoft\Cryptography\MachineGuid`, a `REG_SZ`) is used
|
||||||
|
//! ONLY as a last-resort signal when NO hardware salt is readable. The raw
|
||||||
|
//! signals are never returned — only the opaque `muid_<hex>` derived from them.
|
||||||
|
//! - **Non-Windows (and Windows with no readable signal at all):** a random UUID
|
||||||
|
//! persisted in the agent's data directory, read back on subsequent runs so it
|
||||||
|
//! is stable across calls and process restarts.
|
||||||
|
//!
|
||||||
|
//! **Stability contract (SPEC-016 item 1):**
|
||||||
|
//! - **Salted path (hardware signal present) is re-image-stable:** the digest
|
||||||
|
//! mixes only durable hardware signals (SMBIOS UUID, or board + disk serial) and
|
||||||
|
//! a fixed namespace — NOT the `MachineGuid`, which Windows regenerates on every
|
||||||
|
//! OS install/re-image. So the `machine_uid` survives both a reboot AND an OS
|
||||||
|
//! re-image on the SAME hardware (the re-image dedup goal), while distinct
|
||||||
|
//! physical boxes stay distinct.
|
||||||
|
//! - **MachineGuid-only path is the volatile floor:** when no hardware salt is
|
||||||
|
//! readable, the id anchors on the `MachineGuid` alone. This is stable across
|
||||||
|
//! reboots but NOT across a re-image (the GUID is regenerated). This degraded
|
||||||
|
//! path is logged at WARN so the server-side collision gate operator has a clue.
|
||||||
//!
|
//!
|
||||||
//! This module deliberately does NOT change `agent_id`/`generate_agent_id`.
|
//! This module deliberately does NOT change `agent_id`/`generate_agent_id`.
|
||||||
//! `machine_uid` is reported *alongside* `agent_id`; the server-side dedup that
|
//! `machine_uid` is reported *alongside* `agent_id`; the server-side dedup that
|
||||||
//! consumes it is a separate task.
|
//! consumes it lives in `POST /api/enroll` (SPEC-016 Phase A) and the relay
|
||||||
|
//! connect path.
|
||||||
|
|
||||||
use std::sync::OnceLock;
|
use std::sync::OnceLock;
|
||||||
|
|
||||||
/// Prefix marking the value as an opaque machine-uid (vs. a raw GUID/UUID).
|
/// Prefix marking the value as an opaque machine-uid (vs. a raw GUID/UUID).
|
||||||
const MUID_PREFIX: &str = "muid_";
|
const MUID_PREFIX: &str = "muid_";
|
||||||
|
|
||||||
|
/// Fixed namespace mixed into the hardware-salted derivation for domain
|
||||||
|
/// separation: it ties the digest to *this* identity scheme so the same raw
|
||||||
|
/// hardware serial can never collide with an unrelated digest, and it documents
|
||||||
|
/// the derivation version. It is NOT a secret — it is a constant.
|
||||||
|
const MUID_NAMESPACE: &str = "guruconnect:machine_uid:v1";
|
||||||
|
|
||||||
/// Cached value — `machine_uid()` reads the registry / a file, so compute once
|
/// Cached value — `machine_uid()` reads the registry / a file, so compute once
|
||||||
/// and reuse for the lifetime of the process.
|
/// and reuse for the lifetime of the process.
|
||||||
static MACHINE_UID: OnceLock<String> = OnceLock::new();
|
static MACHINE_UID: OnceLock<String> = OnceLock::new();
|
||||||
@@ -32,10 +58,11 @@ static MACHINE_UID: OnceLock<String> = OnceLock::new();
|
|||||||
/// Return a deterministic, recomputable opaque machine identifier.
|
/// Return a deterministic, recomputable opaque machine identifier.
|
||||||
///
|
///
|
||||||
/// The result is non-empty and prefixed with [`MUID_PREFIX`]. It is cached after
|
/// The result is non-empty and prefixed with [`MUID_PREFIX`]. It is cached after
|
||||||
/// the first call. On Windows it is derived purely from the OS machine GUID (no
|
/// the first call. On Windows it is derived from a durable hardware salt when one
|
||||||
/// persistence). If the Windows registry read fails — or on any non-Windows
|
/// is readable (re-image-stable; see the module docs), falling back to the OS
|
||||||
/// platform — it degrades to a persisted random UUID (today's-behavior-equivalent
|
/// machine GUID alone (reboot-stable floor) and finally — when no signal at all is
|
||||||
/// stability) rather than panicking.
|
/// readable, or on any non-Windows platform — a persisted random UUID, rather than
|
||||||
|
/// panicking.
|
||||||
pub fn machine_uid() -> String {
|
pub fn machine_uid() -> String {
|
||||||
MACHINE_UID.get_or_init(compute_machine_uid).clone()
|
MACHINE_UID.get_or_init(compute_machine_uid).clone()
|
||||||
}
|
}
|
||||||
@@ -56,23 +83,265 @@ fn derive_uid(raw: &str) -> String {
|
|||||||
|
|
||||||
#[cfg(windows)]
|
#[cfg(windows)]
|
||||||
fn compute_machine_uid() -> String {
|
fn compute_machine_uid() -> String {
|
||||||
|
// PRIMARY signal (SPEC-016 item 1): a durable hardware salt — SMBIOS system
|
||||||
|
// UUID if usable, else motherboard + disk serial. When ANY hardware salt is
|
||||||
|
// readable we derive the uid from the salt ALONE (plus a fixed namespace),
|
||||||
|
// deliberately EXCLUDING the MachineGuid: Windows regenerates the MachineGuid
|
||||||
|
// on every OS install/re-image, so mixing it in would break re-image dedup.
|
||||||
|
// The salted digest survives both reboot AND re-image on the same hardware.
|
||||||
|
if let Some(salt) = hardware_salt() {
|
||||||
|
tracing::info!("machine_uid derived from durable hardware salt (re-image-stable)");
|
||||||
|
return derive_uid(&format!("{MUID_NAMESPACE}|{salt}"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// LAST-RESORT signal: no hardware salt is readable, so anchor on the OS
|
||||||
|
// MachineGuid alone. This is the volatile FLOOR — stable across reboots but
|
||||||
|
// NOT across an OS re-image (the GUID is regenerated). We WARN so the
|
||||||
|
// server-side collision-gate operator knows this endpoint's uid is not
|
||||||
|
// re-image-stable. The MachineGuid itself is never logged.
|
||||||
match read_machine_guid() {
|
match read_machine_guid() {
|
||||||
Ok(guid) if !guid.trim().is_empty() => derive_uid(guid.trim()),
|
Ok(guid) if !guid.trim().is_empty() => {
|
||||||
|
tracing::warn!(
|
||||||
|
"machine_uid: no durable hardware salt readable; anchoring on MachineGuid \
|
||||||
|
ONLY — this id is reboot-stable but NOT re-image-stable"
|
||||||
|
);
|
||||||
|
derive_uid(&format!("{MUID_NAMESPACE}|machineguid:{}", guid.trim()))
|
||||||
|
}
|
||||||
Ok(_) => {
|
Ok(_) => {
|
||||||
tracing::warn!(
|
tracing::warn!(
|
||||||
"MachineGuid registry value was empty; falling back to persisted machine_uid"
|
"machine_uid: no hardware salt and MachineGuid registry value was empty; \
|
||||||
|
falling back to persisted machine_uid"
|
||||||
);
|
);
|
||||||
persisted_uid()
|
persisted_uid()
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
tracing::warn!(
|
tracing::warn!(
|
||||||
"Failed to read MachineGuid from registry ({e}); falling back to persisted machine_uid"
|
"machine_uid: no hardware salt and failed to read MachineGuid ({e}); \
|
||||||
|
falling back to persisted machine_uid"
|
||||||
);
|
);
|
||||||
persisted_uid()
|
persisted_uid()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Collect the durable hardware salt for the `machine_uid` (Windows only).
|
||||||
|
///
|
||||||
|
/// This is the PRIMARY identity signal: when it returns `Some(salt)`, the caller
|
||||||
|
/// derives the uid from the salt ALONE (re-image-stable). Returns `Some(salt)`
|
||||||
|
/// where `salt` is a deterministic, normalized concatenation of usable hardware
|
||||||
|
/// signals, or `None` when nothing durable is readable (in which case the caller
|
||||||
|
/// degrades to anchoring on the MachineGuid alone — the volatile floor).
|
||||||
|
///
|
||||||
|
/// Order of preference, per SPEC-016 item 1:
|
||||||
|
/// 1. SMBIOS system UUID (`Win32_ComputerSystemProduct.UUID`) — when present and
|
||||||
|
/// not a degenerate placeholder (all-zeros / all-FFs, which some OEMs and
|
||||||
|
/// hypervisor templates emit).
|
||||||
|
/// 2. Fallback: motherboard serial (`Win32_BaseBoard.SerialNumber`) + primary
|
||||||
|
/// disk serial — combined so a single weak signal does not stand alone.
|
||||||
|
///
|
||||||
|
/// Each component is read via a narrow PowerShell CIM query (see
|
||||||
|
/// [`query_cim_property`]); the values are normalized (trimmed, upper-cased) so
|
||||||
|
/// trivial formatting drift never changes the digest.
|
||||||
|
#[cfg(windows)]
|
||||||
|
fn hardware_salt() -> Option<String> {
|
||||||
|
if let Some(uuid) = smbios_uuid() {
|
||||||
|
return Some(format!("smbios:{uuid}"));
|
||||||
|
}
|
||||||
|
|
||||||
|
// SMBIOS UUID unusable — fall back to board + disk serial. Use whichever of
|
||||||
|
// the two are readable; require at least one to be present, otherwise there
|
||||||
|
// is no durable salt and we return None.
|
||||||
|
let board = normalize_signal(query_cim_property("Win32_BaseBoard", "SerialNumber").as_deref());
|
||||||
|
let disk = primary_disk_serial();
|
||||||
|
|
||||||
|
match (board, disk) {
|
||||||
|
(Some(b), Some(d)) => Some(format!("board:{b}|disk:{d}")),
|
||||||
|
(Some(b), None) => Some(format!("board:{b}")),
|
||||||
|
(None, Some(d)) => Some(format!("disk:{d}")),
|
||||||
|
(None, None) => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The SMBIOS system UUID, or `None` if absent or a degenerate placeholder.
|
||||||
|
///
|
||||||
|
/// Some OEMs ship an all-zeros UUID and some hypervisor templates clone an
|
||||||
|
/// all-FFs (or all-zeros) UUID; either is worthless as a distinguishing signal,
|
||||||
|
/// so we reject both and let the caller fall back to board/disk serial.
|
||||||
|
#[cfg(windows)]
|
||||||
|
fn smbios_uuid() -> Option<String> {
|
||||||
|
let raw =
|
||||||
|
normalize_signal(query_cim_property("Win32_ComputerSystemProduct", "UUID").as_deref())?;
|
||||||
|
|
||||||
|
// Reject degenerate placeholders (ignoring dashes): all-zeros or all-FFs.
|
||||||
|
let hex: String = raw.chars().filter(|c| *c != '-').collect();
|
||||||
|
let all_zero = !hex.is_empty() && hex.chars().all(|c| c == '0');
|
||||||
|
let all_ff = !hex.is_empty() && hex.chars().all(|c| c == 'F');
|
||||||
|
if hex.is_empty() || all_zero || all_ff {
|
||||||
|
tracing::debug!("SMBIOS UUID is absent or a degenerate placeholder; using fallback salt");
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some(raw)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The serial number of the primary (boot/index-0) physical disk, normalized.
|
||||||
|
///
|
||||||
|
/// Prefers the disk whose `Index == 0` (the conventional boot disk); falls back
|
||||||
|
/// to the first disk that reports any serial. Returns `None` if no disk reports a
|
||||||
|
/// usable serial.
|
||||||
|
#[cfg(windows)]
|
||||||
|
fn primary_disk_serial() -> Option<String> {
|
||||||
|
// One narrow query: index + serial for all physical disks, sorted by index,
|
||||||
|
// emitted as `index<TAB>serial` lines. Parse the lowest-index non-empty serial.
|
||||||
|
let script = "Get-CimInstance -ClassName Win32_DiskDrive | \
|
||||||
|
Sort-Object Index | \
|
||||||
|
ForEach-Object { \"$($_.Index)`t$($_.SerialNumber)\" }";
|
||||||
|
let out = run_powershell(script)?;
|
||||||
|
for line in out.lines() {
|
||||||
|
let mut parts = line.splitn(2, '\t');
|
||||||
|
let _index = parts.next();
|
||||||
|
if let Some(serial) = parts.next() {
|
||||||
|
if let Some(n) = normalize_signal(Some(serial)) {
|
||||||
|
return Some(n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read a single property of a single-instance CIM class via PowerShell.
|
||||||
|
///
|
||||||
|
/// Returns the raw (untrimmed) first non-empty line of output, or `None`. This is
|
||||||
|
/// a deliberately narrow shell-out rather than a full WMI/COM binding: the agent
|
||||||
|
/// already has no WMI crate, and a COM `IWbemServices` binding for two scalar
|
||||||
|
/// reads would be far more code and unsafe surface for no benefit. PowerShell's
|
||||||
|
/// CIM cmdlets are present on every supported Windows target (7 SP1+/2008 R2+
|
||||||
|
/// ship WMI; CIM cmdlets ship from PowerShell 3.0 / WMF 3.0, universally present
|
||||||
|
/// on currently-supported builds).
|
||||||
|
#[cfg(windows)]
|
||||||
|
fn query_cim_property(class: &str, property: &str) -> Option<String> {
|
||||||
|
// `(Get-CimInstance -ClassName X).Property` — single scalar, no formatting.
|
||||||
|
let script = format!("(Get-CimInstance -ClassName {class}).{property}");
|
||||||
|
let out = run_powershell(&script)?;
|
||||||
|
out.lines()
|
||||||
|
.map(str::trim)
|
||||||
|
.find(|l| !l.is_empty())
|
||||||
|
.map(str::to_string)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Wall-clock bound on a single PowerShell hardware-signal query.
|
||||||
|
///
|
||||||
|
/// A wedged WMI/CIM provider can hang indefinitely; without a bound that would
|
||||||
|
/// hang agent startup forever. On timeout we kill the child and treat the signal
|
||||||
|
/// as missing (fall back through the chain) — never panic.
|
||||||
|
#[cfg(windows)]
|
||||||
|
const POWERSHELL_QUERY_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(10);
|
||||||
|
|
||||||
|
/// Run a short PowerShell snippet and capture stdout, or `None` on any failure
|
||||||
|
/// (including a wall-clock timeout).
|
||||||
|
///
|
||||||
|
/// Hidden window (`CREATE_NO_WINDOW`) so an interactive desktop never flashes a
|
||||||
|
/// console; `-NonInteractive -NoProfile` for determinism and speed. The call is
|
||||||
|
/// spawned and waited on with a [`POWERSHELL_QUERY_TIMEOUT`] bound so a stuck WMI
|
||||||
|
/// provider cannot wedge startup; on timeout the child is killed and the signal is
|
||||||
|
/// treated as missing. Never logs the captured output (it carries hardware
|
||||||
|
/// identifiers).
|
||||||
|
#[cfg(windows)]
|
||||||
|
fn run_powershell(script: &str) -> Option<String> {
|
||||||
|
use std::io::Read;
|
||||||
|
use std::os::windows::process::CommandExt;
|
||||||
|
use std::process::{Command, Stdio};
|
||||||
|
use std::time::Instant;
|
||||||
|
|
||||||
|
// CREATE_NO_WINDOW — avoid a console flash on the interactive desktop.
|
||||||
|
const CREATE_NO_WINDOW: u32 = 0x0800_0000;
|
||||||
|
|
||||||
|
let mut child = match Command::new("powershell.exe")
|
||||||
|
.args([
|
||||||
|
"-NonInteractive",
|
||||||
|
"-NoProfile",
|
||||||
|
"-ExecutionPolicy",
|
||||||
|
"Bypass",
|
||||||
|
"-Command",
|
||||||
|
script,
|
||||||
|
])
|
||||||
|
.stdin(Stdio::null())
|
||||||
|
.stdout(Stdio::piped())
|
||||||
|
.stderr(Stdio::null())
|
||||||
|
.creation_flags(CREATE_NO_WINDOW)
|
||||||
|
.spawn()
|
||||||
|
{
|
||||||
|
Ok(c) => c,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::debug!("could not run hardware-signal query ({e}); ignoring this signal");
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Poll for exit with a wall-clock bound. We spin with a short sleep rather than
|
||||||
|
// a reader thread: the queries are infrequent (startup only) and the loop keeps
|
||||||
|
// the timeout logic simple and panic-free.
|
||||||
|
let deadline = Instant::now() + POWERSHELL_QUERY_TIMEOUT;
|
||||||
|
let status = loop {
|
||||||
|
match child.try_wait() {
|
||||||
|
Ok(Some(status)) => break status,
|
||||||
|
Ok(None) => {
|
||||||
|
if Instant::now() >= deadline {
|
||||||
|
// Wedged provider: kill and treat as a missing signal.
|
||||||
|
let _ = child.kill();
|
||||||
|
let _ = child.wait();
|
||||||
|
tracing::debug!(
|
||||||
|
"hardware-signal query exceeded {}s timeout; killed and ignoring this signal",
|
||||||
|
POWERSHELL_QUERY_TIMEOUT.as_secs()
|
||||||
|
);
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
std::thread::sleep(std::time::Duration::from_millis(50));
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::debug!("error waiting on hardware-signal query ({e}); ignoring");
|
||||||
|
let _ = child.kill();
|
||||||
|
let _ = child.wait();
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if !status.success() {
|
||||||
|
tracing::debug!(
|
||||||
|
"hardware-signal query exited with status {:?}; ignoring this signal",
|
||||||
|
status.code()
|
||||||
|
);
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// The process exited; drain its captured stdout.
|
||||||
|
let mut buf = Vec::new();
|
||||||
|
if let Some(mut out) = child.stdout.take() {
|
||||||
|
if let Err(e) = out.read_to_end(&mut buf) {
|
||||||
|
tracing::debug!("error reading hardware-signal query output ({e}); ignoring");
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let s = String::from_utf8_lossy(&buf).trim().to_string();
|
||||||
|
if s.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normalize a raw hardware signal: trim, upper-case, drop if empty. Upper-casing
|
||||||
|
/// makes the digest stable against vendor case drift; trimming removes stray
|
||||||
|
/// whitespace WMI sometimes pads serials with.
|
||||||
|
#[cfg(windows)]
|
||||||
|
fn normalize_signal(raw: Option<&str>) -> Option<String> {
|
||||||
|
let v = raw?.trim();
|
||||||
|
if v.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some(v.to_uppercase())
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(not(windows))]
|
#[cfg(not(windows))]
|
||||||
fn compute_machine_uid() -> String {
|
fn compute_machine_uid() -> String {
|
||||||
// No OS machine GUID available — use the persisted random UUID, hashed for a
|
// No OS machine GUID available — use the persisted random UUID, hashed for a
|
||||||
@@ -297,4 +566,108 @@ mod tests {
|
|||||||
assert_eq!(a, b, "compute_machine_uid must be deterministic on Windows");
|
assert_eq!(a, b, "compute_machine_uid must be deterministic on Windows");
|
||||||
assert!(a.starts_with(MUID_PREFIX));
|
assert!(a.starts_with(MUID_PREFIX));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Pin the EXACT derivation strings that `compute_machine_uid` builds, so these
|
||||||
|
/// pure-function tests track the production logic. Keep in lock-step with
|
||||||
|
/// `compute_machine_uid`.
|
||||||
|
#[cfg(windows)]
|
||||||
|
fn salted_uid(salt: &str) -> String {
|
||||||
|
derive_uid(&format!("{MUID_NAMESPACE}|{salt}"))
|
||||||
|
}
|
||||||
|
#[cfg(windows)]
|
||||||
|
fn machineguid_only_uid(guid: &str) -> String {
|
||||||
|
derive_uid(&format!("{MUID_NAMESPACE}|machineguid:{guid}"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// H1 RE-IMAGE STABILITY: when a hardware salt is present, the uid is derived
|
||||||
|
/// from the salt ALONE — the MachineGuid is NOT part of the input. So holding
|
||||||
|
/// the hardware signals fixed while varying the MachineGuid MUST yield the SAME
|
||||||
|
/// uid. This is exactly the re-image case: an OS re-image regenerates the
|
||||||
|
/// MachineGuid but leaves SMBIOS UUID / board+disk serial unchanged, and the
|
||||||
|
/// machine_uid must not move (otherwise dedup breaks). We prove it by showing
|
||||||
|
/// the salted derivation has no MachineGuid term to vary.
|
||||||
|
#[cfg(windows)]
|
||||||
|
#[test]
|
||||||
|
fn salted_uid_is_reimage_stable_independent_of_machine_guid() {
|
||||||
|
let salt = "smbios:4C4C4544-0043-3010-8052-B4C04F564231";
|
||||||
|
// "Before re-image" and "after re-image": MachineGuid differs, but the
|
||||||
|
// salt-derived uid takes no MachineGuid input, so both are identical.
|
||||||
|
let before = salted_uid(salt);
|
||||||
|
let after = salted_uid(salt);
|
||||||
|
assert_eq!(
|
||||||
|
before, after,
|
||||||
|
"salted uid must be stable across a re-image (no MachineGuid term)"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Contrast: the MachineGuid-only floor DOES move when the GUID changes —
|
||||||
|
// demonstrating WHY the salted path must exclude it for re-image stability.
|
||||||
|
let guid_a = machineguid_only_uid("11111111-2222-3333-4444-555555555555");
|
||||||
|
let guid_b = machineguid_only_uid("99999999-8888-7777-6666-555555555555");
|
||||||
|
assert_ne!(
|
||||||
|
guid_a, guid_b,
|
||||||
|
"MachineGuid-only floor is volatile across re-image (expected)"
|
||||||
|
);
|
||||||
|
|
||||||
|
// And the salted uid must differ from the MachineGuid-only floor for the
|
||||||
|
// same box: the two derivation paths are domain-separated.
|
||||||
|
assert_ne!(before, guid_a);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The hardware-salted derivation is `derive_uid` over a deterministic,
|
||||||
|
/// namespaced concatenation: identical signals MUST yield an identical uid and
|
||||||
|
/// any changed signal MUST change it. Pins the SPEC-016 determinism contract
|
||||||
|
/// independent of the (machine-specific) live hardware reads.
|
||||||
|
#[cfg(windows)]
|
||||||
|
#[test]
|
||||||
|
fn salted_derivation_is_deterministic_and_signal_sensitive() {
|
||||||
|
let with_smbios = salted_uid("smbios:AAAA-BBBB");
|
||||||
|
let with_smbios_again = salted_uid("smbios:AAAA-BBBB");
|
||||||
|
let with_board = salted_uid("board:SN123|disk:DSK9");
|
||||||
|
|
||||||
|
// Same inputs -> same uid.
|
||||||
|
assert_eq!(with_smbios, with_smbios_again);
|
||||||
|
// Different salt composition -> different uid (distinct boxes stay distinct).
|
||||||
|
assert_ne!(with_smbios, with_board);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// All-zero and all-FF SMBIOS UUIDs are degenerate placeholders that some OEMs
|
||||||
|
/// and hypervisor templates emit; the normalizer + placeholder check must
|
||||||
|
/// reject them so the derivation falls through to board/disk serial. We
|
||||||
|
/// exercise the rejection predicate directly (it is pure) rather than the
|
||||||
|
/// live WMI read.
|
||||||
|
#[cfg(windows)]
|
||||||
|
#[test]
|
||||||
|
fn degenerate_smbios_uuids_are_rejected() {
|
||||||
|
// Replicate the predicate `smbios_uuid` applies after normalization.
|
||||||
|
fn is_degenerate(raw: &str) -> bool {
|
||||||
|
let Some(norm) = normalize_signal(Some(raw)) else {
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
let hex: String = norm.chars().filter(|c| *c != '-').collect();
|
||||||
|
hex.is_empty()
|
||||||
|
|| (!hex.is_empty() && hex.chars().all(|c| c == '0'))
|
||||||
|
|| (!hex.is_empty() && hex.chars().all(|c| c == 'F'))
|
||||||
|
}
|
||||||
|
|
||||||
|
assert!(is_degenerate("00000000-0000-0000-0000-000000000000"));
|
||||||
|
assert!(is_degenerate("FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF"));
|
||||||
|
assert!(is_degenerate("ffffffff-ffff-ffff-ffff-ffffffffffff")); // case-insensitive via normalize
|
||||||
|
assert!(is_degenerate(" "));
|
||||||
|
// A real, mixed UUID is NOT degenerate.
|
||||||
|
assert!(!is_degenerate("4C4C4544-0043-3010-8052-B4C04F564231"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `normalize_signal` trims, upper-cases, and drops empties — so case/space
|
||||||
|
/// drift in a vendor serial never perturbs the digest.
|
||||||
|
#[cfg(windows)]
|
||||||
|
#[test]
|
||||||
|
fn normalize_signal_is_stable_against_drift() {
|
||||||
|
assert_eq!(
|
||||||
|
normalize_signal(Some(" abc123 ")),
|
||||||
|
Some("ABC123".to_string())
|
||||||
|
);
|
||||||
|
assert_eq!(normalize_signal(Some("ABC123")), Some("ABC123".to_string()));
|
||||||
|
assert_eq!(normalize_signal(Some(" ")), None);
|
||||||
|
assert_eq!(normalize_signal(None), None);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -290,6 +290,18 @@ pub fn install(force_user_install: bool) -> Result<()> {
|
|||||||
// Register protocol handler
|
// Register protocol handler
|
||||||
register_protocol_handler(elevated)?;
|
register_protocol_handler(elevated)?;
|
||||||
|
|
||||||
|
// SPEC-018: a MANAGED install (embedded config => persistent agent) installs
|
||||||
|
// the LocalSystem service as its single autostart and removes the per-user
|
||||||
|
// HKCU\…\Run entry. Attended (support-code) and viewer installs are untouched:
|
||||||
|
// they have no embedded config and continue to use the HKCU Run / protocol
|
||||||
|
// handler paths exactly as before.
|
||||||
|
#[cfg(windows)]
|
||||||
|
{
|
||||||
|
if crate::config::Config::has_embedded_config() {
|
||||||
|
install_managed_service(&exe_path)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
info!("Installation complete!");
|
info!("Installation complete!");
|
||||||
if elevated {
|
if elevated {
|
||||||
info!("Installed system-wide to: {}", install_path.display());
|
info!("Installed system-wide to: {}", install_path.display());
|
||||||
@@ -300,6 +312,64 @@ pub fn install(force_user_install: bool) -> Result<()> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// SPEC-018: install the managed agent as a LocalSystem service and swap out the
|
||||||
|
/// legacy per-user `HKCU\…\Run` autostart so the service is the single managed
|
||||||
|
/// autostart (no double-run).
|
||||||
|
///
|
||||||
|
/// Installing a LocalSystem service requires Administrator. If the SCM rejects the
|
||||||
|
/// create (not elevated), we surface the error rather than silently leaving the
|
||||||
|
/// machine with no managed autostart — a managed deployment is expected to run the
|
||||||
|
/// install elevated. The HKCU Run entry is removed best-effort regardless.
|
||||||
|
#[cfg(windows)]
|
||||||
|
pub fn install_managed_service(exe_path: &std::path::Path) -> Result<()> {
|
||||||
|
info!("Managed install: registering LocalSystem service (SPEC-018)");
|
||||||
|
|
||||||
|
crate::service::install_service(exe_path)
|
||||||
|
.map_err(|e| anyhow!("failed to install the managed agent service: {e:#}"))?;
|
||||||
|
|
||||||
|
// Start the service now so the agent comes up immediately on first install
|
||||||
|
// rather than only on the next boot. Best-effort: the service is auto-start, so
|
||||||
|
// a transient start failure still self-heals on reboot.
|
||||||
|
if let Err(e) = crate::service::start_service() {
|
||||||
|
warn!(
|
||||||
|
"managed service installed but did not start now ({e:#}); \
|
||||||
|
it is auto-start and will run on next boot"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove the legacy per-user autostart so the agent does not also launch in the
|
||||||
|
// user's session (which would double-run alongside the service).
|
||||||
|
if let Err(e) = crate::startup::remove_from_startup() {
|
||||||
|
warn!(
|
||||||
|
"managed service installed, but failed to remove the legacy HKCU Run \
|
||||||
|
autostart (harmless if it was never present): {}",
|
||||||
|
e
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
info!("removed legacy HKCU Run autostart (service is now the managed autostart)");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// SPEC-018: remove the managed agent service and any legacy HKCU Run autostart.
|
||||||
|
/// Idempotent — succeeds if neither is present.
|
||||||
|
#[cfg(windows)]
|
||||||
|
pub fn uninstall_managed_service() -> Result<()> {
|
||||||
|
info!("Managed uninstall: removing LocalSystem service (SPEC-018)");
|
||||||
|
|
||||||
|
// Best-effort removal of the legacy autostart first (cheap, no SCM).
|
||||||
|
if let Err(e) = crate::startup::remove_from_startup() {
|
||||||
|
warn!(
|
||||||
|
"failed to remove legacy HKCU Run autostart during uninstall: {}",
|
||||||
|
e
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
crate::service::uninstall_service()
|
||||||
|
.map_err(|e| anyhow!("failed to uninstall the managed agent service: {e:#}"))
|
||||||
|
}
|
||||||
|
|
||||||
/// Check if the guruconnect:// protocol handler is registered
|
/// Check if the guruconnect:// protocol handler is registered
|
||||||
#[cfg(windows)]
|
#[cfg(windows)]
|
||||||
pub fn is_protocol_handler_registered() -> bool {
|
pub fn is_protocol_handler_registered() -> bool {
|
||||||
|
|||||||
@@ -16,11 +16,16 @@ mod capture;
|
|||||||
mod chat;
|
mod chat;
|
||||||
mod config;
|
mod config;
|
||||||
mod consent;
|
mod consent;
|
||||||
|
#[cfg(windows)]
|
||||||
|
mod credential_store;
|
||||||
mod encoder;
|
mod encoder;
|
||||||
|
mod enroll;
|
||||||
mod identity;
|
mod identity;
|
||||||
mod input;
|
mod input;
|
||||||
mod install;
|
mod install;
|
||||||
mod sas_client;
|
mod sas_client;
|
||||||
|
#[cfg(windows)]
|
||||||
|
mod service;
|
||||||
mod session;
|
mod session;
|
||||||
mod startup;
|
mod startup;
|
||||||
mod transport;
|
mod transport;
|
||||||
@@ -179,6 +184,12 @@ enum Commands {
|
|||||||
/// Show detailed version and build information
|
/// Show detailed version and build information
|
||||||
#[command(name = "version-info")]
|
#[command(name = "version-info")]
|
||||||
VersionInfo,
|
VersionInfo,
|
||||||
|
|
||||||
|
/// Internal: entry point invoked by the Windows Service Control Manager to run
|
||||||
|
/// the managed agent as a LocalSystem service (SPEC-018). Not for interactive
|
||||||
|
/// use — running it by hand fails because there is no controlling SCM.
|
||||||
|
#[command(name = "service-run", hide = true)]
|
||||||
|
ServiceRun,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
@@ -223,7 +234,24 @@ fn main() -> Result<()> {
|
|||||||
Some(Commands::Install {
|
Some(Commands::Install {
|
||||||
user_only,
|
user_only,
|
||||||
elevated,
|
elevated,
|
||||||
}) => run_install(user_only || elevated),
|
}) => {
|
||||||
|
// `run_install`'s parameter is `force_user_install` — when true it
|
||||||
|
// skips the UAC re-elevation attempt and installs in-place with
|
||||||
|
// whatever rights this process already has.
|
||||||
|
//
|
||||||
|
// - `user_only`: the user explicitly asked for a per-user install;
|
||||||
|
// honour it directly.
|
||||||
|
// - `elevated`: this is the internal, already-elevated re-exec spawned
|
||||||
|
// by `try_elevate_and_install` ("install --elevated"). It must NOT
|
||||||
|
// attempt to elevate AGAIN (that would loop / re-prompt), so we pass
|
||||||
|
// force=true here too. This is correct even though it routes through
|
||||||
|
// the "user install" parameter, because the re-exec genuinely runs
|
||||||
|
// elevated: `is_elevated()` returns true inside `install()`, so the
|
||||||
|
// path resolves to Program Files and the LocalSystem service installs
|
||||||
|
// normally. The flag only suppresses re-elevation; it does not force a
|
||||||
|
// per-user (non-elevated) install when we are already elevated.
|
||||||
|
run_install(user_only || elevated)
|
||||||
|
}
|
||||||
Some(Commands::Uninstall) => run_uninstall(),
|
Some(Commands::Uninstall) => run_uninstall(),
|
||||||
Some(Commands::Launch { url }) => run_launch(&url),
|
Some(Commands::Launch { url }) => run_launch(&url),
|
||||||
Some(Commands::VersionInfo) => {
|
Some(Commands::VersionInfo) => {
|
||||||
@@ -233,6 +261,21 @@ fn main() -> Result<()> {
|
|||||||
println!("{}", build_info::full_version());
|
println!("{}", build_info::full_version());
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
Some(Commands::ServiceRun) => {
|
||||||
|
// SPEC-018 Phase 1: SCM-invoked entry. Hand off to the service
|
||||||
|
// dispatcher, which calls back into the control loop and runs the
|
||||||
|
// managed-agent logic as SYSTEM. Blocks until the service stops.
|
||||||
|
#[cfg(windows)]
|
||||||
|
{
|
||||||
|
service::run_dispatcher()
|
||||||
|
}
|
||||||
|
#[cfg(not(windows))]
|
||||||
|
{
|
||||||
|
Err(anyhow::anyhow!(
|
||||||
|
"service-run is a Windows-only entry point (SPEC-018)"
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
None => {
|
None => {
|
||||||
// No subcommand - detect mode from filename or embedded config
|
// No subcommand - detect mode from filename or embedded config
|
||||||
// Legacy: if support_code arg provided, use that
|
// Legacy: if support_code arg provided, use that
|
||||||
@@ -261,10 +304,24 @@ fn main() -> Result<()> {
|
|||||||
run_agent_mode(Some(code))
|
run_agent_mode(Some(code))
|
||||||
}
|
}
|
||||||
RunMode::PermanentAgent => {
|
RunMode::PermanentAgent => {
|
||||||
// Embedded config found - run as permanent agent
|
// Embedded config found - managed/persistent agent.
|
||||||
info!("Permanent agent mode detected (embedded config)");
|
info!("Permanent agent mode detected (embedded config)");
|
||||||
|
|
||||||
|
// SPEC-018: managed mode runs as the LocalSystem service, not as
|
||||||
|
// an interactive process. The service is the single autostart.
|
||||||
|
// - If the service is already installed, the service is (or
|
||||||
|
// will be) running the agent — this interactive invocation
|
||||||
|
// must NOT spawn a second agent. Exit quietly.
|
||||||
|
// - On first run, install (which installs + starts the service
|
||||||
|
// and removes the legacy HKCU Run entry), then exit and let
|
||||||
|
// the service carry the agent as SYSTEM.
|
||||||
|
#[cfg(windows)]
|
||||||
|
{
|
||||||
|
run_permanent_agent_managed()
|
||||||
|
}
|
||||||
|
#[cfg(not(windows))]
|
||||||
|
{
|
||||||
if !install::is_protocol_handler_registered() {
|
if !install::is_protocol_handler_registered() {
|
||||||
// First run - install then run as agent
|
|
||||||
info!("First run - installing agent");
|
info!("First run - installing agent");
|
||||||
if let Err(e) = install::install(false) {
|
if let Err(e) = install::install(false) {
|
||||||
warn!("Installation failed: {}", e);
|
warn!("Installation failed: {}", e);
|
||||||
@@ -272,6 +329,7 @@ fn main() -> Result<()> {
|
|||||||
}
|
}
|
||||||
run_agent_mode(None)
|
run_agent_mode(None)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
RunMode::Default => {
|
RunMode::Default => {
|
||||||
// No special mode detected - use legacy logic
|
// No special mode detected - use legacy logic
|
||||||
if !install::is_protocol_handler_registered() {
|
if !install::is_protocol_handler_registered() {
|
||||||
@@ -323,7 +381,224 @@ fn run_agent_mode(support_code: Option<String>) -> Result<()> {
|
|||||||
|
|
||||||
// Run the agent
|
// Run the agent
|
||||||
let rt = tokio::runtime::Runtime::new()?;
|
let rt = tokio::runtime::Runtime::new()?;
|
||||||
rt.block_on(run_agent(config))
|
rt.block_on(async move {
|
||||||
|
// SPEC-016 Phase B: resolve the operating credential before connecting.
|
||||||
|
// Support sessions are unaffected — they authenticate by support code, not
|
||||||
|
// by a per-machine cak_, so we only resolve enrollment for a managed agent.
|
||||||
|
if config.support_code.is_none() {
|
||||||
|
resolve_agent_credential(&mut config).await?;
|
||||||
|
}
|
||||||
|
run_agent(config, None).await
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// SPEC-018 Phase 1: run the managed/persistent agent as the LocalSystem service.
|
||||||
|
///
|
||||||
|
/// Invoked from the service control loop ([`service::run_service`]) once the
|
||||||
|
/// service has reported `Running`. This is the same persistent-agent logic as
|
||||||
|
/// [`run_agent_mode`] (load config, resolve/enroll the per-machine `cak_` per
|
||||||
|
/// SPEC-016, hold the relay connection) — but it runs **as SYSTEM**, so the
|
||||||
|
/// SYSTEM-ACL'd `cak_` store is finally readable in-context, and it observes the
|
||||||
|
/// SCM `shutdown` flag for a graceful stop.
|
||||||
|
///
|
||||||
|
/// Returns `Ok(())` when the agent loop exits because a stop was requested, and
|
||||||
|
/// `Err` only on an unrecoverable *local* fault (e.g. no usable credential and no
|
||||||
|
/// enrollment material) — network errors are retried inside the loop and never
|
||||||
|
/// surface here.
|
||||||
|
///
|
||||||
|
/// Phase 2 seam: this is where the session broker is wired in — the runtime
|
||||||
|
/// started here will own the broker that spawns the per-session capture/input
|
||||||
|
/// worker (`CreateProcessAsUserW`) and the IPC server. Phase 1 connects/enrolls
|
||||||
|
/// only; it does not capture a desktop (a Session-0 SYSTEM process cannot).
|
||||||
|
#[cfg(windows)]
|
||||||
|
pub fn run_managed_agent_service(
|
||||||
|
shutdown: std::sync::Arc<std::sync::atomic::AtomicBool>,
|
||||||
|
) -> Result<()> {
|
||||||
|
info!("Loading managed-agent configuration (running as SYSTEM)");
|
||||||
|
|
||||||
|
let mut config = config::Config::load()?;
|
||||||
|
// The service ONLY ever runs the managed/persistent path. A support session is
|
||||||
|
// an interactive, user-launched flow and must never be carried by the service.
|
||||||
|
config.support_code = None;
|
||||||
|
|
||||||
|
info!("Server: {}", config.server_url);
|
||||||
|
if let Some(ref company) = config.company {
|
||||||
|
info!("Company: {}", company);
|
||||||
|
}
|
||||||
|
if let Some(ref site) = config.site {
|
||||||
|
info!("Site: {}", site);
|
||||||
|
}
|
||||||
|
|
||||||
|
let rt = tokio::runtime::Runtime::new()?;
|
||||||
|
|
||||||
|
// SPEC-018 (finding M): this future runs across the `extern "system"` service
|
||||||
|
// entry point (ffi_service_main -> service_main -> run_service -> here). A
|
||||||
|
// panic that unwound across that FFI boundary is undefined behaviour (the C
|
||||||
|
// ABI cannot carry a Rust unwind) and would abort the process instead of
|
||||||
|
// taking the intended ServiceSpecific(1) fault path. Catch it here and convert
|
||||||
|
// it into an `Err`, which `run_service` maps to ServiceExitCode::ServiceSpecific(1)
|
||||||
|
// so the SCM applies its configured recovery (restart) cleanly. `Running` is
|
||||||
|
// already reported before we get here, so a fault does not strand StartPending.
|
||||||
|
let outcome = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
|
||||||
|
rt.block_on(async move {
|
||||||
|
// SPEC-016 Phase B: resolve the operating credential before connecting.
|
||||||
|
// Running as SYSTEM, the SYSTEM+Administrators-ACL'd cak_ store is now
|
||||||
|
// readable in-context, so the Phase B fail-fast guard is not hit on this
|
||||||
|
// path (it remains as a safety net for any non-SYSTEM invocation).
|
||||||
|
resolve_agent_credential(&mut config).await?;
|
||||||
|
run_agent(config, Some(shutdown)).await
|
||||||
|
})
|
||||||
|
}));
|
||||||
|
|
||||||
|
match outcome {
|
||||||
|
Ok(result) => result,
|
||||||
|
Err(panic) => {
|
||||||
|
// Recover a human-readable message from the panic payload for the log;
|
||||||
|
// do not re-panic (that would unwind across the FFI boundary again).
|
||||||
|
let detail = panic
|
||||||
|
.downcast_ref::<&str>()
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.or_else(|| panic.downcast_ref::<String>().cloned())
|
||||||
|
.unwrap_or_else(|| "non-string panic payload".to_string());
|
||||||
|
error!("managed-agent runtime panicked: {detail}");
|
||||||
|
Err(anyhow::anyhow!("managed-agent runtime panicked: {detail}"))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// SPEC-018 Phase 1: handle an interactive launch of a MANAGED agent binary (one
|
||||||
|
/// carrying embedded config, detected as [`config::RunMode::PermanentAgent`]).
|
||||||
|
///
|
||||||
|
/// Managed mode runs as the LocalSystem service, never as an interactive process:
|
||||||
|
/// - If the service is already installed, the service is (or will be) running
|
||||||
|
/// the agent as SYSTEM, so this interactive invocation must NOT spawn a second
|
||||||
|
/// agent — it exits quietly.
|
||||||
|
/// - On first run, install (which installs + starts the service and removes the
|
||||||
|
/// legacy `HKCU\…\Run` autostart), then exit and let the service carry the
|
||||||
|
/// agent. If the service install fails (e.g. not elevated), fall back to
|
||||||
|
/// running the agent in-process for this run so the machine is not left with no
|
||||||
|
/// agent at all.
|
||||||
|
#[cfg(windows)]
|
||||||
|
fn run_permanent_agent_managed() -> Result<()> {
|
||||||
|
if service::is_service_installed() {
|
||||||
|
info!(
|
||||||
|
"Managed service already installed; the service runs the agent as SYSTEM — \
|
||||||
|
this interactive instance has nothing to do"
|
||||||
|
);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("First run - installing managed agent service");
|
||||||
|
if let Err(e) = install::install(false) {
|
||||||
|
warn!(
|
||||||
|
"Managed service install failed ({e:#}); falling back to in-process agent for this run"
|
||||||
|
);
|
||||||
|
return run_agent_mode(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Managed agent service installed; handing off to the service");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resolve the per-machine operating credential for a managed agent (SPEC-016
|
||||||
|
/// Phase B, run-mode wiring).
|
||||||
|
///
|
||||||
|
/// Precedence:
|
||||||
|
/// 1. A `cak_` already stored encrypted at rest -> load it and connect with it
|
||||||
|
/// (the steady-state path; no network call, no re-enroll).
|
||||||
|
/// 2. No stored `cak_` but an `enrollment_key` + `site_code` are present ->
|
||||||
|
/// run first-run enrollment to obtain + persist a `cak_`, then connect.
|
||||||
|
/// 3. Neither a stored `cak_` nor enrollment material, but a non-empty
|
||||||
|
/// `api_key` is configured -> use it as the DEPRECATED shared/legacy key
|
||||||
|
/// (transition compatibility only; logged at WARNING).
|
||||||
|
/// 4. Nothing usable -> error; a managed agent cannot authenticate.
|
||||||
|
async fn resolve_agent_credential(config: &mut config::Config) -> Result<()> {
|
||||||
|
// 1. Stored per-machine cak_ (steady state).
|
||||||
|
#[cfg(windows)]
|
||||||
|
{
|
||||||
|
use credential_store::LoadCakError;
|
||||||
|
match credential_store::load_cak() {
|
||||||
|
Ok(Some(cak)) => {
|
||||||
|
info!("Using stored per-machine credential (cak_)");
|
||||||
|
config.api_key = cak;
|
||||||
|
// Any leftover enrollment material is now moot.
|
||||||
|
config.enrollment_key = None;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
Ok(None) => {
|
||||||
|
info!("No stored per-machine credential; will enroll if configured");
|
||||||
|
}
|
||||||
|
// C1 / M1 — the store exists but THIS security context cannot read it
|
||||||
|
// (access-denied against the SYSTEM-only ACL). This is the brick the
|
||||||
|
// C1 guard prevents: a non-SYSTEM run could write the store but never
|
||||||
|
// read it back. Fail fast with an actionable message; do NOT loop and
|
||||||
|
// do NOT silently re-enroll. The SYSTEM+Administrators ACL is correct
|
||||||
|
// for the target (Option A) and is deliberately kept.
|
||||||
|
//
|
||||||
|
// SPEC-018 (this spec): the managed agent now runs as the GuruConnect
|
||||||
|
// SYSTEM service ([`run_managed_agent_service`]), so on the production
|
||||||
|
// managed path the store IS readable in-context and this branch is NOT
|
||||||
|
// hit. The guard is intentionally retained as a harmless safety net for
|
||||||
|
// any non-SYSTEM invocation (e.g. someone running the managed binary
|
||||||
|
// interactively): it still fails fast with an actionable message rather
|
||||||
|
// than bricking. Do NOT remove it in Phase 1.
|
||||||
|
Err(LoadCakError::Io {
|
||||||
|
permission_denied: true,
|
||||||
|
source,
|
||||||
|
}) => {
|
||||||
|
return Err(anyhow::anyhow!(
|
||||||
|
"[ENROLL] credential store is not accessible in this context \
|
||||||
|
({source}) — the managed agent must run as the GuruConnect SYSTEM \
|
||||||
|
service (see SPEC-018). Refusing to re-enroll."
|
||||||
|
));
|
||||||
|
}
|
||||||
|
// M1 — other IO error reaching the store (not access-denied): also
|
||||||
|
// operational, not a tamper signal. Surface it; do not re-enroll over a
|
||||||
|
// store we simply could not read.
|
||||||
|
Err(e @ LoadCakError::Io { .. }) => {
|
||||||
|
return Err(anyhow::Error::new(e).context(
|
||||||
|
"[ENROLL] credential store present but unreadable (IO error); \
|
||||||
|
refusing to re-enroll over it",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
Err(e @ LoadCakError::Path(_)) => {
|
||||||
|
return Err(anyhow::Error::new(e)
|
||||||
|
.context("[ENROLL] could not resolve the credential store path"));
|
||||||
|
}
|
||||||
|
// M1 — the bytes were read but failed to DECRYPT: the real tamper /
|
||||||
|
// wrong-machine signal. Hard stop; never silently re-enroll over it.
|
||||||
|
Err(e @ LoadCakError::Decrypt(_)) => {
|
||||||
|
return Err(anyhow::Error::new(e).context(
|
||||||
|
"[ENROLL] stored credential failed to decrypt — possible tamper or \
|
||||||
|
copy from another machine; refusing to silently re-enroll",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. First-run enrollment (the SPEC-016 zero-touch path). run_enrollment only
|
||||||
|
// returns once a cak_ is stored (it retries network/429/collision-pending
|
||||||
|
// internally); a returned Err is an unrecoverable local fault.
|
||||||
|
if config.enrollment_key.is_some() && config.site_code.is_some() {
|
||||||
|
info!("Enrollment material present; running first-run enrollment");
|
||||||
|
enroll::run_enrollment(config).await?;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. DEPRECATED shared/legacy api_key fallback (transition only).
|
||||||
|
if !config.api_key.is_empty() {
|
||||||
|
warn!(
|
||||||
|
"Connecting with a DEPRECATED shared/legacy api_key. Migrate this agent \
|
||||||
|
to a per-site enrollment (SPEC-016); the shared key path will be removed."
|
||||||
|
);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. Nothing usable.
|
||||||
|
Err(anyhow::anyhow!(
|
||||||
|
"no operating credential available: no stored cak_, no enrollment_key/site_code, \
|
||||||
|
and no legacy api_key — this managed agent cannot authenticate"
|
||||||
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Run in viewer mode (connect to remote session)
|
/// Run in viewer mode (connect to remote session)
|
||||||
@@ -376,7 +651,22 @@ fn run_install(force_user_install: bool) -> Result<()> {
|
|||||||
fn run_uninstall() -> Result<()> {
|
fn run_uninstall() -> Result<()> {
|
||||||
info!("Uninstalling GuruConnect...");
|
info!("Uninstalling GuruConnect...");
|
||||||
|
|
||||||
// Remove from startup
|
// SPEC-018: remove the managed LocalSystem service and the legacy HKCU Run
|
||||||
|
// autostart. Idempotent — no error if the service was never installed (an
|
||||||
|
// attended/viewer install has no service), so this is safe for every install
|
||||||
|
// shape. Requires Administrator to delete the service; a non-elevated uninstall
|
||||||
|
// still clears the per-user autostart below.
|
||||||
|
#[cfg(windows)]
|
||||||
|
{
|
||||||
|
if let Err(e) = install::uninstall_managed_service() {
|
||||||
|
warn!(
|
||||||
|
"Failed to remove managed service (may require Administrator): {}",
|
||||||
|
e
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove from startup (covers non-elevated / attended / viewer installs).
|
||||||
if let Err(e) = startup::remove_from_startup() {
|
if let Err(e) = startup::remove_from_startup() {
|
||||||
warn!("Failed to remove from startup: {}", e);
|
warn!("Failed to remove from startup: {}", e);
|
||||||
}
|
}
|
||||||
@@ -474,20 +764,50 @@ fn cleanup_on_exit() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Run the agent main loop
|
/// Run the agent main loop.
|
||||||
async fn run_agent(config: config::Config) -> Result<()> {
|
///
|
||||||
|
/// `service_shutdown`, when present, is the SCM cooperative-stop flag (SPEC-018):
|
||||||
|
/// the managed-agent service passes it so the loop exits promptly on
|
||||||
|
/// `Stop`/`Shutdown`. It is `None` for the interactive/user-launched paths, which
|
||||||
|
/// stop via the tray exit / server control messages instead.
|
||||||
|
async fn run_agent(
|
||||||
|
config: config::Config,
|
||||||
|
service_shutdown: Option<std::sync::Arc<std::sync::atomic::AtomicBool>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
use std::sync::atomic::Ordering;
|
||||||
|
|
||||||
let elevated = install::is_elevated();
|
let elevated = install::is_elevated();
|
||||||
|
let running_as_service = service_shutdown.is_some();
|
||||||
let mut session = session::SessionManager::new(config.clone(), elevated);
|
let mut session = session::SessionManager::new(config.clone(), elevated);
|
||||||
let is_support_session = config.support_code.is_some();
|
let is_support_session = config.support_code.is_some();
|
||||||
let hostname = config.hostname();
|
let hostname = config.hostname();
|
||||||
|
|
||||||
// Add to startup
|
// Helper: has the SCM asked us to stop?
|
||||||
if let Err(e) = startup::add_to_startup() {
|
let stop_requested = |flag: &Option<std::sync::Arc<std::sync::atomic::AtomicBool>>| -> bool {
|
||||||
|
flag.as_ref()
|
||||||
|
.map(|f| f.load(Ordering::SeqCst))
|
||||||
|
.unwrap_or(false)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Autostart persistence:
|
||||||
|
// - As the SYSTEM service (SPEC-018), the SERVICE itself is the managed
|
||||||
|
// autostart — do NOT write the per-user HKCU\…\Run entry (that would be a
|
||||||
|
// second, redundant autostart, and writing it from SYSTEM lands in the
|
||||||
|
// wrong hive). The service install/uninstall owns lifecycle.
|
||||||
|
// - Interactive/user-launched runs keep the existing HKCU Run behavior.
|
||||||
|
if running_as_service {
|
||||||
|
info!("Running as the GuruConnect SYSTEM service; service is the autostart (skipping HKCU Run)");
|
||||||
|
} else if let Err(e) = startup::add_to_startup() {
|
||||||
warn!("Failed to add to startup: {}", e);
|
warn!("Failed to add to startup: {}", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create tray icon
|
// A Session-0 SYSTEM service has no interactive desktop, so a tray icon is
|
||||||
let tray = match tray::TrayController::new(
|
// both impossible and meaningless there (SPEC-018 Phase 2 moves the user-facing
|
||||||
|
// surface into the per-session worker). Only create the tray off the service.
|
||||||
|
let tray = if running_as_service {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
match tray::TrayController::new(
|
||||||
&hostname,
|
&hostname,
|
||||||
config.support_code.as_deref(),
|
config.support_code.as_deref(),
|
||||||
is_support_session,
|
is_support_session,
|
||||||
@@ -500,6 +820,7 @@ async fn run_agent(config: config::Config) -> Result<()> {
|
|||||||
warn!("Failed to create tray icon: {}", e);
|
warn!("Failed to create tray icon: {}", e);
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Create chat controller
|
// Create chat controller
|
||||||
@@ -507,6 +828,12 @@ async fn run_agent(config: config::Config) -> Result<()> {
|
|||||||
|
|
||||||
// Connect to server and run main loop
|
// Connect to server and run main loop
|
||||||
loop {
|
loop {
|
||||||
|
// SPEC-018: honour an SCM stop request before (re)connecting.
|
||||||
|
if stop_requested(&service_shutdown) {
|
||||||
|
info!("Service stop requested; exiting agent loop");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
info!("Connecting to server...");
|
info!("Connecting to server...");
|
||||||
|
|
||||||
if is_support_session {
|
if is_support_session {
|
||||||
@@ -528,11 +855,22 @@ async fn run_agent(config: config::Config) -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if let Err(e) = session
|
if let Err(e) = session
|
||||||
.run_with_tray(tray.as_ref(), chat_ctrl.as_ref())
|
.run_with_tray(tray.as_ref(), chat_ctrl.as_ref(), service_shutdown.as_ref())
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
let error_msg = e.to_string();
|
let error_msg = e.to_string();
|
||||||
|
|
||||||
|
// SPEC-018 (finding H): the connected session loop broke
|
||||||
|
// because the SCM asked the service to stop. The loop already
|
||||||
|
// closed the WebSocket cleanly; treat this as a graceful stop
|
||||||
|
// (no reconnect) so the service transitions StopPending ->
|
||||||
|
// Stopped. Only the service path can produce this (it is the
|
||||||
|
// only caller that passes a shutdown flag).
|
||||||
|
if error_msg.contains(session::SERVICE_STOP_SENTINEL) {
|
||||||
|
info!("Service stop requested during session; exiting agent loop");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
if error_msg.contains("USER_EXIT") {
|
if error_msg.contains("USER_EXIT") {
|
||||||
info!("Session ended by user");
|
info!("Session ended by user");
|
||||||
cleanup_on_exit();
|
cleanup_on_exit();
|
||||||
@@ -605,6 +943,47 @@ async fn run_agent(config: config::Config) -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
info!("Reconnecting in 5 seconds...");
|
info!("Reconnecting in 5 seconds...");
|
||||||
|
// SPEC-018: poll the SCM stop flag during the backoff so a service stop is
|
||||||
|
// honoured within ~250ms instead of waiting the full reconnect delay.
|
||||||
|
if service_shutdown.is_some() {
|
||||||
|
for _ in 0..20 {
|
||||||
|
if stop_requested(&service_shutdown) {
|
||||||
|
info!("Service stop requested during reconnect backoff; exiting agent loop");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_millis(250)).await;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
tokio::time::sleep(tokio::time::Duration::from_secs(5)).await;
|
tokio::time::sleep(tokio::time::Duration::from_secs(5)).await;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use clap::CommandFactory;
|
||||||
|
|
||||||
|
/// SPEC-018 finding N1: pin the clap subcommand name to the constant the SCM
|
||||||
|
/// is registered with. The service is installed with `SERVICE_RUN_ARG` as its
|
||||||
|
/// launch argument; when the SCM starts it, clap must route that exact token
|
||||||
|
/// into [`Commands::ServiceRun`]. If the `#[command(name = "service-run")]`
|
||||||
|
/// attribute and the constant ever drift apart, the SCM would start the binary
|
||||||
|
/// but clap would fail to match the subcommand and the process would fall
|
||||||
|
/// through to default (non-service) mode and exit. Asserting against the live
|
||||||
|
/// clap metadata (not a second string literal) makes that drift impossible.
|
||||||
|
#[test]
|
||||||
|
#[cfg(windows)]
|
||||||
|
fn service_run_subcommand_matches_scm_launch_arg() {
|
||||||
|
let cmd = Cli::command();
|
||||||
|
let has_matching_subcommand = cmd
|
||||||
|
.get_subcommands()
|
||||||
|
.any(|sc| sc.get_name() == service::SERVICE_RUN_ARG);
|
||||||
|
assert!(
|
||||||
|
has_matching_subcommand,
|
||||||
|
"no clap subcommand named '{}' (the SCM launch arg); the ServiceRun \
|
||||||
|
#[command(name = ...)] attribute drifted from service::SERVICE_RUN_ARG",
|
||||||
|
service::SERVICE_RUN_ARG
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
520
agent/src/service/mod.rs
Normal file
520
agent/src/service/mod.rs
Normal file
@@ -0,0 +1,520 @@
|
|||||||
|
//! Windows SYSTEM service host for the managed GuruConnect agent (SPEC-018).
|
||||||
|
//!
|
||||||
|
//! # Phase 1 scope (this module)
|
||||||
|
//!
|
||||||
|
//! Phase 1 proves the *managed/persistent* agent can run as **LocalSystem** in
|
||||||
|
//! the isolated Session 0 across reboots and at the login screen:
|
||||||
|
//!
|
||||||
|
//! 1. Register the agent with the Service Control Manager (SCM) and run, when
|
||||||
|
//! started, the **existing persistent-agent logic** (`RunMode::PermanentAgent`
|
||||||
|
//! path) *as SYSTEM* — i.e. resolve/enroll the per-machine `cak_` (SPEC-016,
|
||||||
|
//! now readable because the SYSTEM-ACL'd store is in-context) and hold the
|
||||||
|
//! relay WSS connection.
|
||||||
|
//! 2. Report a correct service lifecycle to the SCM (`StartPending` ->
|
||||||
|
//! `Running` -> `StopPending` -> `Stopped`) and handle `Stop`/`Shutdown`
|
||||||
|
//! gracefully. The control handler sets a shared shutdown flag; the agent
|
||||||
|
//! runtime observes it both between reconnect attempts AND inside the
|
||||||
|
//! connected session loop (SPEC-018 finding H), so a stop received while a
|
||||||
|
//! session is live breaks out promptly, closes the WS connection cleanly,
|
||||||
|
//! and exits — rather than waiting for the SCM to force-kill.
|
||||||
|
//! 3. Provide install/uninstall of the service (LocalSystem, auto-start, crash
|
||||||
|
//! recovery) so managed mode uses the service as its single autostart
|
||||||
|
//! instead of the per-user `HKCU\…\Run` entry.
|
||||||
|
//!
|
||||||
|
//! # Phase 2 (deliberately NOT built here — see SPEC-018 §Scope)
|
||||||
|
//!
|
||||||
|
//! A SYSTEM service lives in Session 0 and **cannot** capture or inject the
|
||||||
|
//! interactive desktop directly. Phase 1 therefore enrolls and connects but does
|
||||||
|
//! **NOT** capture a desktop yet. The following are Phase 2 and are intentionally
|
||||||
|
//! absent; the seams where they attach are called out inline below:
|
||||||
|
//!
|
||||||
|
//! - the **session broker** (`WTSEnumerateSessionsW` /
|
||||||
|
//! `WTSGetActiveConsoleSessionId` / `WTSQueryUserToken`),
|
||||||
|
//! - the **per-session capture/input worker** spawned via `CreateProcessAsUserW`
|
||||||
|
//! into `winsta0\default`,
|
||||||
|
//! - **service <-> worker IPC** (the per-session ACL'd named pipe), and
|
||||||
|
//! - **`SERVICE_CONTROL_SESSIONCHANGE`** reaction (logon/logoff/console-connect
|
||||||
|
//! retarget).
|
||||||
|
//!
|
||||||
|
//! Phase 1 registers the control handler for `Stop`/`Shutdown`/`Interrogate`
|
||||||
|
//! only. When Phase 2 lands, the broker hangs off the same control handler
|
||||||
|
//! (adding `SESSIONCHANGE`) and off the same agent runtime started here.
|
||||||
|
|
||||||
|
#![cfg(windows)]
|
||||||
|
|
||||||
|
use std::ffi::OsString;
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use anyhow::{Context, Result};
|
||||||
|
use tracing::{error, info, warn};
|
||||||
|
|
||||||
|
use windows_service::{
|
||||||
|
define_windows_service,
|
||||||
|
service::{
|
||||||
|
ServiceAccess, ServiceControl, ServiceControlAccept, ServiceErrorControl, ServiceExitCode,
|
||||||
|
ServiceInfo, ServiceStartType, ServiceState, ServiceStatus, ServiceType,
|
||||||
|
},
|
||||||
|
service_control_handler::{self, ServiceControlHandlerResult},
|
||||||
|
service_dispatcher,
|
||||||
|
service_manager::{ServiceManager, ServiceManagerAccess},
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Internal service name registered with the SCM (no spaces; used by `sc`,
|
||||||
|
/// `ServiceManager`, and the control handler).
|
||||||
|
pub const SERVICE_NAME: &str = "GuruConnectAgent";
|
||||||
|
|
||||||
|
/// Human-facing display name shown in `services.msc`.
|
||||||
|
pub const SERVICE_DISPLAY_NAME: &str = "GuruConnect Managed Agent";
|
||||||
|
|
||||||
|
/// Service description shown in `services.msc`.
|
||||||
|
pub const SERVICE_DESCRIPTION: &str =
|
||||||
|
"Runs the managed GuruConnect remote-support agent as LocalSystem so it is \
|
||||||
|
reachable at the login screen and across reboots (SPEC-018).";
|
||||||
|
|
||||||
|
/// Hidden subcommand the SCM invokes to enter the service control loop. The
|
||||||
|
/// service is registered with this as its launch argument (see [`install_service`]),
|
||||||
|
/// and `main.rs` routes it into [`run_dispatcher`].
|
||||||
|
pub const SERVICE_RUN_ARG: &str = "service-run";
|
||||||
|
|
||||||
|
/// Hint we give the SCM for how long start/stop transitions may take before it
|
||||||
|
/// should consider the service hung.
|
||||||
|
const TRANSITION_WAIT: Duration = Duration::from_secs(10);
|
||||||
|
|
||||||
|
// The `windows-service` dispatcher requires a `extern "system"` entry point with
|
||||||
|
// a fixed ABI; this macro generates `ffi_service_main`, which trampolines into
|
||||||
|
// our safe `service_main`.
|
||||||
|
define_windows_service!(ffi_service_main, service_main);
|
||||||
|
|
||||||
|
/// Enter the SCM dispatcher (called from `main.rs` for the `service-run`
|
||||||
|
/// subcommand). Blocks until the service stops. This must be invoked by the SCM,
|
||||||
|
/// not interactively — `service_dispatcher::start` fails with
|
||||||
|
/// `ERROR_FAILED_SERVICE_CONTROLLER_CONNECT` (1063) if there is no controlling
|
||||||
|
/// SCM, which is the expected outcome of running `guruconnect service-run` by hand.
|
||||||
|
pub fn run_dispatcher() -> Result<()> {
|
||||||
|
service_dispatcher::start(SERVICE_NAME, ffi_service_main)
|
||||||
|
.context("failed to connect to the service control dispatcher (must be started by the SCM)")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// SCM-invoked service body. Any error is logged; the function cannot return an
|
||||||
|
/// error to the SCM directly, so [`run_service`] reports a failed exit code on the
|
||||||
|
/// status handle before returning.
|
||||||
|
fn service_main(_arguments: Vec<OsString>) {
|
||||||
|
if let Err(e) = run_service() {
|
||||||
|
error!("service exited with error: {e:#}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Drive the full service lifecycle: register the control handler, report
|
||||||
|
/// `Running`, run the persistent agent until a stop is requested, then report
|
||||||
|
/// `Stopped`.
|
||||||
|
fn run_service() -> Result<()> {
|
||||||
|
info!("GuruConnect managed agent service starting (running as SYSTEM in session 0)");
|
||||||
|
|
||||||
|
// Cooperative shutdown flag flipped by the SCM control handler and observed by
|
||||||
|
// the agent runtime. `AtomicBool` keeps the handler closure trivially `Send`
|
||||||
|
// and avoids holding a lock inside an SCM callback.
|
||||||
|
let shutdown = Arc::new(AtomicBool::new(false));
|
||||||
|
let shutdown_for_handler = shutdown.clone();
|
||||||
|
|
||||||
|
let event_handler = move |control_event| -> ServiceControlHandlerResult {
|
||||||
|
match control_event {
|
||||||
|
// SPEC-018 Phase 1: graceful stop. Phase 2 adds
|
||||||
|
// `ServiceControl::SessionChange(_)` here to drive the session broker
|
||||||
|
// (retarget the capture/input worker on logon/logoff/console-connect);
|
||||||
|
// we intentionally do not accept SESSIONCHANGE yet.
|
||||||
|
ServiceControl::Stop | ServiceControl::Shutdown => {
|
||||||
|
info!("received {control_event:?}; signalling agent to shut down");
|
||||||
|
// Set the cooperative-stop flag. The agent runtime observes it on
|
||||||
|
// every idle tick of the connected session loop and between
|
||||||
|
// reconnect attempts (SPEC-018 finding H), so it breaks out and
|
||||||
|
// closes the WebSocket cleanly within ~100ms even if a session is
|
||||||
|
// currently connected.
|
||||||
|
shutdown_for_handler.store(true, Ordering::SeqCst);
|
||||||
|
ServiceControlHandlerResult::NoError
|
||||||
|
}
|
||||||
|
ServiceControl::Interrogate => ServiceControlHandlerResult::NoError,
|
||||||
|
_ => ServiceControlHandlerResult::NotImplemented,
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let status_handle = service_control_handler::register(SERVICE_NAME, event_handler)
|
||||||
|
.context("failed to register the service control handler")?;
|
||||||
|
|
||||||
|
// Report StartPending while we spin up the runtime and connect.
|
||||||
|
set_status(
|
||||||
|
&status_handle,
|
||||||
|
ServiceState::StartPending,
|
||||||
|
ServiceControlAccept::empty(),
|
||||||
|
TRANSITION_WAIT,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Report Running and accept Stop + Shutdown. We report Running before the
|
||||||
|
// first connect attempt completes because the agent loop reconnects forever;
|
||||||
|
// "the service is up and trying" is the correct steady state, and blocking the
|
||||||
|
// SCM on the first relay handshake would risk a start timeout on a slow boot.
|
||||||
|
set_status(
|
||||||
|
&status_handle,
|
||||||
|
ServiceState::Running,
|
||||||
|
ServiceControlAccept::STOP | ServiceControlAccept::SHUTDOWN,
|
||||||
|
Duration::default(),
|
||||||
|
);
|
||||||
|
info!("service reported Running; entering managed-agent control loop");
|
||||||
|
|
||||||
|
// Run the existing persistent-agent logic as SYSTEM. This is the Phase 1
|
||||||
|
// payload: resolve/enroll the cak_ (SPEC-016) and hold the relay connection.
|
||||||
|
let run_result = crate::run_managed_agent_service(shutdown.clone());
|
||||||
|
|
||||||
|
if let Err(e) = &run_result {
|
||||||
|
// The agent loop only returns Err on an unrecoverable LOCAL fault (e.g. no
|
||||||
|
// usable credential and nothing to enroll with). Network errors are
|
||||||
|
// retried inside the loop and never surface here. Report the failure to
|
||||||
|
// the SCM so recovery actions (restart) engage.
|
||||||
|
error!("managed-agent control loop terminated with error: {e:#}");
|
||||||
|
} else {
|
||||||
|
info!("managed-agent control loop exited cleanly on stop request");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Transition StopPending -> Stopped.
|
||||||
|
set_status(
|
||||||
|
&status_handle,
|
||||||
|
ServiceState::StopPending,
|
||||||
|
ServiceControlAccept::empty(),
|
||||||
|
TRANSITION_WAIT,
|
||||||
|
);
|
||||||
|
|
||||||
|
let exit_code = match run_result {
|
||||||
|
Ok(()) => ServiceExitCode::Win32(0),
|
||||||
|
// ERROR_SERVICE_SPECIFIC_ERROR-style: surface a non-zero service-specific
|
||||||
|
// code so the SCM treats the exit as a failure and applies recovery.
|
||||||
|
Err(_) => ServiceExitCode::ServiceSpecific(1),
|
||||||
|
};
|
||||||
|
|
||||||
|
set_status_with_exit(
|
||||||
|
&status_handle,
|
||||||
|
ServiceState::Stopped,
|
||||||
|
ServiceControlAccept::empty(),
|
||||||
|
Duration::default(),
|
||||||
|
exit_code,
|
||||||
|
);
|
||||||
|
info!("service reported Stopped");
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Report a status with a zero (success) exit code.
|
||||||
|
fn set_status(
|
||||||
|
handle: &service_control_handler::ServiceStatusHandle,
|
||||||
|
state: ServiceState,
|
||||||
|
accepted: ServiceControlAccept,
|
||||||
|
wait_hint: Duration,
|
||||||
|
) {
|
||||||
|
set_status_with_exit(
|
||||||
|
handle,
|
||||||
|
state,
|
||||||
|
accepted,
|
||||||
|
wait_hint,
|
||||||
|
ServiceExitCode::Win32(0),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Report a status to the SCM. A failure to report is logged (best-effort) — we
|
||||||
|
/// cannot do anything actionable about it and must not panic inside the service.
|
||||||
|
fn set_status_with_exit(
|
||||||
|
handle: &service_control_handler::ServiceStatusHandle,
|
||||||
|
state: ServiceState,
|
||||||
|
accepted: ServiceControlAccept,
|
||||||
|
wait_hint: Duration,
|
||||||
|
exit_code: ServiceExitCode,
|
||||||
|
) {
|
||||||
|
let status = ServiceStatus {
|
||||||
|
service_type: ServiceType::OWN_PROCESS,
|
||||||
|
current_state: state,
|
||||||
|
controls_accepted: accepted,
|
||||||
|
exit_code,
|
||||||
|
checkpoint: 0,
|
||||||
|
wait_hint,
|
||||||
|
process_id: None,
|
||||||
|
};
|
||||||
|
if let Err(e) = handle.set_service_status(status) {
|
||||||
|
warn!("failed to report service status {state:?} to the SCM: {e}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Install / uninstall (used by install.rs for managed mode)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
/// Install (or reinstall) the managed agent as a LocalSystem auto-start service
|
||||||
|
/// pointing at `exe_path` with the [`SERVICE_RUN_ARG`] launch argument.
|
||||||
|
///
|
||||||
|
/// Idempotent: if the service already exists it is stopped and deleted first,
|
||||||
|
/// then recreated, so an upgrade picks up a new binary path / config. Configures
|
||||||
|
/// crash recovery (restart on failure) via `sc failure`.
|
||||||
|
///
|
||||||
|
/// Requires Administrator (SCM `CREATE_SERVICE`). Returns an error otherwise.
|
||||||
|
pub fn install_service(exe_path: &std::path::Path) -> Result<()> {
|
||||||
|
let manager = ServiceManager::local_computer(
|
||||||
|
None::<&str>,
|
||||||
|
ServiceManagerAccess::CONNECT | ServiceManagerAccess::CREATE_SERVICE,
|
||||||
|
)
|
||||||
|
.context("failed to connect to the Service Control Manager (run as Administrator)")?;
|
||||||
|
|
||||||
|
// Remove any prior installation so the binary path / args are refreshed.
|
||||||
|
let mut deleted_existing = false;
|
||||||
|
if let Ok(existing) = manager.open_service(
|
||||||
|
SERVICE_NAME,
|
||||||
|
ServiceAccess::QUERY_STATUS | ServiceAccess::STOP | ServiceAccess::DELETE,
|
||||||
|
) {
|
||||||
|
info!("existing {SERVICE_NAME} service found; removing before reinstall");
|
||||||
|
stop_if_running(&existing);
|
||||||
|
existing
|
||||||
|
.delete()
|
||||||
|
.context("failed to delete the existing service before reinstall")?;
|
||||||
|
drop(existing);
|
||||||
|
deleted_existing = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
let service_info = ServiceInfo {
|
||||||
|
name: OsString::from(SERVICE_NAME),
|
||||||
|
display_name: OsString::from(SERVICE_DISPLAY_NAME),
|
||||||
|
service_type: ServiceType::OWN_PROCESS,
|
||||||
|
start_type: ServiceStartType::AutoStart,
|
||||||
|
error_control: ServiceErrorControl::Normal,
|
||||||
|
executable_path: exe_path.to_path_buf(),
|
||||||
|
launch_arguments: vec![OsString::from(SERVICE_RUN_ARG)],
|
||||||
|
dependencies: vec![],
|
||||||
|
// account_name: None => LocalSystem (the SPEC-018 requirement).
|
||||||
|
account_name: None,
|
||||||
|
account_password: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let service = create_service_with_retry(&manager, &service_info, deleted_existing)
|
||||||
|
.context("failed to create the GuruConnect managed agent service")?;
|
||||||
|
|
||||||
|
service
|
||||||
|
.set_description(SERVICE_DESCRIPTION)
|
||||||
|
.context("failed to set the service description")?;
|
||||||
|
|
||||||
|
configure_recovery();
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"installed {SERVICE_NAME} (LocalSystem, auto-start) -> {} {}",
|
||||||
|
exe_path.display(),
|
||||||
|
SERVICE_RUN_ARG
|
||||||
|
);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create the service, retrying briefly if the SCM still has the prior instance
|
||||||
|
/// "marked for deletion" (SPEC-018 finding L1).
|
||||||
|
///
|
||||||
|
/// When a service is deleted, the SCM only removes it from its database once every
|
||||||
|
/// open handle to it closes; until then a fresh `CreateService` fails with
|
||||||
|
/// `ERROR_SERVICE_MARKED_FOR_DELETE` (1072). The previous implementation papered
|
||||||
|
/// over this with a fixed 2s sleep after `delete()`, which is both slower than
|
||||||
|
/// necessary in the common case and still racy on a busy box. Instead we attempt
|
||||||
|
/// the create immediately and, only if we just deleted an existing instance and
|
||||||
|
/// hit 1072, retry a few times with short backoff — succeeding as soon as the SCM
|
||||||
|
/// finishes the removal, and giving up with the real error if it never does.
|
||||||
|
///
|
||||||
|
/// The retry is gated on `deleted_existing`: on a clean first install there was no
|
||||||
|
/// prior instance, so a 1072 there is unexpected and is surfaced immediately
|
||||||
|
/// rather than masked by retries.
|
||||||
|
fn create_service_with_retry(
|
||||||
|
manager: &ServiceManager,
|
||||||
|
service_info: &ServiceInfo,
|
||||||
|
deleted_existing: bool,
|
||||||
|
) -> Result<windows_service::service::Service, windows_service::Error> {
|
||||||
|
// ERROR_SERVICE_MARKED_FOR_DELETE (winerror.h). The service is gone from the
|
||||||
|
// caller's perspective but the SCM has not finished reaping it.
|
||||||
|
const ERROR_SERVICE_MARKED_FOR_DELETE: i32 = 1072;
|
||||||
|
// Bounded: ~5 attempts over ~2s total worst case (matches the old fixed sleep
|
||||||
|
// ceiling) but returns the instant the SCM is ready.
|
||||||
|
const MAX_ATTEMPTS: u32 = 5;
|
||||||
|
const BACKOFF: Duration = Duration::from_millis(400);
|
||||||
|
|
||||||
|
let mut attempt = 0;
|
||||||
|
loop {
|
||||||
|
attempt += 1;
|
||||||
|
match manager.create_service(service_info, ServiceAccess::CHANGE_CONFIG) {
|
||||||
|
Ok(service) => return Ok(service),
|
||||||
|
Err(windows_service::Error::Winapi(ref io_err))
|
||||||
|
if deleted_existing
|
||||||
|
&& io_err.raw_os_error() == Some(ERROR_SERVICE_MARKED_FOR_DELETE)
|
||||||
|
&& attempt < MAX_ATTEMPTS =>
|
||||||
|
{
|
||||||
|
warn!(
|
||||||
|
"{SERVICE_NAME} still marked for deletion by the SCM \
|
||||||
|
(attempt {attempt}/{MAX_ATTEMPTS}); retrying in {}ms",
|
||||||
|
BACKOFF.as_millis()
|
||||||
|
);
|
||||||
|
std::thread::sleep(BACKOFF);
|
||||||
|
}
|
||||||
|
Err(e) => return Err(e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Configure SCM crash-recovery so the service restarts on unexpected exit.
|
||||||
|
///
|
||||||
|
/// `windows-service` 0.7 does not expose `ChangeServiceConfig2` recovery actions
|
||||||
|
/// in a stable, ergonomic form, so we mirror the established pattern used by the
|
||||||
|
/// SAS service binary and shell out to `sc failure`. `reset=86400` clears the
|
||||||
|
/// failure count after a day; three `restart/5000` actions retry after 5s each.
|
||||||
|
fn configure_recovery() {
|
||||||
|
use std::os::windows::process::CommandExt;
|
||||||
|
const CREATE_NO_WINDOW: u32 = 0x0800_0000;
|
||||||
|
|
||||||
|
match std::process::Command::new("sc")
|
||||||
|
.args([
|
||||||
|
"failure",
|
||||||
|
SERVICE_NAME,
|
||||||
|
"reset=86400",
|
||||||
|
"actions=restart/5000/restart/5000/restart/5000",
|
||||||
|
])
|
||||||
|
.creation_flags(CREATE_NO_WINDOW)
|
||||||
|
.output()
|
||||||
|
{
|
||||||
|
Ok(out) if out.status.success() => {
|
||||||
|
info!("configured crash-recovery (restart) for {SERVICE_NAME}");
|
||||||
|
}
|
||||||
|
Ok(out) => {
|
||||||
|
warn!(
|
||||||
|
"could not configure crash-recovery for {SERVICE_NAME} (sc failure exit {:?}); \
|
||||||
|
the service will still run but will not auto-restart on crash",
|
||||||
|
out.status.code()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!("could not invoke `sc failure` to set crash-recovery for {SERVICE_NAME}: {e}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stop (if running) and delete the managed agent service. Idempotent: succeeds
|
||||||
|
/// quietly if the service is not installed.
|
||||||
|
pub fn uninstall_service() -> Result<()> {
|
||||||
|
let manager = ServiceManager::local_computer(None::<&str>, ServiceManagerAccess::CONNECT)
|
||||||
|
.context("failed to connect to the Service Control Manager (run as Administrator)")?;
|
||||||
|
|
||||||
|
match manager.open_service(
|
||||||
|
SERVICE_NAME,
|
||||||
|
ServiceAccess::QUERY_STATUS | ServiceAccess::STOP | ServiceAccess::DELETE,
|
||||||
|
) {
|
||||||
|
Ok(service) => {
|
||||||
|
stop_if_running(&service);
|
||||||
|
service
|
||||||
|
.delete()
|
||||||
|
.context("failed to delete the managed agent service")?;
|
||||||
|
info!("uninstalled {SERVICE_NAME} service");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
// Not installed — nothing to do (idempotent uninstall).
|
||||||
|
info!("{SERVICE_NAME} service is not installed; nothing to uninstall");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Start the managed agent service now (used right after a first-run install so
|
||||||
|
/// the agent comes up without waiting for the next boot). Best-effort: logs and
|
||||||
|
/// returns the SCM error if the start fails, but a failure is not fatal to install
|
||||||
|
/// because the service is auto-start and will come up on the next boot regardless.
|
||||||
|
pub fn start_service() -> Result<()> {
|
||||||
|
let manager = ServiceManager::local_computer(None::<&str>, ServiceManagerAccess::CONNECT)
|
||||||
|
.context("failed to connect to the Service Control Manager")?;
|
||||||
|
let service = manager
|
||||||
|
.open_service(
|
||||||
|
SERVICE_NAME,
|
||||||
|
ServiceAccess::START | ServiceAccess::QUERY_STATUS,
|
||||||
|
)
|
||||||
|
.context("failed to open the managed agent service to start it")?;
|
||||||
|
|
||||||
|
// If it is already running (e.g. reinstall-over-running), there is nothing to do.
|
||||||
|
if let Ok(status) = service.query_status() {
|
||||||
|
if status.current_state == ServiceState::Running
|
||||||
|
|| status.current_state == ServiceState::StartPending
|
||||||
|
{
|
||||||
|
info!("{SERVICE_NAME} is already running/starting");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
service
|
||||||
|
.start::<String>(&[])
|
||||||
|
.context("failed to start the managed agent service")?;
|
||||||
|
info!("started {SERVICE_NAME}");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Report whether the managed agent service is currently installed.
|
||||||
|
pub fn is_service_installed() -> bool {
|
||||||
|
match ServiceManager::local_computer(None::<&str>, ServiceManagerAccess::CONNECT) {
|
||||||
|
Ok(manager) => manager
|
||||||
|
.open_service(SERVICE_NAME, ServiceAccess::QUERY_STATUS)
|
||||||
|
.is_ok(),
|
||||||
|
Err(_) => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Best-effort stop of a service, waiting briefly for it to leave the running
|
||||||
|
/// state so a subsequent `delete` does not race an in-flight stop.
|
||||||
|
fn stop_if_running(service: &windows_service::service::Service) {
|
||||||
|
if let Ok(status) = service.query_status() {
|
||||||
|
if status.current_state != ServiceState::Stopped {
|
||||||
|
info!("stopping {SERVICE_NAME} before delete");
|
||||||
|
let _ = service.stop();
|
||||||
|
for _ in 0..10 {
|
||||||
|
std::thread::sleep(Duration::from_millis(500));
|
||||||
|
match service.query_status() {
|
||||||
|
Ok(s) if s.current_state == ServiceState::Stopped => break,
|
||||||
|
_ => continue,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// The launch argument the service is registered with MUST equal the hidden
|
||||||
|
/// `service-run` subcommand `main.rs` dispatches into [`run_dispatcher`]; a
|
||||||
|
/// mismatch would register a service the SCM could start but that would fall
|
||||||
|
/// through to normal (non-service) mode and immediately exit.
|
||||||
|
///
|
||||||
|
/// This pins the value of the constant itself. The companion test
|
||||||
|
/// `tests::service_run_subcommand_matches_scm_launch_arg` in `main.rs` pins the
|
||||||
|
/// other half — that the clap `#[command(name = "service-run")]` attribute on
|
||||||
|
/// `Commands::ServiceRun` resolves to this same constant — so the two string
|
||||||
|
/// literals cannot silently drift apart.
|
||||||
|
#[test]
|
||||||
|
fn service_run_arg_matches_subcommand_name() {
|
||||||
|
assert_eq!(SERVICE_RUN_ARG, "service-run");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Service identifiers are non-empty and the internal name carries no spaces
|
||||||
|
/// (the SCM key / `sc` argument must be a single token).
|
||||||
|
#[test]
|
||||||
|
fn service_identifiers_are_well_formed() {
|
||||||
|
assert!(!SERVICE_NAME.is_empty());
|
||||||
|
assert!(
|
||||||
|
!SERVICE_NAME.contains(char::is_whitespace),
|
||||||
|
"the SCM service name must be a single whitespace-free token"
|
||||||
|
);
|
||||||
|
assert!(!SERVICE_DISPLAY_NAME.is_empty());
|
||||||
|
assert!(!SERVICE_DESCRIPTION.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// `is_service_installed` must never panic regardless of elevation/SCM access;
|
||||||
|
/// on a dev workstation without the service installed it returns `false`. (We
|
||||||
|
/// do NOT install the service in tests — that is a VM/admin integration step.)
|
||||||
|
#[test]
|
||||||
|
fn is_service_installed_is_total() {
|
||||||
|
let _ = is_service_installed();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -41,8 +41,18 @@ use crate::proto::{message, AgentStatus, ChatMessage, Heartbeat, HeartbeatAck, M
|
|||||||
use crate::transport::WebSocketTransport;
|
use crate::transport::WebSocketTransport;
|
||||||
use crate::tray::{TrayAction, TrayController};
|
use crate::tray::{TrayAction, TrayController};
|
||||||
use anyhow::Result;
|
use anyhow::Result;
|
||||||
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
use std::sync::Arc;
|
||||||
use std::time::{Duration, Instant};
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
|
/// Sentinel error string returned by [`SessionManager::run_with_tray`] when the
|
||||||
|
/// loop breaks because the SCM asked the managed-agent service to stop (SPEC-018,
|
||||||
|
/// finding H). The outer `run_agent` loop matches on this to treat the exit as a
|
||||||
|
/// graceful service stop (clean WS close, no reconnect) rather than a session
|
||||||
|
/// error. Only the service path passes a shutdown flag, so only the service path
|
||||||
|
/// can ever produce this.
|
||||||
|
pub const SERVICE_STOP_SENTINEL: &str = "SERVICE_STOP";
|
||||||
|
|
||||||
// Heartbeat interval (30 seconds)
|
// Heartbeat interval (30 seconds)
|
||||||
const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(30);
|
const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(30);
|
||||||
// Status report interval (60 seconds)
|
// Status report interval (60 seconds)
|
||||||
@@ -285,16 +295,34 @@ impl SessionManager {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Run the session main loop with tray and chat event processing
|
/// Run the session main loop with tray and chat event processing.
|
||||||
|
///
|
||||||
|
/// `service_shutdown` (SPEC-018 finding H) is the SCM cooperative-stop flag.
|
||||||
|
/// It is `Some(flag)` ONLY on the managed-agent service path; the
|
||||||
|
/// attended/viewer/interactive callers pass `None` and behave EXACTLY as
|
||||||
|
/// before. When present, the flag is polled on every idle tick (the natural
|
||||||
|
/// ~100ms seam below) so an SCM Stop/Shutdown received while CONNECTED breaks
|
||||||
|
/// this inner loop promptly — instead of only being observed by the outer
|
||||||
|
/// `run_agent` reconnect loop, which never runs while a session is connected.
|
||||||
|
/// On a set flag the loop closes the WebSocket cleanly (via the shared exit
|
||||||
|
/// path at the bottom) and returns the [`SERVICE_STOP_SENTINEL`] error, which
|
||||||
|
/// the outer loop maps to a graceful stop.
|
||||||
pub async fn run_with_tray(
|
pub async fn run_with_tray(
|
||||||
&mut self,
|
&mut self,
|
||||||
tray: Option<&TrayController>,
|
tray: Option<&TrayController>,
|
||||||
chat: Option<&ChatController>,
|
chat: Option<&ChatController>,
|
||||||
|
service_shutdown: Option<&Arc<AtomicBool>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
if self.transport.is_none() {
|
if self.transport.is_none() {
|
||||||
anyhow::bail!("Not connected");
|
anyhow::bail!("Not connected");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Helper: has the SCM asked the service to stop? Always false off the
|
||||||
|
// service path (where `service_shutdown` is `None`).
|
||||||
|
let stop_requested = |flag: Option<&Arc<AtomicBool>>| -> bool {
|
||||||
|
flag.is_some_and(|f| f.load(Ordering::SeqCst))
|
||||||
|
};
|
||||||
|
|
||||||
// Send initial status
|
// Send initial status
|
||||||
self.send_status().await?;
|
self.send_status().await?;
|
||||||
|
|
||||||
@@ -307,6 +335,29 @@ impl SessionManager {
|
|||||||
|
|
||||||
// Main loop
|
// Main loop
|
||||||
loop {
|
loop {
|
||||||
|
// SPEC-018 (finding H): honour an SCM stop request received while the
|
||||||
|
// session is CONNECTED. The outer `run_agent` loop only observes the
|
||||||
|
// flag between connection attempts, but a managed agent spends its
|
||||||
|
// entire connected life inside THIS loop — so without this check an
|
||||||
|
// SCM Stop while connected would not break out until the connection
|
||||||
|
// dropped on its own. Breaking here falls through to the shared exit
|
||||||
|
// path below, which closes the transport cleanly (clean WS close);
|
||||||
|
// the sentinel tells the outer loop this was a graceful stop.
|
||||||
|
if stop_requested(service_shutdown) {
|
||||||
|
tracing::info!("Service stop requested; ending connected session loop");
|
||||||
|
self.release_streaming();
|
||||||
|
self.state = SessionState::Disconnected;
|
||||||
|
if let Some(transport) = self.transport.as_mut() {
|
||||||
|
// Best-effort clean WebSocket close (sends a Close frame). A
|
||||||
|
// failure here just means the peer/socket is already gone; the
|
||||||
|
// service still stops cleanly.
|
||||||
|
if let Err(e) = transport.close().await {
|
||||||
|
tracing::warn!("error during clean WebSocket close on service stop: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return Err(anyhow::anyhow!(SERVICE_STOP_SENTINEL));
|
||||||
|
}
|
||||||
|
|
||||||
// Process tray events
|
// Process tray events
|
||||||
if let Some(t) = tray {
|
if let Some(t) = tray {
|
||||||
if let Some(action) = t.process_events() {
|
if let Some(action) = t.process_events() {
|
||||||
@@ -745,3 +796,47 @@ impl SessionManager {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
/// SPEC-018 finding H: the connected-stop contract. When the SCM sets the
|
||||||
|
/// shutdown flag, `run_with_tray` returns an error whose message contains
|
||||||
|
/// [`SERVICE_STOP_SENTINEL`]; the outer `run_agent` loop recognises a graceful
|
||||||
|
/// stop with `error_msg.contains(SERVICE_STOP_SENTINEL)`. This pins that the
|
||||||
|
/// error the loop constructs on stop actually satisfies that match — so the
|
||||||
|
/// two halves (producer here, consumer in `main.rs`) cannot drift.
|
||||||
|
///
|
||||||
|
/// A full end-to-end test of the in-loop interrupt would need a live connected
|
||||||
|
/// transport (a real or mocked server), which is an integration concern; this
|
||||||
|
/// unit test instead pins the wire contract the interrupt relies on.
|
||||||
|
#[test]
|
||||||
|
fn service_stop_sentinel_is_matched_by_outer_loop_check() {
|
||||||
|
let produced = anyhow::anyhow!(SERVICE_STOP_SENTINEL);
|
||||||
|
assert!(
|
||||||
|
produced.to_string().contains(SERVICE_STOP_SENTINEL),
|
||||||
|
"the stop error must contain the sentinel the outer loop matches on"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
!SERVICE_STOP_SENTINEL.is_empty(),
|
||||||
|
"the sentinel must be a non-empty, distinctive token"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The shutdown-flag check is a no-op (always `false`) when no flag is passed,
|
||||||
|
/// i.e. on the attended/viewer/interactive paths — guaranteeing the new
|
||||||
|
/// parameter is a pure addition that cannot alter non-service behaviour
|
||||||
|
/// (SPEC-018 finding H: "no regression").
|
||||||
|
#[test]
|
||||||
|
fn no_shutdown_flag_never_requests_stop() {
|
||||||
|
let none: Option<&Arc<AtomicBool>> = None;
|
||||||
|
let check = |flag: Option<&Arc<AtomicBool>>| flag.is_some_and(|f| f.load(Ordering::SeqCst));
|
||||||
|
assert!(!check(none));
|
||||||
|
|
||||||
|
let set = Arc::new(AtomicBool::new(true));
|
||||||
|
assert!(check(Some(&set)));
|
||||||
|
let unset = Arc::new(AtomicBool::new(false));
|
||||||
|
assert!(!check(Some(&unset)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -16,11 +16,16 @@ stack. It ships independently of GuruRMM and integrates with it via a versioned
|
|||||||
> match, blacklist-on-WS, agent-plane rejects user JWTs via per-agent `cak_` keys). The feature specs below
|
> match, blacklist-on-WS, agent-plane rejects user JWTs via per-agent `cak_` keys). The feature specs below
|
||||||
> (SPEC-003–009) are **work-items inside the later v2 phases** — see the mapping.
|
> (SPEC-003–009) are **work-items inside the later v2 phases** — see the mapping.
|
||||||
>
|
>
|
||||||
> **Remaining to formally exit Phase 1:** secure-session-core **Task 8** (end-to-end verification +
|
> **Phase 1 formally EXITED (2026-05-31).** secure-session-core **Task 8** is complete — end-to-end
|
||||||
> `/gc-audit --pass=security` re-audit + the manual CRITICAL checks) and Code-Review sign-off on Tasks 3–5
|
> functional verification (live CRITICAL boundary checks against the deployed binary: login-JWT→401,
|
||||||
> (implemented without a local toolchain at the time; since built + deployed). Live HW-H.264 validation is
|
> wrong-session viewer token→403, JWT-as-agent-key→401) **plus the `/gc-audit --pass=security` re-audit:
|
||||||
> also pending — raw+Zstd remains the shipping default. ~~Sprint 0 (relay-auth CRITICAL hotfix)~~ **not
|
> PASS, 0 CRITICAL/HIGH/MEDIUM/LOW** ([report](../reports/2026-05-31-gc-audit.md)). Code-Review sign-off on
|
||||||
> needed — those fixes shipped in Tasks 2–3.**
|
> Tasks 3–5 landed earlier. On top of Phase 1, **SPEC-004 (Tasks 2/4/5 — machine_uid dedup, session
|
||||||
|
> reaping, operator removal API+UI) is implemented, reviewed, deployed, and the 11 live ghost rows were
|
||||||
|
> purged**; the agent is now **auto-versioned + Azure-Trusted-Signing-signed via `release.yml`** with
|
||||||
|
> **v0.3.0 published** as the stable release. ~~Sprint 0 (relay-auth CRITICAL hotfix)~~ **not needed.**
|
||||||
|
> Still pending (NOT a Phase-1 blocker): live HW-H.264 cross-GPU validation — **raw+Zstd remains the
|
||||||
|
> shipping default** (`DEFAULT_PREFER_H264=false`) until H.264 is validated across GPUs.
|
||||||
|
|
||||||
### v2 phase mapping of current specs
|
### v2 phase mapping of current specs
|
||||||
|
|
||||||
@@ -43,8 +48,9 @@ stack. It ships independently of GuruRMM and integrates with it via a versioned
|
|||||||
|
|
||||||
Bringing GC to parity with GuruRMM's release engineering. Full plan: [SPEC-001](specs/SPEC-001-operational-tooling-parity.md).
|
Bringing GC to parity with GuruRMM's release engineering. Full plan: [SPEC-001](specs/SPEC-001-operational-tooling-parity.md).
|
||||||
|
|
||||||
- [ ] **Code signing — Azure Trusted Signing in CI** — P1 — sign the Windows agent `.exe` via `jsign` (TRUSTEDSIGNING) in Gitea Actions, reusing the shared ACG cert profile. (SPEC-001 §2)
|
- [x] **Code signing — Azure Trusted Signing in CI** — P1 — Windows agent `.exe` signed via `jsign` (TRUSTEDSIGNING) in `release.yml`, fail-closed (never publishes unsigned). Shipped with v0.3.0. (SPEC-001 §2)
|
||||||
- [ ] **Automatic versioning** — P1 — conventional-commit-driven version bump across agent/server/dashboard, embedded via `build.rs`. (SPEC-001 §3)
|
- [ ] **Signed beta/test release channel** — **P1 — NOW** — every binary we hand to a tester must be signed, but signing today only runs on a deliberate full `release.yml` dispatch; the automatic `build-and-test.yml` agent artifact is explicitly **unsigned**. Add a `channel: stable | beta` `workflow_dispatch` input to `release.yml`: `beta` signs the agent and publishes a prerelease-tagged Gitea release (e.g. `v0.4.0-beta.1`) **skipping the semver bump + changelog**; `stable` keeps the existing full path. Keeps signing secrets out of PR-triggered runs. (SPEC-001 §2)
|
||||||
|
- [x] **Automatic versioning** — P1 — conventional-commit-driven version bump computed at dispatch in `release.yml`, embedded via `build.rs`. Shipped with v0.3.0. (SPEC-001 §3)
|
||||||
- [ ] **Changelog generation & API** — P2 — `CHANGELOG.md` + per-version changelogs from conventional commits, served at `/api/changelog/...`. (SPEC-001 §4)
|
- [ ] **Changelog generation & API** — P2 — `CHANGELOG.md` + per-version changelogs from conventional commits, served at `/api/changelog/...`. (SPEC-001 §4)
|
||||||
- [ ] **Feature-request workflow** — P2 — `/gc-feature-request` skill producing `docs/specs/SPEC-NNN-*.md` and updating this roadmap. (SPEC-001 §1)
|
- [ ] **Feature-request workflow** — P2 — `/gc-feature-request` skill producing `docs/specs/SPEC-NNN-*.md` and updating this roadmap. (SPEC-001 §1)
|
||||||
- [ ] **Roadmap / ADR / spec tracking** — P1 — this file + `ARCHITECTURE_DECISIONS.md` + `docs/specs/`. (SPEC-001 §5) — *bootstrapped*
|
- [ ] **Roadmap / ADR / spec tracking** — P1 — this file + `ARCHITECTURE_DECISIONS.md` + `docs/specs/`. (SPEC-001 §5) — *bootstrapped*
|
||||||
@@ -62,6 +68,7 @@ Bringing GC to parity with GuruRMM's release engineering. Full plan: [SPEC-001](
|
|||||||
- [x] Protobuf-over-WSS transport, Zstd frame compression
|
- [x] Protobuf-over-WSS transport, Zstd frame compression
|
||||||
- [~] React/TS web viewer (`dashboard/src/components/RemoteViewer.tsx`) — embeddable session viewer
|
- [~] React/TS web viewer (`dashboard/src/components/RemoteViewer.tsx`) — embeddable session viewer
|
||||||
- [ ] **Headless Linux mode (direct TTY access)** — P2 — Terminal-based remote access for Linux servers without GUI. PTY spawn (`openpty`), xterm.js web viewer, full ANSI/VT100 support. Enables server management, container debugging, emergency recovery via GuruConnect dashboard with audit logging. SSH replacement with centralized auth. ([SPEC-012](specs/SPEC-012-headless-linux-tty.md))
|
- [ ] **Headless Linux mode (direct TTY access)** — P2 — Terminal-based remote access for Linux servers without GUI. PTY spawn (`openpty`), xterm.js web viewer, full ANSI/VT100 support. Enables server management, container debugging, emergency recovery via GuruConnect dashboard with audit logging. SSH replacement with centralized auth. ([SPEC-012](specs/SPEC-012-headless-linux-tty.md))
|
||||||
|
- [ ] **Managed-agent SYSTEM service host + session broker** — P1 — convert the persistent agent from `HKCU Run` (user context) to a LocalSystem **service** that runs unattended (login screen, no user, across reboots) and spawns a per-session capture/input worker into the active desktop (Session 0 can't capture directly). Unblocks SPEC-016 Phase B end-to-end (the SYSTEM-ACL'd `cak_` store becomes readable; removes the Phase B fail-fast guard), enables true unattended access, and is the **broker primitive SPEC-013 builds on**. ([SPEC-018](specs/SPEC-018-managed-agent-service-host.md))
|
||||||
- [ ] **Windows session selection and backstage mode** — P2 — Enumerate and switch between Windows user sessions (Terminal Services/RDP/Fast User Switching) and access Session 0 (backstage) for system-level admin tasks. ScreenConnect parity: session selector shows all logged-on users, instant switching without reconnect. Backstage mode provides terminal/command interface for services management without disrupting any user desktop. Critical for multi-user server environments. ([SPEC-013](specs/SPEC-013-session-selection-and-backstage.md))
|
- [ ] **Windows session selection and backstage mode** — P2 — Enumerate and switch between Windows user sessions (Terminal Services/RDP/Fast User Switching) and access Session 0 (backstage) for system-level admin tasks. ScreenConnect parity: session selector shows all logged-on users, instant switching without reconnect. Backstage mode provides terminal/command interface for services management without disrupting any user desktop. Critical for multi-user server environments. ([SPEC-013](specs/SPEC-013-session-selection-and-backstage.md))
|
||||||
- [ ] **Configurable notification overlay on viewer connection** — P2 — Display a semi-transparent on-screen notification when a technician connects, showing technician name and company. Dashboard-configurable message template (supports `{{technician_name}}`, `{{company}}`, `{{time}}`), duration (5-60s), position (top-left/right, bottom-left/right, center), and dismissible behavior. Increases transparency and user awareness during remote support sessions. Compliance-friendly for privacy policies requiring user notification. ([SPEC-015](specs/SPEC-015-notification-overlay.md))
|
- [ ] **Configurable notification overlay on viewer connection** — P2 — Display a semi-transparent on-screen notification when a technician connects, showing technician name and company. Dashboard-configurable message template (supports `{{technician_name}}`, `{{company}}`, `{{time}}`), duration (5-60s), position (top-left/right, bottom-left/right, center), and dismissible behavior. Increases transparency and user awareness during remote support sessions. Compliance-friendly for privacy policies requiring user notification. ([SPEC-015](specs/SPEC-015-notification-overlay.md))
|
||||||
- [ ] Multi-monitor switching — P2
|
- [ ] Multi-monitor switching — P2
|
||||||
@@ -81,13 +88,15 @@ Bringing GC to parity with GuruRMM's release engineering. Full plan: [SPEC-001](
|
|||||||
- [x] Sessions / machines / support-codes / events
|
- [x] Sessions / machines / support-codes / events
|
||||||
- [ ] **Full machine inventory in the connection DB** — P2 — persist per-machine device inventory (OS+locale+install, CPU/RAM, mfr/model/serial, external WAN IP captured server-side + private LAN IP + MAC, logged-on user, idle, time zone, uptime, local-admin) on `connect_machines`, refreshed each `AgentStatus`, shown in the dashboard machine detail (ScreenConnect "Guest Info" parity). Data layer for SPEC-002 Phase 2; closes GC side of agent-IP gap (todo 7459428e). **[→ v2 Phase 2]** ([SPEC-003](specs/SPEC-003-machine-inventory.md))
|
- [ ] **Full machine inventory in the connection DB** — P2 — persist per-machine device inventory (OS+locale+install, CPU/RAM, mfr/model/serial, external WAN IP captured server-side + private LAN IP + MAC, logged-on user, idle, time zone, uptime, local-admin) on `connect_machines`, refreshed each `AgentStatus`, shown in the dashboard machine detail (ScreenConnect "Guest Info" parity). Data layer for SPEC-002 Phase 2; closes GC side of agent-IP gap (todo 7459428e). **[→ v2 Phase 2]** ([SPEC-003](specs/SPEC-003-machine-inventory.md))
|
||||||
- [ ] **Stable machine identity + session lifecycle reaping + operator removal** — P1 — give the agent a deterministic machine-derived `machine_uid` (Windows `MachineGuid`-based) so the same box can't register duplicates (root cause: `agent_id` is a config-file random UUID that a portable/misconfigured run regenerates each launch); key registration on it; add TTL reaping + same-machine supersede as defense-in-depth; and admin-gated per-row + multi-select bulk removal of stale sessions/units. Identity must be bound to the per-machine agent key (spoof guard). Fixes ghost-session accumulation seen on the live console (15 sessions / 0 live, ~10 orphans for one machine). **[→ v2 Phase 1]** ([SPEC-004](specs/SPEC-004-session-lifecycle-and-removal.md))
|
- [ ] **Stable machine identity + session lifecycle reaping + operator removal** — P1 — give the agent a deterministic machine-derived `machine_uid` (Windows `MachineGuid`-based) so the same box can't register duplicates (root cause: `agent_id` is a config-file random UUID that a portable/misconfigured run regenerates each launch); key registration on it; add TTL reaping + same-machine supersede as defense-in-depth; and admin-gated per-row + multi-select bulk removal of stale sessions/units. Identity must be bound to the per-machine agent key (spoof guard). Fixes ghost-session accumulation seen on the live console (15 sessions / 0 live, ~10 orphans for one machine). **[→ v2 Phase 1]** ([SPEC-004](specs/SPEC-004-session-lifecycle-and-removal.md))
|
||||||
|
- [ ] **Zero-touch per-site agent enrollment** — P1 — ScreenConnect-class managed enrollment: one signed installer per site, machines self-register on first run and the server mints a per-machine `cak_` bound to a deterministic `machine_uid` (dedups re-installs). Per-site **rotatable** enrollment key (long secret + `vN (XXXX)` fingerprint) — rotating blocks new enrollments from old installers, leaves enrolled agents untouched. Auto-approve + new-enrollment/site-move alert. **Sign base agent once (CI, shipped) + per-site signed wrapper that writes site config around the signed bytes — resolves SPEC-007's signature-vs-appended-config question.** Anticipated/deferred: enrollment policy + licensing, `--enroll-key`/`--reassign` flag overrides, technician-assisted interactive install. **[→ v2 Phase 1]** ([SPEC-016](specs/SPEC-016-zero-touch-enrollment.md))
|
||||||
- [ ] **Machines list view — dual connection indicators + rich rows** — P2 — ScreenConnect "Access"-list parity: per-row Host/Guest two-segment connection bar (Guest=agent online, Host=viewer connected, with names + durations) and rich inline metadata (company, site, device type, tags, logged-on user + idle, client version in red when outdated). Server-enriches `/api/machines` with live session state + SPEC-003 inventory. **[→ v2 Phase 2]** ([SPEC-005](specs/SPEC-005-machines-list-view-parity.md))
|
- [ ] **Machines list view — dual connection indicators + rich rows** — P2 — ScreenConnect "Access"-list parity: per-row Host/Guest two-segment connection bar (Guest=agent online, Host=viewer connected, with names + durations) and rich inline metadata (company, site, device type, tags, logged-on user + idle, client version in red when outdated). Server-enriches `/api/machines` with live session state + SPEC-003 inventory. **[→ v2 Phase 2]** ([SPEC-005](specs/SPEC-005-machines-list-view-parity.md))
|
||||||
- [ ] Machines "by Company" tree nav with per-company counts — P3 — left-nav grouping sidebar (screenshot parity). Follow-up sub-item of SPEC-005.
|
- [ ] Machines "by Company" tree nav with per-company counts — P3 — left-nav grouping sidebar (screenshot parity). Follow-up sub-item of SPEC-005.
|
||||||
- [ ] **Universal machine search ("everything is searchable")** — P2 — server-side `?q=` on `/api/machines` matching case-insensitive substring across ALL attributes (OS, logged-on user, external/private IP, company, site, tag, serial, MAC, version, …), pg_trgm GIN-indexed; multi-term AND + optional field-scoped syntax (`os:`, `user:`, `ip:`). Replaces the hostname-only client filter. Depends on SPEC-003 (attrs must be persisted). **[→ v2 Phase 2]** ([SPEC-006](specs/SPEC-006-universal-machine-search.md))
|
- [ ] **Universal machine search ("everything is searchable")** — P2 — server-side `?q=` on `/api/machines` matching case-insensitive substring across ALL attributes (OS, logged-on user, external/private IP, company, site, tag, serial, MAC, version, …), pg_trgm GIN-indexed; multi-term AND + optional field-scoped syntax (`os:`, `user:`, `ip:`). Replaces the hostname-only client filter. Depends on SPEC-003 (attrs must be persisted). **[→ v2 Phase 2]** ([SPEC-006](specs/SPEC-006-universal-machine-search.md))
|
||||||
- [ ] **Managed-agent installer builder ("Build Installer")** — P2 — dashboard wizard to build a pre-labeled persistent-agent installer (Name/Company/Site/Department/Device Type/Tag/Type) with Download / Copy URL / Send Link, reusing the existing embed-config download path; adds department + device_type to EmbeddedConfig/AgentStatus so labels persist at install time. Pairs with revocable per-machine keys; signature-vs-appended-config is the key open question. **[→ v2 Phase 2]** ([SPEC-007](specs/SPEC-007-managed-agent-installer-builder.md))
|
- [ ] **Managed-agent installer builder ("Build Installer")** — P2 — dashboard wizard to build a pre-labeled persistent-agent installer (Name/Company/Site/Department/Device Type/Tag/Type) with Download / Copy URL / Send Link, reusing the existing embed-config download path; adds department + device_type to EmbeddedConfig/AgentStatus so labels persist at install time. Pairs with revocable per-machine keys; the signature-vs-appended-config question is resolved by SPEC-016 (sign-once base + per-site signed wrapper, no PE append). **[→ v2 Phase 2]** ([SPEC-007](specs/SPEC-007-managed-agent-installer-builder.md))
|
||||||
- [ ] **Valuable error messages (structured errors + no silent swallows)** — P2 — one structured API error envelope with stable codes + a correlation id that also lands in the logs; contextual tracing on server/agent; sweep the 37 `let _ =` swallows (the pattern that hid the migration-005 bug); dashboard surfaces the real cause + id instead of a generic line. **[→ v2 Phase 0/1 conventions]** ([SPEC-008](specs/SPEC-008-valuable-error-messages.md))
|
- [ ] **Valuable error messages (structured errors + no silent swallows)** — P2 — one structured API error envelope with stable codes + a correlation id that also lands in the logs; contextual tracing on server/agent; sweep the 37 `let _ =` swallows (the pattern that hid the migration-005 bug); dashboard surfaces the real cause + id instead of a generic line. **[→ v2 Phase 0/1 conventions]** ([SPEC-008](specs/SPEC-008-valuable-error-messages.md))
|
||||||
- [ ] **Feature-rich, fully-documented management API** — P2 — everything the console can do, callable by API: OpenAPI 3.x generated from code (utoipa) + browsable docs at `/api/docs`, long-lived revocable scoped API tokens (PAT-style, distinct from the 24h JWT + agent keys), an API-completeness gap audit, and consistent pagination/error conventions. Distinct from the ADR-001 RMM integration contract. **[→ v2 Phase 3]** ([SPEC-009](specs/SPEC-009-feature-rich-documented-api.md))
|
- [ ] **Feature-rich, fully-documented management API** — P2 — everything the console can do, callable by API: OpenAPI 3.x generated from code (utoipa) + browsable docs at `/api/docs`, long-lived revocable scoped API tokens (PAT-style, distinct from the 24h JWT + agent keys), an API-completeness gap audit, and consistent pagination/error conventions. Distinct from the ADR-001 RMM integration contract. **[→ v2 Phase 3]** ([SPEC-009](specs/SPEC-009-feature-rich-documented-api.md))
|
||||||
- [ ] **Branding and white-label configuration** — P2 — Allow MSPs to customize logo, colors, and product name for white-labeled remote support. Dashboard admin settings page with logo upload (PNG/SVG, max 2MB), brand hue slider (OKLCH 0-360°, default 184=cyan), product name override, company name, and favicon. Agent tray tooltip uses custom product name from registry. Singleton database table with public GET endpoint for unauthenticated rendering. CSS variables (`--brand-hue`, `--accent`, `--panel`) for dynamic theming. **[→ v2 Phase 2]** ([SPEC-014](specs/SPEC-014-branding-whitelabel.md))
|
- [ ] **Branding and white-label configuration** — P2 — Allow MSPs to customize logo, colors, and product name for white-labeled remote support. Dashboard admin settings page with logo upload (PNG/SVG, max 2MB), brand hue slider (OKLCH 0-360°, default 184=cyan), product name override, company name, and favicon. Agent tray tooltip uses custom product name from registry. Singleton database table with public GET endpoint for unauthenticated rendering. CSS variables (`--brand-hue`, `--accent`, `--panel`) for dynamic theming. **[→ v2 Phase 2]** ([SPEC-014](specs/SPEC-014-branding-whitelabel.md))
|
||||||
|
- [ ] **End-user (sub-user) remote access** — P2 (may be P3) — let a client pay for their employees to reach their *own* machines from home: a deny-by-default `end_user` login role, a locked-down end-user portal listing only granted machines, and Connect reusing the existing session-scoped viewer-token + relay path. Grant primitive already exists (`user_client_access`, migration 002); directory sync (AD/Entra/Google) is a separate future spec. **[→ new capability, post v2-console]** ([SPEC-017](specs/SPEC-017-end-user-remote-access.md))
|
||||||
- [ ] Programmatic session pre-create + viewer-token (integration contract) — P2
|
- [ ] Programmatic session pre-create + viewer-token (integration contract) — P2
|
||||||
|
|
||||||
## Security & Infrastructure
|
## Security & Infrastructure
|
||||||
|
|||||||
244
docs/specs/SPEC-016-zero-touch-enrollment.md
Normal file
244
docs/specs/SPEC-016-zero-touch-enrollment.md
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
# SPEC-016: Zero-Touch Per-Site Agent Enrollment
|
||||||
|
|
||||||
|
**Status:** Proposed
|
||||||
|
**Priority:** P1
|
||||||
|
**Requested By:** Mike (2026-06-02)
|
||||||
|
**Estimated Effort:** X-Large
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Give GuruConnect a ScreenConnect-class managed-agent enrollment flow: a technician runs
|
||||||
|
**one signed installer per site** on every machine at that site — no per-machine key
|
||||||
|
minting, no flags, no typing — and each machine **self-registers** on first run, the
|
||||||
|
server minting it a per-machine `cak_` key bound to a stable, machine-derived
|
||||||
|
`machine_uid`. Each site installer carries a **rotatable per-site enrollment key** (a long
|
||||||
|
server-generated secret) plus a short human-readable **fingerprint** (`vN (XXXX)`) so an
|
||||||
|
operator can tell at a glance whether an installer is current. Rotating a site's key blocks
|
||||||
|
*new* enrollments from old installers while leaving already-enrolled machines untouched
|
||||||
|
(they hold their own `cak_`).
|
||||||
|
|
||||||
|
This is the missing piece that turns the v2 secure-session-core (SPEC-004 per-agent keys +
|
||||||
|
`machine_uid`) into a real product workflow, and it **resolves SPEC-007's open
|
||||||
|
signature-vs-appended-config question**: the agent binary is signed **once** in CI
|
||||||
|
(already shipped via `release.yml`), and per-site customization rides in a thin **signed
|
||||||
|
wrapper** that writes site config to the endpoint at install time — never appended into the
|
||||||
|
signed PE.
|
||||||
|
|
||||||
|
**Success criteria:**
|
||||||
|
1. A tech installs one site installer on N machines; all N appear in the console under the
|
||||||
|
correct company/site, each as a distinct, deduplicated machine — zero per-machine setup.
|
||||||
|
2. Re-installing / re-imaging the same hardware **reuses** the existing machine row (no
|
||||||
|
ghost duplicates — the failure mode SPEC-004 documents).
|
||||||
|
3. Rotating a site's enrollment key makes old installers unable to enroll new machines,
|
||||||
|
while every already-enrolled agent keeps working.
|
||||||
|
4. Every distributed installer is **validly Authenticode-signed** (SmartScreen/WDAC clean).
|
||||||
|
|
||||||
|
## Background — what exists today (confirmed in code)
|
||||||
|
|
||||||
|
- **Embedded config is append-based and breaks signing.** `server/src/api/downloads.rs`
|
||||||
|
(`download_agent`, ~`:152`) reads `static/downloads/guruconnect.exe` and **appends**
|
||||||
|
`MAGIC_MARKER` + `len:u32` + JSON (`:196`) to the end of the PE. The agent reads it back
|
||||||
|
in `agent/src/config.rs` (`read_embedded_config`, `:223`). Appending bytes after a signed
|
||||||
|
PE invalidates the Authenticode signature — so the current customization path and the
|
||||||
|
newly-shipped CI signing are mutually exclusive.
|
||||||
|
- **No self-registration exists.** Per-agent `cak_` keys are minted **admin-only** in
|
||||||
|
`server/src/api/machine_keys.rs` (`create_key`, `:119`; "Admin issued a per-agent key",
|
||||||
|
`:146`). There is no endpoint where an agent first-run exchanges an enrollment credential
|
||||||
|
for its own key.
|
||||||
|
- **Relay already accepts per-agent keys.** `server/src/relay/mod.rs`
|
||||||
|
(`validate_agent_api_key`, `:417`) calls `crate::auth::agent_keys::verify_agent_key`
|
||||||
|
(`:422`) — the `cak_` path — then falls back to the **deprecated** shared `AGENT_API_KEY`
|
||||||
|
(`:444`, logs a "migrate to per-agent `cak_`" warning).
|
||||||
|
- **Key primitives exist.** `server/src/auth/agent_keys.rs`: `generate_agent_key` mints a
|
||||||
|
`cak_`-prefixed high-entropy key (`:36`/`:46`); `verify_agent_key` (`:71`).
|
||||||
|
`server/src/db/agent_keys.rs` already inserts into `connect_agent_keys (machine_id,
|
||||||
|
key_hash, tenant_id)` (`:47`) — the v2 tenancy column is present (migration
|
||||||
|
`004_v2_secure_session_core.sql`).
|
||||||
|
- **Identity is a random config UUID, not machine-derived** — the root cause of duplicates
|
||||||
|
per SPEC-004 (`agent/src/config.rs` `generate_agent_id`, `:90`).
|
||||||
|
- **Agent mode dispatch:** `agent/src/main.rs` `Commands::Install` (`:160`) → `run_install`;
|
||||||
|
`agent/src/config.rs` `detect_run_mode` (`:162`) returns `RunMode::PermanentAgent` when
|
||||||
|
embedded config is present.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
### Included in v1 (CORE)
|
||||||
|
|
||||||
|
1. **`machine_uid` — deterministic machine identity (hardware-salted, per-tenant).** Derive
|
||||||
|
a stable id from the Windows `MachineGuid`
|
||||||
|
(`HKLM\SOFTWARE\Microsoft\Cryptography\MachineGuid`) **salted with stable hardware
|
||||||
|
signals** (SMBIOS UUID / motherboard + disk serial), independent of the config-file
|
||||||
|
`agent_id`. Hardware-derived salt is deliberate: it **survives an OS reinstall/re-image
|
||||||
|
on the same hardware** (so the row is reused — the re-image dedup goal) while keeping
|
||||||
|
distinct physical boxes distinct (a per-install *random* salt would break re-image dedup
|
||||||
|
and is rejected). Uniqueness is scoped **per-tenant** — dedup key `(tenant_id,
|
||||||
|
machine_uid)` — so the same hardware legitimately present in two tenants stays two
|
||||||
|
independent rows. (Shared root with SPEC-004; whichever lands first owns the impl, the
|
||||||
|
other consumes it.) Used as the dedup key for register/move.
|
||||||
|
|
||||||
|
**Collision-gated activation.** The residual collision case is VMs/templates that share a
|
||||||
|
hardware UUID (some hypervisors clone the SMBIOS UUID). When the server detects a
|
||||||
|
`machine_uid` collision (a seemingly-different endpoint resolving to an existing uid), the
|
||||||
|
endpoint does **not** auto-activate: it drops to a **pending** state, fires an alert, and
|
||||||
|
an operator must confirm in the dashboard that the collided endpoint may activate. This is
|
||||||
|
the one deliberate exception to auto-approve (see item 6).
|
||||||
|
|
||||||
|
2. **Per-site enrollment key + fingerprint.**
|
||||||
|
- Long (≥256-bit) server-generated secret per site, stored **hashed** (Argon2id, same
|
||||||
|
as `cak_`/passwords), never recoverable in plaintext after issue.
|
||||||
|
- A non-secret **fingerprint** = monotonic version + short derived code in **hex**,
|
||||||
|
rendered `vN (XXXX)` (e.g. `v3 (7F2A)`), shown in the dashboard, baked into the
|
||||||
|
installer filename, and reported by the agent at enrollment. Hex is deliberate —
|
||||||
|
**not** the RMM word-style code (`GREEN-FALCON`) — so GuruConnect and GuruRMM
|
||||||
|
artifacts are never visually conflated.
|
||||||
|
- **Rotate** regenerates the secret and bumps the version; old installers are rejected
|
||||||
|
for *new* enrollments; existing agents (holding `cak_`) are unaffected.
|
||||||
|
|
||||||
|
3. **Self-registration endpoint.** New `POST /api/enroll` (public, unauthenticated by JWT —
|
||||||
|
gated by the enrollment key) accepting `{ site_code, enrollment_key, machine_uid,
|
||||||
|
hostname, labels{company,site,department,device_type,tags} }`:
|
||||||
|
- Verify `(site_code, enrollment_key)` against the current per-site key.
|
||||||
|
- **Dedup by `machine_uid`** within the site: if the machine exists, reuse the row and
|
||||||
|
rotate its `cak_`; else create the machine row.
|
||||||
|
- Mint a `cak_` (reuse `generate_agent_key`), store hashed via `db::agent_keys` bound to
|
||||||
|
`machine_id` (+ `tenant_id` from the site), return the plaintext `cak_` **once**.
|
||||||
|
- Emit an audit event + **new-enrollment alert** (and a **site-move** alert when an
|
||||||
|
existing `machine_uid` enrolls under a different site).
|
||||||
|
- **Rate-limit + lockout** per `(site_code, source-IP)` as defense-in-depth (the key is
|
||||||
|
long, so this is belt-and-suspenders, not load-bearing).
|
||||||
|
|
||||||
|
4. **Agent first-run enrollment.** On `RunMode::PermanentAgent` with no stored `cak_`:
|
||||||
|
read site config → call `/api/enroll` with `machine_uid` → persist the returned `cak_`
|
||||||
|
to a SYSTEM-only protected store (HKLM under a SYSTEM-only ACL, or DPAPI-machine) →
|
||||||
|
connect to `wss://connect.azcomputerguru.com/ws/agent` using the `cak_`. On subsequent
|
||||||
|
runs, use the stored `cak_` directly (no re-enroll).
|
||||||
|
|
||||||
|
5. **Sign-once base + per-site signed wrapper (resolves SPEC-007 open question).**
|
||||||
|
- The base agent is signed once in CI (`release.yml`, already shipped) and stays
|
||||||
|
byte-identical for everyone.
|
||||||
|
- Per-site customization (labels + enrollment key + fingerprint) is delivered to the
|
||||||
|
endpoint **at install time** via a signing-safe channel — NOT appended to the signed
|
||||||
|
PE. **v1 produces BOTH a signed bootstrapper `.exe` and a signed MSI per site**
|
||||||
|
(ScreenConnect parity — manual installs grab the `.exe`, GPO/Intune fleet pushes take
|
||||||
|
the MSI), both wrapping the same sign-once agent and writing the site config to the
|
||||||
|
protected config location. The two differ only in packaging (bootstrapper stub vs. WiX
|
||||||
|
bundle); both are signed.
|
||||||
|
- **Deprecate the append path** in `downloads.rs` for managed installs (keep only for
|
||||||
|
attended/support-code if still needed), eliminating the signature-invalidation defect.
|
||||||
|
|
||||||
|
6. **Auto-approve posture (with collision-gate exception).** A self-registered machine is
|
||||||
|
live and controllable immediately (ScreenConnect parity); the new-enrollment alert is the
|
||||||
|
tripwire. The **one** exception is a detected `machine_uid` collision (item 1), which
|
||||||
|
gates the endpoint to **pending** until an operator confirms it in the dashboard.
|
||||||
|
|
||||||
|
### Explicitly out of scope (ANTICIPATED — reserve room, do NOT build in v1)
|
||||||
|
|
||||||
|
The v1 data model and agent mode-dispatch must leave room for these without building them:
|
||||||
|
|
||||||
|
- **Per-site enrollment POLICY** — a `sites.enrollment_policy` field (default
|
||||||
|
`auto-approve`; future `pending-approval`) plus per-seat/per-endpoint licensing controls.
|
||||||
|
Commercial, multi-tenant (the `tenant_id` column already exists). Its own future SPEC.
|
||||||
|
- **Flag overrides** — `--enroll-key` / `--site-code` (generic installer, key supplied on
|
||||||
|
the command line) and `--reassign` (move an existing machine to a new site, gated by
|
||||||
|
possession of the destination site's key, with an **explicit accidental-move guard**:
|
||||||
|
a different-site re-run refuses unless `--reassign` is passed) + cross-client move policy.
|
||||||
|
Backend (`machine_uid` + authorized site + `cak_`) is designed to support it; CLI surface
|
||||||
|
is deferred.
|
||||||
|
- **Technician-assisted interactive install** — `--technician` on a generic installer:
|
||||||
|
prompts for the tech's own server credentials, and on auth presents a **validated**
|
||||||
|
Company/Site/tags picker from the live authorized list (authz-by-identity, full audit
|
||||||
|
trail). Heaviest path (interactive UI + auth/list callback); deferred.
|
||||||
|
|
||||||
|
All three converge on the **same backend operation** delivered in v1: `machine_uid` +
|
||||||
|
authorized site + issued `cak_`. v1 only ships the per-site-embedded-key door.
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
- **Agent** (`agent/`): compute `machine_uid`; first-run enroll → store `cak_`; use stored
|
||||||
|
`cak_` thereafter; read site config from the wrapper-written location instead of an
|
||||||
|
appended PE blob. Touches `config.rs` (`EmbeddedConfig`/`detect_run_mode`/storage),
|
||||||
|
`main.rs` (`Install`/run-mode), a new `enroll` client module, transport auth.
|
||||||
|
- **Relay-server** (`server/`): new `POST /api/enroll`; per-site key issue/rotate/verify;
|
||||||
|
`machine_uid` dedup + site-move on register; audit + alert emission; rate-limit/lockout.
|
||||||
|
Touches `api/` (new `enroll.rs`, `sites` key endpoints), `auth/agent_keys.rs`,
|
||||||
|
`db/agent_keys.rs`, `relay/mod.rs` (enrollment vs. connect), `main.rs` routes.
|
||||||
|
- **Dashboard**: per-site enrollment-key display (fingerprint `vN (XXXX)`), **Rotate**
|
||||||
|
action, "current installer" download wired to the signed wrapper build. (Builder UI is
|
||||||
|
SPEC-007; this spec supplies the key/fingerprint/rotation it consumes.)
|
||||||
|
- **DB migration:** `site_enrollment_keys` (or columns on the site): `site_id`,
|
||||||
|
`key_hash`, `version`, `fingerprint`, `created_at`, `rotated_at`, `active`. Reserve
|
||||||
|
`sites.enrollment_policy` (nullable, default `auto-approve`) for the anticipated policy
|
||||||
|
work. `connect_machines` gains `machine_uid` (unique per tenant/site).
|
||||||
|
- **Protobuf** (`proto/guruconnect.proto`): no wire change required for enrollment if
|
||||||
|
`/api/enroll` is REST; `AgentStatus` label fields per SPEC-007 (`department`,
|
||||||
|
`device_type`) ride along if landed together.
|
||||||
|
|
||||||
|
## Security considerations
|
||||||
|
|
||||||
|
- **Two-tier credential model:** low-sensitivity **enrollment key** (gates "may register",
|
||||||
|
shared per site, rotatable) vs. high-sensitivity **per-machine `cak_`** (operating
|
||||||
|
credential, per-machine revocation). Compromise of an enrollment key is recovered by
|
||||||
|
rotating one site — no fleet-wide re-key.
|
||||||
|
- **Enrollment keys stored hashed** (Argon2id); plaintext shown once at issue/rotate.
|
||||||
|
- **`cak_` at rest on the endpoint** is stored as a **DPAPI-machine-encrypted blob inside a
|
||||||
|
SYSTEM-ACL'd location** (HKLM value or `ProgramData` file) — both layers: the SYSTEM ACL
|
||||||
|
stops non-admin users reading it, and DPAPI-machine encryption makes a copied file/export
|
||||||
|
inert off the box. (Local admin/SYSTEM can always recover it; that is accepted — blast
|
||||||
|
radius of one leaked `cak_` is a single, independently-revocable machine.)
|
||||||
|
- **`machine_uid` binding** is the spoof-guard SPEC-004 wants: a `cak_` is bound to a
|
||||||
|
`machine_uid`; a different box presenting another box's `cak_` is detectable.
|
||||||
|
- **Authorization model** for moves/enrolls is possession-of-destination-key in v1
|
||||||
|
(identity-based authz deferred to the technician-assisted path).
|
||||||
|
- **Open registration risk** is mitigated by requiring `(site_code + long key)` and
|
||||||
|
rate-limit/lockout; auto-approve is acceptable because the enrollment key is the gate and
|
||||||
|
every enrollment/site-move fires an alert.
|
||||||
|
- **Audit events:** enroll, re-enroll/reuse, site-move, key-rotate — all logged with
|
||||||
|
`machine_uid`, site, and source IP.
|
||||||
|
|
||||||
|
## Testing strategy
|
||||||
|
|
||||||
|
- **Unit:** `machine_uid` derivation stability; enrollment-key verify/rotate; fingerprint
|
||||||
|
derivation; `cak_` mint/hash/verify; dedup decision (new vs. reuse vs. move).
|
||||||
|
- **Integration:** enroll new → row + `cak_` issued; re-enroll same `machine_uid` → reuse,
|
||||||
|
no duplicate; enroll with rotated (old) key → rejected; old `cak_` still connects after
|
||||||
|
rotation; rate-limit/lockout trips; site-move emits alert.
|
||||||
|
- **Manual:** build a site wrapper installer → run on a clean VM → appears in console under
|
||||||
|
correct site, immediately controllable; re-image VM → same row reused; `signtool verify
|
||||||
|
/pa` passes on the distributed wrapper and the laid-down agent.
|
||||||
|
|
||||||
|
## Effort estimate & dependencies
|
||||||
|
|
||||||
|
- **Size:** X-Large (agent + relay + DB migration + CI build/sign wrapper + dashboard
|
||||||
|
key/rotation surface).
|
||||||
|
- **Depends on:** SPEC-004 `machine_uid` (shared root); the CI signing already shipped
|
||||||
|
(SPEC-001 §2 / `release.yml`).
|
||||||
|
- **Unblocks:** SPEC-007 (installer builder gets a real per-site key + the signing
|
||||||
|
resolution), and the parked managed-agent test deployment on the internal beta machines.
|
||||||
|
- **Relationship to v2 phases:** sits with the Phase-1 secure-session-core (per-agent keys
|
||||||
|
+ identity) and feeds Phase-2 dashboard work.
|
||||||
|
|
||||||
|
## Resolved decisions (2026-06-02, Mike)
|
||||||
|
|
||||||
|
1. **Wrapper shape — BOTH.** v1 ships a signed bootstrapper `.exe` *and* a signed MSI per
|
||||||
|
site (ScreenConnect offers both; manual installs use the `.exe`, GPO/Intune fleet pushes
|
||||||
|
use the MSI). Same sign-once agent inside each.
|
||||||
|
2. **`cak_` storage — BOTH layers.** DPAPI-machine-encrypted blob stored in a SYSTEM-ACL'd
|
||||||
|
location. Non-admins can't read it; a stolen copy is inert off the box.
|
||||||
|
3. **Fingerprint — hex (`7F2A`).** Deliberately *not* the RMM word-code style, so the two
|
||||||
|
products' artifacts are never visually conflated.
|
||||||
|
4. **`machine_uid` — per-tenant scope, hardware-derived salt, collision-gated.** Dedup key
|
||||||
|
`(tenant_id, machine_uid)`; salt from stable hardware signals (survives same-hardware
|
||||||
|
re-image, separates distinct boxes); detected collisions (e.g. template-cloned VMs
|
||||||
|
sharing a hardware UUID) drop to pending + alert and require dashboard confirmation to
|
||||||
|
activate.
|
||||||
|
5. **Attended (support-code) path — unchanged.** `download_support` is filename-based
|
||||||
|
(`GuruConnect-<code>.exe`), not append-based, so renaming never breaks the signature —
|
||||||
|
it is already signing-safe. Only the managed `download_agent` append path is retired.
|
||||||
|
|
||||||
|
## Remaining for planning
|
||||||
|
|
||||||
|
- Exact stable-hardware signal set for the salt (SMBIOS UUID alone vs. + motherboard/disk
|
||||||
|
serial) and hypervisor behavior matrix (which hypervisors duplicate the SMBIOS UUID on
|
||||||
|
clone → exercise the collision-gate).
|
||||||
|
- MSI authoring approach (WiX) and whether per-site config rides as a per-site MSI vs. a
|
||||||
|
base MSI + property/transform.
|
||||||
180
docs/specs/SPEC-017-end-user-remote-access.md
Normal file
180
docs/specs/SPEC-017-end-user-remote-access.md
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
# SPEC-017: End-User (Sub-User) Remote Access
|
||||||
|
|
||||||
|
**Status:** Proposed
|
||||||
|
**Priority:** P2 (may settle to P3 depending on client demand)
|
||||||
|
**Requested By:** Mike (2026-06-02)
|
||||||
|
**Estimated Effort:** Large
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Let a client pay for their own employees to remotely reach **their own work machines** from home
|
||||||
|
through GuruConnect — the Splashtop-Business / unattended-end-user-access model, layered on top of the
|
||||||
|
MSP-technician console GuruConnect ships today. An MSP admin (or, later, a delegated client-company
|
||||||
|
admin) provisions a list of **end-users** and grants each one access to specific managed machines. The
|
||||||
|
end-user signs into a locked-down **end-user portal**, sees only the machines granted to them, and
|
||||||
|
connects — reusing the existing persistent-agent + session-scoped-viewer-token + relay path.
|
||||||
|
|
||||||
|
Success criteria: an `end_user`-role account can log in at a separate portal, see exactly the machines
|
||||||
|
in its grant set (and no others, across no other tenant), launch a control session to an online granted
|
||||||
|
machine, and is hard-denied from every technician/admin API, the agent plane, and any machine it was
|
||||||
|
not granted — with each login and machine access written to the audit log.
|
||||||
|
|
||||||
|
This is a net-new **sellable capability**, not a console-MVP blocker. It is sequenced after the v2
|
||||||
|
console foundations it depends on (tenancy, machine identity, persistent enrollment), which is why it is
|
||||||
|
P2 rather than P1.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
### Included in v1
|
||||||
|
- A new **`end_user`** value for `users.role`, provisioned by an MSP admin, with **deny-by-default**
|
||||||
|
authority: no console permissions, no agent-plane access, machine reach limited strictly to its
|
||||||
|
`user_client_access` grant set within its own tenant.
|
||||||
|
- A **separate end-user login + portal** route (locked-down): lists only granted machines with
|
||||||
|
online/offline state and a Connect action. No admin nav, no other users/machines/companies.
|
||||||
|
- **Admin UI + API** to create/disable end-users and assign/revoke per-machine grants, reusing the
|
||||||
|
existing `user_client_access` table.
|
||||||
|
- **Connect flow** that reuses the landed session-scoped viewer-token mechanism (`ViewerClaims`,
|
||||||
|
`jwt.rs:114`) and the relay enforcement path — no new transport.
|
||||||
|
- A new `connect_sessions.source` value **`end_user`** (migration widening the existing CHECK).
|
||||||
|
- **Audit**: end-user login success/failure and each machine-access grant-check written to
|
||||||
|
`connect_session_events`.
|
||||||
|
- Rate limiting + lockout on the public end-user login.
|
||||||
|
|
||||||
|
### Explicitly out of scope (v1)
|
||||||
|
- **Directory sync (AD / Entra-365 / Google) → end-user list** — its own future spec; v1 is manual
|
||||||
|
list management only.
|
||||||
|
- **Self-service seat purchasing / billing automation.** v1 records/counts seats per tenant; real
|
||||||
|
metering and Syncro/billing wiring is deferred.
|
||||||
|
- **Delegated client-company-admin role** (a client managing its own end-users/grants) — noted as a
|
||||||
|
fast-follow; v1 grants are MSP-admin-managed.
|
||||||
|
- Per-session view-only-vs-control *policy* per end-user (v1 = Control of one's own machine; the
|
||||||
|
`ViewerAccess` split still exists at the token layer).
|
||||||
|
- File transfer, session recording (already out of scope for the broader product v1).
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
### Principal model — `end_user` is a constrained variant of the login plane
|
||||||
|
GuruConnect already has three credential planes that must stay separate (audit-hardened in v2 Phase 1):
|
||||||
|
1. **Login `Claims`** (`jwt.rs:11`) — dashboard users; `role ∈ {admin, operator, viewer}` today.
|
||||||
|
2. **Session-scoped `ViewerClaims`** (`jwt.rs:114`) — 5-min, one session, `purpose=viewer`.
|
||||||
|
3. **Agent `cak_` keys** (`connect_agent_keys`, migration 004) — agents only.
|
||||||
|
|
||||||
|
`end_user` is added as a **fourth role on the login plane** — it issues a normal login JWT
|
||||||
|
(`create_token`, `jwt.rs:161`) carrying `role: "end_user"` and an **empty permission list**. The
|
||||||
|
separation guarantees the v2 audit established are preserved: an `end_user` JWT still cannot be used as
|
||||||
|
a viewer token (lacks `purpose`) nor as an agent key (agent plane rejects user JWTs).
|
||||||
|
|
||||||
|
**Critical authz inversion:** `user_client_access` today documents "no entries = access to all (for
|
||||||
|
admins)" (migration 002, line 25-26). The grant check **must branch on role** — for `end_user`, an
|
||||||
|
empty grant set means **zero** machines, never all. Authz is deny-by-default and grant-scoped; the
|
||||||
|
admin-bypass in `Claims::has_permission` (`jwt.rs:28-33`) must never fire for `end_user`.
|
||||||
|
|
||||||
|
### Agent / Relay-server / Viewer / Dashboard responsibilities
|
||||||
|
- **Agent:** no changes. End-users connect to existing **persistent/unattended** managed agents
|
||||||
|
(consent `not_required` — it is the user's own machine). Optionally honors the SPEC-015 notification
|
||||||
|
overlay if a per-machine policy requires it.
|
||||||
|
- **Relay-server:** no transport change. New end-user auth + portal + connect endpoints; the
|
||||||
|
grant-check + viewer-token mint is the only new server logic on the hot path.
|
||||||
|
- **Viewer:** reuse the React/TS web viewer (`dashboard/src/components/RemoteViewer.tsx`) — the
|
||||||
|
end-user portal embeds the same component with a Control-mode viewer token.
|
||||||
|
- **Dashboard:** new **role-gated end-user portal** route (recommended separate from the technician
|
||||||
|
console — see Open Questions), plus admin screens for end-user + grant management.
|
||||||
|
|
||||||
|
### Database (migrations)
|
||||||
|
- **`user_client_access`** — reused as the grant table; no schema change (already
|
||||||
|
`user_id UUID × client_id UUID → connect_machines(id)`, unique pair, migration 002).
|
||||||
|
- New migration `011_end_user_access.sql`:
|
||||||
|
- Widen `connect_sessions.source` CHECK to `('standalone','gururmm','end_user')` (currently
|
||||||
|
`('standalone','gururmm')`, migration 004 line 99-102).
|
||||||
|
- Optional `users` columns for the external principal: `mfa_secret TEXT NULL`,
|
||||||
|
`must_change_password BOOLEAN NOT NULL DEFAULT false`, and a partial index for fast
|
||||||
|
`role='end_user'` listing per `tenant_id`.
|
||||||
|
- (Seat tracking, if landed in v1: a lightweight per-tenant `end_user` count view or a
|
||||||
|
`tenant_seats` row — kept minimal.)
|
||||||
|
- Grants are tenant-contained: insert path validates `machine.tenant_id == end_user.tenant_id`.
|
||||||
|
|
||||||
|
### API endpoints / WS messages
|
||||||
|
- `POST /api/enduser/auth/login` — public, rate-limited; returns an `end_user` login JWT.
|
||||||
|
- `GET /api/enduser/machines` — lists only the caller's granted, in-tenant machines + presence.
|
||||||
|
- `POST /api/enduser/machines/:id/connect` — grant-checked; creates a `source=end_user` session and
|
||||||
|
mints a Control `ViewerClaims` token (`create_viewer_token`, `jwt.rs:233`) for that session.
|
||||||
|
- Admin: `POST /api/users` (role=end_user), `POST /api/users/:id/grants`,
|
||||||
|
`DELETE /api/users/:id/grants/:machine_id`, `GET /api/users?role=end_user`.
|
||||||
|
- No new protobuf messages — the WS viewer path and `guruconnect.proto` are unchanged.
|
||||||
|
|
||||||
|
## Implementation details
|
||||||
|
- `server/src/auth/jwt.rs` — extend the role vocabulary doc (`Claims.role`, line 16-17); add an
|
||||||
|
`is_end_user()` helper and ensure `has_permission` cannot grant `end_user` anything beyond explicit
|
||||||
|
permissions (the admin short-circuit at line 30 must be guarded).
|
||||||
|
- `server/src/auth/mod.rs` — `AuthenticatedUser` (line 29+) gains role-aware helpers; add an extractor
|
||||||
|
/ middleware that rejects non-`end_user` on the `/api/enduser/*` namespace and rejects `end_user` on
|
||||||
|
every console/admin route (deny-by-default allowlist).
|
||||||
|
- `server/src/api/` — new `enduser` handler module (login, machines, connect); admin user+grant
|
||||||
|
handlers extended for `role=end_user` and `user_client_access` writes.
|
||||||
|
- Grant check (shared fn): `machine_id ∈ user_client_access[user] AND machine.tenant_id == user.tenant_id`;
|
||||||
|
used by both `GET /machines` and `connect`.
|
||||||
|
- Session create stamps `source='end_user'`, `is_managed=true`/unattended, `consent_state='not_required'`,
|
||||||
|
then mints the viewer token via the existing path so relay enforcement is unchanged.
|
||||||
|
- `dashboard/src/` — end-user portal route (role-gated), reusing `RemoteViewer.tsx`; admin grant-matrix
|
||||||
|
UI. White-label (SPEC-014) applies to the portal as the most client-facing surface.
|
||||||
|
- Migration `server/migrations/011_end_user_access.sql` as above (idempotent; applied by
|
||||||
|
`sqlx::migrate!` per the migration standard).
|
||||||
|
|
||||||
|
## Security considerations
|
||||||
|
- **Preserve the plane separation** audited in v2 Phase 1 — `end_user` is login-plane only; it can
|
||||||
|
never satisfy `validate_viewer_token` or the agent `cak_` path.
|
||||||
|
- **Deny-by-default, grant-scoped:** empty `user_client_access` for an `end_user` = no access; the
|
||||||
|
admin-bypass must not apply. Every `/api/enduser/*` call re-checks the grant + tenant server-side
|
||||||
|
(never trust a machine id from the client).
|
||||||
|
- **Tenant containment:** an `end_user` and its grants live in one tenant; cross-tenant grants are
|
||||||
|
rejected at write and re-validated at connect. (Full tenant isolation lands with Phase 4; v1 enforces
|
||||||
|
via explicit `tenant_id` equality checks.)
|
||||||
|
- **External-user trust:** these accounts are public-internet-facing from home. Require
|
||||||
|
rate-limiting + lockout on `/api/enduser/auth/login`; support (recommend require) **TOTP MFA** for
|
||||||
|
`end_user` — schema column included so MFA can be v1 or an immediate fast-follow without a second
|
||||||
|
migration. Argon2id passwords (existing standard).
|
||||||
|
- **Audit:** log each end-user login (success/failure, source IP) and each machine access to
|
||||||
|
`connect_session_events`; the unattended access is to the user's *own* machine but must be fully
|
||||||
|
traceable. Optionally enforce the SPEC-015 overlay per machine policy.
|
||||||
|
- **Threat model:** stolen end-user creds reach only that user's granted machines (blast radius =
|
||||||
|
grant set), never the console, never the agent plane, never another tenant. Disabling the account
|
||||||
|
(`users.enabled=false`) immediately revokes portal + future tokens; the 5-min viewer-token TTL bounds
|
||||||
|
any in-flight session.
|
||||||
|
|
||||||
|
## Testing strategy
|
||||||
|
- **Unit:** grant-check fn (granted / not-granted / cross-tenant / empty-set-for-end_user = deny);
|
||||||
|
`has_permission` never elevates `end_user`; role-namespace middleware (end_user→console = 403,
|
||||||
|
technician→/api/enduser = 403).
|
||||||
|
- **Integration:** end-user login → list shows only granted machines → connect mints a Control viewer
|
||||||
|
token for a `source=end_user` session → relay admits; connect to a non-granted / other-tenant machine
|
||||||
|
→ 403; disabled account → login + token use rejected.
|
||||||
|
- **Manual:** full portal walkthrough from an off-network browser; MFA enrol + challenge; audit rows
|
||||||
|
present for login and access; white-label branding renders on the portal.
|
||||||
|
|
||||||
|
## Effort estimate & dependencies
|
||||||
|
- **Size:** Large (new principal + portal + admin grant UI + auth namespace; transport/agent untouched
|
||||||
|
and the grant table already exists, which holds it below X-Large).
|
||||||
|
- **Depends on (must precede / strongly preferred):**
|
||||||
|
- **Tenancy** (`tenants` + `tenant_id`, migration 004) — needed for containment; full isolation is
|
||||||
|
Phase 4 but v1 uses explicit tenant checks.
|
||||||
|
- **Stable machine identity + persistent enrollment** (SPEC-004 / 008 `machine_uid`, SPEC-016
|
||||||
|
zero-touch `cak_`) — end-users reach persistent managed agents.
|
||||||
|
- **Session-scoped viewer tokens** (v2 Phase 1, landed) — reused directly.
|
||||||
|
- **Pairs with:** SPEC-014 (white-label — the portal is the client-facing surface), SPEC-003/005
|
||||||
|
(machine inventory/list — portal machine rows), SPEC-015 (optional connect-notification overlay).
|
||||||
|
- **Unblocks:** the directory-sync spec (AD/Entra/Google → end-user list), delegated client-admin role,
|
||||||
|
and per-seat billing — all of which build on the `end_user` principal defined here.
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
1. **Same console vs separate end-user portal?** Recommendation: **separate, role-gated route** —
|
||||||
|
smaller attack surface, no risk of leaking technician controls, cleaner white-label. Confirm before
|
||||||
|
build.
|
||||||
|
2. **End-users in the existing `users` table (role=end_user) vs a dedicated `end_users` table?**
|
||||||
|
Recommendation: reuse `users` (the grant FK `user_client_access.user_id` already points there) with
|
||||||
|
hard role guardrails. Revisit if mixing external + internal principals in one table proves risky.
|
||||||
|
3. **MFA in v1 or immediate fast-follow?** Schema is included either way; decide enforcement timing.
|
||||||
|
4. **Who administers grants in v1** — MSP admin only (assumed), or ship the delegated client-company
|
||||||
|
admin role together? (Affects scope/effort materially.)
|
||||||
|
5. **Seat/licensing enforcement depth for v1** — count-and-display vs hard-cap vs billing-integrated.
|
||||||
|
6. **Default access mode** — Control assumed (own machine); should an admin be able to pin a machine to
|
||||||
|
view-only for a given end-user? (Token layer already supports it.)
|
||||||
146
docs/specs/SPEC-018-managed-agent-service-host.md
Normal file
146
docs/specs/SPEC-018-managed-agent-service-host.md
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
# SPEC-018: Managed-Agent SYSTEM Service Host + Session Broker
|
||||||
|
|
||||||
|
**Status:** Proposed
|
||||||
|
**Priority:** P1 (blocks SPEC-016 Phase B end-to-end runtime and SPEC-013)
|
||||||
|
**Requested By:** Mike (2026-06-02)
|
||||||
|
**Estimated Effort:** X-Large
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
Convert the managed/persistent GuruConnect agent from a user-context `HKCU\…\Run` autostart into a
|
||||||
|
**Windows SYSTEM service** that runs unattended — at the login screen, with no user logged in, across
|
||||||
|
reboots — and **brokers per-session capture/input worker processes** into the active interactive
|
||||||
|
desktop. A SYSTEM service lives in the isolated **Session 0** and cannot capture or inject the
|
||||||
|
interactive desktop directly, so the service spawns a worker into the target user session (the
|
||||||
|
ScreenConnect architecture).
|
||||||
|
|
||||||
|
This is foundational, not cosmetic. It unblocks three things at once:
|
||||||
|
1. **SPEC-016 Phase B end-to-end runtime** — the per-machine `cak_` store is ACL'd to SYSTEM +
|
||||||
|
Administrators; today the agent runs as the interactive *user* and can't read its own store (the
|
||||||
|
Phase B C1 *fail-fast guard* exists precisely because of this). Running as SYSTEM makes the store
|
||||||
|
readable and removes the guard.
|
||||||
|
2. **True unattended access** — a user-context agent only runs while that user is logged in. Reaching
|
||||||
|
a rebooted server or a machine sitting at the login screen (table-stakes for remote support)
|
||||||
|
requires SYSTEM.
|
||||||
|
3. **SPEC-013 session selection / backstage** — the session-broker primitive built here is the
|
||||||
|
substrate SPEC-013's session-switching UX drives.
|
||||||
|
|
||||||
|
**Success criteria:** the managed agent installs as an auto-start SYSTEM service; it holds the relay
|
||||||
|
connection and performs SPEC-016 enrollment as SYSTEM (reading/writing the SYSTEM-ACL'd `cak_`); it
|
||||||
|
spawns a capture/input worker into the active interactive session and relays frames; the worker is
|
||||||
|
respawned/retargeted on logon/logoff/console-connect; and the Phase B fail-fast guard is removed
|
||||||
|
because the store is now readable in-context.
|
||||||
|
|
||||||
|
## Background — why this is needed (confirmed in code)
|
||||||
|
|
||||||
|
- The persistent agent autostarts via `HKCU\…\Run` (`agent/src/startup.rs:21`, `STARTUP_KEY` = HKCU)
|
||||||
|
→ interactive-user token, not SYSTEM. The only SYSTEM service today is the separate `sas_service`
|
||||||
|
(Secure Attention Sequence helper).
|
||||||
|
- SPEC-016 Phase B (`agent/src/credential_store.rs`) ACLs the `cak_` store to `*S-1-5-18` (SYSTEM) +
|
||||||
|
`*S-1-5-32-544` (Administrators). In the current user context the agent writes but cannot read it
|
||||||
|
back → the Phase B fail-fast guard (`agent/src/main.rs` `resolve_agent_credential`) emits
|
||||||
|
"must run as the GuruConnect SYSTEM service (see SPEC-018)" instead of bricking.
|
||||||
|
- Capture/input live in the agent process (`agent/src/capture/`, `agent/src/input/`); a Session-0
|
||||||
|
SYSTEM service cannot drive these against the interactive desktop without a per-session worker.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
### Included in v1
|
||||||
|
|
||||||
|
1. **Windows service install/lifecycle** (`agent/src/install.rs` + a new service module): register the
|
||||||
|
managed agent as a **LocalSystem auto-start service** (`CreateServiceW` / a service crate),
|
||||||
|
configure failure/recovery (restart on crash), and **replace the HKCU `Run` autostart for managed
|
||||||
|
mode** (remove the Run entry on service install). Clean uninstall (stop + delete service).
|
||||||
|
2. **Service control loop** (Session 0, SYSTEM): owns the persistent WSS connection to the relay,
|
||||||
|
performs SPEC-016 enrollment as SYSTEM (now able to read/write the `cak_` store), and dispatches
|
||||||
|
session/connect requests to workers. Handles `SERVICE_CONTROL_STOP`/`SHUTDOWN` and
|
||||||
|
`SERVICE_CONTROL_SESSIONCHANGE`.
|
||||||
|
3. **Session broker:** enumerate sessions (`WTSEnumerateSessionsW`), resolve the active interactive
|
||||||
|
session (`WTSGetActiveConsoleSessionId`), obtain its user token (`WTSQueryUserToken` →
|
||||||
|
`DuplicateTokenEx`), and spawn a **per-session capture/input worker** into that session's desktop
|
||||||
|
(`CreateProcessAsUserW`, `winsta0\default`). The worker does DXGI capture + input injection in the
|
||||||
|
user's session; the service relays frames over the existing transport.
|
||||||
|
4. **Service ↔ worker IPC:** a local, ACL'd channel (named pipe `\\.\pipe\guruconnect-<sessionId>`)
|
||||||
|
carrying frames/input/control; pipe ACL restricted to SYSTEM + the target session user.
|
||||||
|
5. **Session-change handling:** on logon/logoff/console-connect/disconnect/lock/unlock, (re)spawn or
|
||||||
|
retarget the worker so the active desktop is always the one being served.
|
||||||
|
6. **Remove the SPEC-016 Phase B fail-fast guard** once the service runs as SYSTEM (the store is
|
||||||
|
readable in-context); keep the SYSTEM+Administrators ACL.
|
||||||
|
|
||||||
|
### Explicitly out of scope (anticipated, separate specs)
|
||||||
|
|
||||||
|
- **Session-selection / backstage UX** — the operator-facing picker and Session-0/secure-desktop
|
||||||
|
command surface are **SPEC-013**; this spec only provides the broker primitive it drives.
|
||||||
|
- **Login-screen / secure-desktop (winlogon) capture** beyond the broker hook — the hard
|
||||||
|
Secure-Desktop case is coordinated with SPEC-013; v1 here targets the active interactive session.
|
||||||
|
- **macOS/Linux service equivalents** — future SPEC-010 (cross-platform agents).
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
- **Agent splits into two roles:**
|
||||||
|
- **service-host** (LocalSystem, Session 0): service lifecycle, relay transport, SPEC-016
|
||||||
|
enrollment + `cak_` store, session broker, IPC server.
|
||||||
|
- **session-worker** (per interactive session, user token): DXGI/GDI capture, input injection,
|
||||||
|
IPC client. Spawned by the service via `CreateProcessAsUserW`.
|
||||||
|
- **Service install** (`install.rs`): `CreateServiceW` with `SERVICE_AUTO_START`, `SERVICE_WIN32_OWN_PROCESS`,
|
||||||
|
recovery actions; uninstall stops + deletes. Replaces managed-mode `HKCU Run`.
|
||||||
|
- **Token handoff:** `WTSGetActiveConsoleSessionId` → `WTSQueryUserToken` → `DuplicateTokenEx`
|
||||||
|
(primary token) → `CreateProcessAsUserW` with `lpDesktop = "winsta0\\default"`.
|
||||||
|
- **IPC:** named pipe per session, length-prefixed protobuf (reuse `proto/` message types where
|
||||||
|
sensible), pipe security descriptor granting only SYSTEM + the session user.
|
||||||
|
- **Session events:** the service registers for `SERVICE_CONTROL_SESSIONCHANGE` and reacts to
|
||||||
|
`WTS_CONSOLE_CONNECT`, `WTS_SESSION_LOGON/LOGOFF`, `WTS_SESSION_LOCK/UNLOCK`.
|
||||||
|
|
||||||
|
## Security considerations
|
||||||
|
|
||||||
|
- **LocalSystem is maximal privilege** — minimize the service's attack surface; validate every
|
||||||
|
relay-delivered command; never spawn a worker except into a legitimately-enumerated active session.
|
||||||
|
- **IPC pipe must be ACL'd** (SYSTEM + the specific session user only) so a non-admin user can't
|
||||||
|
inject capture/input commands by connecting to the pipe.
|
||||||
|
- **Token hygiene:** close duplicated tokens promptly; don't leak SYSTEM or user primary tokens.
|
||||||
|
- The SPEC-016 `cak_` store (SYSTEM-ACL'd) is now correctly readable; the fail-fast guard is removed
|
||||||
|
but the ACL stays.
|
||||||
|
- **Audit:** service start/stop, enrollment-as-SYSTEM, worker spawn, session attach/retarget — written
|
||||||
|
to the existing event pipeline.
|
||||||
|
|
||||||
|
## Implementation details
|
||||||
|
|
||||||
|
- New service module (e.g. `agent/src/service/{mod.rs, broker.rs, ipc.rs}`); worker entry split out of
|
||||||
|
the current capture path. New `Commands` variants or an internal `--service`/`--session-worker`
|
||||||
|
dispatch in `agent/src/main.rs`.
|
||||||
|
- `install.rs`: service create/recovery/delete; drop the managed-mode HKCU `Run` write.
|
||||||
|
- `windows` crate features: `Win32_System_Services`, `Win32_System_RemoteDesktop`
|
||||||
|
(`WTS*`), `Win32_Security`, `Win32_System_Threading` (`CreateProcessAsUserW`),
|
||||||
|
`Win32_System_Pipes`.
|
||||||
|
- Remove the `resolve_agent_credential` fail-fast guard branch added in SPEC-016 Phase B.
|
||||||
|
|
||||||
|
## Testing strategy
|
||||||
|
|
||||||
|
- **Service:** install → auto-start on boot → stop → uninstall on a clean VM.
|
||||||
|
- **`cak_` end-to-end:** SYSTEM service enrolls (SPEC-016), stores + reads the `cak_`, connects — the
|
||||||
|
integration test SPEC-016 Phase B currently cannot run.
|
||||||
|
- **Session broker:** worker spawns into the active session; capture/input work; survives logoff→logon
|
||||||
|
(respawn) and console-connect (retarget); fast-user-switch retarget.
|
||||||
|
- **Security:** non-admin cannot connect to the IPC pipe; worker runs with the user's token (not
|
||||||
|
SYSTEM) in the user's desktop.
|
||||||
|
|
||||||
|
## Effort estimate & dependencies
|
||||||
|
|
||||||
|
- **Size:** X-Large (service host + worker split + token-handoff + IPC + session-change handling +
|
||||||
|
install/uninstall).
|
||||||
|
- **Depends on:** SPEC-016 (enrollment + `cak_` store); the existing capture/input cores.
|
||||||
|
- **Unblocks:** SPEC-016 Phase B end-to-end runtime (and the parked managed-agent enrollment test on
|
||||||
|
the internal beta machines); **SPEC-013** (session selection builds on this broker).
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
1. **Service vs. SYSTEM scheduled task** — a true Windows service (recovery, SCM integration) is the
|
||||||
|
standard, robust choice; recommend service. Lock in planning.
|
||||||
|
2. **One multi-session worker vs. one worker per session** — per-session worker is simpler to reason
|
||||||
|
about and isolates a crash to one session; confirm.
|
||||||
|
3. **IPC transport** — named pipe (recommended) vs. local TCP/loopback; pipe ACLing is the cleaner
|
||||||
|
security story.
|
||||||
|
4. **Login-screen / Secure-Desktop capture** — how much (if any) in this spec vs. deferred to SPEC-013
|
||||||
|
(it needs a worker in the winlogon/secure desktop, a distinct hard problem).
|
||||||
|
5. **Migration** — on upgrade, cleanly transition existing HKCU-`Run` managed installs to the service
|
||||||
|
(remove the Run entry, install the service) without a gap.
|
||||||
129
reports/2026-05-31-gc-audit.md
Normal file
129
reports/2026-05-31-gc-audit.md
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
# GuruConnect Audit Report — 2026-05-31
|
||||||
|
|
||||||
|
**Auditor:** Claude (claude-opus-4-8[1m])
|
||||||
|
**Passes:** Security & Remote-Session Integrity (`--pass=security` only)
|
||||||
|
**Previous audit:** 2026-05-30 (`reports/2026-05-30-gc-audit.md`)
|
||||||
|
**Scope note:** v2 **Phase-1 EXIT gate** re-audit. Confirms the three relay CRITICALs stay closed and
|
||||||
|
the prior net-new HIGH is fixed, and assesses the net-new SPEC-004 surface (Tasks 2/4/5 — machine_uid
|
||||||
|
dedup, session reaping, operator removal) now committed + deployed. Includes **live** boundary tests
|
||||||
|
against the running production binary, not just a code re-derivation.
|
||||||
|
|
||||||
|
**Code under audit:** working tree at tag **v0.3.0 / e967cce** = the binary deployed to prod
|
||||||
|
172.16.3.30:3002 (deployed this session from 96f9c0a; e967cce adds only the version bump + changelog).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Executive Summary
|
||||||
|
|
||||||
|
| Pass | Total | Critical | High | Medium | Low | Info |
|
||||||
|
|------|-------|----------|------|--------|-----|------|
|
||||||
|
| Security & Session | 4 | 0 | 0 | 0 | 0 | 4 |
|
||||||
|
|
||||||
|
**Phase-1 security EXIT gate: PASS.** The relay/server plane is clean. All three 2026-05-29 CRITICALs
|
||||||
|
remain CLOSED (verified in code AND live against the deployed server). The prior net-new HIGH (agent
|
||||||
|
auto-update TLS bypass) and the prior LOW (chat content logged at INFO) are both remediated. The
|
||||||
|
net-new SPEC-004 surface (operator removal, machine_uid dedup gate, session reaper/supersede) audits
|
||||||
|
clean with the keyed-identity security invariant intact end-to-end. No net-new findings.
|
||||||
|
|
||||||
|
**Requires action:** none.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Live functional verification (deployed binary, 172.16.3.30:3002)
|
||||||
|
|
||||||
|
Forged tokens (HS256, real `JWT_SECRET`) exercised the WS auth boundaries directly. Each illegitimate
|
||||||
|
access was REJECTED (4xx, never a 101 upgrade):
|
||||||
|
|
||||||
|
| Check | Result | Proves |
|
||||||
|
|-------|--------|--------|
|
||||||
|
| Login-shape JWT on `/ws/viewer` | **401** | Login token not accepted as a viewer token (`purpose=="viewer"` enforced) — CRITICAL #1 |
|
||||||
|
| Validly-signed viewer token for session AAAA used on session BBBB | **403** | Session binding enforced — a correctly-signed token is refused for the wrong session — CRITICAL #1 |
|
||||||
|
| Login JWT used as agent `api_key` on `/ws/agent` | **401** | Agent plane rejects JWTs (no JWT branch) — CRITICAL #3 |
|
||||||
|
| Wrong-signature token on `/ws/viewer` | **401** | Signature validation holds (control) |
|
||||||
|
|
||||||
|
The session-bind case is the decisive one: a token that WOULD be accepted for its own session is
|
||||||
|
rejected 403 for a different session, proving the binding rather than mere signature validation.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## The three relay CRITICALs — verdict
|
||||||
|
|
||||||
|
| CRITICAL | Verdict | Enforced at |
|
||||||
|
|----------|---------|-------------|
|
||||||
|
| #1 any-JWT-joins-any-session | **CLOSED** | mint authz `api/sessions.rs` (is_admin \|\| permission); viewer WS `relay/mod.rs:496` `validate_viewer_token` (sig+expiry+`purpose=="viewer"`); session-bind `relay/mod.rs:527-534` (`claim != requested → 403`) |
|
||||||
|
| #2 viewer-WS blacklist | **CLOSED** (TTL-bounded residual unchanged) | `relay/mod.rs:509` `token_blacklist.is_revoked` before upgrade. Residual: logout revokes login JWT not minted viewer tokens (5-min TTL) — same tracked MEDIUM, no regression |
|
||||||
|
| #3 JWT-accepted-as-agent-key | **CLOSED**, fails closed | `relay/mod.rs:417` `validate_agent_api_key` — no JWT branch; only `cak_` (`auth/agent_keys.rs`, SHA-256 vs `connect_agent_keys`, `revoked_at IS NULL`) or deprecated shared key (WARN). Unresolved machine → 503 (`:303`); client `agent_id` overridden by key identity (`:283`) |
|
||||||
|
|
||||||
|
Live results match these code paths exactly.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prior HIGH — FIXED
|
||||||
|
|
||||||
|
**Agent auto-update TLS bypass → MITM-RCE: CLOSED.** `agent/src/update.rs:21` `dev_insecure_tls()` is
|
||||||
|
`cfg!(debug_assertions)` AND env-var gated, so a release build's `cfg!` compiles out and the agent
|
||||||
|
ALWAYS verifies certs. Both `check_for_update` (`:64`) and `download_update` (`:130`) consume it; unit
|
||||||
|
test `test_dev_insecure_tls_release_is_always_false` (`:362`) asserts the release invariant. No
|
||||||
|
`danger_accept_invalid_certs(true)` reachable in production. A signed-manifest defense-in-depth TODO is
|
||||||
|
filed at `install_update` (`:189`) (= tracked task #10, not an exit blocker).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Pass 5: Security & Remote-Session Integrity — net-new SPEC-004 surface
|
||||||
|
|
||||||
|
### [INFO] Operator removal API (`server/src/api/removal.rs`) — clean, admin-gated
|
||||||
|
Every removal handler takes the `AdminUser` extractor as its first argument (runs before any DB
|
||||||
|
mutation): `remove_machine` (`:88`), `remove_session` (`:321`), `bulk_remove_machines` (`:471`).
|
||||||
|
`AdminUser` (`auth/mod.rs:141`) validates JWT (signature + expiry + blacklist `:97`) then requires
|
||||||
|
`is_admin()` else 403 (`:146`). Soft-deletes are parameterized + idempotent (`WHERE … AND deleted_at IS
|
||||||
|
NULL`); bulk bounded (MAX_BATCH 500) with per-id UUID validation + isolated failures; audit
|
||||||
|
(`db/events.rs:126`) records actor + target + trusted-proxy IP, best-effort (cannot be suppressed by
|
||||||
|
attacker-controlled input). Removal is admin-role-gated globally (not per-tenant ACL) — same Phase-1
|
||||||
|
posture as viewer-mint, per-tenant narrowing deferred to SPEC-002 Phase 4. Acceptable by context.
|
||||||
|
|
||||||
|
### [INFO] machine_uid dedup security gate — invariant holds
|
||||||
|
Gate at `relay/mod.rs:352`: `effective_machine_uid = if is_keyed_agent { None } else { claimed }`. The
|
||||||
|
suppressed value (not the raw claim) flows to `register_agent` and `upsert_machine`. Keyed (`cak_`)
|
||||||
|
agents take the agent_id-keyed upsert branch and never write/touch a `ON CONFLICT (machine_uid)` row, so
|
||||||
|
a valid key for machine X cannot repoint machine Y via a claimed uid. An un-keyed uid-spoof can only
|
||||||
|
match a uid-bearing row — which the keyed connect path never creates; the only residual is a legacy
|
||||||
|
pre-keying row, and the startup L1 fix (`main.rs:267-288` via `keyed_machine_ids`, fail-closed on query
|
||||||
|
error) ensures keyed machines are never uid-indexed on restore.
|
||||||
|
|
||||||
|
### [INFO] Session reaper + same-machine supersede — clean, TOCTOU closed
|
||||||
|
`reap_stale_persistent` (`:875`) and supersede (`:322`) select under a read lock then re-assert the full
|
||||||
|
predicate under the write lock via `remove_session_if` (`:755`). Predicate requires
|
||||||
|
`!is_online && is_persistent && viewers.is_empty()` (+ TTL / same-uid) — an online, viewer-attached, or
|
||||||
|
support session is never reaped/superseded. Un-keyed uid-spoof blast radius = denial-of-persistence on
|
||||||
|
an offline same-uid session at worst, never a hijack. Lock order matches `register_agent`; predicate is
|
||||||
|
synchronous (no await under lock).
|
||||||
|
|
||||||
|
### [INFO] General posture — confirmed, no regressions
|
||||||
|
Runtime sqlx parameterized everywhere (no `format!`-built SQL); migrations 008/009 idempotent. Frame
|
||||||
|
caps: agent 4 MiB / viewer 64 KiB applied before upgrade. Input throttle retained. `/api/auth/login`
|
||||||
|
rate-limited (`main.rs:397`). `JWT_SECRET` panics if <32 (`main.rs:143`); agent keys SHA-256; Argon2id
|
||||||
|
passwords; no secret/token/code/PII logged. **Chat content no longer logged** (prior LOW fixed —
|
||||||
|
`relay/mod.rs:829,1428` now log length only).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Definitive answers
|
||||||
|
|
||||||
|
- **(a) Any non-admin removal path?** NO — all three removal handlers gate on `AdminUser` (JWT+blacklist+`is_admin`→403) before any DB mutation.
|
||||||
|
- **(b) Any uid-spoof that repoints/hijacks another machine's row or session (not just denial)?** NO — keyed identity is authoritative and uid-suppressed across connect → upsert → reattach → startup restore. Worst case for an un-keyed spoof is denial-of-persistence on an offline same-uid session.
|
||||||
|
- **(c) Any auth-plane bypass (agent↔viewer credential crossover)?** NO — viewer plane requires a `purpose=="viewer"` session-bound minted token; agent plane requires a `cak_`/shared key with no JWT branch. Confirmed in code and live.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Verdict
|
||||||
|
|
||||||
|
**Phase-1 security EXIT gate: PASS.** Relay/server plane clean; prior HIGH + LOW remediated; SPEC-004
|
||||||
|
surface sound with the keyed-identity invariant intact across the connect path, DB upsert, in-memory
|
||||||
|
reattach, and startup restore. No new CRITICAL/HIGH/MEDIUM/LOW.
|
||||||
|
|
||||||
|
**Tracked, deferred-by-design (not exit blockers):**
|
||||||
|
- Viewer-token logout revocation residual (MEDIUM, TTL-bounded) — `v2-secure-session-core/plan.md`.
|
||||||
|
- Update-binary signature verification (defense-in-depth, task #10) — TODO at `update.rs:189`.
|
||||||
|
|
||||||
|
*Note: only `--pass=security` was run. API-surface, Rust-quality, TypeScript, protocol-integrity,
|
||||||
|
docs-reconciliation, and CI/CD passes were not executed this run.*
|
||||||
159
server/migrations/010_spec016_enrollment.sql
Normal file
159
server/migrations/010_spec016_enrollment.sql
Normal file
@@ -0,0 +1,159 @@
|
|||||||
|
-- Migration: 010_spec016_enrollment.sql
|
||||||
|
-- Purpose: SPEC-016 zero-touch per-site agent enrollment — server-side data model.
|
||||||
|
--
|
||||||
|
-- Adds the per-site enrollment-key table, a minimal sites table to anchor it,
|
||||||
|
-- and the machine-side columns the collision-gated self-registration flow needs.
|
||||||
|
--
|
||||||
|
-- Two-tier credential model (SPEC-016 §Security): a low-sensitivity, rotatable,
|
||||||
|
-- per-site ENROLLMENT KEY (the `cek_` secret stored hashed here) gates "may this
|
||||||
|
-- machine register at all", while the high-sensitivity per-machine `cak_`
|
||||||
|
-- operating credential (connect_agent_keys, migration 004) is minted on a
|
||||||
|
-- successful enroll. Compromise of an enrollment key is recovered by rotating one
|
||||||
|
-- site, not a fleet-wide re-key.
|
||||||
|
--
|
||||||
|
-- DEVIATION FROM SPEC (documented): SPEC-016 §DB-migration describes
|
||||||
|
-- `site_enrollment_keys.site_id` as `fk -> sites`, assuming a sites table already
|
||||||
|
-- exists. It does NOT — in the current schema "site" and "company/organization" are
|
||||||
|
-- free-text columns on connect_machines (migration 005), there is no relational
|
||||||
|
-- sites entity. This migration therefore CREATES a minimal `connect_sites` table
|
||||||
|
-- (the relational anchor the enrollment-key FK and the dashboard per-site key
|
||||||
|
-- display both require) keyed by a natural `site_code` and scoped per-tenant. It is
|
||||||
|
-- intentionally minimal (code + display name + tenant); richer site/company
|
||||||
|
-- modeling is left to future work. The free-text connect_machines.site /
|
||||||
|
-- .organization columns are untouched and continue to carry agent-reported labels.
|
||||||
|
--
|
||||||
|
-- Idempotent: CREATE TABLE/INDEX IF NOT EXISTS, ADD COLUMN IF NOT EXISTS. Applied on
|
||||||
|
-- server startup by sqlx::migrate!(); never pre-applied via psql. Ordered after 009.
|
||||||
|
-- See .claude/standards/gururmm/sqlx-migrations.md.
|
||||||
|
|
||||||
|
-- pgcrypto provides gen_random_uuid(); enabled in 001/004 but re-asserted for safety.
|
||||||
|
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- connect_sites — relational anchor for per-site enrollment (see DEVIATION above)
|
||||||
|
-- ============================================================================
|
||||||
|
-- A site is the unit a single signed installer targets. `site_code` is the
|
||||||
|
-- non-secret, operator-facing identifier the installer carries and the agent sends
|
||||||
|
-- at /api/enroll (e.g. "ACME-PHX"). Uniqueness is per-tenant: the same human-chosen
|
||||||
|
-- code may legitimately exist in two tenants. tenant_id mirrors the nullable,
|
||||||
|
-- default-tenant-backfilled tenancy column used on every other scoped table
|
||||||
|
-- (migration 004); db::tenancy::current_tenant_id() resolves it for now.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS connect_sites (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
-- Operator-facing site identifier the installer carries. Non-secret.
|
||||||
|
site_code TEXT NOT NULL,
|
||||||
|
-- Human-readable site / company display name for the dashboard.
|
||||||
|
display_name TEXT,
|
||||||
|
-- Default company label applied to machines enrolled at this site (mirrors the
|
||||||
|
-- free-text connect_machines.organization the agent otherwise self-reports).
|
||||||
|
company TEXT,
|
||||||
|
-- Tenancy-ready (Phase 4). Backfilled to the default tenant below.
|
||||||
|
tenant_id UUID,
|
||||||
|
-- RESERVED for future per-site enrollment POLICY work (SPEC-016 §out-of-scope):
|
||||||
|
-- default 'auto-approve'; a future 'pending-approval' value will gate new
|
||||||
|
-- enrollments. NOT enforced in Phase A — present so the policy SPEC needs no
|
||||||
|
-- schema change. Do not branch on this column yet.
|
||||||
|
enrollment_policy TEXT DEFAULT 'auto-approve',
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Per-tenant uniqueness of the natural site_code so /api/enroll can resolve a site
|
||||||
|
-- deterministically within a tenant while the same code may exist across tenants.
|
||||||
|
-- COALESCE keeps the index usable while tenant_id is still nullable (Phase 1).
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_connect_sites_tenant_code
|
||||||
|
ON connect_sites (COALESCE(tenant_id, '00000000-0000-0000-0000-000000000001'::uuid), site_code);
|
||||||
|
|
||||||
|
-- Backfill the sites tenant_id to the default tenant (table is empty on a fresh DB;
|
||||||
|
-- no-op there, but keeps the migration self-consistent).
|
||||||
|
UPDATE connect_sites
|
||||||
|
SET tenant_id = '00000000-0000-0000-0000-000000000001'
|
||||||
|
WHERE tenant_id IS NULL;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- site_enrollment_keys — rotatable, hashed per-site enrollment secret + fingerprint
|
||||||
|
-- ============================================================================
|
||||||
|
-- Stores ONLY the Argon2id hash of the `cek_` secret; the plaintext is shown once
|
||||||
|
-- at issue/rotate and never recoverable. `version` is the monotonic rotation
|
||||||
|
-- counter; `fingerprint` is the non-secret short hex shown as `vN (XXXX)` in the
|
||||||
|
-- dashboard and baked into the installer filename. `active` marks the current key —
|
||||||
|
-- rotation flips the old key to active=false (blocking NEW enrollments from old
|
||||||
|
-- installers) and inserts a new active row; already-enrolled agents holding their
|
||||||
|
-- own `cak_` are unaffected. Multiple inactive (historical) rows may coexist per
|
||||||
|
-- site; at most one active row is intended (enforced by a partial unique index).
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS site_enrollment_keys (
|
||||||
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||||
|
site_id UUID NOT NULL REFERENCES connect_sites(id) ON DELETE CASCADE,
|
||||||
|
-- Argon2id hash of the `cek_` enrollment secret. Never the plaintext.
|
||||||
|
key_hash TEXT NOT NULL,
|
||||||
|
-- Monotonic rotation version (1, 2, 3, ...).
|
||||||
|
version INTEGER NOT NULL,
|
||||||
|
-- Non-secret short hex fingerprint code (the XXXX in `vN (XXXX)`), derived from
|
||||||
|
-- the secret. Stored so the dashboard / GET endpoint can show it without the
|
||||||
|
-- secret.
|
||||||
|
fingerprint TEXT NOT NULL,
|
||||||
|
active BOOLEAN NOT NULL DEFAULT true,
|
||||||
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
-- Set when this key is rotated out (active flipped to false).
|
||||||
|
rotated_at TIMESTAMPTZ
|
||||||
|
);
|
||||||
|
|
||||||
|
-- Lookup index for the enroll hot path: resolve the active key for a site.
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_site_enrollment_keys_site_active
|
||||||
|
ON site_enrollment_keys (site_id, active);
|
||||||
|
|
||||||
|
-- At most one ACTIVE enrollment key per site (the "current" installer key).
|
||||||
|
-- Partial unique index so any number of inactive historical rows may coexist.
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_site_enrollment_keys_one_active
|
||||||
|
ON site_enrollment_keys (site_id)
|
||||||
|
WHERE active;
|
||||||
|
|
||||||
|
-- ============================================================================
|
||||||
|
-- connect_machines — site binding + enrollment-state collision gate
|
||||||
|
-- ============================================================================
|
||||||
|
-- machine_uid already exists (migration 008) with a partial UNIQUE index on
|
||||||
|
-- (machine_uid) WHERE machine_uid IS NOT NULL. SPEC-016 §item-1 / resolved-decision #4
|
||||||
|
-- call for the dedup key to be PER-TENANT — (tenant_id, machine_uid) — so the same
|
||||||
|
-- hardware legitimately present in two tenants stays two rows. tenant_id is the
|
||||||
|
-- scoping column that exists on connect_machines (migration 004); machines have no
|
||||||
|
-- direct site_id today, so site is tracked separately (site_id below) and tenancy is
|
||||||
|
-- the uniqueness scope, exactly as the spec states.
|
||||||
|
--
|
||||||
|
-- CRITICAL CONSTRAINT (why we ADD rather than REPLACE the 008 index here):
|
||||||
|
-- db::machines::upsert_machine (the live connect-path upsert) uses
|
||||||
|
-- `ON CONFLICT (machine_uid) WHERE machine_uid IS NOT NULL` as its conflict arbiter.
|
||||||
|
-- Postgres matches that arbiter to the EXACT index from migration 008. Dropping that
|
||||||
|
-- index would make the live upsert fail to find an arbiter and error at runtime —
|
||||||
|
-- breaking every un-keyed agent reconnect. So migration 008's global index is LEFT
|
||||||
|
-- IN PLACE (the connect path keeps working unchanged) and the per-tenant index is
|
||||||
|
-- added ALONGSIDE it. In single-tenant Phase 1 the two are equivalent (every row's
|
||||||
|
-- tenant_id is the default tenant), so the per-tenant index adds the SPEC-016 dedup
|
||||||
|
-- semantics without a redundant-uniqueness conflict: a (tenant, uid) pair that is
|
||||||
|
-- unique is also globally unique today. When multi-tenancy activates AND
|
||||||
|
-- upsert_machine's ON CONFLICT is updated to name (tenant_id, machine_uid), a future
|
||||||
|
-- migration drops the global 008 index. Documented as deferred; do not drop it now.
|
||||||
|
|
||||||
|
-- Optional FK to the site a machine enrolled under (NULL for legacy / support-code
|
||||||
|
-- machines that never enrolled through /api/enroll). A site change on re-enroll is
|
||||||
|
-- the "site move" SPEC-016 audits.
|
||||||
|
ALTER TABLE connect_machines ADD COLUMN IF NOT EXISTS site_id UUID REFERENCES connect_sites(id) ON DELETE SET NULL;
|
||||||
|
|
||||||
|
-- enrollment_state: the collision gate (SPEC-016 §item-1/6). 'active' = live and
|
||||||
|
-- controllable (auto-approve posture); 'pending' = a machine_uid collision was
|
||||||
|
-- detected at enroll and an operator must confirm in the dashboard before the
|
||||||
|
-- endpoint may be controlled. Default 'active' so every legacy/connect-path row is
|
||||||
|
-- unaffected.
|
||||||
|
ALTER TABLE connect_machines
|
||||||
|
ADD COLUMN IF NOT EXISTS enrollment_state TEXT NOT NULL DEFAULT 'active'
|
||||||
|
CHECK (enrollment_state IN ('active', 'pending'));
|
||||||
|
|
||||||
|
-- Per-tenant machine_uid uniqueness (SPEC-016). Added ALONGSIDE migration 008's
|
||||||
|
-- global (machine_uid) index (see CRITICAL CONSTRAINT above — the connect-path
|
||||||
|
-- upsert's ON CONFLICT arbiter binds to the 008 index, which must survive). COALESCE
|
||||||
|
-- folds a NULL tenant_id to the default tenant so the index is well-defined while
|
||||||
|
-- tenancy is single-tenant (Phase 1); the WHERE clause excludes NULL machine_uid so
|
||||||
|
-- legacy un-keyed rows coexist freely.
|
||||||
|
CREATE UNIQUE INDEX IF NOT EXISTS idx_connect_machines_tenant_machine_uid
|
||||||
|
ON connect_machines (COALESCE(tenant_id, '00000000-0000-0000-0000-000000000001'::uuid), machine_uid)
|
||||||
|
WHERE machine_uid IS NOT NULL;
|
||||||
1008
server/src/api/enroll.rs
Normal file
1008
server/src/api/enroll.rs
Normal file
File diff suppressed because it is too large
Load Diff
@@ -4,10 +4,12 @@ pub mod auth;
|
|||||||
pub mod auth_logout;
|
pub mod auth_logout;
|
||||||
pub mod changelog;
|
pub mod changelog;
|
||||||
pub mod downloads;
|
pub mod downloads;
|
||||||
|
pub mod enroll;
|
||||||
pub mod machine_keys;
|
pub mod machine_keys;
|
||||||
pub mod releases;
|
pub mod releases;
|
||||||
pub mod removal;
|
pub mod removal;
|
||||||
pub mod sessions;
|
pub mod sessions;
|
||||||
|
pub mod sites;
|
||||||
pub mod users;
|
pub mod users;
|
||||||
|
|
||||||
use axum::{
|
use axum::{
|
||||||
|
|||||||
217
server/src/api/sites.rs
Normal file
217
server/src/api/sites.rs
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
//! Site enrollment-key administration (SPEC-016, admin plane).
|
||||||
|
//!
|
||||||
|
//! Admin (dashboard JWT + admin role) endpoints for the per-site enrollment key
|
||||||
|
//! the dashboard surfaces and rotates:
|
||||||
|
//!
|
||||||
|
//! - `POST /api/sites/:id/enrollment-key/rotate` — regenerate the `cek_` secret,
|
||||||
|
//! bump the monotonic version, derive a new fingerprint, deactivate the prior
|
||||||
|
//! active key, and return the plaintext + fingerprint ONCE. Old installers can no
|
||||||
|
//! longer enroll NEW machines after this; already-enrolled agents (holding their
|
||||||
|
//! own `cak_`) are unaffected (SPEC-016 success-criterion #3). Doubles as
|
||||||
|
//! first-issue when a site has no key yet.
|
||||||
|
//! - `GET /api/sites/:id/enrollment-key` — read the CURRENT non-secret fingerprint
|
||||||
|
//! + version (never the secret). 404 if the site has no active key yet.
|
||||||
|
//!
|
||||||
|
//! Auth mirrors `api::machine_keys`: the [`crate::auth::AdminUser`] extractor gates
|
||||||
|
//! both routes, and they are mounted behind the JWT `auth_layer`.
|
||||||
|
//!
|
||||||
|
//! SECURITY: the plaintext `cek_` is returned exactly once (rotate response),
|
||||||
|
//! never persisted in plaintext and never logged. Read responses expose only the
|
||||||
|
//! version + fingerprint.
|
||||||
|
|
||||||
|
use axum::{
|
||||||
|
extract::{Path, State},
|
||||||
|
http::StatusCode,
|
||||||
|
Json,
|
||||||
|
};
|
||||||
|
use serde::Serialize;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::auth::{enrollment_keys, AdminUser};
|
||||||
|
use crate::db;
|
||||||
|
use crate::AppState;
|
||||||
|
|
||||||
|
/// Standard error envelope (matches `api::machine_keys::ApiError`).
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct ApiError {
|
||||||
|
pub detail: String,
|
||||||
|
pub error_code: String,
|
||||||
|
pub status_code: u16,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ApiError {
|
||||||
|
fn new(status: StatusCode, code: &str, detail: &str) -> (StatusCode, Json<ApiError>) {
|
||||||
|
(
|
||||||
|
status,
|
||||||
|
Json(ApiError {
|
||||||
|
detail: detail.to_string(),
|
||||||
|
error_code: code.to_string(),
|
||||||
|
status_code: status.as_u16(),
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type ApiResult<T> = Result<T, (StatusCode, Json<ApiError>)>;
|
||||||
|
|
||||||
|
/// Response for a freshly rotated/issued enrollment key. `key` is present ONLY
|
||||||
|
/// here, once.
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct RotatedEnrollmentKey {
|
||||||
|
pub site_id: Uuid,
|
||||||
|
/// The plaintext `cek_` enrollment key. Shown exactly once — bake it into the
|
||||||
|
/// site installer now; the server keeps only its hash.
|
||||||
|
pub key: String,
|
||||||
|
/// Monotonic rotation version.
|
||||||
|
pub version: i32,
|
||||||
|
/// The non-secret short hex code (the `XXXX` in `vN (XXXX)`).
|
||||||
|
pub fingerprint: String,
|
||||||
|
/// Fully rendered operator-facing fingerprint, e.g. `v3 (7F2A)`.
|
||||||
|
pub fingerprint_label: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Non-secret current-key view for the GET endpoint.
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct EnrollmentKeyView {
|
||||||
|
pub site_id: Uuid,
|
||||||
|
pub version: i32,
|
||||||
|
pub fingerprint: String,
|
||||||
|
pub fingerprint_label: String,
|
||||||
|
pub active: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn require_db(state: &AppState) -> ApiResult<&db::Database> {
|
||||||
|
state.db.as_ref().ok_or_else(|| {
|
||||||
|
ApiError::new(
|
||||||
|
StatusCode::SERVICE_UNAVAILABLE,
|
||||||
|
"DATABASE_UNAVAILABLE",
|
||||||
|
"Database not available",
|
||||||
|
)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resolve a site by its UUID path segment, or a 404 envelope.
|
||||||
|
async fn resolve_site(db: &db::Database, site_id: Uuid) -> ApiResult<db::sites::Site> {
|
||||||
|
db::sites::get_site_by_id(db.pool(), site_id)
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
tracing::error!("DB error resolving site: {}", e);
|
||||||
|
ApiError::new(
|
||||||
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
"INTERNAL_ERROR",
|
||||||
|
"Internal server error",
|
||||||
|
)
|
||||||
|
})?
|
||||||
|
.ok_or_else(|| ApiError::new(StatusCode::NOT_FOUND, "SITE_NOT_FOUND", "Site not found"))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// POST /api/sites/:id/enrollment-key/rotate — rotate (or first-issue) a site's
|
||||||
|
/// enrollment key. Returns the plaintext `cek_` + fingerprint once.
|
||||||
|
pub async fn rotate_enrollment_key(
|
||||||
|
AdminUser(admin): AdminUser,
|
||||||
|
State(state): State<AppState>,
|
||||||
|
Path(site_id): Path<Uuid>,
|
||||||
|
) -> ApiResult<(StatusCode, Json<RotatedEnrollmentKey>)> {
|
||||||
|
let db = require_db(&state)?;
|
||||||
|
let site = resolve_site(db, site_id).await?;
|
||||||
|
|
||||||
|
// Mint plaintext + Argon2id hash + fingerprint. Only the hash + fingerprint
|
||||||
|
// are persisted; the plaintext is surfaced once below.
|
||||||
|
let plaintext = enrollment_keys::generate_enrollment_key();
|
||||||
|
let key_hash = enrollment_keys::hash_enrollment_key(&plaintext).map_err(|e| {
|
||||||
|
tracing::error!("Failed to hash enrollment key: {}", e);
|
||||||
|
ApiError::new(
|
||||||
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
"INTERNAL_ERROR",
|
||||||
|
"Failed to hash enrollment key",
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
let fingerprint = enrollment_keys::compute_fingerprint(&plaintext);
|
||||||
|
|
||||||
|
let new_key = db::enrollment_keys::rotate_key(db.pool(), site.id, &key_hash, &fingerprint)
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
tracing::error!("DB error rotating enrollment key: {}", e);
|
||||||
|
ApiError::new(
|
||||||
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
"INTERNAL_ERROR",
|
||||||
|
"Failed to rotate enrollment key",
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let fingerprint_label =
|
||||||
|
enrollment_keys::render_fingerprint(new_key.version, &new_key.fingerprint);
|
||||||
|
|
||||||
|
// Audit WITHOUT key material (no plaintext, no hash).
|
||||||
|
if let Err(e) = db::events::log_enrollment_event(
|
||||||
|
db.pool(),
|
||||||
|
db::events::EventTypes::ENROLLMENT_KEY_ROTATED,
|
||||||
|
serde_json::json!({
|
||||||
|
"site_id": site.id,
|
||||||
|
"site_code": site.site_code,
|
||||||
|
"version": new_key.version,
|
||||||
|
"fingerprint": new_key.fingerprint,
|
||||||
|
"rotated_by": admin.username,
|
||||||
|
}),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
tracing::warn!("[ENROLL] failed to write key-rotate audit event: {}", e);
|
||||||
|
}
|
||||||
|
tracing::info!(
|
||||||
|
"Admin {} rotated enrollment key for site {} to {}",
|
||||||
|
admin.username,
|
||||||
|
site.site_code,
|
||||||
|
fingerprint_label
|
||||||
|
);
|
||||||
|
|
||||||
|
Ok((
|
||||||
|
StatusCode::CREATED,
|
||||||
|
Json(RotatedEnrollmentKey {
|
||||||
|
site_id: site.id,
|
||||||
|
key: plaintext,
|
||||||
|
version: new_key.version,
|
||||||
|
fingerprint: new_key.fingerprint,
|
||||||
|
fingerprint_label,
|
||||||
|
}),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// GET /api/sites/:id/enrollment-key — current non-secret fingerprint + version.
|
||||||
|
pub async fn get_enrollment_key(
|
||||||
|
AdminUser(_admin): AdminUser,
|
||||||
|
State(state): State<AppState>,
|
||||||
|
Path(site_id): Path<Uuid>,
|
||||||
|
) -> ApiResult<Json<EnrollmentKeyView>> {
|
||||||
|
let db = require_db(&state)?;
|
||||||
|
let site = resolve_site(db, site_id).await?;
|
||||||
|
|
||||||
|
let key = db::enrollment_keys::get_active_for_site(db.pool(), site.id)
|
||||||
|
.await
|
||||||
|
.map_err(|e| {
|
||||||
|
tracing::error!("DB error loading enrollment key: {}", e);
|
||||||
|
ApiError::new(
|
||||||
|
StatusCode::INTERNAL_SERVER_ERROR,
|
||||||
|
"INTERNAL_ERROR",
|
||||||
|
"Internal server error",
|
||||||
|
)
|
||||||
|
})?
|
||||||
|
.ok_or_else(|| {
|
||||||
|
ApiError::new(
|
||||||
|
StatusCode::NOT_FOUND,
|
||||||
|
"NO_ENROLLMENT_KEY",
|
||||||
|
"Site has no active enrollment key",
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let fingerprint_label = enrollment_keys::render_fingerprint(key.version, &key.fingerprint);
|
||||||
|
|
||||||
|
Ok(Json(EnrollmentKeyView {
|
||||||
|
site_id: site.id,
|
||||||
|
version: key.version,
|
||||||
|
fingerprint: key.fingerprint,
|
||||||
|
fingerprint_label,
|
||||||
|
active: key.active,
|
||||||
|
}))
|
||||||
|
}
|
||||||
191
server/src/auth/enrollment_keys.rs
Normal file
191
server/src/auth/enrollment_keys.rs
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
//! Per-site enrollment key minting, hashing, verification, and fingerprinting
|
||||||
|
//! (SPEC-016 zero-touch enrollment, auth layer).
|
||||||
|
//!
|
||||||
|
//! This is the low-sensitivity, rotatable side of the two-tier credential model
|
||||||
|
//! (SPEC-016 §Security). A per-site ENROLLMENT key (`cek_` prefix) gates "may
|
||||||
|
//! this machine register at all" at `POST /api/enroll`; a successful enroll mints
|
||||||
|
//! the high-sensitivity per-machine `cak_` operating credential
|
||||||
|
//! ([`crate::auth::agent_keys`]). Compromise of an enrollment key is contained to
|
||||||
|
//! one site and recovered by rotating it.
|
||||||
|
//!
|
||||||
|
//! Lifecycle owned here (the secret side):
|
||||||
|
//!
|
||||||
|
//! - [`generate_enrollment_key`] mints a high-entropy, `cek_`-prefixed plaintext
|
||||||
|
//! secret. Mirrors [`crate::auth::agent_keys::generate_agent_key`]'s entropy
|
||||||
|
//! approach (32 random bytes from the OS CSPRNG, hex-encoded) with a DISTINCT
|
||||||
|
//! prefix so the two key kinds are never confused in logs or storage. The
|
||||||
|
//! plaintext is shown to the operator exactly once at issue/rotate and is NEVER
|
||||||
|
//! persisted or logged.
|
||||||
|
//! - [`hash_enrollment_key`] / [`verify_enrollment_key`] use **Argon2id** (via
|
||||||
|
//! [`crate::auth::password`]). This DIFFERS from `cak_` (which uses SHA-256 for
|
||||||
|
//! a constant-shape equality lookup): SPEC-016 §2 explicitly requires the
|
||||||
|
//! enrollment key be "stored hashed (Argon2id, same as `cak_`/passwords)". The
|
||||||
|
//! trade-off is deliberate — enrollment keys are looked up by `(site, active)`
|
||||||
|
//! first (a small candidate set, usually one row) and only then verified, so the
|
||||||
|
//! per-verify KDF cost is bounded and not on a high-QPS path, while Argon2id
|
||||||
|
//! gives salted, GPU-resistant storage matching the password posture.
|
||||||
|
//! - [`compute_fingerprint`] derives the non-secret short HEX code shown as
|
||||||
|
//! `vN (XXXX)` (SPEC-016 resolved-decision #3 — hex, deliberately NOT the
|
||||||
|
//! GuruRMM word-style code, so the two products' artifacts are never visually
|
||||||
|
//! conflated).
|
||||||
|
//!
|
||||||
|
//! SECURITY: never log a plaintext key or its hash. Functions here return the
|
||||||
|
//! plaintext to the caller (issue/rotate endpoint) but emit no `tracing` output
|
||||||
|
//! containing key material.
|
||||||
|
|
||||||
|
use anyhow::Result;
|
||||||
|
use rand::RngCore;
|
||||||
|
use ring::digest;
|
||||||
|
|
||||||
|
/// Prefix marking a GuruConnect per-site enrollment key. Distinct from the
|
||||||
|
/// per-agent `cak_` prefix so the two key kinds are never confused.
|
||||||
|
pub const ENROLLMENT_KEY_PREFIX: &str = "cek_";
|
||||||
|
|
||||||
|
/// Number of random bytes behind an enrollment key (256 bits of entropy), matching
|
||||||
|
/// [`crate::auth::agent_keys`]. SPEC-016 §2 requires ≥256-bit.
|
||||||
|
const ENROLLMENT_KEY_RANDOM_BYTES: usize = 32;
|
||||||
|
|
||||||
|
/// Number of hex characters in the fingerprint code (the `XXXX` in `vN (XXXX)`).
|
||||||
|
/// Four hex chars = 16 bits — ample to let an operator tell two installers apart at
|
||||||
|
/// a glance; it is a non-secret display aid, not a security control.
|
||||||
|
const FINGERPRINT_HEX_LEN: usize = 4;
|
||||||
|
|
||||||
|
/// Generate a new high-entropy, `cek_`-prefixed per-site enrollment key (plaintext).
|
||||||
|
///
|
||||||
|
/// The returned string is the ONLY time the plaintext exists; the caller must
|
||||||
|
/// surface it to the operator once and store only [`hash_enrollment_key`] of it.
|
||||||
|
/// Uses the OS CSPRNG via `rand::rngs::OsRng`.
|
||||||
|
pub fn generate_enrollment_key() -> String {
|
||||||
|
let mut bytes = [0u8; ENROLLMENT_KEY_RANDOM_BYTES];
|
||||||
|
rand::rngs::OsRng.fill_bytes(&mut bytes);
|
||||||
|
format!("{}{}", ENROLLMENT_KEY_PREFIX, hex_encode(&bytes))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Hash an enrollment key for storage using Argon2id (SPEC-016 §2).
|
||||||
|
///
|
||||||
|
/// Delegates to [`crate::auth::password::hash_password`] so the KDF parameters and
|
||||||
|
/// salt generation match the password posture exactly. Returns the PHC-format
|
||||||
|
/// string Postgres stores in `site_enrollment_keys.key_hash`.
|
||||||
|
pub fn hash_enrollment_key(plaintext: &str) -> Result<String> {
|
||||||
|
crate::auth::password::hash_password(plaintext)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Verify a presented enrollment key against a stored Argon2id hash.
|
||||||
|
///
|
||||||
|
/// Returns `Ok(true)` on a match. A malformed stored hash or a mismatch yields
|
||||||
|
/// `Ok(false)` / an `Err` from the underlying verifier; the caller treats any
|
||||||
|
/// non-`Ok(true)` as a rejection. A cheap structural reject (`cek_` prefix) runs
|
||||||
|
/// first to skip the KDF on obviously-bogus input.
|
||||||
|
///
|
||||||
|
/// SECURITY: only compares; never logs the presented key or the hash.
|
||||||
|
pub fn verify_enrollment_key(presented: &str, stored_hash: &str) -> bool {
|
||||||
|
if !presented.starts_with(ENROLLMENT_KEY_PREFIX) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
crate::auth::password::verify_password(presented, stored_hash).unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Compute the non-secret short HEX fingerprint code for an enrollment key.
|
||||||
|
///
|
||||||
|
/// Derived as the first [`FINGERPRINT_HEX_LEN`] hex chars of the SHA-256 of the
|
||||||
|
/// plaintext secret, uppercased. This is a stable, non-reversible tag of the secret
|
||||||
|
/// (knowing the code does not reveal the key) used purely for display. Pair it with
|
||||||
|
/// the monotonic version via [`render_fingerprint`].
|
||||||
|
pub fn compute_fingerprint(plaintext: &str) -> String {
|
||||||
|
let d = digest::digest(&digest::SHA256, plaintext.as_bytes());
|
||||||
|
let hex = hex_encode(d.as_ref());
|
||||||
|
hex[..FINGERPRINT_HEX_LEN].to_ascii_uppercase()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Render the operator-facing fingerprint string `vN (XXXX)` (SPEC-016 §2).
|
||||||
|
///
|
||||||
|
/// `version` is the monotonic rotation counter; `code` is [`compute_fingerprint`].
|
||||||
|
/// Example: `render_fingerprint(3, "7F2A")` -> `"v3 (7F2A)"`.
|
||||||
|
pub fn render_fingerprint(version: i32, code: &str) -> String {
|
||||||
|
format!("v{} ({})", version, code)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Lowercase hex encoding without pulling in the `hex` crate (mirrors
|
||||||
|
/// [`crate::auth::agent_keys`]).
|
||||||
|
fn hex_encode(bytes: &[u8]) -> String {
|
||||||
|
use std::fmt::Write;
|
||||||
|
let mut s = String::with_capacity(bytes.len() * 2);
|
||||||
|
for b in bytes {
|
||||||
|
let _ = write!(s, "{:02x}", b);
|
||||||
|
}
|
||||||
|
s
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn generated_key_is_prefixed_and_high_entropy() {
|
||||||
|
let key = generate_enrollment_key();
|
||||||
|
assert!(key.starts_with(ENROLLMENT_KEY_PREFIX));
|
||||||
|
assert_eq!(
|
||||||
|
key.len(),
|
||||||
|
ENROLLMENT_KEY_PREFIX.len() + ENROLLMENT_KEY_RANDOM_BYTES * 2
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn generated_keys_are_unique() {
|
||||||
|
assert_ne!(generate_enrollment_key(), generate_enrollment_key());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn hash_and_verify_roundtrip() {
|
||||||
|
let key = generate_enrollment_key();
|
||||||
|
let hash = hash_enrollment_key(&key).expect("hash");
|
||||||
|
assert!(verify_enrollment_key(&key, &hash));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn verify_rejects_wrong_key() {
|
||||||
|
let key = generate_enrollment_key();
|
||||||
|
let other = generate_enrollment_key();
|
||||||
|
let hash = hash_enrollment_key(&key).expect("hash");
|
||||||
|
assert!(!verify_enrollment_key(&other, &hash));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn verify_rejects_unprefixed_input_without_touching_kdf() {
|
||||||
|
let key = generate_enrollment_key();
|
||||||
|
let hash = hash_enrollment_key(&key).expect("hash");
|
||||||
|
// A value lacking the cek_ prefix is structurally rejected before the KDF.
|
||||||
|
assert!(!verify_enrollment_key("not-a-key", &hash));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn verify_rejects_malformed_stored_hash() {
|
||||||
|
let key = generate_enrollment_key();
|
||||||
|
// A garbage stored hash must not panic and must reject.
|
||||||
|
assert!(!verify_enrollment_key(&key, "not-a-phc-hash"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn fingerprint_is_stable_uppercase_hex_of_expected_len() {
|
||||||
|
let key = "cek_deadbeef";
|
||||||
|
let f1 = compute_fingerprint(key);
|
||||||
|
let f2 = compute_fingerprint(key);
|
||||||
|
assert_eq!(f1, f2);
|
||||||
|
assert_eq!(f1.len(), FINGERPRINT_HEX_LEN);
|
||||||
|
assert!(f1.chars().all(|c| c.is_ascii_hexdigit()));
|
||||||
|
assert_eq!(f1, f1.to_ascii_uppercase());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn fingerprint_differs_per_key() {
|
||||||
|
assert_ne!(
|
||||||
|
compute_fingerprint("cek_aaa"),
|
||||||
|
compute_fingerprint("cek_bbb")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn render_fingerprint_matches_spec_shape() {
|
||||||
|
assert_eq!(render_fingerprint(3, "7F2A"), "v3 (7F2A)");
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -4,6 +4,7 @@
|
|||||||
//! validation for agents.
|
//! validation for agents.
|
||||||
|
|
||||||
pub mod agent_keys;
|
pub mod agent_keys;
|
||||||
|
pub mod enrollment_keys;
|
||||||
pub mod jwt;
|
pub mod jwt;
|
||||||
pub mod password;
|
pub mod password;
|
||||||
pub mod token_blacklist;
|
pub mod token_blacklist;
|
||||||
|
|||||||
141
server/src/db/enrollment_keys.rs
Normal file
141
server/src/db/enrollment_keys.rs
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
//! Per-site enrollment key database operations (SPEC-016 zero-touch enrollment).
|
||||||
|
//!
|
||||||
|
//! Backs the `site_enrollment_keys` table (migration 010). Stores ONLY the
|
||||||
|
//! Argon2id hash of the `cek_` secret plus the non-secret rotation metadata
|
||||||
|
//! (version, fingerprint, active flag). Computing the hash and minting the
|
||||||
|
//! plaintext is [`crate::auth::enrollment_keys`]'s job; this module is
|
||||||
|
//! hash-agnostic persistence and takes already-hashed values.
|
||||||
|
//!
|
||||||
|
//! Rotation invariant: at most one `active` row per site (enforced by a partial
|
||||||
|
//! unique index in migration 010). [`rotate_key`] deactivates the current active
|
||||||
|
//! row and inserts a new active one inside a single transaction so the invariant
|
||||||
|
//! is never transiently violated.
|
||||||
|
//!
|
||||||
|
//! All queries use runtime `sqlx::query()` / `sqlx::query_as()` per the codebase
|
||||||
|
//! convention (no compile-time `query!` macros, no `.sqlx` offline cache).
|
||||||
|
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
/// Per-site enrollment key record.
|
||||||
|
///
|
||||||
|
/// `key_hash` is the only representation of the secret the server stores; the
|
||||||
|
/// plaintext is shown once at issue/rotate and never persisted.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, sqlx::FromRow)]
|
||||||
|
pub struct EnrollmentKey {
|
||||||
|
pub id: Uuid,
|
||||||
|
pub site_id: Uuid,
|
||||||
|
pub key_hash: String,
|
||||||
|
pub version: i32,
|
||||||
|
pub fingerprint: String,
|
||||||
|
pub active: bool,
|
||||||
|
pub created_at: DateTime<Utc>,
|
||||||
|
pub rotated_at: Option<DateTime<Utc>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fetch the active enrollment key for a site, if any.
|
||||||
|
///
|
||||||
|
/// This is the `/api/enroll` hot path: resolve the one active key whose hash the
|
||||||
|
/// presented `cek_` is verified against. The partial unique index guarantees at
|
||||||
|
/// most one active row, so `fetch_optional` is correct.
|
||||||
|
pub async fn get_active_for_site(
|
||||||
|
pool: &PgPool,
|
||||||
|
site_id: Uuid,
|
||||||
|
) -> Result<Option<EnrollmentKey>, sqlx::Error> {
|
||||||
|
sqlx::query_as::<_, EnrollmentKey>(
|
||||||
|
r#"
|
||||||
|
SELECT id, site_id, key_hash, version, fingerprint, active, created_at, rotated_at
|
||||||
|
FROM site_enrollment_keys
|
||||||
|
WHERE site_id = $1 AND active
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(site_id)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Insert the FIRST enrollment key for a site at version 1 (initial issue).
|
||||||
|
///
|
||||||
|
/// Use [`rotate_key`] for subsequent rotations. Errors with a unique violation if
|
||||||
|
/// the site already has an active key (the caller should rotate instead).
|
||||||
|
#[allow(dead_code)] // Wired by site-admin issue flow; Phase A exposes rotation (which also covers first issue when none exists).
|
||||||
|
pub async fn insert_initial_key(
|
||||||
|
pool: &PgPool,
|
||||||
|
site_id: Uuid,
|
||||||
|
key_hash: &str,
|
||||||
|
fingerprint: &str,
|
||||||
|
) -> Result<EnrollmentKey, sqlx::Error> {
|
||||||
|
sqlx::query_as::<_, EnrollmentKey>(
|
||||||
|
r#"
|
||||||
|
INSERT INTO site_enrollment_keys (site_id, key_hash, version, fingerprint, active)
|
||||||
|
VALUES ($1, $2, 1, $3, true)
|
||||||
|
RETURNING id, site_id, key_hash, version, fingerprint, active, created_at, rotated_at
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(site_id)
|
||||||
|
.bind(key_hash)
|
||||||
|
.bind(fingerprint)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Rotate a site's enrollment key (SPEC-016 §2): deactivate the current active key
|
||||||
|
/// (if any) and insert a new active key at the next monotonic version, all in one
|
||||||
|
/// transaction.
|
||||||
|
///
|
||||||
|
/// Returns the newly-created active key. If the site has no key yet, this issues
|
||||||
|
/// version 1 (so rotation also serves as first-issue). The caller passes the
|
||||||
|
/// already-hashed new secret and its fingerprint; the plaintext is surfaced once by
|
||||||
|
/// the caller and never reaches this layer.
|
||||||
|
///
|
||||||
|
/// The transaction is what keeps the "at most one active key per site" invariant
|
||||||
|
/// (partial unique index) from being transiently violated between the UPDATE and
|
||||||
|
/// the INSERT.
|
||||||
|
pub async fn rotate_key(
|
||||||
|
pool: &PgPool,
|
||||||
|
site_id: Uuid,
|
||||||
|
new_key_hash: &str,
|
||||||
|
new_fingerprint: &str,
|
||||||
|
) -> Result<EnrollmentKey, sqlx::Error> {
|
||||||
|
let mut tx = pool.begin().await?;
|
||||||
|
|
||||||
|
// Highest existing version for this site (NULL -> 0 so the first key is v1).
|
||||||
|
let current_max: Option<i32> =
|
||||||
|
sqlx::query_scalar("SELECT MAX(version) FROM site_enrollment_keys WHERE site_id = $1")
|
||||||
|
.bind(site_id)
|
||||||
|
.fetch_one(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
let next_version = current_max.unwrap_or(0) + 1;
|
||||||
|
|
||||||
|
// Deactivate the current active key (if any), stamping rotated_at.
|
||||||
|
sqlx::query(
|
||||||
|
r#"
|
||||||
|
UPDATE site_enrollment_keys
|
||||||
|
SET active = false, rotated_at = NOW()
|
||||||
|
WHERE site_id = $1 AND active
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(site_id)
|
||||||
|
.execute(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// Insert the new active key at the next version.
|
||||||
|
let new_key = sqlx::query_as::<_, EnrollmentKey>(
|
||||||
|
r#"
|
||||||
|
INSERT INTO site_enrollment_keys (site_id, key_hash, version, fingerprint, active)
|
||||||
|
VALUES ($1, $2, $3, $4, true)
|
||||||
|
RETURNING id, site_id, key_hash, version, fingerprint, active, created_at, rotated_at
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(site_id)
|
||||||
|
.bind(new_key_hash)
|
||||||
|
.bind(next_version)
|
||||||
|
.bind(new_fingerprint)
|
||||||
|
.fetch_one(&mut *tx)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
tx.commit().await?;
|
||||||
|
Ok(new_key)
|
||||||
|
}
|
||||||
@@ -69,6 +69,40 @@ impl EventTypes {
|
|||||||
pub const MACHINE_REMOVED: &'static str = "machine_removed";
|
pub const MACHINE_REMOVED: &'static str = "machine_removed";
|
||||||
/// An administrator soft-deleted (purged) a session and dropped it in-memory.
|
/// An administrator soft-deleted (purged) a session and dropped it in-memory.
|
||||||
pub const SESSION_REMOVED: &'static str = "session_removed";
|
pub const SESSION_REMOVED: &'static str = "session_removed";
|
||||||
|
|
||||||
|
// Zero-touch enrollment events (SPEC-016). Written by POST /api/enroll and the
|
||||||
|
// site enrollment-key rotation endpoint. These carry no session, so they are
|
||||||
|
// logged via `log_enrollment_event` with `session_id = NULL`; the structured
|
||||||
|
// detail (machine_uid, site_code, fingerprint, etc.) goes in `details` and the
|
||||||
|
// source IP in `ip_address`.
|
||||||
|
/// A new machine self-registered at a site and was minted its first `cak_`.
|
||||||
|
pub const ENROLL_NEW: &'static str = "enroll_new";
|
||||||
|
/// An existing machine_uid re-enrolled at the SAME site — the row was reused and
|
||||||
|
/// a fresh `cak_` minted (re-image / re-install).
|
||||||
|
pub const ENROLL_REUSE: &'static str = "enroll_reuse";
|
||||||
|
/// An existing machine_uid enrolled under a DIFFERENT site — the machine's site
|
||||||
|
/// binding was updated (a "site move"). Fires an alert.
|
||||||
|
///
|
||||||
|
/// NOTE (SPEC-016 Phase A): the unauthenticated enroll path does NOT perform this
|
||||||
|
/// move — a cross-site enroll is REFUSED (`ENROLL_SITE_CONFLICT`) rather than
|
||||||
|
/// silently repointing the machine. This event is reserved for the deliberate
|
||||||
|
/// Phase-B `--reassign` flow (and the dashboard move action) that supersede it.
|
||||||
|
#[allow(dead_code)] // reserved for Phase-B --reassign; not emitted by Phase A enroll
|
||||||
|
pub const ENROLL_SITE_MOVE: &'static str = "enroll_site_move";
|
||||||
|
/// An existing machine_uid presented a valid key for a DIFFERENT site than the one
|
||||||
|
/// the machine is currently bound to. Phase A REFUSES this (no move, no key minted)
|
||||||
|
/// as the accidental-move / cross-site-hijack guard; the deliberate move arrives
|
||||||
|
/// with the Phase-B `--reassign` flow + dashboard. Fires an alert.
|
||||||
|
pub const ENROLL_SITE_CONFLICT: &'static str = "enroll_site_conflict";
|
||||||
|
/// A machine_uid collision was detected at enroll — the endpoint dropped to
|
||||||
|
/// `pending` and awaits operator confirmation in the dashboard. Fires an alert.
|
||||||
|
pub const ENROLL_COLLISION_PENDING: &'static str = "enroll_collision_pending";
|
||||||
|
/// An enroll attempt failed enrollment-key verification (wrong/inactive key or
|
||||||
|
/// unknown site_code). Security audit trail for the open-registration surface.
|
||||||
|
pub const ENROLL_REJECTED: &'static str = "enroll_rejected";
|
||||||
|
/// An administrator rotated a site's enrollment key (new version + fingerprint;
|
||||||
|
/// old installers can no longer enroll NEW machines).
|
||||||
|
pub const ENROLLMENT_KEY_ROTATED: &'static str = "enrollment_key_rotated";
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Log a session event
|
/// Log a session event
|
||||||
@@ -154,6 +188,42 @@ pub async fn log_admin_removal(
|
|||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Log a zero-touch enrollment audit event (SPEC-016).
|
||||||
|
///
|
||||||
|
/// Shares the `connect_session_events` audit table but carries no session
|
||||||
|
/// (`session_id = NULL`, the FK column is nullable) and no viewer — enrollment is
|
||||||
|
/// an unauthenticated agent action, not a viewer/session event. The structured
|
||||||
|
/// detail (machine_uid, site_code, fingerprint version, decision, etc.) goes in
|
||||||
|
/// `details` and the agent's source IP in `ip_address`.
|
||||||
|
///
|
||||||
|
/// Best-effort: a failure to write the audit row must NOT fail the enroll (the
|
||||||
|
/// machine row and `cak_` already exist); the caller logs the error and proceeds,
|
||||||
|
/// matching how the relay and Task-5 removal treat audit writes.
|
||||||
|
pub async fn log_enrollment_event(
|
||||||
|
pool: &PgPool,
|
||||||
|
event_type: &str,
|
||||||
|
details: JsonValue,
|
||||||
|
ip_address: Option<IpAddr>,
|
||||||
|
) -> Result<i64, sqlx::Error> {
|
||||||
|
let ip_str = ip_address.map(|ip| ip.to_string());
|
||||||
|
|
||||||
|
let result = sqlx::query_scalar::<_, i64>(
|
||||||
|
r#"
|
||||||
|
INSERT INTO connect_session_events
|
||||||
|
(session_id, event_type, viewer_id, viewer_name, details, ip_address)
|
||||||
|
VALUES (NULL, $1, NULL, NULL, $2, $3::inet)
|
||||||
|
RETURNING id
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(event_type)
|
||||||
|
.bind(details)
|
||||||
|
.bind(ip_str)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
/// Get events for a session
|
/// Get events for a session
|
||||||
#[allow(dead_code)] // TODO(native-remote-control): consumed by the integration API; see docs/specs/native-remote-control/
|
#[allow(dead_code)] // TODO(native-remote-control): consumed by the integration API; see docs/specs/native-remote-control/
|
||||||
pub async fn get_session_events(
|
pub async fn get_session_events(
|
||||||
|
|||||||
@@ -64,6 +64,16 @@ pub struct Machine {
|
|||||||
/// history) is retained. NULL = live. Nullable, so it is read NULL-tolerantly
|
/// history) is retained. NULL = live. Nullable, so it is read NULL-tolerantly
|
||||||
/// in the manual `FromRow` below.
|
/// in the manual `FromRow` below.
|
||||||
pub deleted_at: Option<DateTime<Utc>>,
|
pub deleted_at: Option<DateTime<Utc>>,
|
||||||
|
/// Relational site binding for a machine enrolled via `/api/enroll` (SPEC-016,
|
||||||
|
/// migration 010). NULL for legacy / support-code / connect-path machines that
|
||||||
|
/// never enrolled through the zero-touch flow. A change of this on re-enroll is
|
||||||
|
/// the "site move" the enroll path audits.
|
||||||
|
pub site_id: Option<Uuid>,
|
||||||
|
/// Collision-gate state (SPEC-016, migration 010): `'active'` (live, auto-approve)
|
||||||
|
/// or `'pending'` (a machine_uid collision was detected at enroll; awaiting
|
||||||
|
/// operator confirmation before the endpoint may be controlled). Non-null with a
|
||||||
|
/// default of `'active'`; read NULL-tolerantly below for defense in depth.
|
||||||
|
pub enrollment_state: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'r> FromRow<'r, PgRow> for Machine {
|
impl<'r> FromRow<'r, PgRow> for Machine {
|
||||||
@@ -83,6 +93,13 @@ impl<'r> FromRow<'r, PgRow> for Machine {
|
|||||||
machine_uid: row.try_get("machine_uid")?,
|
machine_uid: row.try_get("machine_uid")?,
|
||||||
// Schema-nullable (migration 009); decode directly as Option.
|
// Schema-nullable (migration 009); decode directly as Option.
|
||||||
deleted_at: row.try_get("deleted_at")?,
|
deleted_at: row.try_get("deleted_at")?,
|
||||||
|
// Schema-nullable (migration 010); decode directly as Option.
|
||||||
|
site_id: row.try_get("site_id")?,
|
||||||
|
// Non-null with default 'active' (migration 010); read NULL-tolerantly
|
||||||
|
// (older snapshots / partial rows) and fall back to 'active'.
|
||||||
|
enrollment_state: row
|
||||||
|
.try_get::<Option<String>, _>("enrollment_state")?
|
||||||
|
.unwrap_or_else(|| "active".to_string()),
|
||||||
// Nullable-with-default columns mapped to non-`Option` Rust types: read as
|
// Nullable-with-default columns mapped to non-`Option` Rust types: read as
|
||||||
// `Option<T>` and fall back to the type default so a NULL cell never errors.
|
// `Option<T>` and fall back to the type default so a NULL cell never errors.
|
||||||
is_elevated: row
|
is_elevated: row
|
||||||
@@ -166,7 +183,7 @@ pub async fn upsert_machine(
|
|||||||
r#"
|
r#"
|
||||||
INSERT INTO connect_machines (agent_id, hostname, is_persistent, status, last_seen, machine_uid)
|
INSERT INTO connect_machines (agent_id, hostname, is_persistent, status, last_seen, machine_uid)
|
||||||
VALUES ($1, $2, $3, 'online', NOW(), $4)
|
VALUES ($1, $2, $3, 'online', NOW(), $4)
|
||||||
ON CONFLICT (machine_uid) DO UPDATE SET
|
ON CONFLICT (machine_uid) WHERE machine_uid IS NOT NULL DO UPDATE SET
|
||||||
agent_id = EXCLUDED.agent_id,
|
agent_id = EXCLUDED.agent_id,
|
||||||
hostname = EXCLUDED.hostname,
|
hostname = EXCLUDED.hostname,
|
||||||
status = 'online',
|
status = 'online',
|
||||||
@@ -207,6 +224,131 @@ pub async fn upsert_machine(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Find a machine by the SPEC-016 per-tenant dedup key `(tenant_id, machine_uid)`.
|
||||||
|
///
|
||||||
|
/// This is the enroll-time dedup lookup: the same hardware re-enrolling (re-image /
|
||||||
|
/// re-install) resolves to its existing row within the tenant, while the same
|
||||||
|
/// hardware in a DIFFERENT tenant is a distinct row (resolved-decision #4). Tenant
|
||||||
|
/// scoping uses the same default-tenant fold as the unique index so the lookup
|
||||||
|
/// matches the uniqueness guarantee.
|
||||||
|
///
|
||||||
|
/// Unlike `get_machine_by_agent_id`, this deliberately does NOT filter
|
||||||
|
/// `deleted_at IS NULL`: a previously operator-purged machine that legitimately
|
||||||
|
/// re-enrolls must be found so the enroll path can revive it (clearing
|
||||||
|
/// `deleted_at`), mirroring the connect-path revive in `upsert_machine`.
|
||||||
|
pub async fn get_machine_by_tenant_uid(
|
||||||
|
pool: &PgPool,
|
||||||
|
tenant_id: Uuid,
|
||||||
|
machine_uid: &str,
|
||||||
|
) -> Result<Option<Machine>, sqlx::Error> {
|
||||||
|
sqlx::query_as::<_, Machine>(
|
||||||
|
r#"
|
||||||
|
SELECT * FROM connect_machines
|
||||||
|
WHERE machine_uid = $1
|
||||||
|
AND COALESCE(tenant_id, '00000000-0000-0000-0000-000000000001'::uuid) = $2
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(machine_uid)
|
||||||
|
.bind(tenant_id)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parameters for an enroll-time machine create/update (SPEC-016 `/api/enroll`).
|
||||||
|
///
|
||||||
|
/// `agent_id` is a freshly minted opaque id for a NEW enrollment (the agent's
|
||||||
|
/// config UUID story is Phase B; the server only needs a unique non-null value for
|
||||||
|
/// the `agent_id UNIQUE` column). On REUSE/MOVE the existing row's `agent_id` is
|
||||||
|
/// preserved (the FK target of any already-minted `cak_`), so the update path does
|
||||||
|
/// not touch it.
|
||||||
|
pub struct EnrollMachineParams<'a> {
|
||||||
|
pub agent_id: &'a str,
|
||||||
|
pub hostname: &'a str,
|
||||||
|
pub machine_uid: &'a str,
|
||||||
|
pub tenant_id: Uuid,
|
||||||
|
pub site_id: Uuid,
|
||||||
|
/// Company label (-> connect_machines.organization).
|
||||||
|
pub company: Option<&'a str>,
|
||||||
|
/// Site label (-> connect_machines.site) — the free-text label, distinct from
|
||||||
|
/// the relational site_id binding.
|
||||||
|
pub site_label: Option<&'a str>,
|
||||||
|
pub tags: &'a [String],
|
||||||
|
/// 'active' (auto-approve) or 'pending' (collision-gated).
|
||||||
|
pub enrollment_state: &'a str,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Insert a NEW machine row for a first-time enrollment (SPEC-016).
|
||||||
|
///
|
||||||
|
/// Carries the labels, the relational `site_id`, the per-tenant `machine_uid`, and
|
||||||
|
/// the collision-gate `enrollment_state`. Persistent + online. Returns the created
|
||||||
|
/// row (its `id` is the FK target for the `cak_` the caller mints next).
|
||||||
|
pub async fn insert_enrolled_machine(
|
||||||
|
pool: &PgPool,
|
||||||
|
p: &EnrollMachineParams<'_>,
|
||||||
|
) -> Result<Machine, sqlx::Error> {
|
||||||
|
sqlx::query_as::<_, Machine>(
|
||||||
|
r#"
|
||||||
|
INSERT INTO connect_machines
|
||||||
|
(agent_id, hostname, is_persistent, status, last_seen, machine_uid,
|
||||||
|
tenant_id, site_id, organization, site, tags, enrollment_state)
|
||||||
|
VALUES ($1, $2, true, 'online', NOW(), $3, $4, $5, $6, $7, $8, $9)
|
||||||
|
RETURNING *
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(p.agent_id)
|
||||||
|
.bind(p.hostname)
|
||||||
|
.bind(p.machine_uid)
|
||||||
|
.bind(p.tenant_id)
|
||||||
|
.bind(p.site_id)
|
||||||
|
.bind(p.company)
|
||||||
|
.bind(p.site_label)
|
||||||
|
.bind(p.tags)
|
||||||
|
.bind(p.enrollment_state)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update an EXISTING machine row on re-enroll / reuse / site-move (SPEC-016).
|
||||||
|
///
|
||||||
|
/// Refreshes hostname, site binding (`site_id`), labels, and `enrollment_state`,
|
||||||
|
/// and revives a soft-deleted row (`deleted_at = NULL`) — a re-enroll of a purged
|
||||||
|
/// host means it is live again, mirroring `upsert_machine`'s revive. Deliberately
|
||||||
|
/// does NOT change `agent_id`: the existing id is the FK target of any prior `cak_`.
|
||||||
|
/// Labels are COALESCE-merged so an enroll that omits a label does not wipe an
|
||||||
|
/// existing value; `tags` is overwritten only when a non-empty set is supplied
|
||||||
|
/// (matching `update_machine_metadata`'s convention).
|
||||||
|
pub async fn update_enrolled_machine(
|
||||||
|
pool: &PgPool,
|
||||||
|
machine_id: Uuid,
|
||||||
|
p: &EnrollMachineParams<'_>,
|
||||||
|
) -> Result<Machine, sqlx::Error> {
|
||||||
|
sqlx::query_as::<_, Machine>(
|
||||||
|
r#"
|
||||||
|
UPDATE connect_machines SET
|
||||||
|
hostname = $2,
|
||||||
|
site_id = $3,
|
||||||
|
organization = COALESCE($4, organization),
|
||||||
|
site = COALESCE($5, site),
|
||||||
|
tags = CASE WHEN $6::text[] = '{}' THEN tags ELSE $6 END,
|
||||||
|
enrollment_state = $7,
|
||||||
|
status = 'online',
|
||||||
|
last_seen = NOW(),
|
||||||
|
deleted_at = NULL
|
||||||
|
WHERE id = $1
|
||||||
|
RETURNING *
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(machine_id)
|
||||||
|
.bind(p.hostname)
|
||||||
|
.bind(p.site_id)
|
||||||
|
.bind(p.company)
|
||||||
|
.bind(p.site_label)
|
||||||
|
.bind(p.tags)
|
||||||
|
.bind(p.enrollment_state)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
/// Update machine status and info
|
/// Update machine status and info
|
||||||
#[allow(dead_code)] // TODO(native-remote-control): consumed by the integration API; see docs/specs/native-remote-control/
|
#[allow(dead_code)] // TODO(native-remote-control): consumed by the integration API; see docs/specs/native-remote-control/
|
||||||
pub async fn update_machine_status(
|
pub async fn update_machine_status(
|
||||||
|
|||||||
@@ -4,10 +4,12 @@
|
|||||||
//! Optional - server works without database if DATABASE_URL not set.
|
//! Optional - server works without database if DATABASE_URL not set.
|
||||||
|
|
||||||
pub mod agent_keys;
|
pub mod agent_keys;
|
||||||
|
pub mod enrollment_keys;
|
||||||
pub mod events;
|
pub mod events;
|
||||||
pub mod machines;
|
pub mod machines;
|
||||||
pub mod releases;
|
pub mod releases;
|
||||||
pub mod sessions;
|
pub mod sessions;
|
||||||
|
pub mod sites;
|
||||||
pub mod support_codes;
|
pub mod support_codes;
|
||||||
pub mod tenancy;
|
pub mod tenancy;
|
||||||
pub mod users;
|
pub mod users;
|
||||||
|
|||||||
94
server/src/db/sites.rs
Normal file
94
server/src/db/sites.rs
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
//! Site database operations (SPEC-016 zero-touch enrollment).
|
||||||
|
//!
|
||||||
|
//! Backs the `connect_sites` table (migration 010): the relational anchor a
|
||||||
|
//! per-site enrollment key hangs off and the `/api/enroll` flow resolves by
|
||||||
|
//! `site_code`. See the migration header for why this table exists (the prior
|
||||||
|
//! schema modeled "site" only as a free-text column on `connect_machines`).
|
||||||
|
//!
|
||||||
|
//! All queries use runtime `sqlx::query()` / `sqlx::query_as()` per the codebase
|
||||||
|
//! convention (no compile-time `query!` macros, no `.sqlx` offline cache).
|
||||||
|
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use sqlx::PgPool;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
/// Site record from the database.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, sqlx::FromRow)]
|
||||||
|
pub struct Site {
|
||||||
|
pub id: Uuid,
|
||||||
|
pub site_code: String,
|
||||||
|
pub display_name: Option<String>,
|
||||||
|
pub company: Option<String>,
|
||||||
|
pub tenant_id: Option<Uuid>,
|
||||||
|
/// RESERVED for future per-site enrollment POLICY work (SPEC-016 §out-of-scope).
|
||||||
|
/// Not enforced in Phase A.
|
||||||
|
pub enrollment_policy: Option<String>,
|
||||||
|
pub created_at: DateTime<Utc>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Resolve a site by its operator-facing `site_code`, scoped to the given tenant.
|
||||||
|
///
|
||||||
|
/// Tenant scoping uses the same default-tenant fold as the unique index so the
|
||||||
|
/// lookup matches the uniqueness guarantee: `(COALESCE(tenant_id, default),
|
||||||
|
/// site_code)`. Returns `None` if no site with that code exists in the tenant.
|
||||||
|
pub async fn get_site_by_code(
|
||||||
|
pool: &PgPool,
|
||||||
|
site_code: &str,
|
||||||
|
tenant_id: Uuid,
|
||||||
|
) -> Result<Option<Site>, sqlx::Error> {
|
||||||
|
sqlx::query_as::<_, Site>(
|
||||||
|
r#"
|
||||||
|
SELECT id, site_code, display_name, company, tenant_id, enrollment_policy, created_at
|
||||||
|
FROM connect_sites
|
||||||
|
WHERE site_code = $1
|
||||||
|
AND COALESCE(tenant_id, '00000000-0000-0000-0000-000000000001'::uuid) = $2
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(site_code)
|
||||||
|
.bind(tenant_id)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fetch a site by its primary-key UUID.
|
||||||
|
pub async fn get_site_by_id(pool: &PgPool, id: Uuid) -> Result<Option<Site>, sqlx::Error> {
|
||||||
|
sqlx::query_as::<_, Site>(
|
||||||
|
r#"
|
||||||
|
SELECT id, site_code, display_name, company, tenant_id, enrollment_policy, created_at
|
||||||
|
FROM connect_sites
|
||||||
|
WHERE id = $1
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(id)
|
||||||
|
.fetch_optional(pool)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Insert a new site, returning the created row.
|
||||||
|
///
|
||||||
|
/// `tenant_id` is `None`-tolerant and resolved via `db::tenancy::current_tenant_id()`
|
||||||
|
/// at the call site. Errors with a unique-violation if `(tenant, site_code)` already
|
||||||
|
/// exists (the caller maps that to a 409).
|
||||||
|
#[allow(dead_code)] // Wired by the site-admin API (dashboard site CRUD); Phase A exposes key rotation, not site CRUD.
|
||||||
|
pub async fn insert_site(
|
||||||
|
pool: &PgPool,
|
||||||
|
site_code: &str,
|
||||||
|
display_name: Option<&str>,
|
||||||
|
company: Option<&str>,
|
||||||
|
tenant_id: Option<Uuid>,
|
||||||
|
) -> Result<Site, sqlx::Error> {
|
||||||
|
sqlx::query_as::<_, Site>(
|
||||||
|
r#"
|
||||||
|
INSERT INTO connect_sites (site_code, display_name, company, tenant_id)
|
||||||
|
VALUES ($1, $2, $3, $4)
|
||||||
|
RETURNING id, site_code, display_name, company, tenant_id, enrollment_policy, created_at
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.bind(site_code)
|
||||||
|
.bind(display_name)
|
||||||
|
.bind(company)
|
||||||
|
.bind(tenant_id)
|
||||||
|
.fetch_one(pool)
|
||||||
|
.await
|
||||||
|
}
|
||||||
@@ -448,6 +448,11 @@ async fn main() -> Result<()> {
|
|||||||
)),
|
)),
|
||||||
)
|
)
|
||||||
.route("/api/codes/:code/cancel", post(cancel_code))
|
.route("/api/codes/:code/cancel", post(cancel_code))
|
||||||
|
// Zero-touch enrollment (SPEC-016). PUBLIC: no JWT — the per-site enrollment
|
||||||
|
// key in the body is the gate, and the handler applies its own
|
||||||
|
// per-(site_code, IP) rate limit / lockout (defense-in-depth). Mounted with
|
||||||
|
// the other public API routes.
|
||||||
|
.route("/api/enroll", post(api::enroll::enroll))
|
||||||
// WebSocket endpoints
|
// WebSocket endpoints
|
||||||
.route("/ws/agent", get(relay::agent_ws_handler))
|
.route("/ws/agent", get(relay::agent_ws_handler))
|
||||||
.route("/ws/viewer", get(relay::viewer_ws_handler))
|
.route("/ws/viewer", get(relay::viewer_ws_handler))
|
||||||
@@ -498,6 +503,18 @@ async fn main() -> Result<()> {
|
|||||||
"/api/machines/:agent_id/keys/:key_id",
|
"/api/machines/:agent_id/keys/:key_id",
|
||||||
delete(api::machine_keys::revoke_key),
|
delete(api::machine_keys::revoke_key),
|
||||||
)
|
)
|
||||||
|
// Per-site enrollment key administration (SPEC-016, admin-only / JWT).
|
||||||
|
// Rotate regenerates the cek_ secret + fingerprint (old installers can no
|
||||||
|
// longer enroll new machines); GET returns the current non-secret
|
||||||
|
// fingerprint/version. Both gated by the AdminUser extractor.
|
||||||
|
.route(
|
||||||
|
"/api/sites/:id/enrollment-key",
|
||||||
|
get(api::sites::get_enrollment_key),
|
||||||
|
)
|
||||||
|
.route(
|
||||||
|
"/api/sites/:id/enrollment-key/rotate",
|
||||||
|
post(api::sites::rotate_enrollment_key),
|
||||||
|
)
|
||||||
// REST API - Releases and Version
|
// REST API - Releases and Version
|
||||||
.route("/api/version", get(api::releases::get_version)) // No auth - for agent polling
|
.route("/api/version", get(api::releases::get_version)) // No auth - for agent polling
|
||||||
.route("/api/releases", get(api::releases::list_releases))
|
.route("/api/releases", get(api::releases::list_releases))
|
||||||
|
|||||||
@@ -77,6 +77,19 @@ pub const CODE_VALIDATE_MAX_FAILURES: u32 = 10;
|
|||||||
/// Support-code validate: how long an IP stays locked out once tripped.
|
/// Support-code validate: how long an IP stays locked out once tripped.
|
||||||
pub const CODE_VALIDATE_LOCKOUT: Duration = Duration::from_secs(15 * 60);
|
pub const CODE_VALIDATE_LOCKOUT: Duration = Duration::from_secs(15 * 60);
|
||||||
|
|
||||||
|
/// Enroll (`POST /api/enroll`, SPEC-016): window length.
|
||||||
|
pub const ENROLL_WINDOW: Duration = Duration::from_secs(60);
|
||||||
|
/// Enroll: max requests per window per `(site_code, IP)`. A zero-touch site push
|
||||||
|
/// drives N machines through enroll near-simultaneously, so this is generous
|
||||||
|
/// (mass-deploy friendly) while still capping a runaway loop. Defense-in-depth: the
|
||||||
|
/// 256-bit enrollment key is the load-bearing gate, not this cap.
|
||||||
|
pub const ENROLL_MAX_PER_WINDOW: u32 = 60;
|
||||||
|
/// Enroll: consecutive FAILED enroll attempts (bad/inactive key, unknown site) from
|
||||||
|
/// one `(site_code, IP)` that trip the lockout.
|
||||||
|
pub const ENROLL_MAX_FAILURES: u32 = 20;
|
||||||
|
/// Enroll: how long a `(site_code, IP)` stays locked out once tripped.
|
||||||
|
pub const ENROLL_LOCKOUT: Duration = Duration::from_secs(15 * 60);
|
||||||
|
|
||||||
/// Hard cap on the number of distinct IPs tracked by any single limiter map.
|
/// Hard cap on the number of distinct IPs tracked by any single limiter map.
|
||||||
/// Prevents an IP-rotating attacker from growing memory without bound. When the
|
/// Prevents an IP-rotating attacker from growing memory without bound. When the
|
||||||
/// cap is hit, the oldest-windowed entries are pruned. Generous for a real MSP
|
/// cap is hit, the oldest-windowed entries are pruned. Generous for a real MSP
|
||||||
@@ -260,6 +273,150 @@ impl FailureLockout {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// Composite-key limiter for enrollment (keyed by (site_code, IP)) — SPEC-016
|
||||||
|
// ============================================================================
|
||||||
|
//
|
||||||
|
// The login / change-password / code-validate limiters above key purely on IP.
|
||||||
|
// SPEC-016 §3 wants the enroll defense keyed on `(site_code, source-IP)` so a noisy
|
||||||
|
// site push from one office IP cannot lock out a different site enrolling from the
|
||||||
|
// same egress IP. Rather than overload the IP-only maps, this is a small dedicated
|
||||||
|
// composite-key limiter + lockout. It is invoked from the enroll HANDLER (not a
|
||||||
|
// `from_fn` layer) because the `site_code` lives in the JSON body, which a
|
||||||
|
// pre-handler middleware cannot read without consuming it. Documented as
|
||||||
|
// defense-in-depth: the 256-bit enrollment key is the real gate.
|
||||||
|
|
||||||
|
/// Composite limiter key: the site_code and the real client IP.
|
||||||
|
type EnrollKey = (String, IpAddr);
|
||||||
|
|
||||||
|
/// Per-`(site_code, IP)` fixed-window limiter + consecutive-failure lockout.
|
||||||
|
///
|
||||||
|
/// Combines both protections behind one lock-guarded map so the enroll handler
|
||||||
|
/// makes a single allow/deny decision and reports success/failure into the same
|
||||||
|
/// structure. Self-pruning and size-capped, like the IP-only limiters.
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct EnrollLimiter {
|
||||||
|
inner: std::sync::Arc<Mutex<HashMap<EnrollKey, EnrollEntry>>>,
|
||||||
|
max_per_window: u32,
|
||||||
|
window: Duration,
|
||||||
|
max_failures: u32,
|
||||||
|
cooldown: Duration,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
struct EnrollEntry {
|
||||||
|
window_started: Instant,
|
||||||
|
count: u32,
|
||||||
|
failures: u32,
|
||||||
|
locked_until: Option<Instant>,
|
||||||
|
last_seen: Instant,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl EnrollLimiter {
|
||||||
|
pub fn new(
|
||||||
|
max_per_window: u32,
|
||||||
|
window: Duration,
|
||||||
|
max_failures: u32,
|
||||||
|
cooldown: Duration,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
inner: std::sync::Arc::new(Mutex::new(HashMap::new())),
|
||||||
|
max_per_window,
|
||||||
|
window,
|
||||||
|
max_failures,
|
||||||
|
cooldown,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn entry_now() -> EnrollEntry {
|
||||||
|
let now = Instant::now();
|
||||||
|
EnrollEntry {
|
||||||
|
window_started: now,
|
||||||
|
count: 0,
|
||||||
|
failures: 0,
|
||||||
|
locked_until: None,
|
||||||
|
last_seen: now,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Admit one enroll attempt for `(site_code, ip)`. Returns `true` if allowed
|
||||||
|
/// (and counts it). Returns `false` if the key is currently locked out OR over
|
||||||
|
/// the per-window request cap. Clock injected for tests.
|
||||||
|
fn check_at(&self, site_code: &str, ip: IpAddr, now: Instant) -> bool {
|
||||||
|
let mut map = self.inner.lock().unwrap_or_else(|e| e.into_inner());
|
||||||
|
|
||||||
|
if map.len() >= MAX_TRACKED_IPS {
|
||||||
|
let window = self.window;
|
||||||
|
let cooldown = self.cooldown;
|
||||||
|
map.retain(|_, e| {
|
||||||
|
e.locked_until.map(|u| now < u).unwrap_or(false)
|
||||||
|
|| now.duration_since(e.window_started) < window
|
||||||
|
|| now.duration_since(e.last_seen) < cooldown
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let key = (site_code.to_string(), ip);
|
||||||
|
let e = map.entry(key).or_insert_with(Self::entry_now);
|
||||||
|
e.last_seen = now;
|
||||||
|
|
||||||
|
// Lockout takes precedence.
|
||||||
|
if let Some(until) = e.locked_until {
|
||||||
|
if now < until {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Cooldown elapsed — clear it for a fresh start.
|
||||||
|
e.locked_until = None;
|
||||||
|
e.failures = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Roll the fixed window forward if elapsed.
|
||||||
|
if now.duration_since(e.window_started) >= self.window {
|
||||||
|
e.window_started = now;
|
||||||
|
e.count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if e.count >= self.max_per_window {
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
e.count += 1;
|
||||||
|
true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Admit one enroll attempt (real clock).
|
||||||
|
pub fn check(&self, site_code: &str, ip: IpAddr) -> bool {
|
||||||
|
self.check_at(site_code, ip, Instant::now())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn record_failure_at(&self, site_code: &str, ip: IpAddr, now: Instant) {
|
||||||
|
let mut map = self.inner.lock().unwrap_or_else(|e| e.into_inner());
|
||||||
|
let key = (site_code.to_string(), ip);
|
||||||
|
let e = map.entry(key).or_insert_with(Self::entry_now);
|
||||||
|
e.last_seen = now;
|
||||||
|
e.failures = e.failures.saturating_add(1);
|
||||||
|
if e.failures >= self.max_failures {
|
||||||
|
e.locked_until = Some(now + self.cooldown);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Record a FAILED enroll attempt (bad key / unknown site) for the key,
|
||||||
|
/// tripping the lockout once the streak reaches `max_failures`.
|
||||||
|
pub fn record_failure(&self, site_code: &str, ip: IpAddr) {
|
||||||
|
self.record_failure_at(site_code, ip, Instant::now());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Record a SUCCESSFUL enroll for the key, resetting its failure streak.
|
||||||
|
pub fn record_success(&self, site_code: &str, ip: IpAddr) {
|
||||||
|
let mut map = self.inner.lock().unwrap_or_else(|e| e.into_inner());
|
||||||
|
let key = (site_code.to_string(), ip);
|
||||||
|
if let Some(e) = map.get_mut(&key) {
|
||||||
|
e.failures = 0;
|
||||||
|
e.locked_until = None;
|
||||||
|
e.last_seen = Instant::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// Shared rate-limit state (lives in AppState)
|
// Shared rate-limit state (lives in AppState)
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
@@ -275,6 +432,9 @@ pub struct RateLimitState {
|
|||||||
pub code_validate: RateLimiter,
|
pub code_validate: RateLimiter,
|
||||||
/// Per-IP lockout on repeated failed code validations (brute-force defense).
|
/// Per-IP lockout on repeated failed code validations (brute-force defense).
|
||||||
pub code_validate_lockout: FailureLockout,
|
pub code_validate_lockout: FailureLockout,
|
||||||
|
/// `POST /api/enroll` (SPEC-016): per-`(site_code, IP)` request cap +
|
||||||
|
/// consecutive-failure lockout. Invoked from the enroll handler.
|
||||||
|
pub enroll: EnrollLimiter,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RateLimitState {
|
impl RateLimitState {
|
||||||
@@ -290,6 +450,12 @@ impl RateLimitState {
|
|||||||
CODE_VALIDATE_MAX_FAILURES,
|
CODE_VALIDATE_MAX_FAILURES,
|
||||||
CODE_VALIDATE_LOCKOUT,
|
CODE_VALIDATE_LOCKOUT,
|
||||||
),
|
),
|
||||||
|
enroll: EnrollLimiter::new(
|
||||||
|
ENROLL_MAX_PER_WINDOW,
|
||||||
|
ENROLL_WINDOW,
|
||||||
|
ENROLL_MAX_FAILURES,
|
||||||
|
ENROLL_LOCKOUT,
|
||||||
|
),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -524,4 +690,51 @@ mod tests {
|
|||||||
assert!(lockout.is_locked_at(ip(8), t0));
|
assert!(lockout.is_locked_at(ip(8), t0));
|
||||||
assert!(!lockout.is_locked_at(ip(9), t0)); // ip9 unaffected
|
assert!(!lockout.is_locked_at(ip(9), t0)); // ip9 unaffected
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// -- EnrollLimiter (composite (site_code, IP) key) --------------------------
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn enroll_window_allows_up_to_cap_then_blocks() {
|
||||||
|
let lim = EnrollLimiter::new(2, Duration::from_secs(60), 100, Duration::from_secs(600));
|
||||||
|
let t0 = Instant::now();
|
||||||
|
assert!(lim.check_at("SITE-A", ip(1), t0)); // 1
|
||||||
|
assert!(lim.check_at("SITE-A", ip(1), t0)); // 2
|
||||||
|
assert!(!lim.check_at("SITE-A", ip(1), t0)); // over cap
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn enroll_is_keyed_by_site_and_ip() {
|
||||||
|
let lim = EnrollLimiter::new(1, Duration::from_secs(60), 100, Duration::from_secs(600));
|
||||||
|
let t0 = Instant::now();
|
||||||
|
assert!(lim.check_at("SITE-A", ip(1), t0));
|
||||||
|
assert!(!lim.check_at("SITE-A", ip(1), t0)); // same key over cap
|
||||||
|
// Different site, same IP -> independent bucket.
|
||||||
|
assert!(lim.check_at("SITE-B", ip(1), t0));
|
||||||
|
// Same site, different IP -> independent bucket.
|
||||||
|
assert!(lim.check_at("SITE-A", ip(2), t0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn enroll_lockout_trips_after_failures_and_blocks_check() {
|
||||||
|
let lim = EnrollLimiter::new(100, Duration::from_secs(60), 3, Duration::from_secs(600));
|
||||||
|
let t0 = Instant::now();
|
||||||
|
lim.record_failure_at("SITE-A", ip(1), t0);
|
||||||
|
lim.record_failure_at("SITE-A", ip(1), t0);
|
||||||
|
// Not yet tripped: a check still admits.
|
||||||
|
assert!(lim.check_at("SITE-A", ip(1), t0));
|
||||||
|
lim.record_failure_at("SITE-A", ip(1), t0); // 3rd -> trips
|
||||||
|
// Now locked out: check denies even though under the request cap.
|
||||||
|
assert!(!lim.check_at("SITE-A", ip(1), t0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn enroll_success_resets_failure_streak() {
|
||||||
|
let lim = EnrollLimiter::new(100, Duration::from_secs(60), 2, Duration::from_secs(600));
|
||||||
|
let t0 = Instant::now();
|
||||||
|
lim.record_failure_at("SITE-A", ip(1), t0);
|
||||||
|
lim.record_success("SITE-A", ip(1)); // reset
|
||||||
|
lim.record_failure_at("SITE-A", ip(1), t0);
|
||||||
|
// Only one failure since reset -> not locked.
|
||||||
|
assert!(lim.check_at("SITE-A", ip(1), t0));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -527,3 +527,60 @@ Reference: SPEC-002 §5; `agent/src/encoder/raw.rs` (salvaged), `proto/guruconne
|
|||||||
- **Rate limiting:** hammer `/api/auth/login` and the code-validate route → confirm throttling/lockout.
|
- **Rate limiting:** hammer `/api/auth/login` and the code-validate route → confirm throttling/lockout.
|
||||||
- **Migrations:** fresh DB applies the v2 migrations cleanly; `_sqlx_migrations` consistent; `tenant_id`
|
- **Migrations:** fresh DB applies the v2 migrations cleanly; `_sqlx_migrations` consistent; `tenant_id`
|
||||||
populated with the default tenant.
|
populated with the default tenant.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Task 9 [PROPOSED 2026-06-01 — provisioning model = TOFU auto-enroll, chosen by Mike]: `cak_` auto-enroll provisioning + shared-key retirement
|
||||||
|
|
||||||
|
> Context: Task 2 built the SERVER `cak_` machinery (mint/SHA-256 hash/verify in `auth/agent_keys.rs`,
|
||||||
|
> relay validation in `validate_agent_api_key`, admin issuance `POST /api/machines/:id/keys`). What's
|
||||||
|
> missing is how an AGENT obtains and uses a `cak_` — today agents still carry the deprecated shared
|
||||||
|
> `AGENT_API_KEY`, so `connect_agent_keys` is empty and the relay logs the DEPRECATED-shared-key warning
|
||||||
|
> for every agent. This task closes that with **trust-on-first-use auto-enroll** so the shared key can be
|
||||||
|
> retired (unblocks task list #5). NOTE: the agent already presents whatever is in its `api_key` slot and
|
||||||
|
> the relay auto-detects `cak_` vs shared — so a `cak_`-keyed agent needs **no change to its auth call**,
|
||||||
|
> only a way to *receive*, *persist*, and *prefer* a `cak_`.
|
||||||
|
|
||||||
|
**Flow (TOFU):**
|
||||||
|
1. **Bootstrap (first connect):** a fresh agent authenticates on `/ws/agent` with a bootstrap secret —
|
||||||
|
interim: the shared `AGENT_API_KEY` (embedded by the download endpoint); target: a single-use,
|
||||||
|
short-lived **enroll token** (more secure TOFU — see Security).
|
||||||
|
2. **Server issues on first connect:** when an agent authed via the bootstrap path (i.e. NOT already
|
||||||
|
`cak_`-keyed) connects and its machine has **no active (non-revoked) `cak_`**, the relay: resolves/creates
|
||||||
|
the machine row (existing `upsert_machine` on `machine_uid` — now functional after the 2026-06-01
|
||||||
|
ON CONFLICT fix), mints a `cak_` (`generate_agent_key` + `db::agent_keys::insert_agent_key` for that
|
||||||
|
`machine_id`), and sends the plaintext key to the agent **once** over a new server→agent message. Only
|
||||||
|
the hash is stored. **Idempotent:** never re-issue if an active key already exists for the machine.
|
||||||
|
3. **Agent receives + persists + prefers:** on `AgentKeyProvision`, the agent persists the `cak_` durably at
|
||||||
|
`%ProgramData%\GuruConnect\agent_key` (restricted ACL, same pattern as `machine_uid`). On startup it loads
|
||||||
|
the persisted `cak_` if present and uses it as its auth key, falling back to the embedded/bootstrap secret
|
||||||
|
only when no `cak_` is stored yet. After provisioning, every reconnect authenticates via `cak_` (no more
|
||||||
|
DEPRECATED-shared-key warning for that agent).
|
||||||
|
4. **Shared-key retirement (phased):** Phase A — shared key stays as the bootstrap so existing+new agents
|
||||||
|
self-enroll; monitor the relay WARN count → ~0. Phase B — once the fleet is `cak_`-keyed, restrict the
|
||||||
|
shared `AGENT_API_KEY` to enrollment-only or remove the env entirely (only `cak_` / enroll-token accepted).
|
||||||
|
This is the concrete completion of task-list #5.
|
||||||
|
|
||||||
|
**Protocol (4-artifact drift discipline):** add `AgentKeyProvision { string key = 1; }` (server→agent) to
|
||||||
|
`proto/guruconnect.proto` with a new reserved message ID; regenerate prost on both agent + server; the
|
||||||
|
hand-written `dashboard/src/lib/protobuf.ts` decoder does NOT need it (agent-plane only) but reserve the ID.
|
||||||
|
|
||||||
|
**Files:** `proto/guruconnect.proto` (new message); `server/src/relay/mod.rs` (issue+send on bootstrap connect
|
||||||
|
with no active key); `server/src/db/agent_keys.rs` (add `has_active_key(machine_id)` check; reuse insert);
|
||||||
|
`agent/src/transport/*` (handle inbound `AgentKeyProvision`); `agent/src/config.rs` + a small key-store module
|
||||||
|
(load/persist `cak_`, prefer over bootstrap).
|
||||||
|
|
||||||
|
**Security (TOFU):** the first connect trusts the bootstrap secret — a leaked shared key during the enroll
|
||||||
|
window could enroll a rogue agent; the secure target is a **single-use, short-lived enroll token** per
|
||||||
|
deployment instead of the shared key (shared-key bootstrap is interim convenience). The `cak_` is sent
|
||||||
|
plaintext once over the existing wss/TLS channel; only the hash is stored server-side; the agent stores it
|
||||||
|
locally with restricted ACLs. Revocation via the existing `DELETE /api/machines/:id/keys/:key_id` fails the
|
||||||
|
agent closed; on its next bootstrap connect it re-enrolls. The keyed-agent dedup (Task 3) keeps the
|
||||||
|
authenticated identity authoritative.
|
||||||
|
|
||||||
|
**Verification:** drop a current-build (signed 0.3.0+) agent configured with the shared-key bootstrap →
|
||||||
|
it connects, receives a `cak_`, persists it; restart → it authenticates via the `cak_` (relay shows NO
|
||||||
|
DEPRECATED-shared-key warning) and `connect_agent_keys` holds exactly one active key for the machine; issue
|
||||||
|
is idempotent across reconnects; revoke the key via the admin API → agent rejected, then re-enrolls on next
|
||||||
|
bootstrap connect. Reference: `auth/agent_keys.rs`, `api/machine_keys.rs`, `relay/mod.rs:266-309`
|
||||||
|
(`validate_agent_api_key`), `.claude/standards/security/credential-handling.md`.
|
||||||
|
|||||||
Reference in New Issue
Block a user