claudetools/.claude/scripts/run-onboarding-diagnostic.sh

#!/usr/bin/env bash
# run-onboarding-diagnostic.sh - GuruRMM onboarding diagnostic runner (Phase 1).
#
# Dispatches .claude/scripts/onboarding-diagnostic.ps1 to a Windows agent via the
# GuruRMM RMM API, extracts the fenced JSON result, grades it RED/AMBER/GREEN,
# writes an immutable baseline (JSON + Markdown report) under
# clients/<slug>/onboarding-baselines/, diffs against any prior baseline, and
# alerts #dev-alerts on RED / critical findings.
#
# Usage:
#   bash run-onboarding-diagnostic.sh <hostname-or-uuid> [client-slug]
#
# Mirrors the plumbing in .claude/commands/rmm.md (vault auth -> JWT -> dispatch
# -> poll -> command_text/stdout). Read-only against the endpoint; the probe only
# collects, it changes nothing.

set -u

# ---------------------------------------------------------------------------
# Args
# ---------------------------------------------------------------------------
TARGET="${1:-}"
CLIENT_SLUG="${2:-}"

if [ -z "$TARGET" ]; then
    echo "[ERROR] Usage: bash run-onboarding-diagnostic.sh <hostname-or-uuid> [client-slug]" >&2
    exit 1
fi

# ---------------------------------------------------------------------------
# Bootstrap (resolve repo root, vault, RMM base)
# ---------------------------------------------------------------------------
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
VAULT="$REPO_ROOT/.claude/scripts/vault.sh"
PROBE="$SCRIPT_DIR/onboarding-diagnostic.ps1"
ALERT="$REPO_ROOT/.claude/scripts/post-bot-alert.sh"
RMM="http://172.16.3.30:3001"

if [ ! -f "$PROBE" ]; then
    echo "[ERROR] Probe script not found: $PROBE" >&2
    exit 1
fi

for tool in jq curl; do
    if ! command -v "$tool" >/dev/null 2>&1; then
        echo "[ERROR] Required tool not found: $tool" >&2
        exit 1
    fi
done

# Soft-fail wrapper for the bot alert so an alerting failure never aborts the run.
post_alert() {
    local msg="$1"
    if [ -f "$ALERT" ]; then
        bash "$ALERT" "$msg" >/dev/null 2>&1 || true
    fi
}

# ---------------------------------------------------------------------------
# Authenticate
# ---------------------------------------------------------------------------
RMM_EMAIL="$(bash "$VAULT" get-field infrastructure/gururmm-server.sops.yaml credentials.gururmm-api.admin-email 2>/dev/null)"
RMM_PASS="$(bash "$VAULT" get-field infrastructure/gururmm-server.sops.yaml credentials.gururmm-api.admin-password 2>/dev/null)"

if [ -z "$RMM_EMAIL" ] || [ -z "$RMM_PASS" ] || [ "$RMM_EMAIL" = "null" ]; then
    echo "[ERROR] Could not read GuruRMM credentials from vault (infrastructure/gururmm-server.sops.yaml)" >&2
    exit 1
fi

LOGIN_PAYLOAD="$(jq -nc --arg e "$RMM_EMAIL" --arg p "$RMM_PASS" '{email:$e, password:$p}')"
TOKEN="$(curl -s -m 30 -X POST "$RMM/api/auth/login" \
    -H "Content-Type: application/json" \
    --data-binary "$LOGIN_PAYLOAD" | jq -r '.token // empty')"

if [ -z "$TOKEN" ]; then
    echo "[ERROR] RMM login failed (no token returned)" >&2
    exit 1
fi
echo "[OK] Authenticated to GuruRMM"

# ---------------------------------------------------------------------------
# Resolve agent (by exact UUID, exact hostname, then partial hostname)
# ---------------------------------------------------------------------------
AGENTS="$(curl -s -m 30 "$RMM/api/agents" -H "Authorization: Bearer $TOKEN")"
if [ -z "$AGENTS" ] || ! echo "$AGENTS" | jq -e 'type=="array"' >/dev/null 2>&1; then
    echo "[ERROR] Could not retrieve agent list" >&2
    exit 1
fi

# UUID-shaped target -> match by id; otherwise match by hostname.
AGENT=""
if echo "$TARGET" | grep -qiE '^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$'; then
    AGENT="$(echo "$AGENTS" | jq --arg id "$TARGET" '[.[] | select(.id==$id)] | .[0] // empty')"
else
    # exact hostname (case-insensitive) first
    AGENT="$(echo "$AGENTS" | jq --arg h "$TARGET" '[.[] | select((.hostname|ascii_downcase)==($h|ascii_downcase))] | .[0] // empty')"
    if [ -z "$AGENT" ] || [ "$AGENT" = "null" ]; then
        # partial match
        MATCHES="$(echo "$AGENTS" | jq --arg h "$TARGET" '[.[] | select(.hostname|ascii_downcase|contains($h|ascii_downcase))]')"
        COUNT="$(echo "$MATCHES" | jq 'length')"
        if [ "$COUNT" = "0" ]; then
            AGENT=""
        elif [ "$COUNT" = "1" ]; then
            AGENT="$(echo "$MATCHES" | jq '.[0]')"
        else
            echo "[ERROR] Multiple agents match '$TARGET' - be more specific:" >&2
            echo "$MATCHES" | jq -r '.[] | "  \(.hostname)  (\(.os_type))  id=\(.id)  client=\(.client_name)"' >&2
            exit 1
        fi
    fi
fi

if [ -z "$AGENT" ] || [ "$AGENT" = "null" ]; then
    echo "[ERROR] No agent found matching '$TARGET'. Run /rmm agents to list enrolled agents." >&2
    exit 1
fi

AGENT_ID="$(echo "$AGENT" | jq -r '.id // empty')"
AGENT_HOST="$(echo "$AGENT" | jq -r '.hostname // empty')"
AGENT_OS="$(echo "$AGENT" | jq -r '.os_type // empty')"
AGENT_STATUS="$(echo "$AGENT" | jq -r '.status // "unknown"')"
AGENT_CONNECTED="$(echo "$AGENT" | jq -r '.is_connected // "null"')"
AGENT_CLIENT="$(echo "$AGENT" | jq -r '.client_name // empty')"
AGENT_LAST="$(echo "$AGENT" | jq -r '.last_seen // "never"')"

echo "[OK] Agent: $AGENT_HOST ($AGENT_OS) status=$AGENT_STATUS connected=$AGENT_CONNECTED client=$AGENT_CLIENT last_seen=$AGENT_LAST id=$AGENT_ID"

if [ "$AGENT_OS" != "windows" ]; then
    echo "[ERROR] This diagnostic is Windows-only. Agent os_type='$AGENT_OS'." >&2
    exit 1
fi

# Treat online if status==online OR is_connected==true (is_connected can be null even when online).
if [ "$AGENT_STATUS" != "online" ] && [ "$AGENT_CONNECTED" != "true" ]; then
    echo "[WARNING] Agent appears offline (status=$AGENT_STATUS). The command will queue and run when it reconnects."
fi

# Derive client slug if not supplied: prefer explicit arg; else slugify client_name.
if [ -z "$CLIENT_SLUG" ]; then
    if [ -n "$AGENT_CLIENT" ]; then
        CLIENT_SLUG="$(echo "$AGENT_CLIENT" | tr '[:upper:]' '[:lower:]' | sed -E 's/[^a-z0-9]+/-/g; s/^-+//; s/-+$//')"
        echo "[INFO] No client slug supplied; derived '$CLIENT_SLUG' from client name '$AGENT_CLIENT'."
    else
        CLIENT_SLUG="_unsorted"
        echo "[WARNING] No client slug and no client name; using '_unsorted'."
    fi
fi

# ---------------------------------------------------------------------------
# Command dispatch helper
# ---------------------------------------------------------------------------
# The agent caps the inline command body at roughly 32-40 KB (above that it
# returns "Failed to execute command" before PowerShell ever runs). The probe is
# ~60 KB, so we cannot send it inline. Instead we:
#   1. base64-encode the probe locally,
#   2. upload it to a temp file on the endpoint in <24 KB chunks (one command
#      each: first writes, the rest append),
#   3. send a final small command that decodes the file to a .ps1, runs it,
#      prints the fenced JSON, and deletes both temp files.
# Each dispatched command stays well under the agent limit, so this scales no
# matter how large the probe grows in later phases.

WORK_DIR="$(mktemp -d 2>/dev/null || echo "${TMPDIR:-/tmp}/onboard-diag-$$")"
mkdir -p "$WORK_DIR" 2>/dev/null || true
cleanup() { rm -rf "$WORK_DIR" 2>/dev/null || true; }
trap cleanup EXIT

# dispatch_one <command-file-with-script> <timeout_seconds>  -> echoes result JSON, returns 0/1
dispatch_one() {
    local script_file="$1"
    local to="$2"
    local payload_file resp cmd_id status result count

    payload_file="$WORK_DIR/payload.json"
    jq -nc --rawfile cmd "$script_file" --argjson to "$to" \
        '{command_type:"powershell", command:$cmd, timeout_seconds:$to}' > "$payload_file"

    resp="$(curl -s -m 30 -X POST "$RMM/api/agents/$AGENT_ID/command" \
        -H "Authorization: Bearer $TOKEN" \
        -H "Content-Type: application/json" \
        --data-binary "@$payload_file")"
    cmd_id="$(echo "$resp" | jq -r '.command_id // empty')"
    if [ -z "$cmd_id" ]; then
        echo "[ERROR] Dispatch failed: $resp" >&2
        return 1
    fi

    count=0
    while [ $count -lt 72 ]; do
        result="$(curl -s -m 30 "$RMM/api/commands/$cmd_id" -H "Authorization: Bearer $TOKEN")"
        status="$(echo "$result" | jq -r '.status // empty')"
        case "$status" in
            completed|failed|cancelled|interrupted)
                # Persist the command id to a file: this function runs in a $( )
                # subshell, so a plain variable assignment would not survive.
                printf '%s' "$cmd_id" > "$WORK_DIR/last_cmd_id" 2>/dev/null || true
                echo "$result"
                return 0
                ;;
            running|pending|"") count=$((count + 1)); sleep 5 ;;
            *) count=$((count + 1)); sleep 5 ;;
        esac
    done
    echo "[ERROR] Command $cmd_id did not finish (last status=$status)" >&2
    return 1
}

# ---------------------------------------------------------------------------
# Upload probe (base64, chunked) then execute
# ---------------------------------------------------------------------------
echo "[INFO] Uploading probe to endpoint (chunked base64)..."

# Stable-ish remote temp names; unique per run via timestamp+pid.
REMOTE_TAG="grmm_onboard_$(date -u +%Y%m%d%H%M%S)_$$"
REMOTE_B64="\$env:TEMP\\${REMOTE_TAG}.b64"
REMOTE_PS1="\$env:TEMP\\${REMOTE_TAG}.ps1"

# Produce base64 (single line) and split into chunks.
B64_FILE="$WORK_DIR/probe.b64"
base64 -w0 "$PROBE" > "$B64_FILE" 2>/dev/null || base64 "$PROBE" | tr -d '\n' > "$B64_FILE"
CHUNK_DIR="$WORK_DIR/chunks"
mkdir -p "$CHUNK_DIR"
split -b 24000 "$B64_FILE" "$CHUNK_DIR/chunk_"
CHUNKS=$(ls -1 "$CHUNK_DIR"/chunk_* | sort)
N_CHUNKS=$(echo "$CHUNKS" | wc -l | tr -d ' ')
echo "[INFO] Probe is $(wc -c < "$PROBE") bytes -> $N_CHUNKS chunk(s)"

IDX=0
for ch in $CHUNKS; do
    IDX=$((IDX + 1))
    # INVARIANT: DATA is RFC4648 standard base64 (alphabet A-Za-z0-9+/ with '='
    # padding). None of those characters are PowerShell metacharacters, so DATA
    # is safe to interpolate raw into the here-doc below. If this is ever changed
    # to base64url (alphabet adds '-' and '_'), it stays safe too - but revisit
    # this assertion before swapping the encoder, do not assume silently.
    DATA="$(cat "$ch")"
    SCRIPT_FILE="$WORK_DIR/chunkcmd.ps1"
    if [ "$IDX" -eq 1 ]; then
        # First chunk: create/overwrite the file (no newline appended).
        cat > "$SCRIPT_FILE" <<PS
\$ErrorActionPreference = 'Stop'
[System.IO.File]::WriteAllText("$REMOTE_B64", "$DATA")
Write-Output "CHUNK $IDX OK"
PS
    else
        cat > "$SCRIPT_FILE" <<PS
\$ErrorActionPreference = 'Stop'
[System.IO.File]::AppendAllText("$REMOTE_B64", "$DATA")
Write-Output "CHUNK $IDX OK"
PS
    fi
    CH_RESULT="$(dispatch_one "$SCRIPT_FILE" 60)" || { echo "[ERROR] Chunk $IDX dispatch failed" >&2; exit 1; }
    CH_STATUS="$(echo "$CH_RESULT" | jq -r '.status')"
    if [ "$CH_STATUS" != "completed" ]; then
        echo "[ERROR] Chunk $IDX upload failed: status=$CH_STATUS stderr=$(echo "$CH_RESULT" | jq -r '.stderr' | head -c 200)" >&2
        exit 1
    fi
    echo "[OK] Uploaded chunk $IDX/$N_CHUNKS"
done

echo "[INFO] Decoding and executing probe on endpoint (timeout 240s)..."

# Final command: decode base64 file -> .ps1, run it, then clean up both temp files.
RUN_SCRIPT="$WORK_DIR/runcmd.ps1"
cat > "$RUN_SCRIPT" <<PS
\$ErrorActionPreference = 'Continue'
try {
    \$b64 = [System.IO.File]::ReadAllText("$REMOTE_B64")
    \$bytes = [System.Convert]::FromBase64String(\$b64)
    [System.IO.File]::WriteAllBytes("$REMOTE_PS1", \$bytes)
    & powershell.exe -NonInteractive -ExecutionPolicy Bypass -File "$REMOTE_PS1"
} catch {
    Write-Output ("PROBE_RUN_ERROR: " + \$_.Exception.Message)
} finally {
    Remove-Item -Path "$REMOTE_B64" -Force -ErrorAction SilentlyContinue
    Remove-Item -Path "$REMOTE_PS1" -Force -ErrorAction SilentlyContinue
}
PS

RESULT="$(dispatch_one "$RUN_SCRIPT" 240)" || { echo "[ERROR] Probe execution dispatch failed" >&2; exit 1; }
CMD_ID="$(cat "$WORK_DIR/last_cmd_id" 2>/dev/null || echo unknown)"

FINAL_STATUS="$(echo "$RESULT" | jq -r '.status // empty')"
EXIT_CODE="$(echo "$RESULT" | jq -r '.exit_code // "null"')"
STDOUT="$(echo "$RESULT" | jq -r '.stdout // ""')"
STDERR="$(echo "$RESULT" | jq -r '.stderr // ""')"

echo "[INFO] Probe finished: status=$FINAL_STATUS exit_code=$EXIT_CODE stdout_len=${#STDOUT} stderr_len=${#STDERR} cmd=$CMD_ID"

# ---------------------------------------------------------------------------
# Extract fenced JSON from stdout
# ---------------------------------------------------------------------------
# Pull text strictly between the markers. awk handles arbitrary surrounding noise.
DIAG_JSON="$(printf '%s' "$STDOUT" | awk '
    /===DIAG-JSON-START===/ { capture=1; next }
    /===DIAG-JSON-END===/   { capture=0 }
    capture { print }
')"

if [ -z "$DIAG_JSON" ] || ! echo "$DIAG_JSON" | jq -e '.host' >/dev/null 2>&1; then
    echo "[ERROR] Could not extract valid diagnostic JSON from probe output." >&2
    echo "[ERROR] status=$FINAL_STATUS exit_code=$EXIT_CODE" >&2
    if [ -n "$STDERR" ]; then
        echo "--- stderr ---" >&2
        printf '%s\n' "$STDERR" | head -40 >&2
    fi
    echo "--- stdout (first 60 lines) ---" >&2
    printf '%s\n' "$STDOUT" | head -60 >&2
    exit 1
fi

echo "[OK] Extracted diagnostic JSON ($(echo "$DIAG_JSON" | wc -c | tr -d ' ') bytes)"

# ---------------------------------------------------------------------------
# Grade: RED (any critical) / AMBER (any warning, no critical) / GREEN (none)
# ---------------------------------------------------------------------------
N_CRIT="$(echo "$DIAG_JSON" | jq '[.findings[] | select(.severity=="critical")] | length')"
N_WARN="$(echo "$DIAG_JSON" | jq '[.findings[] | select(.severity=="warning")] | length')"
N_UNK="$(echo "$DIAG_JSON"  | jq '[.findings[] | select(.severity=="unknown")] | length')"
N_INFO="$(echo "$DIAG_JSON" | jq '[.findings[] | select(.severity=="info")] | length')"

if [ "$N_CRIT" -gt 0 ]; then
    GRADE="RED"
elif [ "$N_WARN" -gt 0 ]; then
    GRADE="AMBER"
else
    GRADE="GREEN"
fi

PROBE_HOST="$(echo "$DIAG_JSON" | jq -r '.host // empty')"
[ -z "$PROBE_HOST" ] && PROBE_HOST="$AGENT_HOST"
COLLECTED="$(echo "$DIAG_JSON" | jq -r '.collected_at_utc // empty')"

echo "[INFO] Grade=$GRADE  critical=$N_CRIT warning=$N_WARN unknown=$N_UNK info=$N_INFO"

# ---------------------------------------------------------------------------
# Output paths
# ---------------------------------------------------------------------------
BASE_DIR="$REPO_ROOT/clients/$CLIENT_SLUG/onboarding-baselines"
mkdir -p "$BASE_DIR"

UTC_STAMP="$(date -u +%Y%m%dT%H%M%S)"
SAFE_HOST="$(echo "$PROBE_HOST" | sed -E 's/[^A-Za-z0-9._-]+/_/g')"
JSON_PATH="$BASE_DIR/${SAFE_HOST}-${UTC_STAMP}.json"
MD_PATH="$BASE_DIR/${SAFE_HOST}-${UTC_STAMP}.md"

# Immutability guard: the per-second UTC_STAMP can collide if two runs land in
# the same second (or a re-run of the same dispatch). A baseline is immutable
# once written, so never truncate an existing one - append a PID uniquifier
# instead so the prior baseline survives intact.
if [ -e "$JSON_PATH" ]; then JSON_PATH="${JSON_PATH%.json}-$$.json"; MD_PATH="${MD_PATH%.md}-$$.md"; fi

# Find the most recent PRIOR baseline json for this host (before we write the new one).
PRIOR_JSON=""
PRIOR_JSON="$(ls -1 "$BASE_DIR/${SAFE_HOST}-"*.json 2>/dev/null | sort | tail -n 1)"

# Write the immutable raw snapshot (pretty-printed for readability/diffing).
echo "$DIAG_JSON" | jq '.' > "$JSON_PATH"

# ---------------------------------------------------------------------------
# Build the Markdown report
# ---------------------------------------------------------------------------
{
    echo "# Onboarding Diagnostic Baseline - $PROBE_HOST"
    echo ""
    echo "- **Grade:** $GRADE"
    echo "- **Host:** $PROBE_HOST"
    echo "- **Client:** ${AGENT_CLIENT:-$CLIENT_SLUG} (\`$CLIENT_SLUG\`)"
    echo "- **Collected (UTC):** $COLLECTED"
    echo "- **Agent ID:** $AGENT_ID"
    echo "- **Command ID:** $CMD_ID"
    echo "- **Findings:** $N_CRIT critical / $N_WARN warning / $N_INFO info / $N_UNK unknown"
    echo ""
    OS_CAPTION="$(echo "$DIAG_JSON" | jq -r '.os.caption // "?"')"
    OS_BUILD="$(echo "$DIAG_JSON" | jq -r '.os.build // "?"')"
    echo "- **OS:** $OS_CAPTION (build $OS_BUILD)"
    echo ""
    echo "---"
    echo ""

    for sev in critical warning info unknown; do
        SEV_COUNT="$(echo "$DIAG_JSON" | jq --arg s "$sev" '[.findings[] | select(.severity==$s)] | length')"
        [ "$SEV_COUNT" = "0" ] && continue
        SEV_LABEL="$(echo "$sev" | tr '[:lower:]' '[:upper:]')"
        echo "## $SEV_LABEL ($SEV_COUNT)"
        echo ""
        echo "$DIAG_JSON" | jq -r --arg s "$sev" '
            .findings[] | select(.severity==$s) |
            "### " + .title + "\n" +
            "- **Category:** " + (.category // "?") + "\n" +
            "- **ID:** `" + (.id // "?") + "`\n" +
            "- " + (.detail // "") + "\n" +
            (if (.evidence // "") != "" then "\n```\n" + .evidence + "\n```\n" else "" end)
        '
        echo ""
    done

    echo "---"
    echo ""
    echo "## Inventory Baseline Summary"
    echo ""
    echo "$DIAG_JSON" | jq -r '
        .facts as $f |
        "- **Manufacturer / Model:** " + (($f.hardware.manufacturer // "?") + " / " + ($f.hardware.model // "?")) + "\n" +
        "- **Serial:** " + ($f.hardware.serial // "?") + "\n" +
        "- **CPU:** " + ($f.hardware.cpu // "?") + " (" + (($f.hardware.cpu_cores // 0)|tostring) + " cores / " + (($f.hardware.cpu_logical // 0)|tostring) + " logical)\n" +
        "- **RAM (GB):** " + (($f.hardware.ram_gb // 0)|tostring) + "\n" +
        "- **BIOS:** " + ($f.hardware.bios_version // "?") + " (" + ($f.hardware.bios_date // "?") + ")\n" +
        "- **Chassis is laptop:** " + (($f.is_laptop // false)|tostring) + "\n" +
        "- **TPM present / Secure Boot:** " + (($f.tpm.present // "?")|tostring) + " / " + (($f.secure_boot // "?")|tostring) + "\n" +
        "- **Domain joined:** " + (($f.domain_joined // false)|tostring) + " (" + ($f.domain // "?") + ")\n" +
        "- **OS activation licensed:** " + (($f.activation.licensed // "?")|tostring) + "\n" +
        "- **Uptime (days):** " + (($f.uptime_days // "?")|tostring) + "\n" +
        "- **Pending reboot:** " + (($f.pending_reboot // false)|tostring) + "\n" +
        "- **Installed software count:** " + (($f.installed_software_count // 0)|tostring) + "\n" +
        "- **Scheduled tasks (non-MS, enabled):** " + (($f.scheduled_tasks_count // 0)|tostring) + "\n" +
        "- **Local administrators:** " + (($f.local_administrators // []) | join(", "))
    '
    echo ""
    echo "### Fixed volumes"
    echo ""
    echo "$DIAG_JSON" | jq -r '
        (.facts.volumes // []) | .[] |
        "- " + (.drive // "?") + " - " + ((.free_gb // 0)|tostring) + " GB free of " + ((.size_gb // 0)|tostring) + " GB (" + ((.free_pct // 0)|tostring) + "%)"
    '
    echo ""
    echo "### Network adapters"
    echo ""
    echo "$DIAG_JSON" | jq -r '
        (.facts.network_adapters // []) | .[] |
        "- " + (.description // "?") + " - IP: " + ((.ip // []) | join(", ")) + " - DNS: " + ((.dns // []) | join(", ")) + " - DHCP: " + ((.dhcp // false)|tostring)
    '
    echo ""

    # -----------------------------------------------------------------------
    # DIFF section vs prior baseline
    # -----------------------------------------------------------------------
    if [ -n "$PRIOR_JSON" ] && [ -f "$PRIOR_JSON" ]; then
        PRIOR_STAMP="$(basename "$PRIOR_JSON")"
        echo "---"
        echo ""
        echo "## Diff vs Prior Baseline"
        echo ""
        echo "- **Compared against:** \`$PRIOR_STAMP\`"
        echo ""

        # New findings: ids present now but not before.
        NEW_FINDINGS="$(jq -n \
            --slurpfile cur "$JSON_PATH" \
            --slurpfile old "$PRIOR_JSON" '
            ($old[0].findings // []) as $o |
            ($cur[0].findings // []) as $c |
            ($o | map(.id)) as $oids |
            [ $c[] | select(.severity!="info") | select(.id as $id | ($oids | index($id)) | not) ]
        ')"
        # Resolved findings: ids present before but not now.
        RESOLVED_FINDINGS="$(jq -n \
            --slurpfile cur "$JSON_PATH" \
            --slurpfile old "$PRIOR_JSON" '
            ($old[0].findings // []) as $o |
            ($cur[0].findings // []) as $c |
            ($c | map(.id)) as $cids |
            [ $o[] | select(.severity!="info") | select(.id as $id | ($cids | index($id)) | not) ]
        ')"
        # Regressed: same id, severity got worse (info<warning<critical; unknown treated as warning-level).
        REGRESSED="$(jq -n \
            --slurpfile cur "$JSON_PATH" \
            --slurpfile old "$PRIOR_JSON" '
            def rank(s): if s=="critical" then 3 elif s=="warning" then 2 elif s=="unknown" then 2 elif s=="info" then 1 else 0 end;
            ($old[0].findings // []) as $o |
            ($cur[0].findings // []) as $c |
            ($o | map({key:.id, value:.severity}) | from_entries) as $om |
            [ $c[] | select(.id as $id | $om[$id] != null) | select(rank(.severity) > rank($om[.id])) |
              {id, title, was: $om[.id], now: .severity} ]
        ')"

        echo "**New findings:**"
        echo ""
        if [ "$(echo "$NEW_FINDINGS" | jq 'length')" = "0" ]; then
            echo "- (none)"
        else
            echo "$NEW_FINDINGS" | jq -r '.[] | "- [" + (.severity|ascii_upcase) + "] " + .title'
        fi
        echo ""
        echo "**Resolved findings:**"
        echo ""
        if [ "$(echo "$RESOLVED_FINDINGS" | jq 'length')" = "0" ]; then
            echo "- (none)"
        else
            echo "$RESOLVED_FINDINGS" | jq -r '.[] | "- [" + (.severity|ascii_upcase) + "] " + .title'
        fi
        echo ""
        echo "**Regressed findings:**"
        echo ""
        if [ "$(echo "$REGRESSED" | jq 'length')" = "0" ]; then
            echo "- (none)"
        else
            echo "$REGRESSED" | jq -r '.[] | "- " + .title + " (" + .was + " -> " + .now + ")"'
        fi
        echo ""

        # Installed-software deltas
        SW_ADDED="$(jq -n \
            --slurpfile cur "$JSON_PATH" \
            --slurpfile old "$PRIOR_JSON" '
            ((($old[0].facts.installed_software // []) | map(.name)) | unique) as $o |
            ((($cur[0].facts.installed_software // []) | map(.name)) | unique) as $c |
            [ $c[] | select(. as $n | ($o | index($n)) | not) ]
        ')"
        SW_REMOVED="$(jq -n \
            --slurpfile cur "$JSON_PATH" \
            --slurpfile old "$PRIOR_JSON" '
            ((($old[0].facts.installed_software // []) | map(.name)) | unique) as $o |
            ((($cur[0].facts.installed_software // []) | map(.name)) | unique) as $c |
            [ $o[] | select(. as $n | ($c | index($n)) | not) ]
        ')"

        echo "**Software added:**"
        echo ""
        if [ "$(echo "$SW_ADDED" | jq 'length')" = "0" ]; then
            echo "- (none)"
        else
            echo "$SW_ADDED" | jq -r '.[] | "- " + .'
        fi
        echo ""
        echo "**Software removed:**"
        echo ""
        if [ "$(echo "$SW_REMOVED" | jq 'length')" = "0" ]; then
            echo "- (none)"
        else
            echo "$SW_REMOVED" | jq -r '.[] | "- " + .'
        fi
        echo ""
    else
        echo "---"
        echo ""
        echo "## Diff vs Prior Baseline"
        echo ""
        echo "- No prior baseline found for this host. This is the first baseline."
        echo ""
    fi

    echo "---"
    echo ""
    echo "_Generated by run-onboarding-diagnostic.sh (GuruRMM onboarding diagnostic, Phase 1). Raw snapshot: \`$(basename "$JSON_PATH")\` (immutable)._"
} > "$MD_PATH"

# ---------------------------------------------------------------------------
# Alerts (soft-fail): one line for RED overall, one per critical finding (capped)
# ---------------------------------------------------------------------------
if [ "$GRADE" = "RED" ]; then
    CRIT_TITLES="$(echo "$DIAG_JSON" | jq -r '[.findings[] | select(.severity=="critical") | .title] | .[0:3] | join("; ")')"
    MORE=""
    if [ "$N_CRIT" -gt 3 ]; then MORE=" (+$((N_CRIT - 3)) more)"; fi
    post_alert "[RMM] Onboarding diag $PROBE_HOST ($CLIENT_SLUG) = RED: $N_CRIT critical - ${CRIT_TITLES}${MORE}"
elif [ "$GRADE" = "AMBER" ]; then
    post_alert "[RMM] Onboarding diag $PROBE_HOST ($CLIENT_SLUG) = AMBER: $N_WARN warning, 0 critical"
fi

# ---------------------------------------------------------------------------
# Final console summary
# ---------------------------------------------------------------------------
echo ""
echo "=========================================================="
echo " Onboarding diagnostic complete"
echo "   Host:   $PROBE_HOST"
echo "   Client: ${AGENT_CLIENT:-$CLIENT_SLUG} ($CLIENT_SLUG)"
echo "   Grade:  $GRADE  ($N_CRIT critical / $N_WARN warning / $N_INFO info / $N_UNK unknown)"
echo "   JSON:   $JSON_PATH"
echo "   Report: $MD_PATH"
echo "=========================================================="
echo ""
echo "Report path: $MD_PATH"