claudetools/.claude/skills/self-check/scripts/self-check.sh

#!/usr/bin/env bash
# self-check.sh - ClaudeTools harness self-diagnosis / fleet conformance probe.
#
# V1 is a CENSUS tool. Each machine probes its own harness wiring (tools,
# identity, hooks, skills, commands, scripts, connectivity, capability tier),
# grades what it can against the provisional baseline manifest, and can publish
# the result to the coord API so the fleet can be compared and the baseline
# refined from real data. See ../SKILL.md and ../baseline/README.md.
#
# Usage:
#   self-check.sh                 Run checks, print a human report. (default)
#   self-check.sh --json          Emit the structured census JSON to stdout only.
#   self-check.sh --publish       Run checks, then PUT the census to coord (component selfcheck_<host>).
#   self-check.sh fanout          Broadcast a request to ALL_SESSIONS to run /self-check --publish.
#   self-check.sh aggregate       Read every machine's published census and print a fleet table
#                                 plus a proposed-baseline (intersection/union) summary.
#
# Portable: bash 3.2+ (macOS), Git Bash (Windows), Linux. Deps: jq, curl.
# Read-only. It collects and reports; it changes nothing on the machine.

set -u

# ---------------------------------------------------------------------------
# Bootstrap: resolve repo root, identity, coord API, session id
# ---------------------------------------------------------------------------
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SKILL_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)"
MANIFEST="$SKILL_DIR/baseline/manifest.json"

if ! command -v jq >/dev/null 2>&1; then
    echo "[ERROR] jq is required and not found on PATH. Install jq, then re-run." >&2
    exit 2
fi
# Some Windows jq builds (winget) emit CRLF line endings; a trailing \r corrupts
# every `for x in $(jq ...)` word and `read`-from-@tsv field. Strip \r from all
# jq output (it is insignificant JSON whitespace and never wanted in raw values).
jq() { command jq "$@" | tr -d '\r'; }
if ! command -v curl >/dev/null 2>&1; then
    echo "[ERROR] curl is required and not found on PATH." >&2
    exit 2
fi
if [ ! -f "$MANIFEST" ]; then
    echo "[ERROR] Baseline manifest not found: $MANIFEST" >&2
    exit 2
fi

# identity.json: prefer repo copy, then ~/.claude (mirrors check-messages.sh).
IDENTITY=""
for c in "$REPO_ROOT/.claude/identity.json" "$HOME/.claude/identity.json"; do
    [ -f "$c" ] && { IDENTITY="$c"; break; }
done

idfield() { # dotted.path  -> value or empty
    [ -n "$IDENTITY" ] && jq -r "$1 // empty" "$IDENTITY" 2>/dev/null
}

HOSTNAME_RAW="$(hostname 2>/dev/null || echo unknown)"
HOST="${HOSTNAME_RAW%.local}"
SESSION="${HOST}/claude-main"

API="$(idfield '.coord_api')"
[ -z "$API" ] && API="http://172.16.3.30:8001"

PLATFORM="$(idfield '.platform')"
[ -z "$PLATFORM" ] && case "$(uname -s)" in
    Darwin) PLATFORM="macos" ;; Linux) PLATFORM="linux" ;;
    CYGWIN*|MINGW*|MSYS*) PLATFORM="windows" ;; *) PLATFORM="unknown" ;;
esac
ARCH="$(idfield '.architecture')"
[ -z "$ARCH" ] && ARCH="$(uname -m 2>/dev/null || echo unknown)"

# ---------------------------------------------------------------------------
# Results accumulation. Each check appends one compact JSON object.
# status in {PASS, WARN, FAIL, SKIP, INFO}. Grade: any FAIL->RED, WARN->AMBER.
# ---------------------------------------------------------------------------
RESULTS_FILE="$(mktemp 2>/dev/null || echo "${TMPDIR:-/tmp}/selfcheck.$$")"
: > "$RESULTS_FILE"
trap 'rm -f "$RESULTS_FILE" 2>/dev/null' EXIT

emit() { # id  category  status  detail  fix
    jq -nc --arg id "$1" --arg cat "$2" --arg st "$3" --arg detail "$4" --arg fix "${5:-}" \
        '{id:$id,category:$cat,status:$st,detail:$detail,fix:$fix}' >> "$RESULTS_FILE"
}

reachable() { curl -s -o /dev/null -m 4 "$1" 2>/dev/null; }   # exit 0 if HTTP responds

# Content-equal ignoring line endings: a repo LF copy and a ~/.claude CRLF copy
# are the SAME content (the cross-machine case this check polices), so compare
# with \r stripped rather than byte-for-byte (cmp would false-flag them).
same_content() { diff -q <(tr -d '\r' < "$1") <(tr -d '\r' < "$2") >/dev/null 2>&1; }

# ---------------------------------------------------------------------------
# CHECK: identity
# ---------------------------------------------------------------------------
check_identity() {
    if [ -z "$IDENTITY" ]; then
        emit identity.present identity FAIL "identity.json not found (.claude or ~/.claude)" \
            "Run onboarding; create .claude/identity.json then 'bash .claude/scripts/migrate-identity.sh'"
        return
    fi
    if ! jq -e . "$IDENTITY" >/dev/null 2>&1; then
        emit identity.parse identity FAIL "identity.json is not valid JSON: $IDENTITY" "Fix the JSON syntax"
        return
    fi
    emit identity.present identity PASS "identity.json present and valid: $IDENTITY"

    local missing=""
    for f in $(jq -r '.required_identity_fields[]' "$MANIFEST"); do
        local v; v="$(jq -r ".$f // empty" "$IDENTITY" 2>/dev/null)"
        [ -z "$v" ] && missing="$missing $f"
    done
    if [ -n "$missing" ]; then
        emit identity.fields identity WARN "missing/empty identity fields:$missing" \
            "bash .claude/scripts/migrate-identity.sh  (populates machine-specific fields)"
    else
        emit identity.fields identity PASS "all required identity fields present"
    fi

    # --- path fields: identity.json is the map of WHERE things live on this box.
    # It is foundational - every later check trusts claudetools_root / vault_path.
    # Verify they resolve to real locations and that claudetools_root is in fact
    # the repo we are running from (a stale clone path is a silent footgun).
    norm() { # path -> lowercase, forward-slash, drive-letter, no trailing slash
        local p="$1"
        command -v cygpath >/dev/null 2>&1 && p="$(cygpath -m "$p" 2>/dev/null || echo "$p")"
        printf '%s' "$p" | tr 'A-Z' 'a-z' | sed 's#\\#/#g; s#/\{1,\}$##'
    }
    local ctroot; ctroot="$(idfield '.claudetools_root')"
    if [ -z "$ctroot" ]; then
        emit identity.claudetools_root identity FAIL "identity.claudetools_root not set" \
            "Set claudetools_root in identity.json to this repo's absolute path"
    elif [ ! -d "$ctroot" ]; then
        emit identity.claudetools_root identity FAIL "claudetools_root does not exist: $ctroot" \
            "Fix claudetools_root in identity.json (machine moved/renamed the repo?)"
    elif [ "$(norm "$ctroot")" != "$(norm "$REPO_ROOT")" ]; then
        emit identity.claudetools_root identity WARN \
            "claudetools_root ($ctroot) != running repo ($REPO_ROOT)" \
            "Reconcile claudetools_root in identity.json with the repo you actually run from"
    else
        emit identity.claudetools_root identity PASS "claudetools_root resolves to this repo ($ctroot)"
    fi

    local vpath2; vpath2="$(idfield '.vault_path')"
    if [ -z "$vpath2" ]; then
        emit identity.vault_path identity FAIL "identity.vault_path not set (cannot locate the SOPS vault)" \
            "Set vault_path in identity.json to the cloned vault repo path"
    elif [ ! -d "$vpath2" ]; then
        emit identity.vault_path identity FAIL "vault_path does not exist: $vpath2" \
            "Clone the vault repo and set vault_path in identity.json"
    else
        emit identity.vault_path identity PASS "vault_path resolves ($vpath2)"
    fi

    # machine field vs actual hostname
    local idmach; idmach="$(idfield '.machine')"
    if [ -n "$idmach" ] && [ "$(echo "$idmach" | tr 'A-Z' 'a-z')" != "$(echo "$HOST" | tr 'A-Z' 'a-z')" ]; then
        emit identity.hostname identity WARN "identity.machine='$idmach' != actual hostname '$HOST'" \
            "Update .machine in identity.json (did you clone onto a new box?)"
    else
        emit identity.hostname identity PASS "identity.machine matches hostname ($HOST)"
    fi

    # git config vs identity
    local gn ge idn ide
    gn="$(git -C "$REPO_ROOT" config user.name 2>/dev/null)"
    ge="$(git -C "$REPO_ROOT" config user.email 2>/dev/null)"
    idn="$(idfield '.full_name')"; ide="$(idfield '.email')"
    if [ -n "$idn" ] && [ "$gn" != "$idn" ]; then
        emit identity.git_name identity WARN "git user.name='$gn' != identity.full_name='$idn'" \
            "git config user.name \"$idn\""
    else
        emit identity.git_name identity PASS "git user.name matches identity ($gn)"
    fi
    if [ -n "$ide" ] && [ "$ge" != "$ide" ]; then
        emit identity.git_email identity WARN "git user.email='$ge' != identity.email='$ide'" \
            "git config user.email \"$ide\""
    else
        emit identity.git_email identity PASS "git user.email matches identity ($ge)"
    fi
}

# ---------------------------------------------------------------------------
# CHECK: tooling (required + capability-gated)
# ---------------------------------------------------------------------------
toolver() { # best-effort one-line version
    "$1" --version 2>/dev/null | head -1 || true
}
check_tools() {
    local n why
    while IFS=$'\t' read -r n why; do
        if command -v "$n" >/dev/null 2>&1; then
            emit "tool.$n" tooling PASS "$n present ($(toolver "$n"))"
        else
            emit "tool.$n" tooling FAIL "$n MISSING (required: $why)" "Install $n and ensure it is on PATH"
        fi
    done < <(jq -r '.required_tools[] | [.name, .why] | @tsv' "$MANIFEST")

    # python: any_of
    local pyok="" pc
    for pc in $(jq -r '.required_python.any_of[]' "$MANIFEST"); do
        if command -v "$pc" >/dev/null 2>&1; then pyok="$pc"; break; fi
    done
    if [ -n "$pyok" ]; then
        local declared; declared="$(idfield '.python.command')"
        if [ -n "$declared" ] && ! command -v "$declared" >/dev/null 2>&1; then
            emit tool.python tooling WARN "identity.python.command='$declared' not on PATH; '$pyok' is available" \
                "Update .python.command in identity.json or re-run migrate-identity.sh"
        else
            emit tool.python tooling PASS "python available ($pyok; identity declares '${declared:-unset}')"
        fi
    else
        emit tool.python tooling FAIL "no python interpreter found (tried py/python3/python)" "Install Python"
    fi

    # capability tools - presence only, never FAIL
    local cn cap cwhy
    while IFS=$'\t' read -r cn cap cwhy; do
        if command -v "$cn" >/dev/null 2>&1; then
            emit "cap.$cn" capability INFO "$cn present [$cap] ($(toolver "$cn"))"
        else
            emit "cap.$cn" capability INFO "$cn absent [$cap] - capability off ($cwhy)"
        fi
    done < <(jq -r '.capability_tools[] | [.name, .capability, .why] | @tsv' "$MANIFEST")
}

# ---------------------------------------------------------------------------
# CHECK: capability tier (ollama) + effective ruleset
# ---------------------------------------------------------------------------
check_capability_tier() {
    local declared fb local_ok="" remote_ok="" tier rule
    declared="$(idfield '.ollama.endpoint')"
    fb="$(idfield '.ollama.fallback')"

    reachable "http://localhost:11434/api/tags" && local_ok=1
    [ -n "$fb" ] && reachable "${fb%/}/api/tags" && remote_ok=1

    if [ -n "$local_ok" ]; then
        tier="ollama_local"
    elif [ -n "$remote_ok" ]; then
        tier="ollama_remote"
    else
        tier="ollama_none"
    fi
    rule="$(jq -r ".capability_rules.$tier.tier0_engine" "$MANIFEST")"

    # Does the resolved tier agree with what identity declares?
    if [ "$tier" = "ollama_none" ]; then
        emit captier.ollama capability WARN "Ollama tier = NONE (local + fallback both unreachable). Effective rule: $rule" \
            "Confirm this machine is meant to run without Ollama; ensure Tier-0 work routes to haiku, not blocked"
    else
        local note=""
        if [ "$tier" = "ollama_remote" ] && echo "$declared" | grep -q "localhost"; then
            note=" (identity declares localhost but local is down; using fallback $fb)"
        fi
        emit captier.ollama capability PASS "Ollama tier = ${tier#ollama_}${note}. Effective Tier-0: $rule"
    fi
}

# ---------------------------------------------------------------------------
# CHECK: required scripts + hook files (exist + executable)
# ---------------------------------------------------------------------------
check_files() {
    local rel p
    for rel in $(jq -r '.required_scripts[], .required_hook_files[]' "$MANIFEST"); do
        p="$REPO_ROOT/$rel"
        if [ ! -f "$p" ]; then
            emit "file.$rel" files FAIL "missing: $rel" "Restore via /sync (git pull from Gitea)"
        elif [ ! -x "$p" ] && echo "$rel" | grep -qE '\.(sh|template)$'; then
            emit "file.$rel" files WARN "present but not executable: $rel" "chmod +x \"$rel\""
        else
            emit "file.$rel" files PASS "present: $rel"
        fi
    done
}

# ---------------------------------------------------------------------------
# CHECK: settings.json hooks wired correctly
# ---------------------------------------------------------------------------
check_settings_hooks() {
    local settings="$REPO_ROOT/.claude/settings.json"
    if [ ! -f "$settings" ] || ! jq -e . "$settings" >/dev/null 2>&1; then
        emit hooks.settings hooks FAIL "settings.json missing or invalid JSON" "Restore .claude/settings.json via /sync"
        return
    fi
    local ev needle why found
    # NB: omit .matcher from the TSV - an empty middle field collapses under tab
    # IFS (tab is IFS-whitespace), shifting columns. We do not use the matcher here.
    while IFS=$'\t' read -r ev needle why; do
        # any hook command under this event containing the needle
        found="$(jq -r --arg ev "$ev" --arg n "$needle" \
            '(.hooks[$ev] // []) | [.[].hooks[]?.command // ""] | map(select(contains($n))) | length' \
            "$settings" 2>/dev/null)"
        if [ "${found:-0}" -gt 0 ] 2>/dev/null; then
            emit "hook.$ev" hooks PASS "$ev hook wired ($needle)"
        else
            emit "hook.$ev" hooks FAIL "$ev hook NOT wired (expected command containing '$needle' - $why)" \
                "Add the $ev hook to .claude/settings.json (see baseline manifest required_settings_hooks)"
        fi
    done < <(jq -r '.required_settings_hooks[] | [.event, .command_contains, .why] | @tsv' "$MANIFEST")

    # current-mode file (created by UserPromptSubmit hook, but flag if absent)
    if [ -f "$REPO_ROOT/.claude/current-mode" ]; then
        emit hook.current-mode hooks PASS "current-mode present ($(tr -d '[:space:]' < "$REPO_ROOT/.claude/current-mode"))"
    else
        emit hook.current-mode hooks WARN "current-mode missing (auto-created on next prompt)" "echo general > .claude/current-mode"
    fi
}

# ---------------------------------------------------------------------------
# CHECK: git remote + post-commit hooks
# ---------------------------------------------------------------------------
check_git() {
    local url want host_ip
    url="$(git -C "$REPO_ROOT" remote get-url origin 2>/dev/null)"
    want="$(jq -r '.git.remote_host_contains' "$MANIFEST")"
    host_ip="$(jq -r '.git.remote_host_internal_ip' "$MANIFEST")"
    if [ -z "$url" ]; then
        emit git.remote git WARN "no 'origin' remote on $REPO_ROOT" "git remote add origin <gitea-url>"
    elif echo "$url" | grep -qF "$want" || echo "$url" | grep -qF "$host_ip"; then
        emit git.remote git PASS "origin -> $url"
    else
        emit git.remote git FAIL "origin does not point at ACG Gitea: $url" \
            "git remote set-url origin http://<user>@$host_ip:3000/azcomputerguru/claudetools.git"
    fi

    if [ "$(jq -r '.git.post_commit_hook_expected' "$MANIFEST")" = "true" ]; then
        if [ -f "$REPO_ROOT/.git/hooks/post-commit" ]; then
            emit git.post_commit git PASS "main-repo post-commit hook installed"
        else
            emit git.post_commit git WARN "main-repo post-commit hook NOT installed (HOOKS.md mandates dev-alerts hook)" \
                "cp .claude/hooks/post-commit.template .git/hooks/post-commit && chmod +x .git/hooks/post-commit"
        fi
    fi
}

# ---------------------------------------------------------------------------
# CHECK: skills + commands conformance vs manifest
# ---------------------------------------------------------------------------
check_skills_commands() {
    local name dir
    # skills present
    for name in $(jq -r '.skills[]' "$MANIFEST"); do
        dir="$REPO_ROOT/.claude/skills/$name"
        if [ -d "$dir" ]; then
            if [ -f "$dir/SKILL.md" ] || ls "$dir"/*.md >/dev/null 2>&1 || [ -d "$dir/scripts" ]; then
                emit "skill.$name" skills PASS "skill present: $name"
            else
                emit "skill.$name" skills WARN "skill dir present but looks empty: $name" "Restore skill contents via /sync"
            fi
        else
            emit "skill.$name" skills FAIL "skill MISSING: $name" "Restore .claude/skills/$name via /sync"
        fi
    done
    # extra skills not in manifest (drift to report, not fail in V1)
    local known
    known="|$(jq -r '.skills[]' "$MANIFEST" | tr '\n' '|')"
    for dir in "$REPO_ROOT"/.claude/skills/*/; do
        [ -d "$dir" ] || continue
        name="$(basename "$dir")"
        case "$known" in *"|$name|"*) ;; *) emit "skill.extra.$name" skills INFO "skill present but NOT in baseline: $name (census candidate)" ;; esac
    done

    # commands present
    for name in $(jq -r '.commands[]' "$MANIFEST"); do
        if [ -f "$REPO_ROOT/.claude/commands/$name.md" ]; then
            emit "cmd.$name" commands PASS "command present: /$name"
        else
            emit "cmd.$name" commands FAIL "command MISSING: /$name" "Restore .claude/commands/$name.md via /sync"
        fi
    done
    # extra commands
    known="|$(jq -r '.commands[]' "$MANIFEST" | tr '\n' '|')"
    for f in "$REPO_ROOT"/.claude/commands/*.md; do
        [ -f "$f" ] || continue
        name="$(basename "$f" .md)"
        [ "$name" = "README" ] && continue
        case "$known" in *"|$name|"*) ;; *) emit "cmd.extra.$name" commands INFO "command present but NOT in baseline: /$name (census candidate)" ;; esac
    done
}

# ---------------------------------------------------------------------------
# CHECK: vault decrypt readiness
# ---------------------------------------------------------------------------
check_vault() {
    local vpath; vpath="$(idfield '.vault_path')"
    if [ -z "$vpath" ]; then
        emit vault.path vault WARN "identity.vault_path not set" "Set vault_path in identity.json"
        return
    fi
    if [ ! -d "$vpath" ]; then
        emit vault.path vault FAIL "vault_path does not exist: $vpath" "Clone the vault repo and set vault_path"
        return
    fi
    emit vault.path vault PASS "vault repo present: $vpath"
    if ! command -v sops >/dev/null 2>&1 || ! command -v age >/dev/null 2>&1; then
        emit vault.tools vault FAIL "sops/age missing - cannot decrypt vault" "Install sops and age"
        return
    fi
    # Lightweight readiness: vault.sh list should enumerate entries without error.
    if [ -x "$REPO_ROOT/.claude/scripts/vault.sh" ]; then
        if bash "$REPO_ROOT/.claude/scripts/vault.sh" list >/dev/null 2>&1; then
            emit vault.list vault PASS "vault.sh list succeeded (sops/age wired)"
        else
            emit vault.list vault WARN "vault.sh list failed - check age key + SOPS_AGE_KEY_FILE" \
                "Verify age key at the SOPS recipient path; run: bash .claude/scripts/vault.sh list"
        fi
    fi
}

# ---------------------------------------------------------------------------
# CHECK: connectivity
# ---------------------------------------------------------------------------
check_connectivity() {
    local name url req
    while IFS=$'\t' read -r name url req; do
        if reachable "$url"; then
            emit "net.$name" connectivity PASS "$name reachable ($url)"
        elif [ "$req" = "true" ]; then
            emit "net.$name" connectivity FAIL "$name UNREACHABLE ($url)" "Check VPN/Tailscale/network to 172.16.3.x"
        else
            emit "net.$name" connectivity WARN "$name unreachable ($url) - off-network is OK" ""
        fi
    done < <(jq -r '.connectivity[] | [.name, .url, (.required|tostring)] | @tsv' "$MANIFEST")
}

# ---------------------------------------------------------------------------
# CHECK: duplicate command/skill definitions across search roots.
# Claude Code resolves slash commands and skills from BOTH the repo
# (.claude/commands, .claude/skills) and the user profile (~/.claude/...). When
# the same name exists in both with DIFFERENT content, the harness may resolve a
# different one than you expect - the "same /cmd, different behaviour on the Mac"
# bug. Divergent = WARN; identical = INFO (redundant copy that WILL drift).
# ---------------------------------------------------------------------------
check_duplicates() {
    local kind repo_dir user_dir
    # commands: compare *.md files by content
    for kind in commands skills; do
        repo_dir="$REPO_ROOT/.claude/$kind"
        user_dir="$HOME/.claude/$kind"
        [ -d "$repo_dir" ] || continue
        [ -d "$user_dir" ] || { emit "dup.$kind" duplicates PASS "no user-level ~/.claude/$kind (single source: repo)"; continue; }

        local name rp up dup_div=0 dup_same=0
        if [ "$kind" = "commands" ]; then
            for rp in "$repo_dir"/*.md; do
                [ -f "$rp" ] || continue
                name="$(basename "$rp" .md)"
                [ "$name" = "README" ] && continue
                up="$user_dir/$name.md"
                [ -f "$up" ] || continue
                [ "$rp" -ef "$up" ] && continue   # symlink to the same file - cannot drift
                if same_content "$rp" "$up"; then
                    dup_same=$((dup_same+1))
                else
                    dup_div=$((dup_div+1))
                    emit "dup.cmd.$name" duplicates WARN \
                        "/$name is DIVERGENT: repo and ~/.claude copies differ (harness may run the wrong one)" \
                        "Reconcile: diff \"$rp\" \"$up\"  then make ~/.claude/commands/$name.md match the repo (or remove it)"
                fi
            done
        else
            for rp in "$repo_dir"/*/; do
                [ -d "$rp" ] || continue
                name="$(basename "$rp")"
                up="$user_dir/$name"
                [ -d "$up" ] || continue
                [ "$rp" -ef "$up" ] && continue   # symlinked dir - cannot drift
                # Only compare when BOTH have a SKILL.md; otherwise not comparable
                # (script-only / *.md-only skills) - skip rather than miscount.
                if [ -f "$rp/SKILL.md" ] && [ -f "$up/SKILL.md" ]; then
                    if same_content "$rp/SKILL.md" "$up/SKILL.md"; then
                        dup_same=$((dup_same+1))
                    else
                        dup_div=$((dup_div+1))
                        emit "dup.skill.$name" duplicates WARN \
                            "skill '$name' is DIVERGENT: repo and ~/.claude SKILL.md differ" \
                            "Reconcile ~/.claude/skills/$name with the repo copy (or remove the user-level one)"
                    fi
                fi
            done
        fi
        if [ "$dup_div" -eq 0 ] && [ "$dup_same" -gt 0 ]; then
            emit "dup.$kind" duplicates INFO \
                "$dup_same $kind exist in BOTH repo and ~/.claude (identical now, but a redundant copy that can drift)" \
                "Consider a single source of truth for $kind to prevent future divergence"
        elif [ "$dup_div" -eq 0 ] && [ "$dup_same" -eq 0 ]; then
            emit "dup.$kind" duplicates PASS "no duplicate $kind across roots"
        fi
    done
}

# ---------------------------------------------------------------------------
# CHECK: rogue memories that contradict settings/identity.
# Deterministic core only: index integrity + a conservative, manifest-declared
# set of contradiction patterns evaluated against this machine's identity. The
# SEMANTIC contradiction pass (reasoning over all memories vs identity/settings)
# is a judgment task and is delegated to the model in SKILL.md, not grep.
# ---------------------------------------------------------------------------
check_memory() {
    local mdir="$REPO_ROOT/.claude/memory" idx="$REPO_ROOT/.claude/memory/MEMORY.md"
    if [ ! -d "$mdir" ]; then
        emit memory.dir memory WARN "no .claude/memory directory" "Expected the shared memory store; restore via /sync"
        return
    fi
    if [ ! -f "$idx" ]; then
        emit memory.index memory WARN "MEMORY.md index missing" "Create .claude/memory/MEMORY.md (the loaded index)"
    else
        # orphan detection: every *.md (except MEMORY.md) should be referenced in the index
        local f base orphans=0
        for f in "$mdir"/*.md; do
            [ -f "$f" ] || continue
            base="$(basename "$f")"
            [ "$base" = "MEMORY.md" ] && continue
            if ! grep -qF "$base" "$idx" 2>/dev/null; then
                orphans=$((orphans+1))
            fi
        done
        if [ "$orphans" -gt 0 ]; then
            emit memory.orphans memory WARN "$orphans memory file(s) not referenced in MEMORY.md (orphaned)" \
                "Run /memory-dream or add the missing index lines"
        else
            emit memory.index memory PASS "MEMORY.md index present; no orphaned memory files"
        fi
    fi

    # Manifest-declared contradiction patterns. Each entry:
    #   { when_field, when_equals, grep, why }  - only evaluated when this
    #   machine's identity.<when_field> == when_equals, so a pattern fires only
    #   where it is actually a contradiction (e.g. prescribing python3 on a `py` box).
    # NB: fields are read via @tsv, so when_equals/grep MUST NOT contain tab chars.
    local has; has="$(jq -r '(.memory.contradiction_patterns // []) | length' "$MANIFEST" 2>/dev/null)"
    if [ "${has:-0}" -gt 0 ] 2>/dev/null; then
        local wf we gx why hits
        while IFS=$'\t' read -r wf we gx why; do
            [ -n "$wf" ] || continue
            if [ "$(idfield ".$wf")" = "$we" ]; then
                hits="$(grep -rliE "$gx" "$mdir" 2>/dev/null | grep -vF 'MEMORY.md' | head -5 | tr '\n' ' ')"
                if [ -n "$hits" ]; then
                    emit "memory.contradiction.$wf" memory WARN \
                        "memory may contradict identity.$wf=$we ($why): $hits" \
                        "Review the listed memory file(s); correct or delete if they prescribe the wrong behaviour for this machine"
                fi
            fi
        done < <(jq -r '(.memory.contradiction_patterns // [])[] | [.when_field, .when_equals, .grep, .why] | @tsv' "$MANIFEST")
    fi
}

# ---------------------------------------------------------------------------
# Build the census JSON from accumulated results
# ---------------------------------------------------------------------------
build_census() {
    local fails warns grade
    fails="$(jq -s '[.[]|select(.status=="FAIL")]|length' "$RESULTS_FILE")"
    warns="$(jq -s '[.[]|select(.status=="WARN")]|length' "$RESULTS_FILE")"
    if [ "$fails" -gt 0 ]; then grade="RED"; elif [ "$warns" -gt 0 ]; then grade="AMBER"; else grade="GREEN"; fi

    jq -s \
        --arg host "$HOST" --arg session "$SESSION" --arg platform "$PLATFORM" --arg arch "$ARCH" \
        --arg grade "$grade" --arg ts "$RUN_TS" \
        --arg mver "$(jq -r '.schema_version' "$MANIFEST")" \
        '{
            host:$host, session:$session, platform:$platform, arch:$arch,
            grade:$grade, generated_at:$ts, manifest_version:$mver,
            summary: { pass:([.[]|select(.status=="PASS")]|length),
                       warn:([.[]|select(.status=="WARN")]|length),
                       fail:([.[]|select(.status=="FAIL")]|length),
                       info:([.[]|select(.status=="INFO")]|length) },
            results: .
         }' "$RESULTS_FILE"
}

# ---------------------------------------------------------------------------
# Human report
# ---------------------------------------------------------------------------
print_report() {
    local census="$1" grade
    grade="$(echo "$census" | jq -r .grade)"
    echo ""
    echo "============================================================"
    echo " ClaudeTools self-check - $HOST ($PLATFORM/$ARCH)"
    echo " Grade: $grade   $(echo "$census" | jq -r '.summary | "PASS \(.pass)  WARN \(.warn)  FAIL \(.fail)  INFO \(.info)"')"
    echo " Manifest: $(echo "$census" | jq -r .manifest_version) (provisional)   $RUN_TS"
    echo "============================================================"
    # FAIL then WARN then INFO; PASS summarized per category
    echo "$census" | jq -r '
        def mark(s): if s=="FAIL" then "[FAIL]" elif s=="WARN" then "[WARN]"
                     elif s=="INFO" then "[INFO]" elif s=="SKIP" then "[SKIP]" else "[ OK ]" end;
        (.results | map(select(.status=="FAIL"))) as $f
        | (.results | map(select(.status=="WARN"))) as $w
        | (.results | map(select(.status=="INFO"))) as $i
        | (if ($f|length)>0 then "\nFAILURES:" else empty end),
          ($f[] | "  [FAIL] \(.category)/\(.id): \(.detail)" + (if .fix!="" then "\n         fix: \(.fix)" else "" end)),
          (if ($w|length)>0 then "\nWARNINGS:" else empty end),
          ($w[] | "  [WARN] \(.category)/\(.id): \(.detail)" + (if .fix!="" then "\n         fix: \(.fix)" else "" end)),
          (if ($i|length)>0 then "\nINFO / capability:" else empty end),
          ($i[] | "  [INFO] \(.detail)")
    '
    # per-category PASS counts
    echo ""
    echo "PASS by category:"
    echo "$census" | jq -r '.results | map(select(.status=="PASS")) | group_by(.category)[] | "  \(.[0].category): \(length) ok"'
    echo "============================================================"
}

# ---------------------------------------------------------------------------
# Publish census to coord API.
# The coord API uses the PATH-PARAM form: PUT /api/coord/components/{pk}/{comp}
# with a body of {state, version, notes, updated_by} (the body form 405s).
# The component segment must be slash-free (a slash 404s, even URL-encoded), so
# the per-machine component is "selfcheck_<host>" (NOT "selfcheck/<host>").
# ---------------------------------------------------------------------------
COMPONENT="selfcheck_$HOST"
publish_census() {
    local census="$1" grade compact body path
    grade="$(echo "$census" | jq -r .grade)"
    compact="$(echo "$census" | jq -c .)"
    path="/api/coord/components/claudetools/$COMPONENT"
    body="$(jq -nc --arg state "$grade" \
        --arg ver "$(jq -r '.schema_version' "$MANIFEST")" --arg notes "$compact" --arg by "$SESSION" \
        '{state:$state, version:$ver, notes:$notes, updated_by:$by}')"
    if curl -s -m 8 -X PUT "$API$path" -H "Content-Type: application/json" -d "$body" >/dev/null 2>&1; then
        echo "[OK] Published census to coord: component $COMPONENT = $grade"
    else
        # softfail per coordination protocol - queue the SAME path/body so a
        # later /sync drain replays a request that actually works.
        local q="$REPO_ROOT/.claude/coord-queue.jsonl"
        jq -nc --arg path "$path" --argjson b "$body" --arg ts "$RUN_TS" \
            '{ts:$ts, method:"PUT", path:$path, body:$b}' >> "$q" 2>/dev/null
        echo "[WARN] coord unreachable; census queued to .claude/coord-queue.jsonl"
    fi
}

# ---------------------------------------------------------------------------
# Subcommand: fanout - request all instances to run /self-check --publish
# ---------------------------------------------------------------------------
do_fanout() {
    local subj body payload
    subj="[self-check] Fleet census + self-remediation request"
    body="On THIS machine: (1) run /self-check ; (2) apply the suggested fix commands it prints for any FAIL/WARN - fix your OWN machine, locally, with your operator present (nobody fixes you remotely) ; (3) re-run /self-check to confirm GREEN ; (4) run /self-check --publish to report your census (component selfcheck_<host>) to coord. The check is read-only; only --publish writes (your census only). Requested by $SESSION at $RUN_TS."
    payload="$(jq -nc --arg from "$SESSION" --arg subj "$subj" --arg body "$body" \
        '{from_session:$from, to_session:"ALL_SESSIONS", project_key:"claudetools", subject:$subj, body:$body}')"
    if curl -s -m 8 -X POST "$API/api/coord/messages" -H "Content-Type: application/json" -d "$payload" >/dev/null 2>&1; then
        echo "[OK] Broadcast census request to ALL_SESSIONS."
    else
        echo "[ERROR] Failed to broadcast (coord unreachable)." >&2
        exit 1
    fi
}

# ---------------------------------------------------------------------------
# Subcommand: aggregate - read all published censuses, build fleet view
# ---------------------------------------------------------------------------
do_aggregate() {
    local comps
    comps="$(curl -s -m 8 "$API/api/coord/components?project_key=claudetools" 2>/dev/null)"
    if [ -z "$comps" ]; then echo "[ERROR] coord unreachable." >&2; exit 1; fi
    # The coord API returns {states:[...], total:N}; each row's grade is .state and
    # the full census JSON is in .notes. Keep selfcheck_* rows with parseable notes.
    # (.components / bare-array kept as defensive fallbacks.)
    local censuses
    censuses="$(echo "$comps" | jq -c '
        ( .states? // .components? // (if type=="array" then . else [] end) ) as $rows
        | ($rows // [])
        | map(select((.component? // "") | startswith("selfcheck")))
        | map(.notes | try fromjson catch empty)
    ' 2>/dev/null)"
    local n; n="$(echo "$censuses" | jq 'length' 2>/dev/null || echo 0)"
    if [ "${n:-0}" -eq 0 ]; then
        echo "No published censuses found yet. Run 'self-check.sh fanout', then have each machine run /self-check --publish."
        return
    fi
    echo "============================================================"
    echo " Fleet census: $n machine(s) reporting"
    echo "============================================================"
    echo "$censuses" | jq -r '.[] | "  \(.grade)\t\(.host)\t\(.platform)/\(.arch)\tP\(.summary.pass) W\(.summary.warn) F\(.summary.fail)\t\(.generated_at)"' | column -t -s$'\t' 2>/dev/null \
        || echo "$censuses" | jq -r '.[] | "  \(.grade)  \(.host)  \(.platform)/\(.arch)  P\(.summary.pass) W\(.summary.warn) F\(.summary.fail)"'

    echo ""
    echo "Proposed baseline (intersection = required everywhere; symmetric diff = capability-gated):"
    # Tools present on every machine vs only some, derived from tool.* PASS results.
    echo "$censuses" | jq -r '
        [ .[] | { host:.host, tools:( .results | map(select((.id|startswith("tool."))) | select(.status=="PASS") | (.id|sub("^tool.";""))) ) } ] as $m
        | ($m|length) as $count
        | ([ $m[].tools[] ] | unique) as $all
        | "  tools on ALL \($count): " + ( [ $all[] | . as $t | select( ([ $m[] | select(.tools|index($t)) ]|length) == $count ) ] | join(", ") ),
          "  tools on SOME only:    " + ( [ $all[] | . as $t | select( ([ $m[] | select(.tools|index($t)) ]|length) <  $count ) ] | join(", ") )
    ' 2>/dev/null
    echo ""
    echo "Machines that must self-remediate (RED/AMBER) - each fixes ITSELF, then re-runs + re-publishes:"
    local needfix
    needfix="$(echo "$censuses" | jq -r '
        .[] | select(.grade!="GREEN")
        | "  \(.host) [\(.grade)] should run, in order:\n"
          + ( [ .results[] | select(.status=="FAIL" or .status=="WARN") | select(.fix!="")
                | "      - \(.fix)" ] | join("\n") )
          + "\n      then: /self-check --publish"
    ' 2>/dev/null)"
    if [ -n "$needfix" ]; then
        echo "$needfix"
    else
        echo "  (none - whole fleet is GREEN)"
    fi
    echo "============================================================"
    echo "We do NOT fix remote machines. Relay each machine's fix list to its operator;"
    echo "they self-remediate locally, re-run /self-check, and re-publish until GREEN."
    echo "Once the fleet is reporting consistently, ratify baseline/manifest.json with Mike."
}

# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
# RUN_TS is passed in by the caller (SKILL.md instructs a real UTC stamp);
# fall back to `date` if available so the script is runnable standalone.
RUN_TS="${SELFCHECK_TS:-$(date -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || echo unknown)}"

MODE="${1:-report}"
case "$MODE" in
    fanout)     do_fanout; exit 0 ;;
    aggregate)  do_aggregate; exit 0 ;;
esac

# run all checks
check_identity
check_tools
check_capability_tier
check_files
check_settings_hooks
check_git
check_skills_commands
check_duplicates
check_memory
check_vault
check_connectivity

CENSUS="$(build_census)"

case "$MODE" in
    --json)     echo "$CENSUS" ;;
    --publish)  print_report "$CENSUS"; publish_census "$CENSUS" ;;
    report|*)   print_report "$CENSUS" ;;
esac

# exit code reflects grade for scripting (0 GREEN, 1 AMBER, 2 RED)
GR="$(echo "$CENSUS" | jq -r .grade)"
case "$GR" in GREEN) exit 0 ;; AMBER) exit 1 ;; RED) exit 2 ;; *) exit 0 ;; esac