harness: fleet-wide functional-error + correction + friction logging

Add .claude/scripts/log-skill-error.sh — the canonical agent error log helper (writes errorlog.md in DATE | MACHINE | skill | [type] error format, soft-fails). Three categories: execution failures (default), user corrections (--correction), and preventable self-inflicted friction (--friction; cite ref= when it repeats a documented gotcha). Goal: stop paying tokens twice for the same avoidable mistake. - CLAUDE.md: make logging mandatory for all skills + corrections + friction. - skill-creator: new skills must wire in the helper (guidance + checklist). - Retrofit every skill script's genuine failure branches to call the helper (b2/bitdefender/mailprotector/packetdial/coord python CLIs; remediation-tool + onboard365 bash; vault, rmm-auth, post-bot-alert, agy, grok, 1password, run-onboarding-diagnostic). Handled conditions + self-tests left alone. - errorlog.md: broaden header to cover skills + harness + corrections; seed this session's corrections (INKY, Mail.Send token-audience, omnibox-strictness) and friction (git-bash /tmp, env-persistence, argv-limit, PowerShell var-case). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-15 11:39:43 -07:00
parent 927a06a0cf
commit 9960da5f9a
29 changed files with 388 additions and 36 deletions
--- a/.claude/scripts/discord-dm.sh
+++ b/.claude/scripts/discord-dm.sh
@@ -78,7 +78,11 @@ if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
  ENV_FILE="$ROOT/projects/discord-bot/.env"
  [ -f "$ENV_FILE" ] && TOKEN="$(grep -iE '^[[:space:]]*DISCORD_TOKEN[[:space:]]*=' "$ENV_FILE" | head -1 | sed -E 's/^[^=]*=[[:space:]]*//; s/^["'"'"']//; s/["'"'"'][[:space:]]*$//')"
 fi
-if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then echo "[ERROR] no bot token (vault + .env both empty)" >&2; exit 2; fi
+if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
+  echo "[ERROR] no bot token (vault + .env both empty)" >&2
+  bash "$ROOT/.claude/scripts/log-skill-error.sh" "discord-dm" "no Discord bot token (vault projects/discord-bot/bot-token + .env both empty)" >/dev/null 2>&1
+  exit 2
+fi

 auth=(-H "Authorization: Bot ${TOKEN}" -H "Content-Type: application/json" -H "User-Agent: ${UA}")

@@ -87,7 +91,11 @@ if [ "$MODE" = "dm" ]; then
  DM="$(printf '%s' "$(jq -nc --arg r "$TARGET" '{recipient_id:$r}')" | \
    curl -s -m 15 "${auth[@]}" -X POST "$API/users/@me/channels" --data-binary @-)"
  CHID="$(printf '%s' "$DM" | jq -r '.id // empty' 2>/dev/null)"
-  if [ -z "$CHID" ]; then echo "[ERROR] could not open DM channel for $LABEL: $DM" >&2; exit 3; fi
+  if [ -z "$CHID" ]; then
+    echo "[ERROR] could not open DM channel for $LABEL: $DM" >&2
+    bash "$ROOT/.claude/scripts/log-skill-error.sh" "discord-dm" "failed to open DM channel for $LABEL" --context "resp=${DM:0:80}" >/dev/null 2>&1
+    exit 3
+  fi
  TARGET="$CHID"
 fi

@@ -103,4 +111,5 @@ if [ "$HTTP" = "200" ]; then
  exit 0
 fi
 echo "[ERROR] discord-dm: Discord returned ${HTTP:-no-response} — ${BODY}" >&2
+bash "$ROOT/.claude/scripts/log-skill-error.sh" "discord-dm" "Discord send to $LABEL failed" --context "http=${HTTP:-none} resp=${BODY:0:80}" >/dev/null 2>&1
 exit 3
--- a/.claude/scripts/log-skill-error.sh
+++ b/.claude/scripts/log-skill-error.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+# log-skill-error.sh — append an entry to errorlog.md in the canonical format,
+# for later linting that feeds skill fixes, CLAUDE.md rules, and memory cleanup.
+#
+# Despite the name this is the GENERAL agent error/correction/friction log — it
+# captures three things (see --type below):
+#   1. skill/command FUNCTIONAL failures (API/auth/unexpected-response/bad-exit)
+#   2. user CORRECTIONS of an improper assumption I made (--correction)
+#   3. preventable self-inflicted FRICTION that wasted tokens (--friction) —
+#      harness/env/tool misuse, ESPECIALLY a repeat of an already-documented
+#      gotcha (that means a rule or memory isn't working and needs strengthening)
+#
+# Do NOT call it for expected/handled conditions (a search with no matches, a
+# "no unread messages", a user declining a prompt) — only real, preventable,
+# pattern-worthy events.
+#
+# Usage:
+#   bash log-skill-error.sh <skill-or-command> "<brief error>"
+#   echo "<brief error>" | bash log-skill-error.sh <skill-or-command>
+#   bash log-skill-error.sh <skill> "<error>" --context "op=send id=123 http=403"
+#   bash log-skill-error.sh <skill/context> "<what I wrongly assumed + the correction>" --correction
+#
+# Categories (all feed the lint that improves skills, CLAUDE.md, and memory):
+#   (default) execution failure — API/auth failure, unexpected response, bad exit.
+#   --correction — the USER corrected an improper assumption/approach I made.
+#   --friction   — preventable self-inflicted error that wasted tokens (harness/env/
+#                  tool misuse). If it repeats a documented gotcha, note it in
+#                  --context (e.g. ref=feedback_tmp_path_windows) — that's the signal
+#                  a rule/memory needs strengthening.
+#   (--type <other> also supported; tags the error column as [<type>].)
+#   bash log-skill-error.sh <context> "<what wasted tokens + the fix>" --friction --context "ref=<memory>"
+#
+# Writes:  YYYY-MM-DD | MACHINE | <skill> | [<type>] <error> [ctx: <context>]
+# (newest entry inserted at the top, just under the append marker).
+#
+# Soft-fail by design: this NEVER breaks the caller. Missing log, missing jq,
+# empty message -> prints a [WARN] to stderr and exits 0.
+set -u
+ROOT="${CLAUDETOOLS_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
+
+SKILL="${1:-unknown}"; shift || true
+CONTEXT=""
+ETYPE=""          # "" / exec = execution failure; "correction" = user corrected a bad assumption
+ARGS=()
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --context) CONTEXT="${2:-}"; shift 2;;
+    --type) ETYPE="${2:-}"; shift 2;;
+    --correction) ETYPE="correction"; shift;;
+    --friction) ETYPE="friction"; shift;;
+    *) ARGS+=("$1"); shift;;
+  esac
+done
+MSG="${ARGS[*]:-}"
+if [ -z "$MSG" ] && [ ! -t 0 ]; then MSG="$(cat)"; fi
+if [ -z "$MSG" ]; then echo "[WARN] log-skill-error: empty message, nothing logged" >&2; exit 0; fi
+
+LOG="$ROOT/errorlog.md"
+if [ ! -f "$LOG" ]; then echo "[WARN] log-skill-error: $LOG not found" >&2; exit 0; fi
+
+DATE="$(date -u +%F)"
+IDF="$ROOT/.claude/identity.json"
+MACHINE=""
+if command -v jq >/dev/null 2>&1 && [ -f "$IDF" ]; then
+  MACHINE="$(jq -r '.machine_name // .hostname // empty' "$IDF" 2>/dev/null)"
+fi
+[ -z "$MACHINE" ] && MACHINE="$(hostname 2>/dev/null || echo unknown)"
+
+# normalize whitespace/newlines so each entry is one line
+MSG="$(printf '%s' "$MSG" | tr '\n' ' ' | sed 's/[[:space:]]\{1,\}/ /g; s/^ //; s/ $//')"
+[ -n "$CONTEXT" ] && MSG="$MSG [ctx: $CONTEXT]"
+# Tag non-execution categories at the start of the error column for easy linting
+# (e.g. grep "\[correction\]" errorlog.md to surface improper-assumption patterns).
+if [ -n "$ETYPE" ] && [ "$ETYPE" != "exec" ]; then MSG="[$ETYPE] $MSG"; fi
+ENTRY="$DATE | $MACHINE | $SKILL | $MSG"
+
+MARK="<!-- Append entries below this line -->"
+TMP="$LOG.tmp.$$"
+if awk -v entry="$ENTRY" -v mark="$MARK" '
+  { print }
+  ($0==mark && !done) { print ""; print entry; done=1 }
+  END { if (!done) { print ""; print entry } }   # marker missing -> append at end
+' "$LOG" > "$TMP" 2>/dev/null && mv "$TMP" "$LOG" 2>/dev/null; then
+  echo "[OK] logged skill error to errorlog.md ($SKILL)"
+else
+  rm -f "$TMP" 2>/dev/null
+  echo "[WARN] log-skill-error: could not write $LOG" >&2
+fi
+exit 0
--- a/.claude/scripts/post-bot-alert.sh
+++ b/.claude/scripts/post-bot-alert.sh
@@ -86,4 +86,8 @@ if [ "$HTTP" = "200" ]; then
 fi

 echo "[WARNING] post-bot-alert: Discord returned ${HTTP:-no-response} — ${BODY}" >&2
+# Log the Discord POST failure (non-200 / unreachable) once. Do NOT route this
+# through post-bot-alert itself — that would recurse; log-skill-error.sh only
+# writes to errorlog.md. Soft-fail preserved: this never changes the exit 0.
+bash "$ROOT/.claude/scripts/log-skill-error.sh" "post-bot-alert" "Discord POST failed (non-200/unreachable)" --context "channel=${CHANNEL_NAME} http=${HTTP:-none} resp=${BODY:0:80}" >/dev/null 2>&1 || true
 exit 0
--- a/.claude/scripts/rmm-auth.sh
+++ b/.claude/scripts/rmm-auth.sh
@@ -11,19 +11,27 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 IDENTITY_FILE="$REPO_ROOT/.claude/identity.json"

+# Functional-error logger. MUST stay silent on stdout (this script's stdout is
+# eval'd by the caller) — log-skill-error.sh prints only to stderr, and we
+# redirect everything to /dev/null to be safe.
+_logerr() { bash "$REPO_ROOT/.claude/scripts/log-skill-error.sh" "rmm-auth" "$@" >/dev/null 2>&1 || true; }
+
 if [ ! -f "$IDENTITY_FILE" ]; then
+    _logerr "identity.json not found; RMM auth cannot resolve vault" --context "path=$IDENTITY_FILE"
    echo "export TOKEN=''; export RMM=''; export REPO_ROOT=''; echo '[ERROR] identity.json not found' >&2"
    exit 1
 fi

 VAULT_PATH=$(jq -r '.vault_path // empty' "$IDENTITY_FILE")
 if [ -z "$VAULT_PATH" ]; then
+    _logerr "vault_path not in identity.json; RMM auth failed" --context "path=$IDENTITY_FILE"
    echo "export TOKEN=''; export RMM=''; export REPO_ROOT=''; echo '[ERROR] vault_path not in identity.json' >&2"
    exit 1
 fi

 VAULT_SH="$VAULT_PATH/scripts/vault.sh"
 if [ ! -f "$VAULT_SH" ]; then
+    _logerr "vault.sh not found at resolved vault_path; RMM auth failed" --context "path=$VAULT_SH"
    echo "export TOKEN=''; export RMM=''; export REPO_ROOT=''; echo '[ERROR] vault.sh not found at $VAULT_SH' >&2"
    exit 1
 fi
@@ -35,6 +43,7 @@ RMM_EMAIL=$(bash "$VAULT_SH" get-field infrastructure/gururmm-server.sops.yaml c
 RMM_PASS=$(bash "$VAULT_SH" get-field infrastructure/gururmm-server.sops.yaml credentials.gururmm-api.admin-password 2>/dev/null)

 if [ -z "$RMM_EMAIL" ] || [ -z "$RMM_PASS" ]; then
+    _logerr "vault read of GuruRMM API credentials failed (empty email/password)" --context "entry=infrastructure/gururmm-server.sops.yaml"
    echo "export TOKEN=''; export RMM=''; export REPO_ROOT=''; echo '[ERROR] Failed to get RMM credentials from vault' >&2"
    exit 1
 fi
@@ -45,6 +54,7 @@ JWT=$(curl -s -X POST "$RMM_URL/api/auth/login" -H "Content-Type: application/js
 TOKEN=$(echo "$JWT" | jq -r '.token // empty')

 if [ -z "$TOKEN" ]; then
+    _logerr "RMM login failed (no token returned from /api/auth/login)" --context "url=$RMM_URL resp=${JWT:0:80}"
    echo "export TOKEN=''; export RMM=''; export REPO_ROOT=''; echo '[ERROR] RMM login failed: $JWT' >&2"
    exit 1
 fi
--- a/.claude/scripts/rmm-search.sh
+++ b/.claude/scripts/rmm-search.sh
@@ -37,9 +37,17 @@ if [ -z "$QUERY" ] && [ -z "$CLIENT" ] && [ "$LISTC" -eq 0 ]; then
 fi

 eval "$(bash "$ROOT/.claude/scripts/rmm-auth.sh" 2>/dev/null)" >/dev/null
-if [ -z "${TOKEN:-}" ] || [ -z "${RMM:-}" ]; then echo "[ERROR] RMM auth failed (see rmm-auth.sh)" >&2; exit 1; fi
+if [ -z "${TOKEN:-}" ] || [ -z "${RMM:-}" ]; then
+  echo "[ERROR] RMM auth failed (see rmm-auth.sh)" >&2
+  bash "$ROOT/.claude/scripts/log-skill-error.sh" "rmm-search" "RMM auth failed via rmm-auth.sh (no TOKEN/RMM)" >/dev/null 2>&1
+  exit 1
+fi
 AGENTS=$(curl -s "$RMM/api/agents" -H "Authorization: Bearer $TOKEN")
-if [ -z "$AGENTS" ] || [ "${AGENTS:0:1}" != "[" ]; then echo "[ERROR] could not fetch agents: ${AGENTS:0:160}" >&2; exit 1; fi
+if [ -z "$AGENTS" ] || [ "${AGENTS:0:1}" != "[" ]; then
+  echo "[ERROR] could not fetch agents: ${AGENTS:0:160}" >&2
+  bash "$ROOT/.claude/scripts/log-skill-error.sh" "rmm-search" "GET /api/agents returned non-array/empty" --context "resp=${AGENTS:0:80}" >/dev/null 2>&1
+  exit 1
+fi

 # Pipe agents on stdin (payload too large for argv on Windows); flags via env.
 printf '%s' "$AGENTS" | QUERY="$QUERY" CLIENT="$CLIENT" ONLINE="$ONLINE" JSON="$JSON" LISTC="$LISTC" LIMIT="$LIMIT" \
--- a/.claude/scripts/run-onboarding-diagnostic.sh
+++ b/.claude/scripts/run-onboarding-diagnostic.sh
@@ -37,6 +37,11 @@ PROBE="$SCRIPT_DIR/onboarding-diagnostic.ps1"
 ALERT="$REPO_ROOT/.claude/scripts/post-bot-alert.sh"
 RMM="http://172.16.3.30:3001"

+# Functional-error logger (skill name "rmm-diagnose"). Logs genuine operational
+# failures (auth, vault, dispatch) — NOT the RED/AMBER/GREEN diagnostic grade,
+# which is a normal by-design result. Soft-fails; never breaks the run.
+_logerr() { bash "$REPO_ROOT/.claude/scripts/log-skill-error.sh" "rmm-diagnose" "$@" >/dev/null 2>&1 || true; }
+
 if [ ! -f "$PROBE" ]; then
    echo "[ERROR] Probe script not found: $PROBE" >&2
    exit 1
@@ -65,6 +70,7 @@ RMM_PASS="$(bash "$VAULT" get-field infrastructure/gururmm-server.sops.yaml cred

 if [ -z "$RMM_EMAIL" ] || [ -z "$RMM_PASS" ] || [ "$RMM_EMAIL" = "null" ]; then
    echo "[ERROR] Could not read GuruRMM credentials from vault (infrastructure/gururmm-server.sops.yaml)" >&2
+    _logerr "vault read of GuruRMM credentials failed (empty/null)" --context "entry=infrastructure/gururmm-server.sops.yaml"
    exit 1
 fi

@@ -75,6 +81,7 @@ TOKEN="$(curl -s -m 30 -X POST "$RMM/api/auth/login" \

 if [ -z "$TOKEN" ]; then
    echo "[ERROR] RMM login failed (no token returned)" >&2
+    _logerr "RMM login failed (no token returned from /api/auth/login)" --context "url=$RMM"
    exit 1
 fi
 echo "[OK] Authenticated to GuruRMM"
@@ -85,6 +92,7 @@ echo "[OK] Authenticated to GuruRMM"
 AGENTS="$(curl -s -m 30 "$RMM/api/agents" -H "Authorization: Bearer $TOKEN")"
 if [ -z "$AGENTS" ] || ! echo "$AGENTS" | jq -e 'type=="array"' >/dev/null 2>&1; then
    echo "[ERROR] Could not retrieve agent list" >&2
+    _logerr "GET /api/agents returned non-array/empty" --context "resp=${AGENTS:0:80}"
    exit 1
 fi

@@ -263,6 +271,7 @@ PS
    CH_STATUS="$(echo "$CH_RESULT" | jq -r '.status')"
    if [ "$CH_STATUS" != "completed" ]; then
        echo "[ERROR] Chunk $IDX upload failed: status=$CH_STATUS stderr=$(echo "$CH_RESULT" | jq -r '.stderr' | head -c 200)" >&2
+        _logerr "probe chunk upload failed" --context "host=$AGENT_HOST idx=$IDX/$N_CHUNKS status=$CH_STATUS"
        exit 1
    fi
    echo "[OK] Uploaded chunk $IDX/$N_CHUNKS"
@@ -288,7 +297,7 @@ try {
 }
 PS

-RESULT="$(dispatch_one "$RUN_SCRIPT" "$EXEC_TIMEOUT")" || { echo "[ERROR] Probe execution dispatch failed" >&2; exit 1; }
+RESULT="$(dispatch_one "$RUN_SCRIPT" "$EXEC_TIMEOUT")" || { echo "[ERROR] Probe execution dispatch failed" >&2; _logerr "probe execution dispatch failed" --context "host=$AGENT_HOST agent=$AGENT_ID"; exit 1; }
 CMD_ID="$(cat "$WORK_DIR/last_cmd_id" 2>/dev/null || echo unknown)"

 FINAL_STATUS="$(echo "$RESULT" | jq -r '.status // empty')"
@@ -317,6 +326,7 @@ if [ -z "$DIAG_JSON" ] || ! echo "$DIAG_JSON" | jq -e '.host' >/dev/null 2>&1; t
    fi
    echo "--- stdout (first 60 lines) ---" >&2
    printf '%s\n' "$STDOUT" | head -60 >&2
+    _logerr "could not extract valid diagnostic JSON from probe output" --context "host=$AGENT_HOST status=$FINAL_STATUS exit=$EXIT_CODE"
    exit 1
 fi

--- a/.claude/scripts/vault.sh
+++ b/.claude/scripts/vault.sh
@@ -16,9 +16,12 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 CLAUDETOOLS_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
 IDENTITY_FILE="$CLAUDETOOLS_ROOT/.claude/identity.json"

+_logerr() { bash "$CLAUDETOOLS_ROOT/.claude/scripts/log-skill-error.sh" "vault" "$@" >/dev/null 2>&1 || true; }
+
 if [[ ! -f "$IDENTITY_FILE" ]]; then
    echo "[ERROR] .claude/identity.json not found at $IDENTITY_FILE" >&2
    echo "        Run onboarding to create it, or add vault_path manually." >&2
+    _logerr "identity.json not found; vault read cannot resolve vault_path" --context "path=$IDENTITY_FILE"
    exit 1
 fi

@@ -40,6 +43,7 @@ fi
 if [[ -z "$VAULT_ROOT" ]]; then
    echo "[ERROR] vault_path not set in $IDENTITY_FILE" >&2
    echo "        Add: \"vault_path\": \"/path/to/vault\"" >&2
+    _logerr "vault_path not set in identity.json; vault read failed" --context "path=$IDENTITY_FILE"
    exit 1
 fi

@@ -48,6 +52,7 @@ REAL_VAULT_SH="$VAULT_ROOT/scripts/vault.sh"
 if [[ ! -f "$REAL_VAULT_SH" ]]; then
    echo "[ERROR] vault.sh not found at $REAL_VAULT_SH" >&2
    echo "        Check vault_path in $IDENTITY_FILE" >&2
+    _logerr "real vault.sh not found at resolved vault_path; vault read failed" --context "path=$REAL_VAULT_SH"
    exit 1
 fi