harness: fleet-wide functional-error + correction + friction logging

Add .claude/scripts/log-skill-error.sh — the canonical agent error log helper
(writes errorlog.md in DATE | MACHINE | skill | [type] error format, soft-fails).
Three categories: execution failures (default), user corrections (--correction),
and preventable self-inflicted friction (--friction; cite ref= when it repeats a
documented gotcha). Goal: stop paying tokens twice for the same avoidable mistake.

- CLAUDE.md: make logging mandatory for all skills + corrections + friction.
- skill-creator: new skills must wire in the helper (guidance + checklist).
- Retrofit every skill script's genuine failure branches to call the helper
  (b2/bitdefender/mailprotector/packetdial/coord python CLIs; remediation-tool
  + onboard365 bash; vault, rmm-auth, post-bot-alert, agy, grok, 1password,
  run-onboarding-diagnostic). Handled conditions + self-tests left alone.
- errorlog.md: broaden header to cover skills + harness + corrections; seed this
  session's corrections (INKY, Mail.Send token-audience, omnibox-strictness) and
  friction (git-bash /tmp, env-persistence, argv-limit, PowerShell var-case).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-15 11:39:43 -07:00
parent 927a06a0cf
commit 9960da5f9a
29 changed files with 388 additions and 36 deletions

View File

@@ -78,7 +78,11 @@ if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
ENV_FILE="$ROOT/projects/discord-bot/.env"
[ -f "$ENV_FILE" ] && TOKEN="$(grep -iE '^[[:space:]]*DISCORD_TOKEN[[:space:]]*=' "$ENV_FILE" | head -1 | sed -E 's/^[^=]*=[[:space:]]*//; s/^["'"'"']//; s/["'"'"'][[:space:]]*$//')"
fi
if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then echo "[ERROR] no bot token (vault + .env both empty)" >&2; exit 2; fi
if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
echo "[ERROR] no bot token (vault + .env both empty)" >&2
bash "$ROOT/.claude/scripts/log-skill-error.sh" "discord-dm" "no Discord bot token (vault projects/discord-bot/bot-token + .env both empty)" >/dev/null 2>&1
exit 2
fi
auth=(-H "Authorization: Bot ${TOKEN}" -H "Content-Type: application/json" -H "User-Agent: ${UA}")
@@ -87,7 +91,11 @@ if [ "$MODE" = "dm" ]; then
DM="$(printf '%s' "$(jq -nc --arg r "$TARGET" '{recipient_id:$r}')" | \
curl -s -m 15 "${auth[@]}" -X POST "$API/users/@me/channels" --data-binary @-)"
CHID="$(printf '%s' "$DM" | jq -r '.id // empty' 2>/dev/null)"
if [ -z "$CHID" ]; then echo "[ERROR] could not open DM channel for $LABEL: $DM" >&2; exit 3; fi
if [ -z "$CHID" ]; then
echo "[ERROR] could not open DM channel for $LABEL: $DM" >&2
bash "$ROOT/.claude/scripts/log-skill-error.sh" "discord-dm" "failed to open DM channel for $LABEL" --context "resp=${DM:0:80}" >/dev/null 2>&1
exit 3
fi
TARGET="$CHID"
fi
@@ -103,4 +111,5 @@ if [ "$HTTP" = "200" ]; then
exit 0
fi
echo "[ERROR] discord-dm: Discord returned ${HTTP:-no-response}${BODY}" >&2
bash "$ROOT/.claude/scripts/log-skill-error.sh" "discord-dm" "Discord send to $LABEL failed" --context "http=${HTTP:-none} resp=${BODY:0:80}" >/dev/null 2>&1
exit 3

View File

@@ -0,0 +1,89 @@
#!/usr/bin/env bash
# log-skill-error.sh — append an entry to errorlog.md in the canonical format,
# for later linting that feeds skill fixes, CLAUDE.md rules, and memory cleanup.
#
# Despite the name this is the GENERAL agent error/correction/friction log — it
# captures three things (see --type below):
# 1. skill/command FUNCTIONAL failures (API/auth/unexpected-response/bad-exit)
# 2. user CORRECTIONS of an improper assumption I made (--correction)
# 3. preventable self-inflicted FRICTION that wasted tokens (--friction) —
# harness/env/tool misuse, ESPECIALLY a repeat of an already-documented
# gotcha (that means a rule or memory isn't working and needs strengthening)
#
# Do NOT call it for expected/handled conditions (a search with no matches, a
# "no unread messages", a user declining a prompt) — only real, preventable,
# pattern-worthy events.
#
# Usage:
# bash log-skill-error.sh <skill-or-command> "<brief error>"
# echo "<brief error>" | bash log-skill-error.sh <skill-or-command>
# bash log-skill-error.sh <skill> "<error>" --context "op=send id=123 http=403"
# bash log-skill-error.sh <skill/context> "<what I wrongly assumed + the correction>" --correction
#
# Categories (all feed the lint that improves skills, CLAUDE.md, and memory):
# (default) execution failure — API/auth failure, unexpected response, bad exit.
# --correction — the USER corrected an improper assumption/approach I made.
# --friction — preventable self-inflicted error that wasted tokens (harness/env/
# tool misuse). If it repeats a documented gotcha, note it in
# --context (e.g. ref=feedback_tmp_path_windows) — that's the signal
# a rule/memory needs strengthening.
# (--type <other> also supported; tags the error column as [<type>].)
# bash log-skill-error.sh <context> "<what wasted tokens + the fix>" --friction --context "ref=<memory>"
#
# Writes: YYYY-MM-DD | MACHINE | <skill> | [<type>] <error> [ctx: <context>]
# (newest entry inserted at the top, just under the append marker).
#
# Soft-fail by design: this NEVER breaks the caller. Missing log, missing jq,
# empty message -> prints a [WARN] to stderr and exits 0.
set -u
ROOT="${CLAUDETOOLS_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
SKILL="${1:-unknown}"; shift || true
CONTEXT=""
ETYPE="" # "" / exec = execution failure; "correction" = user corrected a bad assumption
ARGS=()
while [ $# -gt 0 ]; do
case "$1" in
--context) CONTEXT="${2:-}"; shift 2;;
--type) ETYPE="${2:-}"; shift 2;;
--correction) ETYPE="correction"; shift;;
--friction) ETYPE="friction"; shift;;
*) ARGS+=("$1"); shift;;
esac
done
MSG="${ARGS[*]:-}"
if [ -z "$MSG" ] && [ ! -t 0 ]; then MSG="$(cat)"; fi
if [ -z "$MSG" ]; then echo "[WARN] log-skill-error: empty message, nothing logged" >&2; exit 0; fi
LOG="$ROOT/errorlog.md"
if [ ! -f "$LOG" ]; then echo "[WARN] log-skill-error: $LOG not found" >&2; exit 0; fi
DATE="$(date -u +%F)"
IDF="$ROOT/.claude/identity.json"
MACHINE=""
if command -v jq >/dev/null 2>&1 && [ -f "$IDF" ]; then
MACHINE="$(jq -r '.machine_name // .hostname // empty' "$IDF" 2>/dev/null)"
fi
[ -z "$MACHINE" ] && MACHINE="$(hostname 2>/dev/null || echo unknown)"
# normalize whitespace/newlines so each entry is one line
MSG="$(printf '%s' "$MSG" | tr '\n' ' ' | sed 's/[[:space:]]\{1,\}/ /g; s/^ //; s/ $//')"
[ -n "$CONTEXT" ] && MSG="$MSG [ctx: $CONTEXT]"
# Tag non-execution categories at the start of the error column for easy linting
# (e.g. grep "\[correction\]" errorlog.md to surface improper-assumption patterns).
if [ -n "$ETYPE" ] && [ "$ETYPE" != "exec" ]; then MSG="[$ETYPE] $MSG"; fi
ENTRY="$DATE | $MACHINE | $SKILL | $MSG"
MARK="<!-- Append entries below this line -->"
TMP="$LOG.tmp.$$"
if awk -v entry="$ENTRY" -v mark="$MARK" '
{ print }
($0==mark && !done) { print ""; print entry; done=1 }
END { if (!done) { print ""; print entry } } # marker missing -> append at end
' "$LOG" > "$TMP" 2>/dev/null && mv "$TMP" "$LOG" 2>/dev/null; then
echo "[OK] logged skill error to errorlog.md ($SKILL)"
else
rm -f "$TMP" 2>/dev/null
echo "[WARN] log-skill-error: could not write $LOG" >&2
fi
exit 0

View File

@@ -86,4 +86,8 @@ if [ "$HTTP" = "200" ]; then
fi
echo "[WARNING] post-bot-alert: Discord returned ${HTTP:-no-response}${BODY}" >&2
# Log the Discord POST failure (non-200 / unreachable) once. Do NOT route this
# through post-bot-alert itself — that would recurse; log-skill-error.sh only
# writes to errorlog.md. Soft-fail preserved: this never changes the exit 0.
bash "$ROOT/.claude/scripts/log-skill-error.sh" "post-bot-alert" "Discord POST failed (non-200/unreachable)" --context "channel=${CHANNEL_NAME} http=${HTTP:-none} resp=${BODY:0:80}" >/dev/null 2>&1 || true
exit 0

View File

@@ -11,19 +11,27 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
IDENTITY_FILE="$REPO_ROOT/.claude/identity.json"
# Functional-error logger. MUST stay silent on stdout (this script's stdout is
# eval'd by the caller) — log-skill-error.sh prints only to stderr, and we
# redirect everything to /dev/null to be safe.
_logerr() { bash "$REPO_ROOT/.claude/scripts/log-skill-error.sh" "rmm-auth" "$@" >/dev/null 2>&1 || true; }
if [ ! -f "$IDENTITY_FILE" ]; then
_logerr "identity.json not found; RMM auth cannot resolve vault" --context "path=$IDENTITY_FILE"
echo "export TOKEN=''; export RMM=''; export REPO_ROOT=''; echo '[ERROR] identity.json not found' >&2"
exit 1
fi
VAULT_PATH=$(jq -r '.vault_path // empty' "$IDENTITY_FILE")
if [ -z "$VAULT_PATH" ]; then
_logerr "vault_path not in identity.json; RMM auth failed" --context "path=$IDENTITY_FILE"
echo "export TOKEN=''; export RMM=''; export REPO_ROOT=''; echo '[ERROR] vault_path not in identity.json' >&2"
exit 1
fi
VAULT_SH="$VAULT_PATH/scripts/vault.sh"
if [ ! -f "$VAULT_SH" ]; then
_logerr "vault.sh not found at resolved vault_path; RMM auth failed" --context "path=$VAULT_SH"
echo "export TOKEN=''; export RMM=''; export REPO_ROOT=''; echo '[ERROR] vault.sh not found at $VAULT_SH' >&2"
exit 1
fi
@@ -35,6 +43,7 @@ RMM_EMAIL=$(bash "$VAULT_SH" get-field infrastructure/gururmm-server.sops.yaml c
RMM_PASS=$(bash "$VAULT_SH" get-field infrastructure/gururmm-server.sops.yaml credentials.gururmm-api.admin-password 2>/dev/null)
if [ -z "$RMM_EMAIL" ] || [ -z "$RMM_PASS" ]; then
_logerr "vault read of GuruRMM API credentials failed (empty email/password)" --context "entry=infrastructure/gururmm-server.sops.yaml"
echo "export TOKEN=''; export RMM=''; export REPO_ROOT=''; echo '[ERROR] Failed to get RMM credentials from vault' >&2"
exit 1
fi
@@ -45,6 +54,7 @@ JWT=$(curl -s -X POST "$RMM_URL/api/auth/login" -H "Content-Type: application/js
TOKEN=$(echo "$JWT" | jq -r '.token // empty')
if [ -z "$TOKEN" ]; then
_logerr "RMM login failed (no token returned from /api/auth/login)" --context "url=$RMM_URL resp=${JWT:0:80}"
echo "export TOKEN=''; export RMM=''; export REPO_ROOT=''; echo '[ERROR] RMM login failed: $JWT' >&2"
exit 1
fi

View File

@@ -37,9 +37,17 @@ if [ -z "$QUERY" ] && [ -z "$CLIENT" ] && [ "$LISTC" -eq 0 ]; then
fi
eval "$(bash "$ROOT/.claude/scripts/rmm-auth.sh" 2>/dev/null)" >/dev/null
if [ -z "${TOKEN:-}" ] || [ -z "${RMM:-}" ]; then echo "[ERROR] RMM auth failed (see rmm-auth.sh)" >&2; exit 1; fi
if [ -z "${TOKEN:-}" ] || [ -z "${RMM:-}" ]; then
echo "[ERROR] RMM auth failed (see rmm-auth.sh)" >&2
bash "$ROOT/.claude/scripts/log-skill-error.sh" "rmm-search" "RMM auth failed via rmm-auth.sh (no TOKEN/RMM)" >/dev/null 2>&1
exit 1
fi
AGENTS=$(curl -s "$RMM/api/agents" -H "Authorization: Bearer $TOKEN")
if [ -z "$AGENTS" ] || [ "${AGENTS:0:1}" != "[" ]; then echo "[ERROR] could not fetch agents: ${AGENTS:0:160}" >&2; exit 1; fi
if [ -z "$AGENTS" ] || [ "${AGENTS:0:1}" != "[" ]; then
echo "[ERROR] could not fetch agents: ${AGENTS:0:160}" >&2
bash "$ROOT/.claude/scripts/log-skill-error.sh" "rmm-search" "GET /api/agents returned non-array/empty" --context "resp=${AGENTS:0:80}" >/dev/null 2>&1
exit 1
fi
# Pipe agents on stdin (payload too large for argv on Windows); flags via env.
printf '%s' "$AGENTS" | QUERY="$QUERY" CLIENT="$CLIENT" ONLINE="$ONLINE" JSON="$JSON" LISTC="$LISTC" LIMIT="$LIMIT" \

View File

@@ -37,6 +37,11 @@ PROBE="$SCRIPT_DIR/onboarding-diagnostic.ps1"
ALERT="$REPO_ROOT/.claude/scripts/post-bot-alert.sh"
RMM="http://172.16.3.30:3001"
# Functional-error logger (skill name "rmm-diagnose"). Logs genuine operational
# failures (auth, vault, dispatch) — NOT the RED/AMBER/GREEN diagnostic grade,
# which is a normal by-design result. Soft-fails; never breaks the run.
_logerr() { bash "$REPO_ROOT/.claude/scripts/log-skill-error.sh" "rmm-diagnose" "$@" >/dev/null 2>&1 || true; }
if [ ! -f "$PROBE" ]; then
echo "[ERROR] Probe script not found: $PROBE" >&2
exit 1
@@ -65,6 +70,7 @@ RMM_PASS="$(bash "$VAULT" get-field infrastructure/gururmm-server.sops.yaml cred
if [ -z "$RMM_EMAIL" ] || [ -z "$RMM_PASS" ] || [ "$RMM_EMAIL" = "null" ]; then
echo "[ERROR] Could not read GuruRMM credentials from vault (infrastructure/gururmm-server.sops.yaml)" >&2
_logerr "vault read of GuruRMM credentials failed (empty/null)" --context "entry=infrastructure/gururmm-server.sops.yaml"
exit 1
fi
@@ -75,6 +81,7 @@ TOKEN="$(curl -s -m 30 -X POST "$RMM/api/auth/login" \
if [ -z "$TOKEN" ]; then
echo "[ERROR] RMM login failed (no token returned)" >&2
_logerr "RMM login failed (no token returned from /api/auth/login)" --context "url=$RMM"
exit 1
fi
echo "[OK] Authenticated to GuruRMM"
@@ -85,6 +92,7 @@ echo "[OK] Authenticated to GuruRMM"
AGENTS="$(curl -s -m 30 "$RMM/api/agents" -H "Authorization: Bearer $TOKEN")"
if [ -z "$AGENTS" ] || ! echo "$AGENTS" | jq -e 'type=="array"' >/dev/null 2>&1; then
echo "[ERROR] Could not retrieve agent list" >&2
_logerr "GET /api/agents returned non-array/empty" --context "resp=${AGENTS:0:80}"
exit 1
fi
@@ -263,6 +271,7 @@ PS
CH_STATUS="$(echo "$CH_RESULT" | jq -r '.status')"
if [ "$CH_STATUS" != "completed" ]; then
echo "[ERROR] Chunk $IDX upload failed: status=$CH_STATUS stderr=$(echo "$CH_RESULT" | jq -r '.stderr' | head -c 200)" >&2
_logerr "probe chunk upload failed" --context "host=$AGENT_HOST idx=$IDX/$N_CHUNKS status=$CH_STATUS"
exit 1
fi
echo "[OK] Uploaded chunk $IDX/$N_CHUNKS"
@@ -288,7 +297,7 @@ try {
}
PS
RESULT="$(dispatch_one "$RUN_SCRIPT" "$EXEC_TIMEOUT")" || { echo "[ERROR] Probe execution dispatch failed" >&2; exit 1; }
RESULT="$(dispatch_one "$RUN_SCRIPT" "$EXEC_TIMEOUT")" || { echo "[ERROR] Probe execution dispatch failed" >&2; _logerr "probe execution dispatch failed" --context "host=$AGENT_HOST agent=$AGENT_ID"; exit 1; }
CMD_ID="$(cat "$WORK_DIR/last_cmd_id" 2>/dev/null || echo unknown)"
FINAL_STATUS="$(echo "$RESULT" | jq -r '.status // empty')"
@@ -317,6 +326,7 @@ if [ -z "$DIAG_JSON" ] || ! echo "$DIAG_JSON" | jq -e '.host' >/dev/null 2>&1; t
fi
echo "--- stdout (first 60 lines) ---" >&2
printf '%s\n' "$STDOUT" | head -60 >&2
_logerr "could not extract valid diagnostic JSON from probe output" --context "host=$AGENT_HOST status=$FINAL_STATUS exit=$EXIT_CODE"
exit 1
fi

View File

@@ -16,9 +16,12 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
CLAUDETOOLS_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
IDENTITY_FILE="$CLAUDETOOLS_ROOT/.claude/identity.json"
_logerr() { bash "$CLAUDETOOLS_ROOT/.claude/scripts/log-skill-error.sh" "vault" "$@" >/dev/null 2>&1 || true; }
if [[ ! -f "$IDENTITY_FILE" ]]; then
echo "[ERROR] .claude/identity.json not found at $IDENTITY_FILE" >&2
echo " Run onboarding to create it, or add vault_path manually." >&2
_logerr "identity.json not found; vault read cannot resolve vault_path" --context "path=$IDENTITY_FILE"
exit 1
fi
@@ -40,6 +43,7 @@ fi
if [[ -z "$VAULT_ROOT" ]]; then
echo "[ERROR] vault_path not set in $IDENTITY_FILE" >&2
echo " Add: \"vault_path\": \"/path/to/vault\"" >&2
_logerr "vault_path not set in identity.json; vault read failed" --context "path=$IDENTITY_FILE"
exit 1
fi
@@ -48,6 +52,7 @@ REAL_VAULT_SH="$VAULT_ROOT/scripts/vault.sh"
if [[ ! -f "$REAL_VAULT_SH" ]]; then
echo "[ERROR] vault.sh not found at $REAL_VAULT_SH" >&2
echo " Check vault_path in $IDENTITY_FILE" >&2
_logerr "real vault.sh not found at resolved vault_path; vault read failed" --context "path=$REAL_VAULT_SH"
exit 1
fi