harness: fleet-wide functional-error + correction + friction logging
Add .claude/scripts/log-skill-error.sh — the canonical agent error log helper (writes errorlog.md in DATE | MACHINE | skill | [type] error format, soft-fails). Three categories: execution failures (default), user corrections (--correction), and preventable self-inflicted friction (--friction; cite ref= when it repeats a documented gotcha). Goal: stop paying tokens twice for the same avoidable mistake. - CLAUDE.md: make logging mandatory for all skills + corrections + friction. - skill-creator: new skills must wire in the helper (guidance + checklist). - Retrofit every skill script's genuine failure branches to call the helper (b2/bitdefender/mailprotector/packetdial/coord python CLIs; remediation-tool + onboard365 bash; vault, rmm-auth, post-bot-alert, agy, grok, 1password, run-onboarding-diagnostic). Handled conditions + self-tests left alone. - errorlog.md: broaden header to cover skills + harness + corrections; seed this session's corrections (INKY, Mail.Send token-audience, omnibox-strictness) and friction (git-bash /tmp, env-persistence, argv-limit, PowerShell var-case). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -78,7 +78,11 @@ if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
|
||||
ENV_FILE="$ROOT/projects/discord-bot/.env"
|
||||
[ -f "$ENV_FILE" ] && TOKEN="$(grep -iE '^[[:space:]]*DISCORD_TOKEN[[:space:]]*=' "$ENV_FILE" | head -1 | sed -E 's/^[^=]*=[[:space:]]*//; s/^["'"'"']//; s/["'"'"'][[:space:]]*$//')"
|
||||
fi
|
||||
if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then echo "[ERROR] no bot token (vault + .env both empty)" >&2; exit 2; fi
|
||||
if [ -z "$TOKEN" ] || [ "$TOKEN" = "null" ]; then
|
||||
echo "[ERROR] no bot token (vault + .env both empty)" >&2
|
||||
bash "$ROOT/.claude/scripts/log-skill-error.sh" "discord-dm" "no Discord bot token (vault projects/discord-bot/bot-token + .env both empty)" >/dev/null 2>&1
|
||||
exit 2
|
||||
fi
|
||||
|
||||
auth=(-H "Authorization: Bot ${TOKEN}" -H "Content-Type: application/json" -H "User-Agent: ${UA}")
|
||||
|
||||
@@ -87,7 +91,11 @@ if [ "$MODE" = "dm" ]; then
|
||||
DM="$(printf '%s' "$(jq -nc --arg r "$TARGET" '{recipient_id:$r}')" | \
|
||||
curl -s -m 15 "${auth[@]}" -X POST "$API/users/@me/channels" --data-binary @-)"
|
||||
CHID="$(printf '%s' "$DM" | jq -r '.id // empty' 2>/dev/null)"
|
||||
if [ -z "$CHID" ]; then echo "[ERROR] could not open DM channel for $LABEL: $DM" >&2; exit 3; fi
|
||||
if [ -z "$CHID" ]; then
|
||||
echo "[ERROR] could not open DM channel for $LABEL: $DM" >&2
|
||||
bash "$ROOT/.claude/scripts/log-skill-error.sh" "discord-dm" "failed to open DM channel for $LABEL" --context "resp=${DM:0:80}" >/dev/null 2>&1
|
||||
exit 3
|
||||
fi
|
||||
TARGET="$CHID"
|
||||
fi
|
||||
|
||||
@@ -103,4 +111,5 @@ if [ "$HTTP" = "200" ]; then
|
||||
exit 0
|
||||
fi
|
||||
echo "[ERROR] discord-dm: Discord returned ${HTTP:-no-response} — ${BODY}" >&2
|
||||
bash "$ROOT/.claude/scripts/log-skill-error.sh" "discord-dm" "Discord send to $LABEL failed" --context "http=${HTTP:-none} resp=${BODY:0:80}" >/dev/null 2>&1
|
||||
exit 3
|
||||
|
||||
89
.claude/scripts/log-skill-error.sh
Normal file
89
.claude/scripts/log-skill-error.sh
Normal file
@@ -0,0 +1,89 @@
|
||||
#!/usr/bin/env bash
|
||||
# log-skill-error.sh — append an entry to errorlog.md in the canonical format,
|
||||
# for later linting that feeds skill fixes, CLAUDE.md rules, and memory cleanup.
|
||||
#
|
||||
# Despite the name this is the GENERAL agent error/correction/friction log — it
|
||||
# captures three things (see --type below):
|
||||
# 1. skill/command FUNCTIONAL failures (API/auth/unexpected-response/bad-exit)
|
||||
# 2. user CORRECTIONS of an improper assumption I made (--correction)
|
||||
# 3. preventable self-inflicted FRICTION that wasted tokens (--friction) —
|
||||
# harness/env/tool misuse, ESPECIALLY a repeat of an already-documented
|
||||
# gotcha (that means a rule or memory isn't working and needs strengthening)
|
||||
#
|
||||
# Do NOT call it for expected/handled conditions (a search with no matches, a
|
||||
# "no unread messages", a user declining a prompt) — only real, preventable,
|
||||
# pattern-worthy events.
|
||||
#
|
||||
# Usage:
|
||||
# bash log-skill-error.sh <skill-or-command> "<brief error>"
|
||||
# echo "<brief error>" | bash log-skill-error.sh <skill-or-command>
|
||||
# bash log-skill-error.sh <skill> "<error>" --context "op=send id=123 http=403"
|
||||
# bash log-skill-error.sh <skill/context> "<what I wrongly assumed + the correction>" --correction
|
||||
#
|
||||
# Categories (all feed the lint that improves skills, CLAUDE.md, and memory):
|
||||
# (default) execution failure — API/auth failure, unexpected response, bad exit.
|
||||
# --correction — the USER corrected an improper assumption/approach I made.
|
||||
# --friction — preventable self-inflicted error that wasted tokens (harness/env/
|
||||
# tool misuse). If it repeats a documented gotcha, note it in
|
||||
# --context (e.g. ref=feedback_tmp_path_windows) — that's the signal
|
||||
# a rule/memory needs strengthening.
|
||||
# (--type <other> also supported; tags the error column as [<type>].)
|
||||
# bash log-skill-error.sh <context> "<what wasted tokens + the fix>" --friction --context "ref=<memory>"
|
||||
#
|
||||
# Writes: YYYY-MM-DD | MACHINE | <skill> | [<type>] <error> [ctx: <context>]
|
||||
# (newest entry inserted at the top, just under the append marker).
|
||||
#
|
||||
# Soft-fail by design: this NEVER breaks the caller. Missing log, missing jq,
|
||||
# empty message -> prints a [WARN] to stderr and exits 0.
|
||||
set -u
|
||||
ROOT="${CLAUDETOOLS_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
|
||||
|
||||
SKILL="${1:-unknown}"; shift || true
|
||||
CONTEXT=""
|
||||
ETYPE="" # "" / exec = execution failure; "correction" = user corrected a bad assumption
|
||||
ARGS=()
|
||||
while [ $# -gt 0 ]; do
|
||||
case "$1" in
|
||||
--context) CONTEXT="${2:-}"; shift 2;;
|
||||
--type) ETYPE="${2:-}"; shift 2;;
|
||||
--correction) ETYPE="correction"; shift;;
|
||||
--friction) ETYPE="friction"; shift;;
|
||||
*) ARGS+=("$1"); shift;;
|
||||
esac
|
||||
done
|
||||
MSG="${ARGS[*]:-}"
|
||||
if [ -z "$MSG" ] && [ ! -t 0 ]; then MSG="$(cat)"; fi
|
||||
if [ -z "$MSG" ]; then echo "[WARN] log-skill-error: empty message, nothing logged" >&2; exit 0; fi
|
||||
|
||||
LOG="$ROOT/errorlog.md"
|
||||
if [ ! -f "$LOG" ]; then echo "[WARN] log-skill-error: $LOG not found" >&2; exit 0; fi
|
||||
|
||||
DATE="$(date -u +%F)"
|
||||
IDF="$ROOT/.claude/identity.json"
|
||||
MACHINE=""
|
||||
if command -v jq >/dev/null 2>&1 && [ -f "$IDF" ]; then
|
||||
MACHINE="$(jq -r '.machine_name // .hostname // empty' "$IDF" 2>/dev/null)"
|
||||
fi
|
||||
[ -z "$MACHINE" ] && MACHINE="$(hostname 2>/dev/null || echo unknown)"
|
||||
|
||||
# normalize whitespace/newlines so each entry is one line
|
||||
MSG="$(printf '%s' "$MSG" | tr '\n' ' ' | sed 's/[[:space:]]\{1,\}/ /g; s/^ //; s/ $//')"
|
||||
[ -n "$CONTEXT" ] && MSG="$MSG [ctx: $CONTEXT]"
|
||||
# Tag non-execution categories at the start of the error column for easy linting
|
||||
# (e.g. grep "\[correction\]" errorlog.md to surface improper-assumption patterns).
|
||||
if [ -n "$ETYPE" ] && [ "$ETYPE" != "exec" ]; then MSG="[$ETYPE] $MSG"; fi
|
||||
ENTRY="$DATE | $MACHINE | $SKILL | $MSG"
|
||||
|
||||
MARK="<!-- Append entries below this line -->"
|
||||
TMP="$LOG.tmp.$$"
|
||||
if awk -v entry="$ENTRY" -v mark="$MARK" '
|
||||
{ print }
|
||||
($0==mark && !done) { print ""; print entry; done=1 }
|
||||
END { if (!done) { print ""; print entry } } # marker missing -> append at end
|
||||
' "$LOG" > "$TMP" 2>/dev/null && mv "$TMP" "$LOG" 2>/dev/null; then
|
||||
echo "[OK] logged skill error to errorlog.md ($SKILL)"
|
||||
else
|
||||
rm -f "$TMP" 2>/dev/null
|
||||
echo "[WARN] log-skill-error: could not write $LOG" >&2
|
||||
fi
|
||||
exit 0
|
||||
@@ -86,4 +86,8 @@ if [ "$HTTP" = "200" ]; then
|
||||
fi
|
||||
|
||||
echo "[WARNING] post-bot-alert: Discord returned ${HTTP:-no-response} — ${BODY}" >&2
|
||||
# Log the Discord POST failure (non-200 / unreachable) once. Do NOT route this
|
||||
# through post-bot-alert itself — that would recurse; log-skill-error.sh only
|
||||
# writes to errorlog.md. Soft-fail preserved: this never changes the exit 0.
|
||||
bash "$ROOT/.claude/scripts/log-skill-error.sh" "post-bot-alert" "Discord POST failed (non-200/unreachable)" --context "channel=${CHANNEL_NAME} http=${HTTP:-none} resp=${BODY:0:80}" >/dev/null 2>&1 || true
|
||||
exit 0
|
||||
|
||||
@@ -11,19 +11,27 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
IDENTITY_FILE="$REPO_ROOT/.claude/identity.json"
|
||||
|
||||
# Functional-error logger. MUST stay silent on stdout (this script's stdout is
|
||||
# eval'd by the caller) — log-skill-error.sh prints only to stderr, and we
|
||||
# redirect everything to /dev/null to be safe.
|
||||
_logerr() { bash "$REPO_ROOT/.claude/scripts/log-skill-error.sh" "rmm-auth" "$@" >/dev/null 2>&1 || true; }
|
||||
|
||||
if [ ! -f "$IDENTITY_FILE" ]; then
|
||||
_logerr "identity.json not found; RMM auth cannot resolve vault" --context "path=$IDENTITY_FILE"
|
||||
echo "export TOKEN=''; export RMM=''; export REPO_ROOT=''; echo '[ERROR] identity.json not found' >&2"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
VAULT_PATH=$(jq -r '.vault_path // empty' "$IDENTITY_FILE")
|
||||
if [ -z "$VAULT_PATH" ]; then
|
||||
_logerr "vault_path not in identity.json; RMM auth failed" --context "path=$IDENTITY_FILE"
|
||||
echo "export TOKEN=''; export RMM=''; export REPO_ROOT=''; echo '[ERROR] vault_path not in identity.json' >&2"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
VAULT_SH="$VAULT_PATH/scripts/vault.sh"
|
||||
if [ ! -f "$VAULT_SH" ]; then
|
||||
_logerr "vault.sh not found at resolved vault_path; RMM auth failed" --context "path=$VAULT_SH"
|
||||
echo "export TOKEN=''; export RMM=''; export REPO_ROOT=''; echo '[ERROR] vault.sh not found at $VAULT_SH' >&2"
|
||||
exit 1
|
||||
fi
|
||||
@@ -35,6 +43,7 @@ RMM_EMAIL=$(bash "$VAULT_SH" get-field infrastructure/gururmm-server.sops.yaml c
|
||||
RMM_PASS=$(bash "$VAULT_SH" get-field infrastructure/gururmm-server.sops.yaml credentials.gururmm-api.admin-password 2>/dev/null)
|
||||
|
||||
if [ -z "$RMM_EMAIL" ] || [ -z "$RMM_PASS" ]; then
|
||||
_logerr "vault read of GuruRMM API credentials failed (empty email/password)" --context "entry=infrastructure/gururmm-server.sops.yaml"
|
||||
echo "export TOKEN=''; export RMM=''; export REPO_ROOT=''; echo '[ERROR] Failed to get RMM credentials from vault' >&2"
|
||||
exit 1
|
||||
fi
|
||||
@@ -45,6 +54,7 @@ JWT=$(curl -s -X POST "$RMM_URL/api/auth/login" -H "Content-Type: application/js
|
||||
TOKEN=$(echo "$JWT" | jq -r '.token // empty')
|
||||
|
||||
if [ -z "$TOKEN" ]; then
|
||||
_logerr "RMM login failed (no token returned from /api/auth/login)" --context "url=$RMM_URL resp=${JWT:0:80}"
|
||||
echo "export TOKEN=''; export RMM=''; export REPO_ROOT=''; echo '[ERROR] RMM login failed: $JWT' >&2"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -37,9 +37,17 @@ if [ -z "$QUERY" ] && [ -z "$CLIENT" ] && [ "$LISTC" -eq 0 ]; then
|
||||
fi
|
||||
|
||||
eval "$(bash "$ROOT/.claude/scripts/rmm-auth.sh" 2>/dev/null)" >/dev/null
|
||||
if [ -z "${TOKEN:-}" ] || [ -z "${RMM:-}" ]; then echo "[ERROR] RMM auth failed (see rmm-auth.sh)" >&2; exit 1; fi
|
||||
if [ -z "${TOKEN:-}" ] || [ -z "${RMM:-}" ]; then
|
||||
echo "[ERROR] RMM auth failed (see rmm-auth.sh)" >&2
|
||||
bash "$ROOT/.claude/scripts/log-skill-error.sh" "rmm-search" "RMM auth failed via rmm-auth.sh (no TOKEN/RMM)" >/dev/null 2>&1
|
||||
exit 1
|
||||
fi
|
||||
AGENTS=$(curl -s "$RMM/api/agents" -H "Authorization: Bearer $TOKEN")
|
||||
if [ -z "$AGENTS" ] || [ "${AGENTS:0:1}" != "[" ]; then echo "[ERROR] could not fetch agents: ${AGENTS:0:160}" >&2; exit 1; fi
|
||||
if [ -z "$AGENTS" ] || [ "${AGENTS:0:1}" != "[" ]; then
|
||||
echo "[ERROR] could not fetch agents: ${AGENTS:0:160}" >&2
|
||||
bash "$ROOT/.claude/scripts/log-skill-error.sh" "rmm-search" "GET /api/agents returned non-array/empty" --context "resp=${AGENTS:0:80}" >/dev/null 2>&1
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Pipe agents on stdin (payload too large for argv on Windows); flags via env.
|
||||
printf '%s' "$AGENTS" | QUERY="$QUERY" CLIENT="$CLIENT" ONLINE="$ONLINE" JSON="$JSON" LISTC="$LISTC" LIMIT="$LIMIT" \
|
||||
|
||||
@@ -37,6 +37,11 @@ PROBE="$SCRIPT_DIR/onboarding-diagnostic.ps1"
|
||||
ALERT="$REPO_ROOT/.claude/scripts/post-bot-alert.sh"
|
||||
RMM="http://172.16.3.30:3001"
|
||||
|
||||
# Functional-error logger (skill name "rmm-diagnose"). Logs genuine operational
|
||||
# failures (auth, vault, dispatch) — NOT the RED/AMBER/GREEN diagnostic grade,
|
||||
# which is a normal by-design result. Soft-fails; never breaks the run.
|
||||
_logerr() { bash "$REPO_ROOT/.claude/scripts/log-skill-error.sh" "rmm-diagnose" "$@" >/dev/null 2>&1 || true; }
|
||||
|
||||
if [ ! -f "$PROBE" ]; then
|
||||
echo "[ERROR] Probe script not found: $PROBE" >&2
|
||||
exit 1
|
||||
@@ -65,6 +70,7 @@ RMM_PASS="$(bash "$VAULT" get-field infrastructure/gururmm-server.sops.yaml cred
|
||||
|
||||
if [ -z "$RMM_EMAIL" ] || [ -z "$RMM_PASS" ] || [ "$RMM_EMAIL" = "null" ]; then
|
||||
echo "[ERROR] Could not read GuruRMM credentials from vault (infrastructure/gururmm-server.sops.yaml)" >&2
|
||||
_logerr "vault read of GuruRMM credentials failed (empty/null)" --context "entry=infrastructure/gururmm-server.sops.yaml"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -75,6 +81,7 @@ TOKEN="$(curl -s -m 30 -X POST "$RMM/api/auth/login" \
|
||||
|
||||
if [ -z "$TOKEN" ]; then
|
||||
echo "[ERROR] RMM login failed (no token returned)" >&2
|
||||
_logerr "RMM login failed (no token returned from /api/auth/login)" --context "url=$RMM"
|
||||
exit 1
|
||||
fi
|
||||
echo "[OK] Authenticated to GuruRMM"
|
||||
@@ -85,6 +92,7 @@ echo "[OK] Authenticated to GuruRMM"
|
||||
AGENTS="$(curl -s -m 30 "$RMM/api/agents" -H "Authorization: Bearer $TOKEN")"
|
||||
if [ -z "$AGENTS" ] || ! echo "$AGENTS" | jq -e 'type=="array"' >/dev/null 2>&1; then
|
||||
echo "[ERROR] Could not retrieve agent list" >&2
|
||||
_logerr "GET /api/agents returned non-array/empty" --context "resp=${AGENTS:0:80}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -263,6 +271,7 @@ PS
|
||||
CH_STATUS="$(echo "$CH_RESULT" | jq -r '.status')"
|
||||
if [ "$CH_STATUS" != "completed" ]; then
|
||||
echo "[ERROR] Chunk $IDX upload failed: status=$CH_STATUS stderr=$(echo "$CH_RESULT" | jq -r '.stderr' | head -c 200)" >&2
|
||||
_logerr "probe chunk upload failed" --context "host=$AGENT_HOST idx=$IDX/$N_CHUNKS status=$CH_STATUS"
|
||||
exit 1
|
||||
fi
|
||||
echo "[OK] Uploaded chunk $IDX/$N_CHUNKS"
|
||||
@@ -288,7 +297,7 @@ try {
|
||||
}
|
||||
PS
|
||||
|
||||
RESULT="$(dispatch_one "$RUN_SCRIPT" "$EXEC_TIMEOUT")" || { echo "[ERROR] Probe execution dispatch failed" >&2; exit 1; }
|
||||
RESULT="$(dispatch_one "$RUN_SCRIPT" "$EXEC_TIMEOUT")" || { echo "[ERROR] Probe execution dispatch failed" >&2; _logerr "probe execution dispatch failed" --context "host=$AGENT_HOST agent=$AGENT_ID"; exit 1; }
|
||||
CMD_ID="$(cat "$WORK_DIR/last_cmd_id" 2>/dev/null || echo unknown)"
|
||||
|
||||
FINAL_STATUS="$(echo "$RESULT" | jq -r '.status // empty')"
|
||||
@@ -317,6 +326,7 @@ if [ -z "$DIAG_JSON" ] || ! echo "$DIAG_JSON" | jq -e '.host' >/dev/null 2>&1; t
|
||||
fi
|
||||
echo "--- stdout (first 60 lines) ---" >&2
|
||||
printf '%s\n' "$STDOUT" | head -60 >&2
|
||||
_logerr "could not extract valid diagnostic JSON from probe output" --context "host=$AGENT_HOST status=$FINAL_STATUS exit=$EXIT_CODE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
@@ -16,9 +16,12 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
CLAUDETOOLS_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
IDENTITY_FILE="$CLAUDETOOLS_ROOT/.claude/identity.json"
|
||||
|
||||
_logerr() { bash "$CLAUDETOOLS_ROOT/.claude/scripts/log-skill-error.sh" "vault" "$@" >/dev/null 2>&1 || true; }
|
||||
|
||||
if [[ ! -f "$IDENTITY_FILE" ]]; then
|
||||
echo "[ERROR] .claude/identity.json not found at $IDENTITY_FILE" >&2
|
||||
echo " Run onboarding to create it, or add vault_path manually." >&2
|
||||
_logerr "identity.json not found; vault read cannot resolve vault_path" --context "path=$IDENTITY_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -40,6 +43,7 @@ fi
|
||||
if [[ -z "$VAULT_ROOT" ]]; then
|
||||
echo "[ERROR] vault_path not set in $IDENTITY_FILE" >&2
|
||||
echo " Add: \"vault_path\": \"/path/to/vault\"" >&2
|
||||
_logerr "vault_path not set in identity.json; vault read failed" --context "path=$IDENTITY_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -48,6 +52,7 @@ REAL_VAULT_SH="$VAULT_ROOT/scripts/vault.sh"
|
||||
if [[ ! -f "$REAL_VAULT_SH" ]]; then
|
||||
echo "[ERROR] vault.sh not found at $REAL_VAULT_SH" >&2
|
||||
echo " Check vault_path in $IDENTITY_FILE" >&2
|
||||
_logerr "real vault.sh not found at resolved vault_path; vault read failed" --context "path=$REAL_VAULT_SH"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
Reference in New Issue
Block a user