harness: fleet-wide functional-error + correction + friction logging
Add .claude/scripts/log-skill-error.sh — the canonical agent error log helper (writes errorlog.md in DATE | MACHINE | skill | [type] error format, soft-fails). Three categories: execution failures (default), user corrections (--correction), and preventable self-inflicted friction (--friction; cite ref= when it repeats a documented gotcha). Goal: stop paying tokens twice for the same avoidable mistake. - CLAUDE.md: make logging mandatory for all skills + corrections + friction. - skill-creator: new skills must wire in the helper (guidance + checklist). - Retrofit every skill script's genuine failure branches to call the helper (b2/bitdefender/mailprotector/packetdial/coord python CLIs; remediation-tool + onboard365 bash; vault, rmm-auth, post-bot-alert, agy, grok, 1password, run-onboarding-diagnostic). Handled conditions + self-tests left alone. - errorlog.md: broaden header to cover skills + harness + corrections; seed this session's corrections (INKY, Mail.Send token-audience, omnibox-strictness) and friction (git-bash /tmp, env-persistence, argv-limit, PowerShell var-case). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -37,6 +37,11 @@ PROBE="$SCRIPT_DIR/onboarding-diagnostic.ps1"
|
||||
ALERT="$REPO_ROOT/.claude/scripts/post-bot-alert.sh"
|
||||
RMM="http://172.16.3.30:3001"
|
||||
|
||||
# Functional-error logger (skill name "rmm-diagnose"). Logs genuine operational
|
||||
# failures (auth, vault, dispatch) — NOT the RED/AMBER/GREEN diagnostic grade,
|
||||
# which is a normal by-design result. Soft-fails; never breaks the run.
|
||||
_logerr() { bash "$REPO_ROOT/.claude/scripts/log-skill-error.sh" "rmm-diagnose" "$@" >/dev/null 2>&1 || true; }
|
||||
|
||||
if [ ! -f "$PROBE" ]; then
|
||||
echo "[ERROR] Probe script not found: $PROBE" >&2
|
||||
exit 1
|
||||
@@ -65,6 +70,7 @@ RMM_PASS="$(bash "$VAULT" get-field infrastructure/gururmm-server.sops.yaml cred
|
||||
|
||||
if [ -z "$RMM_EMAIL" ] || [ -z "$RMM_PASS" ] || [ "$RMM_EMAIL" = "null" ]; then
|
||||
echo "[ERROR] Could not read GuruRMM credentials from vault (infrastructure/gururmm-server.sops.yaml)" >&2
|
||||
_logerr "vault read of GuruRMM credentials failed (empty/null)" --context "entry=infrastructure/gururmm-server.sops.yaml"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -75,6 +81,7 @@ TOKEN="$(curl -s -m 30 -X POST "$RMM/api/auth/login" \
|
||||
|
||||
if [ -z "$TOKEN" ]; then
|
||||
echo "[ERROR] RMM login failed (no token returned)" >&2
|
||||
_logerr "RMM login failed (no token returned from /api/auth/login)" --context "url=$RMM"
|
||||
exit 1
|
||||
fi
|
||||
echo "[OK] Authenticated to GuruRMM"
|
||||
@@ -85,6 +92,7 @@ echo "[OK] Authenticated to GuruRMM"
|
||||
AGENTS="$(curl -s -m 30 "$RMM/api/agents" -H "Authorization: Bearer $TOKEN")"
|
||||
if [ -z "$AGENTS" ] || ! echo "$AGENTS" | jq -e 'type=="array"' >/dev/null 2>&1; then
|
||||
echo "[ERROR] Could not retrieve agent list" >&2
|
||||
_logerr "GET /api/agents returned non-array/empty" --context "resp=${AGENTS:0:80}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -263,6 +271,7 @@ PS
|
||||
CH_STATUS="$(echo "$CH_RESULT" | jq -r '.status')"
|
||||
if [ "$CH_STATUS" != "completed" ]; then
|
||||
echo "[ERROR] Chunk $IDX upload failed: status=$CH_STATUS stderr=$(echo "$CH_RESULT" | jq -r '.stderr' | head -c 200)" >&2
|
||||
_logerr "probe chunk upload failed" --context "host=$AGENT_HOST idx=$IDX/$N_CHUNKS status=$CH_STATUS"
|
||||
exit 1
|
||||
fi
|
||||
echo "[OK] Uploaded chunk $IDX/$N_CHUNKS"
|
||||
@@ -288,7 +297,7 @@ try {
|
||||
}
|
||||
PS
|
||||
|
||||
RESULT="$(dispatch_one "$RUN_SCRIPT" "$EXEC_TIMEOUT")" || { echo "[ERROR] Probe execution dispatch failed" >&2; exit 1; }
|
||||
RESULT="$(dispatch_one "$RUN_SCRIPT" "$EXEC_TIMEOUT")" || { echo "[ERROR] Probe execution dispatch failed" >&2; _logerr "probe execution dispatch failed" --context "host=$AGENT_HOST agent=$AGENT_ID"; exit 1; }
|
||||
CMD_ID="$(cat "$WORK_DIR/last_cmd_id" 2>/dev/null || echo unknown)"
|
||||
|
||||
FINAL_STATUS="$(echo "$RESULT" | jq -r '.status // empty')"
|
||||
@@ -317,6 +326,7 @@ if [ -z "$DIAG_JSON" ] || ! echo "$DIAG_JSON" | jq -e '.host' >/dev/null 2>&1; t
|
||||
fi
|
||||
echo "--- stdout (first 60 lines) ---" >&2
|
||||
printf '%s\n' "$STDOUT" | head -60 >&2
|
||||
_logerr "could not extract valid diagnostic JSON from probe output" --context "host=$AGENT_HOST status=$FINAL_STATUS exit=$EXIT_CODE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
Reference in New Issue
Block a user