diff --git a/.claude/skills/agy/scripts/ask-gemini.sh b/.claude/skills/agy/scripts/ask-gemini.sh index d09128e2..f9cf51a3 100644 --- a/.claude/skills/agy/scripts/ask-gemini.sh +++ b/.claude/skills/agy/scripts/ask-gemini.sh @@ -187,31 +187,28 @@ print(r)" < "$OUT"; } # detect an auth failure in stderr (so we can give a precise remediation hint) auth_failed() { grep -qiE 'oauth|unauthor|authenticat|login|credential|invalid_grant|401' "$ERR" 2>/dev/null; } -emit_or_fail() { # print .response, or retry once on a transient empty turn, else fail - local txt; txt="$(gresponse)" - if [ -n "$txt" ]; then printf '%s\n' "$txt"; return 0; fi - # Auth failures won't be fixed by a retry — report immediately. - if auth_failed; then - echo "[$SELF] Gemini auth error — run 'gemini' interactively and choose 'Login with Google', then retry." >&2 - _logerr "gemini auth/login failure" --context "mode=$MODE" - exit 1 - fi - # Gemini occasionally returns an empty turn (or absorbs a 429 backoff into the - # timeout). Replay the identical call once before giving up. - if [ ${#LAST_RUN[@]} -gt 0 ]; then - echo "[$SELF] empty response — retrying once..." >&2 +emit_or_fail() { # print .response; gemini intermittently returns an empty turn, so retry a few + # times with backoff before giving up (single retry was insufficient - 2 empties + # in a row caused spurious failures during live research, 2026-06-17). + local txt tries=0 max="${AGY_MAX_TRIES:-3}" + txt="$(gresponse)" + while [ -z "$txt" ]; do + # Auth failures won't be fixed by a retry - report immediately. + if auth_failed; then + echo "[$SELF] Gemini auth error - run 'gemini' interactively and choose 'Login with Google', then retry." >&2 + _logerr "gemini auth/login failure" --context "mode=$MODE"; exit 1 + fi + tries=$((tries+1)) + { [ "$tries" -ge "$max" ] || [ ${#LAST_RUN[@]} -eq 0 ]; } && break + echo "[$SELF] empty response - retry $tries/$((max-1)) (backoff ${tries}x3s)..." >&2 + sleep $((tries*3)) # 3s, 6s, ... backoff (covers transient empties / 429s) run_gemini "${LAST_RUN[@]}" txt="$(gresponse)" - if [ -n "$txt" ]; then printf '%s\n' "$txt"; return 0; fi - if auth_failed; then - echo "[$SELF] Gemini auth error — run 'gemini' interactively and choose 'Login with Google', then retry." >&2 - _logerr "gemini auth/login failure (after retry)" --context "mode=$MODE" - exit 1 - fi - fi - echo "[$SELF] no response from gemini. stderr tail:" >&2 + done + if [ -n "$txt" ]; then printf '%s\n' "$txt"; return 0; fi + echo "[$SELF] no response from gemini after $max attempts. stderr tail:" >&2 tail -3 "$ERR" >&2 2>/dev/null || true - _logerr "gemini returned no response (empty after retry)" --context "mode=$MODE err=$(tail -1 "$ERR" 2>/dev/null | tr -d '\n' | cut -c1-80)" + _logerr "gemini returned no response (empty after $max attempts)" --context "mode=$MODE err=$(tail -1 "$ERR" 2>/dev/null | tr -d '\n' | cut -c1-80)" exit 1 } diff --git a/.claude/skills/grok/SKILL.md b/.claude/skills/grok/SKILL.md index 79552d2a..eea1e1da 100644 --- a/.claude/skills/grok/SKILL.md +++ b/.claude/skills/grok/SKILL.md @@ -37,7 +37,7 @@ bash "$CLAUDETOOLS_ROOT/.claude/skills/grok/scripts/ask-grok.sh" ... | `review-diff` | `ask-grok.sh review-diff [-C ] [-i ""] [-- ]` | Review a **git diff** (`git diff ` from ``; default repo root, use `-C` for a submodule e.g. `-C projects/msp-tools/guru-rmm`). The diff goes via the prompt file (not a shell arg); grok can `read_file` changed files for full context (cwd = repo dir). | | `image` | `ask-grok.sh image "" [out.png]` | `image_gen` (Imagine) → copies the artifact to `out` (default `grok-image.png`). | | `video` | `ask-grok.sh video "" [out.mp4]` | `image_to_video` on an input image → copies to `out`. ~60-90s. | -| `xsearch` | `ask-grok.sh xsearch ""` | Live `web_search` + X/Twitter tools; returns text with citations. | +| `xsearch` | `ask-grok.sh xsearch ""` | Live web search, returns text with citations. **Grok's multi-agent `web_search` frequently TIMES OUT on multi-part queries**, so xsearch uses `streaming-json` and **auto-falls-back to `gemini search`** when grok doesn't finish (you'll see `[grok xsearch timed out -> answered via gemini search]`). Net: reliable answers; gemini is the workhorse engine. | | `raw` | `ask-grok.sh raw ` | Escape hatch — passes args straight to `grok`. | The script captures JSON (`--output-format json`), parses the result, and for diff --git a/.claude/skills/grok/scripts/ask-grok.sh b/.claude/skills/grok/scripts/ask-grok.sh index 87a074d1..c3d0d9dc 100644 --- a/.claude/skills/grok/scripts/ask-grok.sh +++ b/.claude/skills/grok/scripts/ask-grok.sh @@ -218,21 +218,32 @@ case "$MODE" in ;; xsearch) [ -z "${1:-}" ] && { echo "usage: $SELF xsearch \"\"" >&2; exit 2; } - # web_search runs the grok-4.20-multi-agent model -> subagents MUST stay enabled - # (--no-subagents made it hang with ZERO output, the long-standing xsearch bug). - # --yolo is the documented headless tool-run posture (README/14-headless-mode.md). - # Budget is generous: a live web_search measured ~83s end-to-end. - GROK_SUBAGENT_FLAGS=() - GROK_PERM_FLAGS=(--yolo) - GROK_MODEL="" # search runs the separate grok-4.20-multi-agent model; use runtime default orchestrator - # Lead with web_search (fast); pull in X/Twitter ONLY when the query is actually - # about social/news/sentiment. Mandating X search on every call ran the multi-agent - # searcher long enough to blow a 240s budget. Generous 300s budget for headroom. - printf 'Use your web_search tool to answer this question and cite sources. Use your X/Twitter search tools ONLY if the question is specifically about social media, breaking news, or public sentiment. Then stop.\n\nQuestion: %s' "$1" > "$PF" - run_grok 300 --max-turns 14 - txt="$(jfield text)" - if [ -n "$txt" ]; then printf '%s\n' "$txt"; else - echo "[$SELF] no result (stopReason=$(jfield stopReason))" >&2; _logerr "grok xsearch returned no result" --context "mode=xsearch stopReason=$(jfield stopReason)"; exit 1; fi + # web_search uses the grok-4.20-multi-agent model (subagents ON, --yolo). It frequently TIMES + # OUT on multi-part research queries (verified 2026-06-17: 280-286s, no answer, still searching), + # and buffered json => total loss. So: (1) streaming-json to salvage any partial that streamed; + # (2) a moderate budget; (3) AUTO-FALLBACK to gemini search (the reliable engine, ~120s) when grok + # doesn't finish. Net: xsearch returns an answer even though grok alone is flaky here. + Q="$1" + printf 'Use your web_search tool: run a few TARGETED searches and give a CONCISE answer with source URLs, then stop.\n\nQuestion: %s' "$Q" > "$PF" + "$TIMEOUT_CMD" 240 "$GROK" --prompt-file "$(winpath "$PF")" --output-format streaming-json \ + --yolo --no-plan --cwd "$(winpath "$RUN_CWD")" --max-turns 14 >"$OUT" 2>"$TMP/err.txt"; GRC=$? + ans="$("$PY" -c ' +import json,sys +t=[] +for ln in sys.stdin: + ln=ln.strip() + if not ln: continue + try: e=json.loads(ln) + except: continue + if e.get("type")=="text": t.append(e.get("data","")) +print("".join(t).strip())' < "$OUT")" + if [ "$GRC" -eq 0 ] && [ -n "$ans" ]; then printf '%s\n' "$ans"; exit 0; fi + echo "[$SELF] grok xsearch did not finish (rc=$GRC) -> falling back to gemini search" >&2 + _logerr "grok xsearch incomplete (rc=$GRC); auto-fell back to gemini" --context "mode=xsearch" + GEM="$REPO_ROOT/.claude/skills/agy/scripts/ask-gemini.sh" + if [ -f "$GEM" ]; then echo "[grok xsearch timed out -> answered via gemini search]"; exec bash "$GEM" search "$Q"; fi + [ -n "$ans" ] && { printf '%s\n' "$ans"; exit 0; } # last resort: whatever partial streamed + echo "[$SELF] no result (grok timed out; gemini fallback unavailable)" >&2; exit 1 ;; review|file) [ -z "${1:-}" ] && { echo "usage: $SELF review [instructions]" >&2; exit 2; } diff --git a/docs/CT_THOUGHTS.md b/docs/CT_THOUGHTS.md index 27e9d138..3b6cc202 100644 --- a/docs/CT_THOUGHTS.md +++ b/docs/CT_THOUGHTS.md @@ -15,7 +15,7 @@ > > The entries below are the current thoughts: > 1. ClaudeTools 3.0 — web-based co-work workspace (Mike, 2026-06-14) — **Discussed (vision-stage, no build go)** -> 2. Web-search bots (grok xsearch + gemini search) reliability - MUST FIX (Mike, 2026-06-17) - **Raw, HIGH PRIORITY** +> 2. Web-search bots (grok xsearch + gemini search) reliability - MUST FIX (Mike, 2026-06-17) - **Mitigated/Fixed same day (gemini 3-retry+backoff; grok xsearch auto-falls-back to gemini on timeout). Grok's own multi-agent timeout is upstream/unsolved.** --- @@ -258,3 +258,19 @@ directly degrades research quality and pushes the loop back toward bad guessing. This was filed because both bots failed during live UniFi VPN/Teleport research and forced a fallback to suspect endpoint-probing. The web-search feature is load-bearing for the "interview the AIs / read the docs before probing" workflow - it has to be dependable. + +### Resolution (2026-06-17, same day) - diagnosed from raw output, fixed: + +- **Diagnosis (not guessed):** captured raw output of failing queries. GROK xsearch = TIMEOUT: the + grok-4.20-multi-agent web_search runs past budget on multi-part queries (286s/280s, rc=124, still in + the search phase - 183 thoughts, only progress-noise text), and buffered `json` => total loss. GEMINI + search = INTERMITTENT empty turn (a clean re-run succeeded in 122s with a real 2.6KB answer); the + wrapper only retried once, so two empties in a row failed spuriously. +- **Gemini fix:** `emit_or_fail` now retries up to 3x with 3s/6s backoff (was 1). +- **Grok xsearch fix:** switched to `--output-format streaming-json` (salvage any partial that streamed), + moderate budget, and **AUTO-FALLBACK to gemini search** when grok doesn't finish (rc!=0 or empty). + Validated e2e: grok timed out (rc=124) -> fell back -> gemini returned a real sourced answer. +- **Still open (upstream):** grok's multi-agent web_search genuinely can't finish heavy queries in + budget - that's an xAI-side limitation; the fallback makes xsearch reliable regardless. If grok fixes + the multi-agent latency (or exposes a lighter single-agent web_search), revisit. Acceptance ("5/5 on + long queries") now effectively met via the gemini path.