feat: session recovery toolset (orphan detector + /recover)

Reconstructs session logs from Claude Code transcripts when a session crashes or is closed before /save. Two entry points: - /recover <uuid|latest> : manual, Claude-reviewed reconstruction - detect_orphaned_sessions.py : scheduled scan that auto-builds logs for substantive, unsaved, not-yet-recovered transcripts (banner-marked RECOVERED-UNVERIFIED), commits them, and posts a #bot-alerts FYI. recover_session.py is the shared engine: Python extracts the verbatim command/config/reference timeline; Ollama drafts prose-only narrative. Machine-local ledger (.claude/state/) prevents reprocessing. Reviewed: git add scoped to own files, ledger written only after successful push, per-uuid idempotency, --max cap for unattended runs. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-01 18:33:07 -07:00
parent e8144a862e
commit eed3ece2c7
9 changed files with 1897 additions and 0 deletions
--- a/.claude/scripts/detect_orphaned_sessions.py
+++ b/.claude/scripts/detect_orphaned_sessions.py
@@ -0,0 +1,431 @@
+#!/usr/bin/env python3
+"""detect_orphaned_sessions.py -- find and auto-recover unsaved Claude Code sessions.
+
+A session is "orphaned" when its transcript records substantive (mutating) work
+but the session was never saved (no /save, /scc, or /checkpoint, and no write into
+a session-logs/ path). This script scans the per-machine transcript directory,
+classifies each idle transcript via the recover_session engine, auto-builds a
+banner-marked recovery log for each orphan, records every processed uuid in a
+machine-local ledger so it is never re-scanned, commits + pushes the recovered
+logs, and posts an FYI to #bot-alerts.
+
+Modes:
+  (default)        full run: build logs, update ledger, commit, push, alert
+  --dry-run        scan + print a report table; write/commit/alert nothing
+  --idle-min N     minutes of mtime-idle before a transcript is eligible (default 90)
+  --no-commit      build + ledger, but skip git commit/push
+  --no-alert       build + ledger + commit, but skip the Discord alert
+
+The detector NEVER touches sync.sh; it does its own git add/commit/push so it has
+no surprising side effects. Soft-fails on git/alert errors (work is already saved
+to disk -- those are best-effort).
+
+stdlib only; targets Python 3.11+.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+# Import the shared engine (same directory).
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+import recover_session as engine  # noqa: E402
+
+
+LEDGER_REL = Path(".claude") / "state" / "recovered-sessions.json"
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def ledger_path() -> Path:
+    return engine.repo_root() / LEDGER_REL
+
+
+def load_ledger() -> dict:
+    p = ledger_path()
+    if p.exists():
+        try:
+            return json.loads(p.read_text(encoding="utf-8"))
+        except (OSError, ValueError):
+            return {}
+    return {}
+
+
+def save_ledger(ledger: dict) -> None:
+    p = ledger_path()
+    p.parent.mkdir(parents=True, exist_ok=True)
+    p.write_text(json.dumps(ledger, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
+
+
+def _scope_str(scope: dict) -> str:
+    t = scope.get("type", "general")
+    if t == "general":
+        return "general"
+    return f"{t}:{scope.get('slug', '?')}"
+
+
+def scan(idle_min: int, ledger: dict) -> tuple[list[dict], list[dict]]:
+    """Scan transcripts.
+
+    Returns (eligible, recoverable):
+      eligible    -- every transcript that is past idle and not already in ledger
+                     (each a dict with parsed metadata + verdict fields)
+      recoverable -- the subset that are orphans (substantive and not saved)
+    """
+    base = engine.transcript_base_dir()
+    now = datetime.now().timestamp()
+    idle_secs = idle_min * 60
+
+    eligible: list[dict] = []
+    recoverable: list[dict] = []
+
+    if not base.is_dir():
+        return eligible, recoverable
+
+    for jf in sorted(base.glob("*.jsonl")):
+        uuid = jf.stem
+        try:
+            mtime = jf.stat().st_mtime
+        except OSError:
+            continue
+        # Skip recently-active sessions.
+        if (now - mtime) < idle_secs:
+            continue
+        # Skip anything already processed.
+        if uuid in ledger:
+            continue
+
+        parsed = engine.parse_transcript(jf)
+        verdict = engine.classify(parsed)
+        orphan = bool(verdict["substantive"] and not verdict["saved"])
+        rec = {
+            "uuid": uuid,
+            "path": jf,
+            "mtime": mtime,
+            "substantive": verdict["substantive"],
+            "saved": verdict["saved"],
+            "orphan": orphan,
+            "scope": verdict["scope"],
+            "title": verdict["title"],
+            "parsed": parsed,
+        }
+        # would-write path (metadata-cheap; no Ollama)
+        rec["would_write"] = str(
+            engine.compute_output_path(parsed, verdict["scope"], verdict["title"])
+        )
+        eligible.append(rec)
+        if orphan:
+            recoverable.append(rec)
+
+    # Process OLDEST-FIRST so a capped run drains the longest-waiting orphans
+    # first. Prefer the transcript's first_ts when available; fall back to mtime.
+    def _age_key(r: dict):
+        ts = (r.get("parsed").first_ts if r.get("parsed") else "") or ""
+        if ts:
+            try:
+                return datetime.fromisoformat(ts.replace("Z", "+00:00")).timestamp()
+            except ValueError:
+                pass
+        return r.get("mtime", 0.0)
+
+    eligible.sort(key=_age_key)
+    recoverable.sort(key=_age_key)
+
+    return eligible, recoverable
+
+
+def print_dry_run_table(eligible: list[dict]) -> None:
+    if not eligible:
+        print("[INFO] No eligible (past-idle, unprocessed) transcripts found.")
+        return
+    headers = ["uuid", "mtime", "subst", "saved", "orphan", "scope", "would-write-path"]
+    rows = []
+    for r in eligible:
+        mt = datetime.fromtimestamp(r["mtime"]).strftime("%Y-%m-%d %H:%M")
+        rows.append(
+            [
+                r["uuid"][:8],
+                mt,
+                "yes" if r["substantive"] else "no",
+                "yes" if r["saved"] else "no",
+                "YES" if r["orphan"] else "no",
+                _scope_str(r["scope"]),
+                r["would_write"],
+            ]
+        )
+    widths = [len(h) for h in headers]
+    for row in rows:
+        for i, cell in enumerate(row):
+            widths[i] = max(widths[i], len(str(cell)))
+    fmt = "  ".join("{:<" + str(w) + "}" for w in widths)
+    print(fmt.format(*headers))
+    print(fmt.format(*["-" * w for w in widths]))
+    for row in rows:
+        print(fmt.format(*[str(c) for c in row]))
+    n_orphan = sum(1 for r in eligible if r["orphan"])
+    print()
+    print(f"[INFO] {len(eligible)} eligible, {n_orphan} orphan(s) would be recovered.")
+
+
+def _existing_recovered_for_uuid(out_dir: Path, uuid: str) -> Path | None:
+    """Return a prior recovered log for THIS uuid in ``out_dir``, if one exists.
+
+    The tool's own collision filename embeds the 8-char uuid prefix as a trailing
+    ``-recovered-...-<short>.md`` suffix (see ``compute_output_path``). Matching on
+    that prefix lets a re-run overwrite its OWN prior draft for the same uuid in
+    place -- the one safe overwrite -- instead of minting a second suffixed copy.
+
+    Only files that are clearly recovered drafts (``-recovered-`` in the name AND
+    ending in ``-<short>.md``) are considered. A genuine non-recovered human log
+    will never match, so its suffix protection is preserved.
+    """
+    if not out_dir.is_dir():
+        return None
+    short = uuid[:8]
+    suffix = f"-{short}.md"
+    for f in out_dir.glob(f"*-recovered-*{suffix}"):
+        if f.is_file() and f.name.endswith(suffix):
+            return f
+    return None
+
+
+def recover_one(rec: dict) -> str:
+    """Build + write the recovery log for one orphan. Returns the written path.
+
+    Idempotent per-uuid: if a prior recovered draft for THIS uuid already exists
+    in the target directory (a run that died after writing but before the ledger
+    was updated), overwrite that same file in place rather than creating a new
+    suffixed copy. Never overwrites a non-recovered human log.
+    """
+    parsed = rec["parsed"]
+    markdown, meta = engine.build_log(parsed)
+    out_path = Path(meta["path_would_be"])
+    prior = _existing_recovered_for_uuid(out_path.parent, rec["uuid"])
+    if prior is not None:
+        out_path = prior
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(markdown, encoding="utf-8")
+    rec["written"] = str(out_path)
+    rec["date"] = meta["date"]
+    return str(out_path)
+
+
+def git(*args: str) -> subprocess.CompletedProcess:
+    return subprocess.run(
+        ["git", *args],
+        cwd=str(engine.repo_root()),
+        capture_output=True,
+        text=True,
+        timeout=120,
+    )
+
+
+def _current_branch() -> str:
+    """Return the current git branch name, or empty string if undeterminable."""
+    res = git("rev-parse", "--abbrev-ref", "HEAD")
+    if res.returncode == 0:
+        name = res.stdout.strip()
+        if name and name != "HEAD":
+            return name
+    return ""
+
+
+def commit_and_push(written_paths: list[str], count: int) -> bool:
+    """Stage only the recovered logs, commit, push. Soft-fail on errors.
+
+    NEVER stages the ledger -- it is machine-local and correctly gitignored;
+    appending it to ``git add`` aborts the whole add (exit 1) and stages nothing.
+
+    Returns True only when BOTH the commit AND the push succeed. On any failure
+    returns False so the caller knows not to mark these uuids ``recovered`` (the
+    next run must re-attempt them).
+    """
+    root = engine.repo_root()
+    rel_paths = []
+    for p in written_paths:
+        try:
+            rel_paths.append(str(Path(p).resolve().relative_to(root)))
+        except ValueError:
+            rel_paths.append(p)
+
+    add = git("add", "--", *rel_paths)
+    if add.returncode != 0:
+        print(f"[WARNING] git add failed; logs are on disk but uncommitted: {add.stderr.strip()}", file=sys.stderr)
+        return False
+
+    msg = (
+        f"chore: auto-recover {count} unsaved session log(s)\n\n"
+        f"{engine._COMMIT_FOOTER}"
+    )
+    commit = git("commit", "-m", msg)
+    if commit.returncode != 0:
+        # Nothing to commit, or hook failure -- soft-fail.
+        print(f"[WARNING] git commit returned non-zero: {commit.stdout.strip()} {commit.stderr.strip()}", file=sys.stderr)
+        return False
+    print(f"[OK] committed {count} recovered log(s).")
+
+    branch = _current_branch()
+    if branch:
+        push = git("push", "origin", branch)
+    else:
+        push = git("push")
+    if push.returncode != 0:
+        target = f"origin {branch}" if branch else "origin"
+        print(
+            f"[WARNING] git push to {target} failed (commit is local): {push.stderr.strip()}",
+            file=sys.stderr,
+        )
+        return False
+    print(f"[OK] pushed to origin{(' ' + branch) if branch else ''}.")
+    return True
+
+
+def post_alert(recovered: list[dict]) -> None:
+    """Post an FYI to #bot-alerts via post-bot-alert.sh. Soft-fail."""
+    script = engine.repo_root() / ".claude" / "scripts" / "post-bot-alert.sh"
+    if not script.exists():
+        print("[WARNING] post-bot-alert.sh not found; alert skipped.", file=sys.stderr)
+        return
+    bash = shutil.which("bash")
+    if not bash:
+        print(
+            "[WARNING] 'bash' not found on PATH (restricted scheduler env?); "
+            "#bot-alerts FYI skipped. Recovered logs are already committed.",
+            file=sys.stderr,
+        )
+        return
+    lines = [
+        f"[INFO] Auto-recovered {len(recovered)} unsaved session log(s) -- "
+        f"already saved to the repo; FYI, please review and remove the UNVERIFIED banner:"
+    ]
+    for r in recovered:
+        lines.append(
+            f"- {r['uuid'][:8]} | {r.get('date', '?')} | {_scope_str(r['scope'])} | {r.get('written', '?')}"
+        )
+    message = "\n".join(lines)
+    try:
+        res = subprocess.run(
+            [bash, str(script), message, "bot"],
+            cwd=str(engine.repo_root()),
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+        out = (res.stdout or "").strip() or (res.stderr or "").strip()
+        if out:
+            print(out)
+    except (OSError, subprocess.SubprocessError) as e:
+        print(f"[WARNING] alert post failed: {e}", file=sys.stderr)
+
+
+def main(argv: list[str] | None = None) -> int:
+    # Force UTF-8 stdout (Windows console defaults to cp1252; titles/paths in
+    # the dry-run table can contain characters outside that codepage).
+    try:
+        sys.stdout.reconfigure(encoding="utf-8", errors="replace")
+    except (AttributeError, ValueError):
+        pass
+
+    parser = argparse.ArgumentParser(
+        description="Detect and auto-recover unsaved Claude Code sessions."
+    )
+    parser.add_argument("--dry-run", action="store_true", help="scan + print report; no writes/commit/alert")
+    parser.add_argument("--idle-min", type=int, default=90, help="minutes of mtime-idle before eligible (default 90)")
+    parser.add_argument("--max", type=int, default=25, dest="max_recover", help="max orphan logs to build per run, oldest-first (default 25)")
+    parser.add_argument("--no-commit", action="store_true", help="skip git commit/push")
+    parser.add_argument("--no-alert", action="store_true", help="skip the Discord alert")
+    args = parser.parse_args(argv)
+
+    # Respect the ledger in both modes (dry-run still skips already-processed).
+    ledger = load_ledger()
+
+    eligible, recoverable = scan(args.idle_min, ledger)
+
+    if args.dry_run:
+        print_dry_run_table(eligible)
+        return 0
+
+    if not eligible:
+        print("[INFO] No eligible transcripts to process.")
+        return 0
+
+    written_paths: list[str] = []
+    recovered_recs: list[dict] = []
+    deferred = 0
+    built = 0
+
+    for rec in eligible:
+        uuid = rec["uuid"]
+        if rec["orphan"]:
+            # Cap actual log-builds per run (oldest-first). Remaining orphans are
+            # left OUT of the ledger so the next run re-attempts them.
+            if built >= args.max_recover:
+                deferred += 1
+                continue
+            try:
+                path = recover_one(rec)
+            except Exception as e:  # noqa: BLE001 -- never let one bad transcript abort the run
+                print(f"[WARNING] failed to recover {uuid[:8]}: {e}", file=sys.stderr)
+                # No on-disk artifact -> safe to mark immediately.
+                ledger[uuid] = {"verdict": "error", "at": _now_iso(), "path": None, "error": str(e)}
+                continue
+            built += 1
+            written_paths.append(path)
+            recovered_recs.append(rec)
+            print(f"[OK] recovered {uuid[:8]} -> {path}")
+        elif rec["saved"]:
+            # No on-disk artifact -> safe to mark immediately.
+            ledger[uuid] = {"verdict": "skipped-saved", "at": _now_iso(), "path": None}
+        else:
+            ledger[uuid] = {"verdict": "skipped-trivial", "at": _now_iso(), "path": None}
+
+    if deferred:
+        print(f"[INFO] {deferred} more orphan(s) deferred to next run (--max {args.max_recover}).")
+
+    # Persist the skipped/error verdicts now (they have no artifact, so they are
+    # safe regardless of the commit/push outcome below).
+    save_ledger(ledger)
+
+    if not recovered_recs:
+        print("[INFO] No orphans recovered (all eligible sessions were saved or trivial).")
+        return 0
+
+    if not args.no_commit:
+        pushed = commit_and_push(written_paths, len(recovered_recs))
+        if pushed:
+            # H1: only mark uuids 'recovered' AFTER a successful commit+push, so a
+            # push failure leaves them out of the ledger for the next run to retry.
+            for rec in recovered_recs:
+                ledger[rec["uuid"]] = {
+                    "verdict": "recovered",
+                    "at": _now_iso(),
+                    "path": rec.get("written"),
+                }
+            save_ledger(ledger)
+        else:
+            print(
+                "[WARNING] commit/push did not succeed; recovered uuids left UNLEDGERED "
+                "so the next run re-attempts them (logs are on disk).",
+                file=sys.stderr,
+            )
+    else:
+        print("[INFO] --no-commit set; recovered logs left unstaged and UNLEDGERED (next run will re-attempt).")
+
+    if not args.no_alert:
+        post_alert(recovered_recs)
+    else:
+        print("[INFO] --no-alert set; Discord alert skipped.")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/.claude/scripts/recover_session.py
+++ b/.claude/scripts/recover_session.py
--- a/.claude/scripts/register-orphan-detector.ps1
+++ b/.claude/scripts/register-orphan-detector.ps1
@@ -0,0 +1,95 @@
+# register-orphan-detector.ps1
+# Register the "ClaudeTools - Orphaned Session Detector" scheduled task on this
+# Windows machine. The task runs detect_orphaned_sessions.py, which scans the
+# per-machine Claude Code transcript directory for unsaved substantive sessions,
+# auto-builds banner-marked recovery logs, commits + pushes them, and posts an
+# FYI to #bot-alerts.
+#
+# Mirrors the GrepAI watcher registration pattern in .claude/OLLAMA.md.
+#
+# Triggers:
+#   - AtLogOn (catch sessions lost since the last logon)
+#   - Daily, repeating every 4 hours (catch crashes during a long workday;
+#     4h cadence pairs with the detector's 90-minute idle gate so an active
+#     session is never grabbed mid-flight)
+#
+# Idempotent: -Force replaces any existing task with the same name.
+# This script only REGISTERS the task. It does not run the detector now.
+#
+# Run from an ordinary (non-admin) PowerShell:
+#   powershell -ExecutionPolicy Bypass -File D:\claudetools\.claude\scripts\register-orphan-detector.ps1
+
+$ErrorActionPreference = "Stop"
+
+$TaskName    = "ClaudeTools - Orphaned Session Detector"
+
+# Resolve the repo root portably. Prefer claudetools_root from identity.json
+# (per-machine, gitignored); fall back to two levels up from this script
+# (.claude/scripts/ -> repo root), resolved to a full path.
+$ScriptDir   = $PSScriptRoot
+$FallbackRoot = (Resolve-Path (Join-Path $ScriptDir "..\..")).Path
+$IdentityPath = Join-Path $FallbackRoot ".claude\identity.json"
+$RepoRoot    = $FallbackRoot
+if (Test-Path $IdentityPath) {
+    try {
+        $identity = Get-Content -Raw -Path $IdentityPath | ConvertFrom-Json
+        if ($identity.claudetools_root -and (Test-Path $identity.claudetools_root)) {
+            $RepoRoot = (Resolve-Path $identity.claudetools_root).Path
+        }
+    } catch {
+        Write-Host "[WARNING] Could not parse $IdentityPath; using $FallbackRoot" -ForegroundColor Yellow
+    }
+}
+$Script      = Join-Path $RepoRoot ".claude\scripts\detect_orphaned_sessions.py"
+
+if (-not (Test-Path $Script)) {
+    Write-Host "[ERROR] Detector not found at $Script" -ForegroundColor Red
+    exit 1
+}
+
+# Resolve the py launcher's full path (the action's Execute wants an absolute
+# path; "py" alone usually resolves but we pin it for reliability under the
+# Task Scheduler's environment).
+$PyCmd = Get-Command py -ErrorAction SilentlyContinue
+if ($null -ne $PyCmd) {
+    $PyPath = $PyCmd.Source
+} else {
+    $PyPath = "py"  # fall back to PATH resolution at run time
+}
+
+$Action = New-ScheduledTaskAction `
+    -Execute $PyPath `
+    -Argument "`"$Script`"" `
+    -WorkingDirectory $RepoRoot
+
+# Trigger 1: at logon for the current user.
+$TriggerLogon = New-ScheduledTaskTrigger -AtLogOn -User $env:USERNAME
+
+# Trigger 2: daily at a fixed start, repeating every 4 hours all day.
+$TriggerDaily = New-ScheduledTaskTrigger -Daily -At 9am
+$TriggerDaily.Repetition = (New-ScheduledTaskTrigger `
+    -Once -At 9am `
+    -RepetitionInterval (New-TimeSpan -Hours 4) `
+    -RepetitionDuration (New-TimeSpan -Hours 24)).Repetition
+
+$Settings = New-ScheduledTaskSettingsSet `
+    -ExecutionTimeLimit (New-TimeSpan -Minutes 30) `
+    -MultipleInstances IgnoreNew `
+    -StartWhenAvailable `
+    -DontStopOnIdleEnd
+
+Register-ScheduledTask `
+    -TaskName $TaskName `
+    -Action $Action `
+    -Trigger $TriggerLogon, $TriggerDaily `
+    -Settings $Settings `
+    -Description "Scans Claude Code transcripts for unsaved substantive sessions and auto-recovers them into session logs." `
+    -Force | Out-Null
+
+Write-Host "[OK] Registered scheduled task '$TaskName'."
+Write-Host "[INFO] Action:   $PyPath `"$Script`""
+Write-Host "[INFO] WorkDir:  $RepoRoot"
+Write-Host "[INFO] Triggers: AtLogOn ($env:USERNAME) + daily every 4h"
+Write-Host "[INFO] To inspect:  Get-ScheduledTask -TaskName '$TaskName' | Format-List"
+Write-Host "[INFO] To run now:  Start-ScheduledTask -TaskName '$TaskName'"
+Write-Host "[INFO] To remove:   Unregister-ScheduledTask -TaskName '$TaskName' -Confirm:`$false"