feat: session recovery toolset (orphan detector + /recover)

Reconstructs session logs from Claude Code transcripts when a session
crashes or is closed before /save. Two entry points:

- /recover <uuid|latest> : manual, Claude-reviewed reconstruction
- detect_orphaned_sessions.py : scheduled scan that auto-builds logs for
  substantive, unsaved, not-yet-recovered transcripts (banner-marked
  RECOVERED-UNVERIFIED), commits them, and posts a #bot-alerts FYI.

recover_session.py is the shared engine: Python extracts the verbatim
command/config/reference timeline; Ollama drafts prose-only narrative.
Machine-local ledger (.claude/state/) prevents reprocessing. Reviewed:
git add scoped to own files, ledger written only after successful push,
per-uuid idempotency, --max cap for unattended runs.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-01 18:33:07 -07:00
parent e8144a862e
commit eed3ece2c7
9 changed files with 1897 additions and 0 deletions

View File

@@ -0,0 +1,431 @@
#!/usr/bin/env python3
"""detect_orphaned_sessions.py -- find and auto-recover unsaved Claude Code sessions.
A session is "orphaned" when its transcript records substantive (mutating) work
but the session was never saved (no /save, /scc, or /checkpoint, and no write into
a session-logs/ path). This script scans the per-machine transcript directory,
classifies each idle transcript via the recover_session engine, auto-builds a
banner-marked recovery log for each orphan, records every processed uuid in a
machine-local ledger so it is never re-scanned, commits + pushes the recovered
logs, and posts an FYI to #bot-alerts.
Modes:
(default) full run: build logs, update ledger, commit, push, alert
--dry-run scan + print a report table; write/commit/alert nothing
--idle-min N minutes of mtime-idle before a transcript is eligible (default 90)
--no-commit build + ledger, but skip git commit/push
--no-alert build + ledger + commit, but skip the Discord alert
The detector NEVER touches sync.sh; it does its own git add/commit/push so it has
no surprising side effects. Soft-fails on git/alert errors (work is already saved
to disk -- those are best-effort).
stdlib only; targets Python 3.11+.
"""
from __future__ import annotations
import argparse
import json
import shutil
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
# Import the shared engine (same directory).
sys.path.insert(0, str(Path(__file__).resolve().parent))
import recover_session as engine # noqa: E402
LEDGER_REL = Path(".claude") / "state" / "recovered-sessions.json"
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def ledger_path() -> Path:
return engine.repo_root() / LEDGER_REL
def load_ledger() -> dict:
p = ledger_path()
if p.exists():
try:
return json.loads(p.read_text(encoding="utf-8"))
except (OSError, ValueError):
return {}
return {}
def save_ledger(ledger: dict) -> None:
p = ledger_path()
p.parent.mkdir(parents=True, exist_ok=True)
p.write_text(json.dumps(ledger, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
def _scope_str(scope: dict) -> str:
t = scope.get("type", "general")
if t == "general":
return "general"
return f"{t}:{scope.get('slug', '?')}"
def scan(idle_min: int, ledger: dict) -> tuple[list[dict], list[dict]]:
"""Scan transcripts.
Returns (eligible, recoverable):
eligible -- every transcript that is past idle and not already in ledger
(each a dict with parsed metadata + verdict fields)
recoverable -- the subset that are orphans (substantive and not saved)
"""
base = engine.transcript_base_dir()
now = datetime.now().timestamp()
idle_secs = idle_min * 60
eligible: list[dict] = []
recoverable: list[dict] = []
if not base.is_dir():
return eligible, recoverable
for jf in sorted(base.glob("*.jsonl")):
uuid = jf.stem
try:
mtime = jf.stat().st_mtime
except OSError:
continue
# Skip recently-active sessions.
if (now - mtime) < idle_secs:
continue
# Skip anything already processed.
if uuid in ledger:
continue
parsed = engine.parse_transcript(jf)
verdict = engine.classify(parsed)
orphan = bool(verdict["substantive"] and not verdict["saved"])
rec = {
"uuid": uuid,
"path": jf,
"mtime": mtime,
"substantive": verdict["substantive"],
"saved": verdict["saved"],
"orphan": orphan,
"scope": verdict["scope"],
"title": verdict["title"],
"parsed": parsed,
}
# would-write path (metadata-cheap; no Ollama)
rec["would_write"] = str(
engine.compute_output_path(parsed, verdict["scope"], verdict["title"])
)
eligible.append(rec)
if orphan:
recoverable.append(rec)
# Process OLDEST-FIRST so a capped run drains the longest-waiting orphans
# first. Prefer the transcript's first_ts when available; fall back to mtime.
def _age_key(r: dict):
ts = (r.get("parsed").first_ts if r.get("parsed") else "") or ""
if ts:
try:
return datetime.fromisoformat(ts.replace("Z", "+00:00")).timestamp()
except ValueError:
pass
return r.get("mtime", 0.0)
eligible.sort(key=_age_key)
recoverable.sort(key=_age_key)
return eligible, recoverable
def print_dry_run_table(eligible: list[dict]) -> None:
if not eligible:
print("[INFO] No eligible (past-idle, unprocessed) transcripts found.")
return
headers = ["uuid", "mtime", "subst", "saved", "orphan", "scope", "would-write-path"]
rows = []
for r in eligible:
mt = datetime.fromtimestamp(r["mtime"]).strftime("%Y-%m-%d %H:%M")
rows.append(
[
r["uuid"][:8],
mt,
"yes" if r["substantive"] else "no",
"yes" if r["saved"] else "no",
"YES" if r["orphan"] else "no",
_scope_str(r["scope"]),
r["would_write"],
]
)
widths = [len(h) for h in headers]
for row in rows:
for i, cell in enumerate(row):
widths[i] = max(widths[i], len(str(cell)))
fmt = " ".join("{:<" + str(w) + "}" for w in widths)
print(fmt.format(*headers))
print(fmt.format(*["-" * w for w in widths]))
for row in rows:
print(fmt.format(*[str(c) for c in row]))
n_orphan = sum(1 for r in eligible if r["orphan"])
print()
print(f"[INFO] {len(eligible)} eligible, {n_orphan} orphan(s) would be recovered.")
def _existing_recovered_for_uuid(out_dir: Path, uuid: str) -> Path | None:
"""Return a prior recovered log for THIS uuid in ``out_dir``, if one exists.
The tool's own collision filename embeds the 8-char uuid prefix as a trailing
``-recovered-...-<short>.md`` suffix (see ``compute_output_path``). Matching on
that prefix lets a re-run overwrite its OWN prior draft for the same uuid in
place -- the one safe overwrite -- instead of minting a second suffixed copy.
Only files that are clearly recovered drafts (``-recovered-`` in the name AND
ending in ``-<short>.md``) are considered. A genuine non-recovered human log
will never match, so its suffix protection is preserved.
"""
if not out_dir.is_dir():
return None
short = uuid[:8]
suffix = f"-{short}.md"
for f in out_dir.glob(f"*-recovered-*{suffix}"):
if f.is_file() and f.name.endswith(suffix):
return f
return None
def recover_one(rec: dict) -> str:
"""Build + write the recovery log for one orphan. Returns the written path.
Idempotent per-uuid: if a prior recovered draft for THIS uuid already exists
in the target directory (a run that died after writing but before the ledger
was updated), overwrite that same file in place rather than creating a new
suffixed copy. Never overwrites a non-recovered human log.
"""
parsed = rec["parsed"]
markdown, meta = engine.build_log(parsed)
out_path = Path(meta["path_would_be"])
prior = _existing_recovered_for_uuid(out_path.parent, rec["uuid"])
if prior is not None:
out_path = prior
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(markdown, encoding="utf-8")
rec["written"] = str(out_path)
rec["date"] = meta["date"]
return str(out_path)
def git(*args: str) -> subprocess.CompletedProcess:
return subprocess.run(
["git", *args],
cwd=str(engine.repo_root()),
capture_output=True,
text=True,
timeout=120,
)
def _current_branch() -> str:
"""Return the current git branch name, or empty string if undeterminable."""
res = git("rev-parse", "--abbrev-ref", "HEAD")
if res.returncode == 0:
name = res.stdout.strip()
if name and name != "HEAD":
return name
return ""
def commit_and_push(written_paths: list[str], count: int) -> bool:
"""Stage only the recovered logs, commit, push. Soft-fail on errors.
NEVER stages the ledger -- it is machine-local and correctly gitignored;
appending it to ``git add`` aborts the whole add (exit 1) and stages nothing.
Returns True only when BOTH the commit AND the push succeed. On any failure
returns False so the caller knows not to mark these uuids ``recovered`` (the
next run must re-attempt them).
"""
root = engine.repo_root()
rel_paths = []
for p in written_paths:
try:
rel_paths.append(str(Path(p).resolve().relative_to(root)))
except ValueError:
rel_paths.append(p)
add = git("add", "--", *rel_paths)
if add.returncode != 0:
print(f"[WARNING] git add failed; logs are on disk but uncommitted: {add.stderr.strip()}", file=sys.stderr)
return False
msg = (
f"chore: auto-recover {count} unsaved session log(s)\n\n"
f"{engine._COMMIT_FOOTER}"
)
commit = git("commit", "-m", msg)
if commit.returncode != 0:
# Nothing to commit, or hook failure -- soft-fail.
print(f"[WARNING] git commit returned non-zero: {commit.stdout.strip()} {commit.stderr.strip()}", file=sys.stderr)
return False
print(f"[OK] committed {count} recovered log(s).")
branch = _current_branch()
if branch:
push = git("push", "origin", branch)
else:
push = git("push")
if push.returncode != 0:
target = f"origin {branch}" if branch else "origin"
print(
f"[WARNING] git push to {target} failed (commit is local): {push.stderr.strip()}",
file=sys.stderr,
)
return False
print(f"[OK] pushed to origin{(' ' + branch) if branch else ''}.")
return True
def post_alert(recovered: list[dict]) -> None:
"""Post an FYI to #bot-alerts via post-bot-alert.sh. Soft-fail."""
script = engine.repo_root() / ".claude" / "scripts" / "post-bot-alert.sh"
if not script.exists():
print("[WARNING] post-bot-alert.sh not found; alert skipped.", file=sys.stderr)
return
bash = shutil.which("bash")
if not bash:
print(
"[WARNING] 'bash' not found on PATH (restricted scheduler env?); "
"#bot-alerts FYI skipped. Recovered logs are already committed.",
file=sys.stderr,
)
return
lines = [
f"[INFO] Auto-recovered {len(recovered)} unsaved session log(s) -- "
f"already saved to the repo; FYI, please review and remove the UNVERIFIED banner:"
]
for r in recovered:
lines.append(
f"- {r['uuid'][:8]} | {r.get('date', '?')} | {_scope_str(r['scope'])} | {r.get('written', '?')}"
)
message = "\n".join(lines)
try:
res = subprocess.run(
[bash, str(script), message, "bot"],
cwd=str(engine.repo_root()),
capture_output=True,
text=True,
timeout=30,
)
out = (res.stdout or "").strip() or (res.stderr or "").strip()
if out:
print(out)
except (OSError, subprocess.SubprocessError) as e:
print(f"[WARNING] alert post failed: {e}", file=sys.stderr)
def main(argv: list[str] | None = None) -> int:
# Force UTF-8 stdout (Windows console defaults to cp1252; titles/paths in
# the dry-run table can contain characters outside that codepage).
try:
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
except (AttributeError, ValueError):
pass
parser = argparse.ArgumentParser(
description="Detect and auto-recover unsaved Claude Code sessions."
)
parser.add_argument("--dry-run", action="store_true", help="scan + print report; no writes/commit/alert")
parser.add_argument("--idle-min", type=int, default=90, help="minutes of mtime-idle before eligible (default 90)")
parser.add_argument("--max", type=int, default=25, dest="max_recover", help="max orphan logs to build per run, oldest-first (default 25)")
parser.add_argument("--no-commit", action="store_true", help="skip git commit/push")
parser.add_argument("--no-alert", action="store_true", help="skip the Discord alert")
args = parser.parse_args(argv)
# Respect the ledger in both modes (dry-run still skips already-processed).
ledger = load_ledger()
eligible, recoverable = scan(args.idle_min, ledger)
if args.dry_run:
print_dry_run_table(eligible)
return 0
if not eligible:
print("[INFO] No eligible transcripts to process.")
return 0
written_paths: list[str] = []
recovered_recs: list[dict] = []
deferred = 0
built = 0
for rec in eligible:
uuid = rec["uuid"]
if rec["orphan"]:
# Cap actual log-builds per run (oldest-first). Remaining orphans are
# left OUT of the ledger so the next run re-attempts them.
if built >= args.max_recover:
deferred += 1
continue
try:
path = recover_one(rec)
except Exception as e: # noqa: BLE001 -- never let one bad transcript abort the run
print(f"[WARNING] failed to recover {uuid[:8]}: {e}", file=sys.stderr)
# No on-disk artifact -> safe to mark immediately.
ledger[uuid] = {"verdict": "error", "at": _now_iso(), "path": None, "error": str(e)}
continue
built += 1
written_paths.append(path)
recovered_recs.append(rec)
print(f"[OK] recovered {uuid[:8]} -> {path}")
elif rec["saved"]:
# No on-disk artifact -> safe to mark immediately.
ledger[uuid] = {"verdict": "skipped-saved", "at": _now_iso(), "path": None}
else:
ledger[uuid] = {"verdict": "skipped-trivial", "at": _now_iso(), "path": None}
if deferred:
print(f"[INFO] {deferred} more orphan(s) deferred to next run (--max {args.max_recover}).")
# Persist the skipped/error verdicts now (they have no artifact, so they are
# safe regardless of the commit/push outcome below).
save_ledger(ledger)
if not recovered_recs:
print("[INFO] No orphans recovered (all eligible sessions were saved or trivial).")
return 0
if not args.no_commit:
pushed = commit_and_push(written_paths, len(recovered_recs))
if pushed:
# H1: only mark uuids 'recovered' AFTER a successful commit+push, so a
# push failure leaves them out of the ledger for the next run to retry.
for rec in recovered_recs:
ledger[rec["uuid"]] = {
"verdict": "recovered",
"at": _now_iso(),
"path": rec.get("written"),
}
save_ledger(ledger)
else:
print(
"[WARNING] commit/push did not succeed; recovered uuids left UNLEDGERED "
"so the next run re-attempts them (logs are on disk).",
file=sys.stderr,
)
else:
print("[INFO] --no-commit set; recovered logs left unstaged and UNLEDGERED (next run will re-attempt).")
if not args.no_alert:
post_alert(recovered_recs)
else:
print("[INFO] --no-alert set; Discord alert skipped.")
return 0
if __name__ == "__main__":
raise SystemExit(main())

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,95 @@
# register-orphan-detector.ps1
# Register the "ClaudeTools - Orphaned Session Detector" scheduled task on this
# Windows machine. The task runs detect_orphaned_sessions.py, which scans the
# per-machine Claude Code transcript directory for unsaved substantive sessions,
# auto-builds banner-marked recovery logs, commits + pushes them, and posts an
# FYI to #bot-alerts.
#
# Mirrors the GrepAI watcher registration pattern in .claude/OLLAMA.md.
#
# Triggers:
# - AtLogOn (catch sessions lost since the last logon)
# - Daily, repeating every 4 hours (catch crashes during a long workday;
# 4h cadence pairs with the detector's 90-minute idle gate so an active
# session is never grabbed mid-flight)
#
# Idempotent: -Force replaces any existing task with the same name.
# This script only REGISTERS the task. It does not run the detector now.
#
# Run from an ordinary (non-admin) PowerShell:
# powershell -ExecutionPolicy Bypass -File D:\claudetools\.claude\scripts\register-orphan-detector.ps1
$ErrorActionPreference = "Stop"
$TaskName = "ClaudeTools - Orphaned Session Detector"
# Resolve the repo root portably. Prefer claudetools_root from identity.json
# (per-machine, gitignored); fall back to two levels up from this script
# (.claude/scripts/ -> repo root), resolved to a full path.
$ScriptDir = $PSScriptRoot
$FallbackRoot = (Resolve-Path (Join-Path $ScriptDir "..\..")).Path
$IdentityPath = Join-Path $FallbackRoot ".claude\identity.json"
$RepoRoot = $FallbackRoot
if (Test-Path $IdentityPath) {
try {
$identity = Get-Content -Raw -Path $IdentityPath | ConvertFrom-Json
if ($identity.claudetools_root -and (Test-Path $identity.claudetools_root)) {
$RepoRoot = (Resolve-Path $identity.claudetools_root).Path
}
} catch {
Write-Host "[WARNING] Could not parse $IdentityPath; using $FallbackRoot" -ForegroundColor Yellow
}
}
$Script = Join-Path $RepoRoot ".claude\scripts\detect_orphaned_sessions.py"
if (-not (Test-Path $Script)) {
Write-Host "[ERROR] Detector not found at $Script" -ForegroundColor Red
exit 1
}
# Resolve the py launcher's full path (the action's Execute wants an absolute
# path; "py" alone usually resolves but we pin it for reliability under the
# Task Scheduler's environment).
$PyCmd = Get-Command py -ErrorAction SilentlyContinue
if ($null -ne $PyCmd) {
$PyPath = $PyCmd.Source
} else {
$PyPath = "py" # fall back to PATH resolution at run time
}
$Action = New-ScheduledTaskAction `
-Execute $PyPath `
-Argument "`"$Script`"" `
-WorkingDirectory $RepoRoot
# Trigger 1: at logon for the current user.
$TriggerLogon = New-ScheduledTaskTrigger -AtLogOn -User $env:USERNAME
# Trigger 2: daily at a fixed start, repeating every 4 hours all day.
$TriggerDaily = New-ScheduledTaskTrigger -Daily -At 9am
$TriggerDaily.Repetition = (New-ScheduledTaskTrigger `
-Once -At 9am `
-RepetitionInterval (New-TimeSpan -Hours 4) `
-RepetitionDuration (New-TimeSpan -Hours 24)).Repetition
$Settings = New-ScheduledTaskSettingsSet `
-ExecutionTimeLimit (New-TimeSpan -Minutes 30) `
-MultipleInstances IgnoreNew `
-StartWhenAvailable `
-DontStopOnIdleEnd
Register-ScheduledTask `
-TaskName $TaskName `
-Action $Action `
-Trigger $TriggerLogon, $TriggerDaily `
-Settings $Settings `
-Description "Scans Claude Code transcripts for unsaved substantive sessions and auto-recovers them into session logs." `
-Force | Out-Null
Write-Host "[OK] Registered scheduled task '$TaskName'."
Write-Host "[INFO] Action: $PyPath `"$Script`""
Write-Host "[INFO] WorkDir: $RepoRoot"
Write-Host "[INFO] Triggers: AtLogOn ($env:USERNAME) + daily every 4h"
Write-Host "[INFO] To inspect: Get-ScheduledTask -TaskName '$TaskName' | Format-List"
Write-Host "[INFO] To run now: Start-ScheduledTask -TaskName '$TaskName'"
Write-Host "[INFO] To remove: Unregister-ScheduledTask -TaskName '$TaskName' -Confirm:`$false"