From 6b0ce9aa0479f644cf77612379180ed3b675fc4b Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Fri, 5 Jun 2026 18:50:15 -0700 Subject: [PATCH] feat(sync): serialize sync.sh with a per-machine lock; per-session log filenames Multiple concurrent Claude sessions (and the scheduled-task sync) were stepping on each other's git state. sync.sh now takes an atomic mkdir lock in .git/ around the whole run (stage/commit/fetch/rebase/push + vault), exits 75 (EX_TEMPFAIL = deferred) on contention instead of racing, and reclaims stale/dead-owner locks with a re-verify-before-clear guard (closes two TOCTOU races caught in review). /save now mandates per-session-unique log filenames (never the bare YYYY-MM-DD-session.md). Docs updated for the lock + deferred-exit semantics. Note: git add -A is still the catch-all sweep; full per-session commit isolation and routing /scc + /checkpoint through the lock are follow-ups. --- .claude/commands/save.md | 2 +- .claude/commands/sync.md | 2 ++ .claude/scripts/sync.sh | 31 ++++++++++++++++++++----------- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/.claude/commands/save.md b/.claude/commands/save.md index bfd42e9..9aecb86 100644 --- a/.claude/commands/save.md +++ b/.claude/commands/save.md @@ -96,7 +96,7 @@ The article + `wiki/index.md` are committed alongside the session log by `sync.s bash .claude/scripts/sync.sh ``` -`sync.sh` is **serialized by a per-machine lock** (`.git/claudetools-sync.lock`) so concurrent sessions or the scheduled-task sync cannot interleave commits/rebases; if another sync is mid-flight it waits up to ~120s, then skips (the next sync catches up). It then handles: reconcile this machine's `git config user.name/email` to `.claude/identity.json` (so commit authorship can't drift), stage all changes with `git add -A` (after purging garbled Windows path-as-filename cruft), auto-commit, fetch + rebase, push, then the same flow for the vault repo, then surface cross-user `## Note for ` blocks. +`sync.sh` is **serialized by a per-machine lock** (`.git/claudetools-sync.lock`) so concurrent sessions or the scheduled-task sync cannot interleave commits/rebases; if another sync is mid-flight it waits up to ~120s, then **exits 75 (deferred)** rather than racing — the next sync catches up. On a 75, do NOT print a success summary; report "**sync deferred — another sync is running; your session log is written locally and will sync on the next run**". Otherwise it: reconciles this machine's `git config user.name/email` to `.claude/identity.json` (so commit authorship can't drift), stages all changes with `git add -A` (after purging garbled Windows path-as-filename cruft), auto-commits, fetch + rebase, push, then the same flow for the vault repo, then surfaces cross-user `## Note for ` blocks. > Note: `git add -A` is still the catch-all sweep, so a save run will also pick up any *other* dirty files in the shared tree. The lock prevents two syncs from racing, and per-session-unique log filenames prevent log overwrites — but the bare-`add -A` capture means full per-session commit isolation is a later step (see the isolation plan: drop blind `add -A` in favour of explicit per-session staging). For now, avoid running `/save` from two sessions at the exact same moment. diff --git a/.claude/commands/sync.md b/.claude/commands/sync.md index 7d8a431..57dbd87 100644 --- a/.claude/commands/sync.md +++ b/.claude/commands/sync.md @@ -50,6 +50,8 @@ Invokes `bash .claude/scripts/sync.sh`, which: The script is the single source of truth for git operations. Both `/sync` and `/save` invoke it. +**Concurrency:** the run is serialized by a per-machine lock (`.git/claudetools-sync.lock`) so two syncs (e.g. interactive + the scheduled-task sync, or two Claude sessions) can't interleave staging/commit/rebase/push. If another sync is already running, this run waits up to ~120s then **exits 75 (EX_TEMPFAIL = deferred, not a failure)** — report it as deferred, not synced; the next run catches up. Stale locks (owner process dead, or older than 10 min) are auto-reclaimed. + --- ## Cross-user note handling (CRITICAL) diff --git a/.claude/scripts/sync.sh b/.claude/scripts/sync.sh index 5e870d5..904db66 100755 --- a/.claude/scripts/sync.sh +++ b/.claude/scripts/sync.sh @@ -166,7 +166,7 @@ sync_pid_alive() { } acquire_sync_lock() { - local waited=0 owner_pid owner_ts now mtime lock_age stale_aside + local waited=0 owner_pid owner_ts now mtime lock_age stale_aside re_pid re_now re_mtime re_age while true; do if mkdir "$SYNC_LOCK_DIR" 2>/dev/null; then SYNC_LOCK_OWNED=1 @@ -198,17 +198,26 @@ acquire_sync_lock() { lock_age=$(( now - mtime )) if { [ "$mtime" -gt 0 ] && [ "$lock_age" -ge "$SYNC_LOCK_STALE" ]; } \ || { [ -n "$owner_pid" ] && ! sync_pid_alive "$owner_pid"; }; then - echo -e "${YELLOW}[WARNING]${NC} removing stale sync lock (held by PID ${owner_pid:-?} since ${owner_ts}, age ${lock_age}s)" - # Atomically claim the right to clear the stale lock. Only ONE racer can rename - # the canonical dir aside (rename source vanishes after the first; the loser's mv - # fails and it re-evaluates next pass). The canonical lock name is thereafter only - # ever recreated by the atomic mkdir at the top, so a live freshly-acquired lock - # can never be rm'd out from under its owner. - stale_aside="${SYNC_LOCK_DIR}.stale.$$" - if mv "$SYNC_LOCK_DIR" "$stale_aside" 2>/dev/null; then - rm -rf "$stale_aside" 2>/dev/null || true + # Re-verify staleness IMMEDIATELY before clearing. Between the check + # above and here, another racer may have already cleared the stale + # lock and acquired a fresh, LIVE one. Re-read owner.pid + mtime NOW; + # only rename-aside if it is STILL stale this instant. A freshly + # acquired winner has a live PID and fresh mtime, so the loser falls + # through to the live-lock wait path instead of stealing the lock. + re_pid=$(cat "$SYNC_LOCK_DIR/owner.pid" 2>/dev/null || echo "") + re_now=$(date +%s 2>/dev/null || echo 0) + re_mtime=$(stat -c %Y "$SYNC_LOCK_DIR" 2>/dev/null || stat -f %m "$SYNC_LOCK_DIR" 2>/dev/null || echo 0) + re_age=$(( re_now - re_mtime )) + if { [ "$re_mtime" -gt 0 ] && [ "$re_age" -ge "$SYNC_LOCK_STALE" ]; } \ + || { [ -n "$re_pid" ] && ! sync_pid_alive "$re_pid"; }; then + echo -e "${YELLOW}[WARNING]${NC} removing stale sync lock (held by PID ${re_pid:-?} since ${owner_ts}, age ${re_age}s)" + stale_aside="${SYNC_LOCK_DIR}.stale.$$" + if mv "$SYNC_LOCK_DIR" "$stale_aside" 2>/dev/null; then + rm -rf "$stale_aside" 2>/dev/null || true + fi fi - continue # retry mkdir immediately + sleep 1 # insurance: never tight-spin if clearing persistently fails + continue fi # Live lock. If we've waited the full budget, skip (a duplicate sync is