refactor(sync): share the sync lock with /scc and /checkpoint

Extract the per-machine concurrency lock from sync.sh into a sourceable lib (.claude/scripts/sync-lock.sh) plus a `run <cmd>` wrapper that locks the current repo (same lock-dir basename, so it mutually excludes with sync.sh in the ClaudeTools repo and self-scopes in any project repo). sync.sh now sources it (behavior identical — verified by review). /scc routes its commit+push through the locked, rebase-safe sync.sh (and drops the bare YYYY-MM-DD-session.md filename for the per-session-unique one). /checkpoint now stages+commits atomically under the repo lock so a concurrent session in a shared worktree can't be swept in. Closes the remaining commit paths that bypassed the lock shipped in 6b0ce9a.
2026-06-05 19:13:40 -07:00
parent 6b0ce9aa04
commit 353ba6363c
4 changed files with 215 additions and 117 deletions
--- a/.claude/scripts/sync.sh
+++ b/.claude/scripts/sync.sh
@@ -130,107 +130,18 @@ echo -e "${GREEN}[OK]${NC} Working directory: $(pwd)"
 # submodule update, staging, commit, fetch, rebase, push — and by extension the
 # vault phase) behind a single per-machine lock.
 #
-# PORTABILITY: `flock` is frequently ABSENT on Git Bash (MSYS2), so we can't
-# depend on it. An atomic `mkdir` is the lowest common denominator — it fails if
-# the directory already exists, atomically, on every platform we run on (Windows
-# Git Bash, macOS, Linux). The lock lives under .git/ (never tracked, so a blind
-# `git add -A` can't stage it) and is scoped to this repo.
+# The lock primitive (mkdir-atomic lock, stale detection, ownership-checked
+# release, exit-75-on-contention) lives in the SHAREABLE library sync-lock.sh so
+# other commit paths (/scc, /checkpoint) can contend on the SAME lock dir. We
+# set SYNC_LOCK_DIR explicitly, source the library (which defines the vars +
+# functions but installs NO trap and acquires NOTHING on source), then install
+# our own EXIT trap and acquire — exactly as before. We are already cd'd into
+# REPO_ROOT, and the path is absolute, so the source resolves from any CWD.
 SYNC_LOCK_DIR="$REPO_ROOT/.git/claudetools-sync.lock"
-SYNC_LOCK_WAIT=120     # max seconds to wait for a held lock before skipping the run
-SYNC_LOCK_STALE=600    # seconds after which a held lock is treated as stale (10 min)
-SYNC_LOCK_OWNED=0      # becomes 1 only once THIS run owns the lock (gates release)
+# shellcheck source=./sync-lock.sh
+source "$REPO_ROOT/.claude/scripts/sync-lock.sh"

-# Idempotent release — only removes the lock if THIS process actually owns it
-# (stored PID == $$), so a "skipping this run" exit can never clobber the lock
-# held by the live sync we deferred to. Installed as an EXIT trap because the
-# script runs under `set -e`: the lock must be released on error exits too.
-# (There is no pre-existing EXIT trap in this script, so this adds a fresh one.)
-release_sync_lock() {
-    if [ "$SYNC_LOCK_OWNED" = "1" ] && [ -d "$SYNC_LOCK_DIR" ]; then
-        local owner_pid
-        owner_pid=$(cat "$SYNC_LOCK_DIR/owner.pid" 2>/dev/null || echo "")
-        if [ -z "$owner_pid" ] || [ "$owner_pid" = "$$" ]; then
-            rm -rf "$SYNC_LOCK_DIR" 2>/dev/null || true
-        fi
-        SYNC_LOCK_OWNED=0
-    fi
-}
 trap release_sync_lock EXIT INT TERM
-
-# Portable liveness check. `kill -0 <pid>` works on Git Bash (it maps to the
-# Windows process table), macOS, and Linux; guarded so a bad/empty PID is "dead".
-sync_pid_alive() {
-    local pid="$1"
-    [ -n "$pid" ] || return 1
-    kill -0 "$pid" 2>/dev/null
-}
-
-acquire_sync_lock() {
-    local waited=0 owner_pid owner_ts now mtime lock_age stale_aside re_pid re_now re_mtime re_age
-    while true; do
-        if mkdir "$SYNC_LOCK_DIR" 2>/dev/null; then
-            SYNC_LOCK_OWNED=1
-            printf '%s' "$$" > "$SYNC_LOCK_DIR/owner.pid" 2>/dev/null || true
-            # PID + ISO timestamp inside the lock dir, for diagnostics.
-            {
-                printf 'pid=%s\n'     "$$"
-                printf 'iso=%s\n'     "$(date -u "+%Y-%m-%dT%H:%M:%SZ")"
-                printf 'machine=%s\n' "$MACHINE"
-            } > "$SYNC_LOCK_DIR/owner" 2>/dev/null || true
-            # Defense-in-depth: confirm we still own the dir we just created. If
-            # owner.pid isn't ours, drop ownership and re-evaluate (never fatal
-            # under set -e — comparison is cheap and the body just loops).
-            if [ "$(cat "$SYNC_LOCK_DIR/owner.pid" 2>/dev/null)" != "$$" ]; then
-                SYNC_LOCK_OWNED=0; continue
-            fi
-            return 0
-        fi
-
-        # mkdir failed -> the lock is held. Decide whether it's stale or live.
-        owner_pid=$(cat "$SYNC_LOCK_DIR/owner.pid" 2>/dev/null || echo "")
-        owner_ts=$(sed -n 's/^iso=//p' "$SYNC_LOCK_DIR/owner" 2>/dev/null | head -1)
-        [ -n "$owner_ts" ] || owner_ts="unknown"
-
-        # Stale if the dir is older than the threshold OR the owner PID is dead.
-        # `stat -c` is GNU/Git-Bash, `stat -f` is BSD/macOS; fall back to 0.
-        now=$(date +%s 2>/dev/null || echo 0)
-        mtime=$(stat -c %Y "$SYNC_LOCK_DIR" 2>/dev/null || stat -f %m "$SYNC_LOCK_DIR" 2>/dev/null || echo 0)
-        lock_age=$(( now - mtime ))
-        if { [ "$mtime" -gt 0 ] && [ "$lock_age" -ge "$SYNC_LOCK_STALE" ]; } \
-           || { [ -n "$owner_pid" ] && ! sync_pid_alive "$owner_pid"; }; then
-            # Re-verify staleness IMMEDIATELY before clearing. Between the check
-            # above and here, another racer may have already cleared the stale
-            # lock and acquired a fresh, LIVE one. Re-read owner.pid + mtime NOW;
-            # only rename-aside if it is STILL stale this instant. A freshly
-            # acquired winner has a live PID and fresh mtime, so the loser falls
-            # through to the live-lock wait path instead of stealing the lock.
-            re_pid=$(cat "$SYNC_LOCK_DIR/owner.pid" 2>/dev/null || echo "")
-            re_now=$(date +%s 2>/dev/null || echo 0)
-            re_mtime=$(stat -c %Y "$SYNC_LOCK_DIR" 2>/dev/null || stat -f %m "$SYNC_LOCK_DIR" 2>/dev/null || echo 0)
-            re_age=$(( re_now - re_mtime ))
-            if { [ "$re_mtime" -gt 0 ] && [ "$re_age" -ge "$SYNC_LOCK_STALE" ]; } \
-               || { [ -n "$re_pid" ] && ! sync_pid_alive "$re_pid"; }; then
-                echo -e "${YELLOW}[WARNING]${NC} removing stale sync lock (held by PID ${re_pid:-?} since ${owner_ts}, age ${re_age}s)"
-                stale_aside="${SYNC_LOCK_DIR}.stale.$$"
-                if mv "$SYNC_LOCK_DIR" "$stale_aside" 2>/dev/null; then
-                    rm -rf "$stale_aside" 2>/dev/null || true
-                fi
-            fi
-            sleep 1    # insurance: never tight-spin if clearing persistently fails
-            continue
-        fi
-
-        # Live lock. If we've waited the full budget, skip (a duplicate sync is
-        # harmless to drop — the next scheduled/interactive run catches up).
-        if [ "$waited" -ge "$SYNC_LOCK_WAIT" ]; then
-            echo -e "${YELLOW}[WARNING]${NC} another sync is in progress (held by PID ${owner_pid:-?} since ${owner_ts}); skipping this run"
-            exit 75   # EX_TEMPFAIL: deferred (another sync in progress), not a real success
-        fi
-        sleep 2
-        waited=$(( waited + 2 ))
-    done
-}
-
 acquire_sync_lock
 echo -e "${GREEN}[OK]${NC} Acquired sync lock ($SYNC_LOCK_DIR)"
 # --- end concurrency lock ----------------------------------------------------