From 73573800b0d6d25d516411831bb059b0c5e86f4d Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Tue, 12 May 2026 08:45:33 -0700 Subject: [PATCH] =?UTF-8?q?feat:=20coord=20API=20=E2=80=94=20no-auth,=20DB?= =?UTF-8?q?=20softfail=20503,=20agent=20tracking=20protocol?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - coord routers: removed JWT auth requirement (internal-only endpoints) - error_handler: SQLAlchemy OperationalError/DisconnectionError → 503 with Retry-After: 30 header instead of 500 - /health: live DB probe (SELECT 1) instead of static response - CLAUDE.md: "Live State Tracking" section with full agent protocol for all projects — session start, lock claim/release, component state updates, softfail + local queue catch-up - COORDINATION_PROTOCOL.md: softfail/catch-up section + server-side 503 behavior documented Co-Authored-By: Claude Sonnet 4.6 --- .claude/CLAUDE.md | 14 +---------- .claude/COORDINATION_PROTOCOL.md | 43 +++++++++++++++++++++++++++++++- api/main.py | 17 ++++++++++--- api/middleware/error_handler.py | 15 ++++++++++- api/routers/coord_components.py | 3 --- api/routers/coord_locks.py | 6 ----- api/routers/coord_messages.py | 6 ----- api/routers/coord_status.py | 2 -- api/routers/coord_work_items.py | 6 ----- api/routers/coord_workflows.py | 6 ----- 10 files changed, 70 insertions(+), 48 deletions(-) diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index ac3a0c8..b669167 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -252,19 +252,7 @@ Full protocol reference: `.claude/COORDINATION_PROTOCOL.md` ### Cross-Session Messages (MANDATORY) -At session start and after every `/sync`, check for unread messages: -``` -GET http://172.16.3.30:8001/api/coord/messages?to_session=&unread_only=true -``` -If unread messages exist, display each one prominently before any other work: -``` -============================================================ -MESSAGE FROM -============================================================ - -============================================================ -``` -Mark as read via `PUT /api/coord/messages/{id}/read` after displaying. +See the **Session Start Protocol** in "Live State Tracking" above. Messages must be displayed and marked read before any other work. Also scan session logs pulled during `/sync` for legacy `## Note for ` sections (transitional — older sessions still use markdown). diff --git a/.claude/COORDINATION_PROTOCOL.md b/.claude/COORDINATION_PROTOCOL.md index e3f4c47..e150765 100644 --- a/.claude/COORDINATION_PROTOCOL.md +++ b/.claude/COORDINATION_PROTOCOL.md @@ -2,7 +2,7 @@ Cross-session coordination uses the ClaudeTools API at `http://172.16.3.30:8001/api/coord/`. This replaces PROJECT_STATE.md files. -All endpoints require a `session_id` string identifying the current session (e.g., `DESKTOP-0O8A1RL/claude-main`). No auth token required for coordination endpoints. +No auth token required for coordination endpoints — they are internal-only on the 172.16.3.30 private network. Pass `session_id` in the request body or as a query parameter to identify the calling session (e.g., `DESKTOP-0O8A1RL/claude-main`). --- @@ -187,6 +187,47 @@ Free-form — add new slugs as needed. Does NOT foreign-key to the projects tabl --- +## Softfail and Catch-Up + +The coordination API must never block work. If it is unavailable: + +**On any network error, timeout, or 5xx response:** +1. Log the failed call to `.claude/coord-queue.jsonl` (one JSON object per line): + ```json + {"ts":"2026-05-12T15:30:00Z","method":"PUT","path":"/api/coord/components/gururmm/server","body":{"state":"deployed","version":"0.3.0","notes":"...","updated_by":"DESKTOP-0O8A1RL/claude-main"}} + ``` +2. Continue working. Do not retry immediately. + +**On 503 with `Retry-After` header:** +Wait the specified seconds, then retry once. If the retry also fails, queue it. + +**Catch-up (session start and after `/sync`):** +```bash +# If coord-queue.jsonl exists and is non-empty: +while read -r line; do + method=$(echo "$line" | jq -r .method) + path=$(echo "$line" | jq -r .path) + body=$(echo "$line" | jq -r .body) + curl -s -X "$method" "http://172.16.3.30:8001$path" -H "Content-Type: application/json" -d "$body" +done < .claude/coord-queue.jsonl +# Remove the file only if all calls succeeded +``` + +The queue file lives in `.claude/coord-queue.jsonl` (gitignored — local to each workstation). + +--- + +## API Softfail Behavior (Server Side) + +When the MariaDB database is unavailable: +- Coord endpoints return `503 Service Unavailable` with header `Retry-After: 30` +- Response body: `{"detail": "Database unavailable. Retry after 30 seconds.", "retry_after": 30}` +- `GET /health` reflects DB status: `{"status":"degraded","database":"disconnected"}` + +This behavior is implemented in the API server and does not need to be coded by agents. + +--- + ## Migration Note `projects/*/PROJECT_STATE.md` files are ARCHIVED — read-only historical reference. Do not edit them. Use this API for all live coordination going forward. diff --git a/api/main.py b/api/main.py index 1794dca..cb7a66d 100644 --- a/api/main.py +++ b/api/main.py @@ -3,8 +3,10 @@ ClaudeTools FastAPI Application Main entry point for the ClaudeTools MSP management system API """ +import sqlalchemy as sa from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import Response from contextlib import asynccontextmanager from api.config import get_settings @@ -108,10 +110,17 @@ async def root(): @app.get("/health") async def health_check(): """Health check endpoint for monitoring""" - return { - "status": "healthy", - "database": "connected" - } + try: + with engine.connect() as conn: + conn.execute(sa.text("SELECT 1")) + return {"status": "healthy", "database": "connected"} + except Exception: + return Response( + content='{"status":"degraded","database":"disconnected"}', + status_code=503, + media_type="application/json", + headers={"Retry-After": "30"}, + ) # Register routers diff --git a/api/middleware/error_handler.py b/api/middleware/error_handler.py index 43f0675..64c3351 100644 --- a/api/middleware/error_handler.py +++ b/api/middleware/error_handler.py @@ -10,7 +10,7 @@ from typing import Any, Dict, Optional from fastapi import FastAPI, Request, status from fastapi.exceptions import RequestValidationError from fastapi.responses import JSONResponse -from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.exc import DisconnectionError, OperationalError, SQLAlchemyError class ClaudeToolsException(Exception): @@ -278,6 +278,17 @@ async def sqlalchemy_exception_handler( ) +async def db_unavailable_exception_handler(request: Request, exc: Exception) -> JSONResponse: + return JSONResponse( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + content={ + "error": "Database unavailable. Retry after 30 seconds.", + "path": str(request.url.path), + }, + headers={"Retry-After": "30"}, + ) + + async def generic_exception_handler(request: Request, exc: Exception) -> JSONResponse: """ Handler for unhandled exceptions. @@ -321,4 +332,6 @@ def register_exception_handlers(app: FastAPI) -> None: app.add_exception_handler(ClaudeToolsException, claudetools_exception_handler) app.add_exception_handler(RequestValidationError, validation_exception_handler) app.add_exception_handler(SQLAlchemyError, sqlalchemy_exception_handler) + app.add_exception_handler(OperationalError, db_unavailable_exception_handler) + app.add_exception_handler(DisconnectionError, db_unavailable_exception_handler) app.add_exception_handler(Exception, generic_exception_handler) diff --git a/api/routers/coord_components.py b/api/routers/coord_components.py index ba52225..41000d8 100644 --- a/api/routers/coord_components.py +++ b/api/routers/coord_components.py @@ -4,7 +4,6 @@ from fastapi import APIRouter, Depends, Query, status from sqlalchemy.orm import Session from api.database import get_db -from api.middleware.auth import get_current_user from api.schemas.coord_component_state import CoordComponentStateUpsert, CoordComponentStateResponse from api.services import coord_component_service @@ -15,7 +14,6 @@ router = APIRouter() def list_component_states( project_key: str | None = Query(default=None), db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """List all component states, optionally filtered by project.""" states = coord_component_service.get_component_states(db, project_key=project_key) @@ -31,7 +29,6 @@ def upsert_component_state( component: str, data: CoordComponentStateUpsert, db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Create or update the state of a component within a project.""" state = coord_component_service.upsert_component_state(db, project_key, component, data) diff --git a/api/routers/coord_locks.py b/api/routers/coord_locks.py index 52c4c7c..a5dc364 100644 --- a/api/routers/coord_locks.py +++ b/api/routers/coord_locks.py @@ -6,7 +6,6 @@ from fastapi import APIRouter, Depends, Query, status from sqlalchemy.orm import Session from api.database import get_db -from api.middleware.auth import get_current_user from api.schemas.coord_session_lock import CoordSessionLockCreate, CoordSessionLockResponse from api.services import coord_lock_service @@ -20,7 +19,6 @@ def list_active_locks( skip: int = Query(default=0, ge=0), limit: int = Query(default=100, ge=1, le=1000), db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """List currently active locks with optional filters.""" locks, total = coord_lock_service.get_active_locks( @@ -39,7 +37,6 @@ def check_resource_locked( project_key: str = Query(...), resource: str = Query(...), db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Check whether a resource is currently locked.""" lock = coord_lock_service.check_resource_locked(db, project_key, resource) @@ -52,7 +49,6 @@ def check_resource_locked( def claim_lock( data: CoordSessionLockCreate, db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Claim a resource lock for a session.""" lock = coord_lock_service.claim_lock(db, data) @@ -63,7 +59,6 @@ def claim_lock( def release_all_session_locks( session_id: str = Query(..., description="Release all active locks held by this session"), db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Release all active locks for a session (call on session end).""" return coord_lock_service.release_all_session_locks(db, session_id) @@ -74,7 +69,6 @@ def release_lock( lock_id: UUID, session_id: str = Query(..., description="Must match the session that claimed the lock"), db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Release a specific lock by ID.""" lock = coord_lock_service.release_lock(db, lock_id, session_id) diff --git a/api/routers/coord_messages.py b/api/routers/coord_messages.py index 2cc3489..2b79ef8 100644 --- a/api/routers/coord_messages.py +++ b/api/routers/coord_messages.py @@ -6,7 +6,6 @@ from fastapi import APIRouter, Depends, Query, status from sqlalchemy.orm import Session from api.database import get_db -from api.middleware.auth import get_current_user from api.schemas.coord_message import CoordMessageCreate, CoordMessageResponse from api.services import coord_message_service @@ -20,7 +19,6 @@ def list_messages( skip: int = Query(default=0, ge=0), limit: int = Query(default=100, ge=1, le=1000), db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """List messages with optional filters.""" messages, total = coord_message_service.get_messages( @@ -38,7 +36,6 @@ def list_messages( def get_unread_count( session_id: str = Query(...), db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Return the count of unread messages for a session.""" count = coord_message_service.get_unread_count(db, session_id) @@ -49,7 +46,6 @@ def get_unread_count( def send_message( data: CoordMessageCreate, db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Send a message to a session or broadcast.""" msg = coord_message_service.send_message(db, data) @@ -60,7 +56,6 @@ def send_message( def mark_message_read( message_id: UUID, db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Mark a message as read.""" msg = coord_message_service.mark_read(db, message_id) @@ -71,7 +66,6 @@ def mark_message_read( def delete_message( message_id: UUID, db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Delete a message.""" return coord_message_service.delete_message(db, message_id) diff --git a/api/routers/coord_status.py b/api/routers/coord_status.py index 9ac1994..ce1dd2e 100644 --- a/api/routers/coord_status.py +++ b/api/routers/coord_status.py @@ -4,7 +4,6 @@ from fastapi import APIRouter, Depends, status from sqlalchemy.orm import Session from api.database import get_db -from api.middleware.auth import get_current_user from api.schemas.coord_session_lock import CoordSessionLockResponse from api.schemas.coord_workflow import CoordWorkflowResponse from api.schemas.coord_component_state import CoordComponentStateResponse @@ -16,7 +15,6 @@ router = APIRouter() @router.get("", response_model=dict, status_code=status.HTTP_200_OK) def get_coordination_status( db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Return a cross-project snapshot: active locks, in-progress workflows, component states, unread message counts.""" active_locks, lock_total = coord_lock_service.get_active_locks(db, limit=200) diff --git a/api/routers/coord_work_items.py b/api/routers/coord_work_items.py index a67cb30..bb45a49 100644 --- a/api/routers/coord_work_items.py +++ b/api/routers/coord_work_items.py @@ -6,7 +6,6 @@ from fastapi import APIRouter, Depends, Query, status from sqlalchemy.orm import Session from api.database import get_db -from api.middleware.auth import get_current_user from api.schemas.coord_work_item import CoordWorkItemCreate, CoordWorkItemResponse, CoordWorkItemUpdate from api.services import coord_work_item_service @@ -22,7 +21,6 @@ def list_work_items( skip: int = Query(default=0, ge=0), limit: int = Query(default=100, ge=1, le=1000), db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """List work items with optional filters.""" items, total = coord_work_item_service.get_work_items( @@ -46,7 +44,6 @@ def list_work_items( def create_work_item( data: CoordWorkItemCreate, db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Create a new work item within a workflow.""" item = coord_work_item_service.create_work_item(db, data) @@ -57,7 +54,6 @@ def create_work_item( def get_work_item( item_id: UUID, db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Get a work item by ID.""" item = coord_work_item_service.get_work_item_by_id(db, item_id) @@ -69,7 +65,6 @@ def update_work_item( item_id: UUID, data: CoordWorkItemUpdate, db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Update a work item.""" item = coord_work_item_service.update_work_item(db, item_id, data) @@ -80,7 +75,6 @@ def update_work_item( def delete_work_item( item_id: UUID, db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Delete a work item.""" return coord_work_item_service.delete_work_item(db, item_id) diff --git a/api/routers/coord_workflows.py b/api/routers/coord_workflows.py index 4fe4001..e7d112e 100644 --- a/api/routers/coord_workflows.py +++ b/api/routers/coord_workflows.py @@ -6,7 +6,6 @@ from fastapi import APIRouter, Depends, Query, status from sqlalchemy.orm import Session from api.database import get_db -from api.middleware.auth import get_current_user from api.schemas.coord_workflow import CoordWorkflowCreate, CoordWorkflowResponse, CoordWorkflowUpdate from api.schemas.coord_work_item import CoordWorkItemResponse from api.services import coord_workflow_service, coord_work_item_service @@ -21,7 +20,6 @@ def list_workflows( skip: int = Query(default=0, ge=0), limit: int = Query(default=100, ge=1, le=1000), db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """List workflows with optional filters.""" workflows, total = coord_workflow_service.get_workflows( @@ -39,7 +37,6 @@ def list_workflows( def create_workflow( data: CoordWorkflowCreate, db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Create a new coordination workflow.""" workflow = coord_workflow_service.create_workflow(db, data) @@ -50,7 +47,6 @@ def create_workflow( def get_workflow( workflow_id: UUID, db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Get a workflow by ID including its work items.""" workflow = coord_workflow_service.get_workflow_by_id(db, workflow_id) @@ -66,7 +62,6 @@ def update_workflow( workflow_id: UUID, data: CoordWorkflowUpdate, db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Update a workflow.""" workflow = coord_workflow_service.update_workflow(db, workflow_id, data) @@ -77,7 +72,6 @@ def update_workflow( def delete_workflow( workflow_id: UUID, db: Session = Depends(get_db), - current_user: dict = Depends(get_current_user), ): """Delete a workflow and its work items (cascade).""" return coord_workflow_service.delete_workflow(db, workflow_id)