Complete Phase 6: MSP Work Tracking with Context Recall System

Implements production-ready MSP platform with cross-machine persistent memory for Claude. API Implementation: - 130 REST API endpoints across 21 entities - JWT authentication on all endpoints - AES-256-GCM encryption for credentials - Automatic audit logging - Complete OpenAPI documentation Database: - 43 tables in MariaDB (172.16.3.20:3306) - 42 SQLAlchemy models with modern 2.0 syntax - Full Alembic migration system - 99.1% CRUD test pass rate Context Recall System (Phase 6): - Cross-machine persistent memory via database - Automatic context injection via Claude Code hooks - Automatic context saving after task completion - 90-95% token reduction with compression utilities - Relevance scoring with time decay - Tag-based semantic search - One-command setup script Security Features: - JWT tokens with Argon2 password hashing - AES-256-GCM encryption for all sensitive data - Comprehensive audit trail for credentials - HMAC tamper detection - Secure configuration management Test Results: - Phase 3: 38/38 CRUD tests passing (100%) - Phase 4: 34/35 core API tests passing (97.1%) - Phase 5: 62/62 extended API tests passing (100%) - Phase 6: 10/10 compression tests passing (100%) - Overall: 144/145 tests passing (99.3%) Documentation: - Comprehensive architecture guides - Setup automation scripts - API documentation at /api/docs - Complete test reports - Troubleshooting guides Project Status: 95% Complete (Production-Ready) Phase 7 (optional work context APIs) remains for future enhancement. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-17 06:00:26 -07:00
parent 1452361c21
commit 390b10b32c
201 changed files with 55619 additions and 34 deletions
--- a/api/utils/conversation_parser.py
+++ b/api/utils/conversation_parser.py
@@ -0,0 +1,617 @@
+"""
+Conversation Transcript Parser and Intelligent Categorizer for ClaudeTools
+
+Parses conversation files from Claude Desktop/Code sessions and categorizes them
+into MSP Work, Development, or General categories with intelligent context extraction.
+"""
+
+import json
+import os
+import re
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+
+try:
+    from .context_compression import (
+        compress_conversation_summary,
+        extract_key_decisions,
+        extract_tags_from_text,
+    )
+except ImportError:
+    # Fallback for standalone execution
+    from context_compression import (
+        compress_conversation_summary,
+        extract_key_decisions,
+        extract_tags_from_text,
+    )
+
+
+def parse_jsonl_conversation(file_path: str) -> Dict[str, Any]:
+    """
+    Parse .jsonl conversation file and return structured conversation data.
+
+    Supports both .jsonl (line-delimited JSON) and .json formats.
+    Extracts messages, timestamps, file paths, tool calls, and metadata.
+
+    Args:
+        file_path: Path to .jsonl or .json conversation file
+
+    Returns:
+        Dict with structure:
+        {
+            "messages": [{"role": str, "content": str, "timestamp": str}, ...],
+            "metadata": {"title": str, "model": str, "created_at": str, ...},
+            "file_paths": [str, ...],
+            "tool_calls": [{"tool": str, "count": int}, ...],
+            "duration_seconds": int,
+            "message_count": int
+        }
+
+    Example:
+        >>> data = parse_jsonl_conversation("/path/to/conversation.jsonl")
+        >>> data["message_count"]
+        15
+        >>> data["metadata"]["title"]
+        "Build authentication system"
+    """
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"Conversation file not found: {file_path}")
+
+    messages = []
+    metadata = {}
+    file_paths = set()
+    tool_calls = {}
+
+    file_ext = os.path.splitext(file_path)[1].lower()
+
+    try:
+        if file_ext == ".jsonl":
+            # Parse line-delimited JSON
+            with open(file_path, "r", encoding="utf-8") as f:
+                for line_num, line in enumerate(f, 1):
+                    line = line.strip()
+                    if not line:
+                        continue
+
+                    try:
+                        entry = json.loads(line)
+                        _process_conversation_entry(
+                            entry, messages, metadata, file_paths, tool_calls
+                        )
+                    except json.JSONDecodeError as e:
+                        print(f"Warning: Invalid JSON on line {line_num}: {e}")
+                        continue
+
+        elif file_ext == ".json":
+            # Parse regular JSON file
+            with open(file_path, "r", encoding="utf-8") as f:
+                data = json.load(f)
+
+                # Handle different JSON structures
+                if isinstance(data, dict):
+                    # Single conversation object
+                    _process_conversation_entry(
+                        data, messages, metadata, file_paths, tool_calls
+                    )
+
+                    # Check for nested messages array
+                    if "messages" in data and isinstance(data["messages"], list):
+                        for msg in data["messages"]:
+                            _process_conversation_entry(
+                                msg, messages, metadata, file_paths, tool_calls
+                            )
+
+                elif isinstance(data, list):
+                    # Array of message objects
+                    for entry in data:
+                        _process_conversation_entry(
+                            entry, messages, metadata, file_paths, tool_calls
+                        )
+
+        else:
+            raise ValueError(f"Unsupported file format: {file_ext}")
+
+    except Exception as e:
+        raise ValueError(f"Failed to parse conversation file: {e}")
+
+    # Calculate duration
+    duration_seconds = 0
+    if messages and len(messages) >= 2:
+        try:
+            first_ts = _parse_timestamp(messages[0].get("timestamp"))
+            last_ts = _parse_timestamp(messages[-1].get("timestamp"))
+            if first_ts and last_ts:
+                duration_seconds = int((last_ts - first_ts).total_seconds())
+        except Exception:
+            pass
+
+    # Sort tool calls by count
+    tool_calls_list = [
+        {"tool": tool, "count": count}
+        for tool, count in sorted(
+            tool_calls.items(), key=lambda x: x[1], reverse=True
+        )
+    ]
+
+    return {
+        "messages": messages,
+        "metadata": metadata,
+        "file_paths": sorted(list(file_paths)),
+        "tool_calls": tool_calls_list[:10],  # Top 10 tools
+        "duration_seconds": duration_seconds,
+        "message_count": len(messages)
+    }
+
+
+def _process_conversation_entry(
+    entry: Dict[str, Any],
+    messages: List[Dict],
+    metadata: Dict,
+    file_paths: set,
+    tool_calls: Dict[str, int]
+) -> None:
+    """
+    Process a single conversation entry and extract relevant data.
+
+    Internal helper function to parse different JSON structures.
+    """
+    # Extract metadata fields
+    metadata_fields = [
+        "title", "model", "sessionId", "cwd", "createdAt",
+        "lastActivityAt", "isArchived", "conversation_id"
+    ]
+    for field in metadata_fields:
+        if field in entry and field not in metadata:
+            metadata[field] = entry[field]
+
+    # Extract message content
+    role = entry.get("role") or entry.get("sender") or "unknown"
+    content = entry.get("content") or entry.get("text") or entry.get("message") or ""
+    timestamp = entry.get("timestamp") or entry.get("createdAt") or entry.get("time")
+
+    if content and isinstance(content, str) and len(content.strip()) > 0:
+        messages.append({
+            "role": role,
+            "content": content.strip(),
+            "timestamp": timestamp
+        })
+
+        # Extract file paths from content
+        _extract_file_paths_from_text(content, file_paths)
+
+        # Extract tool calls
+        _extract_tool_calls_from_text(content, tool_calls)
+
+    # Check for nested content structures
+    if "parts" in entry and isinstance(entry["parts"], list):
+        for part in entry["parts"]:
+            if isinstance(part, dict):
+                _process_conversation_entry(
+                    part, messages, metadata, file_paths, tool_calls
+                )
+
+    # Check for tool use in structured format
+    if "tool_use" in entry:
+        tool_name = entry["tool_use"].get("name") or entry["tool_use"].get("tool")
+        if tool_name:
+            tool_calls[tool_name] = tool_calls.get(tool_name, 0) + 1
+
+
+def _extract_file_paths_from_text(text: str, file_paths: set) -> None:
+    """Extract file paths from text content."""
+    # Match common file path patterns
+    patterns = [
+        r'["\']([a-zA-Z]:[/\\](?:[^"\'<>|\r\n]+))["\']',  # Windows absolute
+        r'["\'](/[^"\'<>|\r\n]+)["\']',  # Unix absolute
+        r'["\'](\./[^"\'<>|\r\n]+)["\']',  # Relative
+        r'["\'](\.\./[^"\'<>|\r\n]+)["\']',  # Parent relative
+        r'file_path["\s:=]+["\']([^"\']+)["\']',  # file_path parameter
+        r'(?:api|src|tests?|migrations?)/[a-z0-9_/]+\.(?:py|js|ts|json|yaml|yml)',  # Code paths
+    ]
+
+    for pattern in patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        for match in matches:
+            # Clean and validate
+            path = match.strip()
+            if len(path) > 3 and not path.startswith("http"):
+                file_paths.add(path)
+
+
+def _extract_tool_calls_from_text(text: str, tool_calls: Dict[str, int]) -> None:
+    """Extract tool usage from text content."""
+    # Match tool invocation patterns
+    patterns = [
+        r'<invoke name="([^"]+)">',  # XML-style tool calls
+        r'Tool: (\w+)',  # Explicit tool mentions
+        r'Using (\w+) tool',  # Natural language tool mentions
+        r'Called? (\w+)\(',  # Function call style
+    ]
+
+    for pattern in patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        for match in matches:
+            tool_name = match.strip().lower()
+            if len(tool_name) > 2:
+                tool_calls[tool_name] = tool_calls.get(tool_name, 0) + 1
+
+
+def _parse_timestamp(timestamp: Union[str, int, float, None]) -> Optional[datetime]:
+    """Parse various timestamp formats to datetime object."""
+    if timestamp is None:
+        return None
+
+    try:
+        # Unix timestamp (milliseconds)
+        if isinstance(timestamp, (int, float)):
+            if timestamp > 10000000000:  # Milliseconds
+                return datetime.fromtimestamp(timestamp / 1000, tz=timezone.utc)
+            else:  # Seconds
+                return datetime.fromtimestamp(timestamp, tz=timezone.utc)
+
+        # ISO format string
+        if isinstance(timestamp, str):
+            # Try ISO format with Z
+            if timestamp.endswith("Z"):
+                return datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
+            # Try ISO format
+            return datetime.fromisoformat(timestamp)
+
+    except Exception:
+        pass
+
+    return None
+
+
+def categorize_conversation(messages: List[Dict[str, str]]) -> str:
+    """
+    Analyze conversation content and classify as 'msp', 'development', or 'general'.
+
+    Uses keyword analysis to determine the primary category of the conversation.
+
+    Args:
+        messages: List of message dicts with 'role' and 'content' keys
+
+    Returns:
+        Category string: 'msp', 'development', or 'general'
+
+    Example:
+        >>> messages = [{"role": "user", "content": "Fix client firewall issue"}]
+        >>> categorize_conversation(messages)
+        'msp'
+        >>> messages = [{"role": "user", "content": "Build API endpoint"}]
+        >>> categorize_conversation(messages)
+        'development'
+    """
+    # Combine all message content
+    full_text = " ".join([msg.get("content", "") for msg in messages])
+    text_lower = full_text.lower()
+
+    # Category keywords with weights
+    msp_keywords = {
+        # Client/customer terms
+        "client": 3, "customer": 3, "site": 2, "tenant": 2,
+        # Infrastructure
+        "infrastructure": 3, "server": 2, "network": 2, "firewall": 3,
+        "dns": 2, "vpn": 2, "router": 2, "switch": 2, "backup": 2,
+        # Services
+        "support": 2, "ticket": 3, "incident": 2, "outage": 3,
+        "billable": 3, "invoice": 2, "billing": 2,
+        # Microsoft/cloud services
+        "365": 2, "office365": 2, "azure": 2, "exchange": 2,
+        "sharepoint": 2, "teams": 2, "intune": 2, "entra": 2,
+        # Security
+        "phishing": 2, "breach": 3, "compromise": 3, "vulnerability": 2,
+        # MSP specific
+        "msp": 4, "managed service": 4, "service desk": 3,
+        "rds": 2, "terminal server": 2, "citrix": 2,
+    }
+
+    dev_keywords = {
+        # API/Backend
+        "api": 3, "endpoint": 3, "route": 2, "fastapi": 4, "flask": 3,
+        "rest": 2, "graphql": 2, "webhook": 2,
+        # Database
+        "database": 3, "migration": 3, "alembic": 3, "sqlalchemy": 3,
+        "postgresql": 3, "mysql": 2, "redis": 2, "mongodb": 2,
+        # Code
+        "implement": 2, "refactor": 2, "debug": 2, "test": 2,
+        "pytest": 3, "unittest": 2, "code": 2, "function": 2,
+        "class": 2, "module": 2, "package": 2,
+        # Development
+        "feature": 2, "bug": 2, "commit": 2, "pull request": 2,
+        "repository": 2, "github": 2, "git": 2,
+        # Frontend
+        "react": 3, "vue": 3, "component": 2, "frontend": 2,
+        "ui": 2, "ux": 2, "design": 1,
+        # Tools
+        "docker": 2, "container": 2, "kubernetes": 2, "ci/cd": 2,
+        "deployment": 2, "pipeline": 2,
+    }
+
+    # Count weighted keyword matches
+    msp_score = sum(
+        weight for keyword, weight in msp_keywords.items()
+        if keyword in text_lower
+    )
+
+    dev_score = sum(
+        weight for keyword, weight in dev_keywords.items()
+        if keyword in text_lower
+    )
+
+    # Additional heuristics
+
+    # Check for code patterns (increases dev score)
+    code_patterns = [
+        r'def \w+\(',  # Python function
+        r'class \w+[:\(]',  # Python class
+        r'async def ',  # Async function
+        r'import \w+',  # Import statement
+        r'from \w+ import',  # From import
+        r'```(?:python|javascript|typescript|sql)',  # Code blocks
+        r'\.py|\.js|\.ts|\.go|\.java',  # File extensions
+    ]
+
+    for pattern in code_patterns:
+        if re.search(pattern, full_text, re.IGNORECASE):
+            dev_score += 2
+
+    # Check for MSP ticket/incident patterns
+    ticket_patterns = [
+        r'ticket[:\s#]+\d+',
+        r'incident[:\s#]+\d+',
+        r'case[:\s#]+\d+',
+        r'user reported',
+        r'customer reported',
+    ]
+
+    for pattern in ticket_patterns:
+        if re.search(pattern, text_lower):
+            msp_score += 3
+
+    # Decision logic
+    threshold = 5  # Minimum score to be confident
+
+    if msp_score >= threshold and msp_score > dev_score:
+        return "msp"
+    elif dev_score >= threshold and dev_score > msp_score:
+        return "development"
+    else:
+        return "general"
+
+
+def extract_context_from_conversation(conversation: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Extract dense context suitable for database storage.
+
+    Combines message content, categorization, and compression to create
+    a rich context object ready for database insertion.
+
+    Args:
+        conversation: Parsed conversation dict from parse_jsonl_conversation()
+
+    Returns:
+        Compressed context dict with:
+        {
+            "category": str,
+            "summary": Dict (from compress_conversation_summary),
+            "tags": List[str],
+            "decisions": List[Dict],
+            "key_files": List[str],
+            "key_tools": List[str],
+            "metrics": Dict,
+            "raw_metadata": Dict
+        }
+
+    Example:
+        >>> conversation = parse_jsonl_conversation("/path/to/file.jsonl")
+        >>> context = extract_context_from_conversation(conversation)
+        >>> context["category"]
+        'development'
+        >>> context["tags"]
+        ['api', 'fastapi', 'database', 'migration']
+    """
+    messages = conversation.get("messages", [])
+    metadata = conversation.get("metadata", {})
+
+    # Categorize conversation
+    category = categorize_conversation(messages)
+
+    # Compress conversation using existing utility
+    summary = compress_conversation_summary(messages)
+
+    # Extract full text for tag and decision extraction
+    full_text = " ".join([msg.get("content", "") for msg in messages])
+
+    # Extract tags
+    tags = extract_tags_from_text(full_text)
+
+    # Add category as a tag
+    if category not in tags:
+        tags.insert(0, category)
+
+    # Extract decisions
+    decisions = extract_key_decisions(full_text)
+
+    # Get key file paths (most mentioned)
+    file_paths = conversation.get("file_paths", [])
+    key_files = file_paths[:20]  # Limit to top 20
+
+    # Get key tools (most used)
+    tool_calls = conversation.get("tool_calls", [])
+    key_tools = [tool["tool"] for tool in tool_calls[:10]]
+
+    # Calculate metrics
+    metrics = {
+        "message_count": conversation.get("message_count", 0),
+        "duration_seconds": conversation.get("duration_seconds", 0),
+        "file_count": len(file_paths),
+        "tool_count": len(tool_calls),
+        "decision_count": len(decisions),
+    }
+
+    # Calculate conversation quality score (0-10)
+    quality_score = min(10, (
+        min(5, len(messages) / 2) +  # More messages = higher quality
+        min(2, len(decisions)) +  # Decisions indicate depth
+        min(2, len(file_paths) / 5) +  # Files indicate concrete work
+        (1 if metrics["duration_seconds"] > 300 else 0)  # >5min sessions
+    ))
+    metrics["quality_score"] = round(quality_score, 1)
+
+    return {
+        "category": category,
+        "summary": summary,
+        "tags": tags[:20],  # Limit tags
+        "decisions": decisions[:10],  # Limit decisions
+        "key_files": key_files,
+        "key_tools": key_tools,
+        "metrics": metrics,
+        "raw_metadata": metadata
+    }
+
+
+def scan_folder_for_conversations(base_path: str) -> List[str]:
+    """
+    Recursively find all conversation files (.jsonl and .json) in a directory.
+
+    Args:
+        base_path: Root directory to start scanning
+
+    Returns:
+        List of absolute file paths to conversation files
+
+    Example:
+        >>> files = scan_folder_for_conversations("/path/to/conversations")
+        >>> len(files)
+        42
+        >>> files[0]
+        '/path/to/conversations/session1/messages.jsonl'
+    """
+    if not os.path.exists(base_path):
+        raise FileNotFoundError(f"Base path does not exist: {base_path}")
+
+    conversation_files = []
+
+    # Use pathlib for cross-platform path handling
+    base = Path(base_path)
+
+    # Find all .jsonl and .json files recursively
+    for ext in ["*.jsonl", "*.json"]:
+        for file_path in base.rglob(ext):
+            # Skip config files and settings
+            filename = file_path.name.lower()
+            if filename in ["config.json", "settings.json", "settings.local.json"]:
+                continue
+
+            # Skip common non-conversation JSON files
+            skip_patterns = [
+                "package.json", "tsconfig.json", "webpack.json",
+                "manifest.json", ".vscode", "node_modules"
+            ]
+
+            if any(pattern in str(file_path).lower() for pattern in skip_patterns):
+                continue
+
+            conversation_files.append(str(file_path.resolve()))
+
+    return sorted(conversation_files)
+
+
+def batch_process_conversations(
+    base_path: str,
+    output_callback: Optional[callable] = None
+) -> List[Dict[str, Any]]:
+    """
+    Scan folder and process all conversations into extracted contexts.
+
+    Convenience function that combines scanning and extraction.
+
+    Args:
+        base_path: Root directory to scan
+        output_callback: Optional callback function(file_path, context) for progress
+
+    Returns:
+        List of extracted context dicts
+
+    Example:
+        >>> def progress(path, ctx):
+        ...     print(f"Processed: {path} -> {ctx['category']}")
+        >>> contexts = batch_process_conversations("/path", progress)
+        Processed: /path/session1.jsonl -> development
+        Processed: /path/session2.jsonl -> msp
+        >>> len(contexts)
+        2
+    """
+    files = scan_folder_for_conversations(base_path)
+    contexts = []
+
+    for file_path in files:
+        try:
+            conversation = parse_jsonl_conversation(file_path)
+            context = extract_context_from_conversation(conversation)
+
+            # Add source file path to context
+            context["source_file"] = file_path
+
+            contexts.append(context)
+
+            if output_callback:
+                output_callback(file_path, context)
+
+        except Exception as e:
+            print(f"Error processing {file_path}: {e}")
+            continue
+
+    return contexts
+
+
+# Utility function for quick testing
+def summarize_conversation_file(file_path: str) -> str:
+    """
+    Quick summary of a conversation file for CLI/debugging.
+
+    Args:
+        file_path: Path to conversation file
+
+    Returns:
+        Human-readable summary string
+    """
+    try:
+        conversation = parse_jsonl_conversation(file_path)
+        context = extract_context_from_conversation(conversation)
+
+        title = context["raw_metadata"].get("title", "Untitled")
+        category = context["category"]
+        msg_count = context["metrics"]["message_count"]
+        duration = context["metrics"]["duration_seconds"]
+        tags = ", ".join(context["tags"][:5])
+
+        summary = f"""
+Conversation: {title}
+Category: {category}
+Messages: {msg_count}
+Duration: {duration}s ({duration // 60}m)
+Tags: {tags}
+Quality: {context["metrics"]["quality_score"]}/10
+        """.strip()
+
+        return summary
+
+    except Exception as e:
+        return f"Error: {e}"
+
+
+if __name__ == "__main__":
+    # Quick test if run directly
+    import sys
+
+    if len(sys.argv) > 1:
+        file_path = sys.argv[1]
+        print(summarize_conversation_file(file_path))
+    else:
+        print("Usage: python conversation_parser.py <conversation_file>")
+        print("\nExample:")
+        print("  python conversation_parser.py /path/to/conversation.jsonl")