sync: Auto-sync from acg-guru-5070 at 2026-03-21 16:34:05

Synced files: - Session logs updated - Latest context and credentials - Command/directive updates Machine: acg-guru-5070 Timestamp: 2026-03-21 16:34:05 Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-03-21 16:34:05 -07:00
parent 37aaa6660b
commit a29d00c6b2
2 changed files with 539 additions and 0 deletions
--- a/projects/radio-show/audio-processor/gpu_debug_transcribe.py
+++ b/projects/radio-show/audio-processor/gpu_debug_transcribe.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+"""GPU-monitored batch transcription with diagnostics.
+
+Monitors GPU health before, during, and after each episode transcription.
+Logs temperature, power, utilization, and memory to detect what triggers
+the NVRM rpcSendMessage failure (status 0x00000062).
+"""
+
+import subprocess
+import sys
+import time
+import signal
+import threading
+import os
+from datetime import datetime
+from pathlib import Path
+
+LOG_DIR = Path("gpu-debug-logs")
+LOG_DIR.mkdir(exist_ok=True)
+LOG_FILE = LOG_DIR / f"gpu_monitor_{datetime.now():%Y%m%d_%H%M%S}.log"
+
+# Episodes to transcribe (remaining ones)
+EPISODES = [
+    "training-data/episodes/2011-06-04-hr1.mp3",
+    "training-data/episodes/2011-09-10-hr1.mp3",
+    "training-data/episodes/2014-s6e05.mp3",
+    "training-data/episodes/2015-s7e30.mp3",
+    "training-data/episodes/2016-s8e42.mp3",
+    "training-data/episodes/2017-s9e26.mp3",
+    "training-data/episodes/2018-s10e17.mp3",
+    "training-data/episodes/2018-s10e21.mp3",
+]
+
+stop_monitor = threading.Event()
+
+
+def log(msg: str):
+    ts = datetime.now().strftime("%H:%M:%S.%f")[:-3]
+    line = f"[{ts}] {msg}"
+    print(line)
+    with open(LOG_FILE, "a") as f:
+        f.write(line + "\n")
+
+
+def gpu_query() -> dict | None:
+    """Query GPU stats via nvidia-smi. Returns None if GPU is in error state."""
+    try:
+        result = subprocess.run(
+            ["nvidia-smi",
+             "--query-gpu=temperature.gpu,power.draw,utilization.gpu,utilization.memory,"
+             "memory.used,memory.total,clocks.current.sm,clocks.current.memory,"
+             "pstate,fan.speed",
+             "--format=csv,noheader,nounits"],
+            capture_output=True, text=True, timeout=5
+        )
+        if result.returncode != 0:
+            return None
+        parts = [p.strip() for p in result.stdout.strip().split(",")]
+        # Check for ERR! or [N/A] in any field
+        if any("ERR" in p or "[N/A]" in p for p in parts[:4]):
+            return {"error": True, "raw": result.stdout.strip()}
+        return {
+            "temp_c": parts[0],
+            "power_w": parts[1],
+            "gpu_util": parts[2],
+            "mem_util": parts[3],
+            "mem_used_mb": parts[4],
+            "mem_total_mb": parts[5],
+            "sm_clock_mhz": parts[6],
+            "mem_clock_mhz": parts[7],
+            "pstate": parts[8],
+            "fan": parts[9],
+            "error": False,
+        }
+    except (subprocess.TimeoutExpired, Exception) as e:
+        return {"error": True, "raw": str(e)}
+
+
+def gpu_health_check() -> bool:
+    """Returns True if GPU is healthy."""
+    stats = gpu_query()
+    if stats is None or stats.get("error"):
+        log(f"GPU ERROR: {stats}")
+        return False
+    return True
+
+
+def gpu_status_str(stats: dict) -> str:
+    if stats.get("error"):
+        return f"ERR! raw={stats.get('raw', 'unknown')}"
+    return (f"T={stats['temp_c']}C P={stats['power_w']}W "
+            f"GPU={stats['gpu_util']}% MEM={stats['mem_util']}% "
+            f"VRAM={stats['mem_used_mb']}/{stats['mem_total_mb']}MB "
+            f"SM={stats['sm_clock_mhz']}MHz MEMCLK={stats['mem_clock_mhz']}MHz "
+            f"PState={stats['pstate']} Fan={stats['fan']}")
+
+
+def monitor_thread(interval: float = 2.0):
+    """Background thread that logs GPU stats at regular intervals."""
+    while not stop_monitor.is_set():
+        stats = gpu_query()
+        if stats:
+            log(f"MONITOR: {gpu_status_str(stats)}")
+            if stats.get("error"):
+                log("MONITOR: GPU ENTERED ERROR STATE!")
+                # Check dmesg for the smoking gun
+                try:
+                    result = subprocess.run(
+                        ["sudo", "dmesg", "-T", "--level=err,warn"],
+                        capture_output=True, text=True, timeout=5
+                    )
+                    nvrm_lines = [l for l in result.stdout.splitlines()
+                                  if "NVRM" in l or "nvidia" in l.lower()]
+                    for line in nvrm_lines[-5:]:
+                        log(f"DMESG: {line}")
+                except Exception:
+                    pass
+        stop_monitor.wait(interval)
+
+
+def check_runtime_d3():
+    """Check and log Runtime D3 power management status."""
+    try:
+        power_file = Path("/proc/driver/nvidia/gpus/0000:02:00.0/power")
+        if power_file.exists():
+            log(f"GPU Power Management:\n{power_file.read_text()}")
+
+        # Check if dynamic power management is enabled
+        result = subprocess.run(
+            ["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_status"],
+            capture_output=True, text=True, timeout=5
+        )
+        log(f"PCI runtime_status: {result.stdout.strip()}")
+
+        result = subprocess.run(
+            ["cat", "/sys/bus/pci/devices/0000:02:00.0/power/control"],
+            capture_output=True, text=True, timeout=5
+        )
+        log(f"PCI power control: {result.stdout.strip()}")
+
+        result = subprocess.run(
+            ["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_enabled"],
+            capture_output=True, text=True, timeout=5
+        )
+        log(f"PCI runtime_enabled: {result.stdout.strip()}")
+
+    except Exception as e:
+        log(f"Power check error: {e}")
+
+
+def check_nvidia_persistence():
+    """Check persistence mode."""
+    try:
+        result = subprocess.run(
+            ["nvidia-smi", "--query-gpu=persistence_mode", "--format=csv,noheader"],
+            capture_output=True, text=True, timeout=5
+        )
+        log(f"Persistence mode: {result.stdout.strip()}")
+    except Exception as e:
+        log(f"Persistence check error: {e}")
+
+
+def transcribe_one(episode_path: str) -> bool:
+    """Transcribe a single episode with GPU health monitoring. Returns success."""
+    name = Path(episode_path).stem
+    output_dir = f"training-data/transcripts/{name}"
+
+    if Path(output_dir).exists() and (Path(output_dir) / "transcript.json").exists():
+        log(f"SKIP: {name} already transcribed")
+        return True
+
+    # Pre-flight GPU check
+    log(f"PRE-FLIGHT: Checking GPU before {name}")
+    stats = gpu_query()
+    if not stats or stats.get("error"):
+        log(f"PRE-FLIGHT FAIL: GPU already in error state! Stats: {stats}")
+        return False
+    log(f"PRE-FLIGHT: {gpu_status_str(stats)}")
+
+    # Quick CUDA test
+    log("PRE-FLIGHT: Testing CUDA...")
+    try:
+        import torch
+        if not torch.cuda.is_available():
+            log("PRE-FLIGHT FAIL: torch.cuda.is_available() = False")
+            return False
+        # Small allocation test
+        x = torch.randn(100, 100, device="cuda")
+        y = x @ x
+        del x, y
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
+        log(f"PRE-FLIGHT: CUDA OK, allocated={torch.cuda.memory_allocated() / 1024**2:.0f}MB")
+    except Exception as e:
+        log(f"PRE-FLIGHT FAIL: CUDA test error: {e}")
+        return False
+
+    # Transcribe
+    log(f"START: {name} ({episode_path})")
+    start_time = time.time()
+
+    try:
+        from src.transcriber import transcribe
+        transcript = transcribe(episode_path)
+        transcript.save(Path(output_dir))
+        elapsed = time.time() - start_time
+        log(f"DONE: {name} in {elapsed:.1f}s ({elapsed/60:.1f}min), "
+            f"{len(transcript.segments)} segments")
+    except Exception as e:
+        elapsed = time.time() - start_time
+        log(f"FAIL: {name} after {elapsed:.1f}s: {type(e).__name__}: {e}")
+
+        # Post-failure GPU check
+        stats = gpu_query()
+        log(f"POST-FAIL: {gpu_status_str(stats) if stats else 'query failed'}")
+        return False
+
+    # Post-transcription GPU check
+    stats = gpu_query()
+    if stats and not stats.get("error"):
+        log(f"POST: {gpu_status_str(stats)}")
+    else:
+        log(f"POST: GPU entered error state after transcription! {stats}")
+
+    # Cool-down: clear CUDA cache, let GPU idle briefly
+    try:
+        import torch
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
+    except Exception:
+        pass
+
+    log("COOLDOWN: Waiting 10s between episodes...")
+    time.sleep(10)
+
+    return True
+
+
+def main():
+    log("=" * 60)
+    log("GPU Debug Batch Transcription")
+    log(f"Driver: {subprocess.getoutput('nvidia-smi --query-gpu=driver_version --format=csv,noheader')}")
+    log(f"CUDA version: {subprocess.getoutput('nvidia-smi --query-gpu=cuda_version --format=csv,noheader 2>/dev/null') or 'N/A'}")
+    log("=" * 60)
+
+    # Check power management
+    check_runtime_d3()
+    check_nvidia_persistence()
+
+    # Initial GPU state
+    stats = gpu_query()
+    if not stats or stats.get("error"):
+        log(f"ABORT: GPU already in error state at startup: {stats}")
+        sys.exit(1)
+    log(f"INITIAL: {gpu_status_str(stats)}")
+
+    # Start background monitor (every 5 seconds during transcription)
+    monitor = threading.Thread(target=monitor_thread, args=(5.0,), daemon=True)
+    monitor.start()
+
+    # Filter to only episodes that need transcription
+    remaining = []
+    for ep in EPISODES:
+        name = Path(ep).stem
+        out = Path(f"training-data/transcripts/{name}/transcript.json")
+        if out.exists():
+            log(f"ALREADY DONE: {name}")
+        else:
+            remaining.append(ep)
+
+    log(f"QUEUE: {len(remaining)} episodes to transcribe")
+
+    completed = 0
+    failed = 0
+    for ep in remaining:
+        success = transcribe_one(ep)
+        if success:
+            completed += 1
+        else:
+            failed += 1
+            log(f"STOPPING: GPU failure detected after {completed} episodes, {failed} failed")
+            # Log final state
+            stats = gpu_query()
+            log(f"FINAL: {gpu_status_str(stats) if stats else 'query failed'}")
+            break
+
+    stop_monitor.set()
+    log(f"SUMMARY: {completed} completed, {failed} failed, "
+        f"{len(remaining) - completed - failed} remaining")
+    log(f"Log saved to: {LOG_FILE}")
+
+
+if __name__ == "__main__":
+    main()