claudetools/projects/radio-show/audio-processor/gpu_debug_transcribe.py

#!/usr/bin/env python3
"""GPU-monitored batch transcription with diagnostics.

Monitors GPU health before, during, and after each episode transcription.
Logs temperature, power, utilization, and memory to detect what triggers
the NVRM rpcSendMessage failure (status 0x00000062).
"""

import subprocess
import sys
import time
import signal
import threading
import os
from datetime import datetime
from pathlib import Path

LOG_DIR = Path("gpu-debug-logs")
LOG_DIR.mkdir(exist_ok=True)
LOG_FILE = LOG_DIR / f"gpu_monitor_{datetime.now():%Y%m%d_%H%M%S}.log"

# Episodes to transcribe (remaining ones)
EPISODES = [
    "training-data/episodes/2011-06-04-hr1.mp3",
    "training-data/episodes/2011-09-10-hr1.mp3",
    "training-data/episodes/2014-s6e05.mp3",
    "training-data/episodes/2015-s7e30.mp3",
    "training-data/episodes/2016-s8e42.mp3",
    "training-data/episodes/2017-s9e26.mp3",
    "training-data/episodes/2018-s10e17.mp3",
    "training-data/episodes/2018-s10e21.mp3",
]

stop_monitor = threading.Event()


def log(msg: str):
    ts = datetime.now().strftime("%H:%M:%S.%f")[:-3]
    line = f"[{ts}] {msg}"
    print(line)
    with open(LOG_FILE, "a") as f:
        f.write(line + "\n")


def gpu_query() -> dict | None:
    """Query GPU stats via nvidia-smi. Returns None if GPU is in error state."""
    try:
        result = subprocess.run(
            ["nvidia-smi",
             "--query-gpu=temperature.gpu,power.draw,utilization.gpu,utilization.memory,"
             "memory.used,memory.total,clocks.current.sm,clocks.current.memory,"
             "pstate,fan.speed",
             "--format=csv,noheader,nounits"],
            capture_output=True, text=True, timeout=5
        )
        if result.returncode != 0:
            return None
        parts = [p.strip() for p in result.stdout.strip().split(",")]
        # Check for ERR! or [N/A] in any field
        if any("ERR" in p or "[N/A]" in p for p in parts[:4]):
            return {"error": True, "raw": result.stdout.strip()}
        return {
            "temp_c": parts[0],
            "power_w": parts[1],
            "gpu_util": parts[2],
            "mem_util": parts[3],
            "mem_used_mb": parts[4],
            "mem_total_mb": parts[5],
            "sm_clock_mhz": parts[6],
            "mem_clock_mhz": parts[7],
            "pstate": parts[8],
            "fan": parts[9],
            "error": False,
        }
    except (subprocess.TimeoutExpired, Exception) as e:
        return {"error": True, "raw": str(e)}


def gpu_health_check() -> bool:
    """Returns True if GPU is healthy."""
    stats = gpu_query()
    if stats is None or stats.get("error"):
        log(f"GPU ERROR: {stats}")
        return False
    return True


def gpu_status_str(stats: dict) -> str:
    if stats.get("error"):
        return f"ERR! raw={stats.get('raw', 'unknown')}"
    return (f"T={stats['temp_c']}C P={stats['power_w']}W "
            f"GPU={stats['gpu_util']}% MEM={stats['mem_util']}% "
            f"VRAM={stats['mem_used_mb']}/{stats['mem_total_mb']}MB "
            f"SM={stats['sm_clock_mhz']}MHz MEMCLK={stats['mem_clock_mhz']}MHz "
            f"PState={stats['pstate']} Fan={stats['fan']}")


def monitor_thread(interval: float = 2.0):
    """Background thread that logs GPU stats at regular intervals."""
    while not stop_monitor.is_set():
        stats = gpu_query()
        if stats:
            log(f"MONITOR: {gpu_status_str(stats)}")
            if stats.get("error"):
                log("MONITOR: GPU ENTERED ERROR STATE!")
                # Check dmesg for the smoking gun
                try:
                    result = subprocess.run(
                        ["sudo", "dmesg", "-T", "--level=err,warn"],
                        capture_output=True, text=True, timeout=5
                    )
                    nvrm_lines = [l for l in result.stdout.splitlines()
                                  if "NVRM" in l or "nvidia" in l.lower()]
                    for line in nvrm_lines[-5:]:
                        log(f"DMESG: {line}")
                except Exception:
                    pass
        stop_monitor.wait(interval)


def check_runtime_d3():
    """Check and log Runtime D3 power management status."""
    try:
        power_file = Path("/proc/driver/nvidia/gpus/0000:02:00.0/power")
        if power_file.exists():
            log(f"GPU Power Management:\n{power_file.read_text()}")

        # Check if dynamic power management is enabled
        result = subprocess.run(
            ["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_status"],
            capture_output=True, text=True, timeout=5
        )
        log(f"PCI runtime_status: {result.stdout.strip()}")

        result = subprocess.run(
            ["cat", "/sys/bus/pci/devices/0000:02:00.0/power/control"],
            capture_output=True, text=True, timeout=5
        )
        log(f"PCI power control: {result.stdout.strip()}")

        result = subprocess.run(
            ["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_enabled"],
            capture_output=True, text=True, timeout=5
        )
        log(f"PCI runtime_enabled: {result.stdout.strip()}")

    except Exception as e:
        log(f"Power check error: {e}")


def check_nvidia_persistence():
    """Check persistence mode."""
    try:
        result = subprocess.run(
            ["nvidia-smi", "--query-gpu=persistence_mode", "--format=csv,noheader"],
            capture_output=True, text=True, timeout=5
        )
        log(f"Persistence mode: {result.stdout.strip()}")
    except Exception as e:
        log(f"Persistence check error: {e}")


def transcribe_one(episode_path: str) -> bool:
    """Transcribe a single episode with GPU health monitoring. Returns success."""
    name = Path(episode_path).stem
    output_dir = f"training-data/transcripts/{name}"

    if Path(output_dir).exists() and (Path(output_dir) / "transcript.json").exists():
        log(f"SKIP: {name} already transcribed")
        return True

    # Pre-flight GPU check
    log(f"PRE-FLIGHT: Checking GPU before {name}")
    stats = gpu_query()
    if not stats or stats.get("error"):
        log(f"PRE-FLIGHT FAIL: GPU already in error state! Stats: {stats}")
        return False
    log(f"PRE-FLIGHT: {gpu_status_str(stats)}")

    # Quick CUDA test
    log("PRE-FLIGHT: Testing CUDA...")
    try:
        import torch
        if not torch.cuda.is_available():
            log("PRE-FLIGHT FAIL: torch.cuda.is_available() = False")
            return False
        # Small allocation test
        x = torch.randn(100, 100, device="cuda")
        y = x @ x
        del x, y
        torch.cuda.synchronize()
        torch.cuda.empty_cache()
        log(f"PRE-FLIGHT: CUDA OK, allocated={torch.cuda.memory_allocated() / 1024**2:.0f}MB")
    except Exception as e:
        log(f"PRE-FLIGHT FAIL: CUDA test error: {e}")
        return False

    # Transcribe
    log(f"START: {name} ({episode_path})")
    start_time = time.time()

    try:
        from src.transcriber import transcribe
        transcript = transcribe(episode_path)
        transcript.save(Path(output_dir))
        elapsed = time.time() - start_time
        log(f"DONE: {name} in {elapsed:.1f}s ({elapsed/60:.1f}min), "
            f"{len(transcript.segments)} segments")
    except Exception as e:
        elapsed = time.time() - start_time
        log(f"FAIL: {name} after {elapsed:.1f}s: {type(e).__name__}: {e}")

        # Post-failure GPU check
        stats = gpu_query()
        log(f"POST-FAIL: {gpu_status_str(stats) if stats else 'query failed'}")
        return False

    # Post-transcription GPU check
    stats = gpu_query()
    if stats and not stats.get("error"):
        log(f"POST: {gpu_status_str(stats)}")
    else:
        log(f"POST: GPU entered error state after transcription! {stats}")

    # Cool-down: clear CUDA cache, let GPU idle briefly
    try:
        import torch
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    except Exception:
        pass

    log("COOLDOWN: Waiting 10s between episodes...")
    time.sleep(10)

    return True


def main():
    log("=" * 60)
    log("GPU Debug Batch Transcription")
    log(f"Driver: {subprocess.getoutput('nvidia-smi --query-gpu=driver_version --format=csv,noheader')}")
    log(f"CUDA version: {subprocess.getoutput('nvidia-smi --query-gpu=cuda_version --format=csv,noheader 2>/dev/null') or 'N/A'}")
    log("=" * 60)

    # Check power management
    check_runtime_d3()
    check_nvidia_persistence()

    # Initial GPU state
    stats = gpu_query()
    if not stats or stats.get("error"):
        log(f"ABORT: GPU already in error state at startup: {stats}")
        sys.exit(1)
    log(f"INITIAL: {gpu_status_str(stats)}")

    # Start background monitor (every 5 seconds during transcription)
    monitor = threading.Thread(target=monitor_thread, args=(5.0,), daemon=True)
    monitor.start()

    # Filter to only episodes that need transcription
    remaining = []
    for ep in EPISODES:
        name = Path(ep).stem
        out = Path(f"training-data/transcripts/{name}/transcript.json")
        if out.exists():
            log(f"ALREADY DONE: {name}")
        else:
            remaining.append(ep)

    log(f"QUEUE: {len(remaining)} episodes to transcribe")

    completed = 0
    failed = 0
    for ep in remaining:
        success = transcribe_one(ep)
        if success:
            completed += 1
        else:
            failed += 1
            log(f"STOPPING: GPU failure detected after {completed} episodes, {failed} failed")
            # Log final state
            stats = gpu_query()
            log(f"FINAL: {gpu_status_str(stats) if stats else 'query failed'}")
            break

    stop_monitor.set()
    log(f"SUMMARY: {completed} completed, {failed} failed, "
        f"{len(remaining) - completed - failed} remaining")
    log(f"Log saved to: {LOG_FILE}")


if __name__ == "__main__":
    main()