#!/usr/bin/env python3 """GPU-monitored batch transcription with diagnostics. Monitors GPU health before, during, and after each episode transcription. Logs temperature, power, utilization, and memory to detect what triggers the NVRM rpcSendMessage failure (status 0x00000062). """ import subprocess import sys import time import signal import threading import os from datetime import datetime from pathlib import Path LOG_DIR = Path("gpu-debug-logs") LOG_DIR.mkdir(exist_ok=True) LOG_FILE = LOG_DIR / f"gpu_monitor_{datetime.now():%Y%m%d_%H%M%S}.log" # Episodes to transcribe (remaining ones) EPISODES = [ "training-data/episodes/2011-06-04-hr1.mp3", "training-data/episodes/2011-09-10-hr1.mp3", "training-data/episodes/2014-s6e05.mp3", "training-data/episodes/2015-s7e30.mp3", "training-data/episodes/2016-s8e42.mp3", "training-data/episodes/2017-s9e26.mp3", "training-data/episodes/2018-s10e17.mp3", "training-data/episodes/2018-s10e21.mp3", ] stop_monitor = threading.Event() def log(msg: str): ts = datetime.now().strftime("%H:%M:%S.%f")[:-3] line = f"[{ts}] {msg}" print(line) with open(LOG_FILE, "a") as f: f.write(line + "\n") def gpu_query() -> dict | None: """Query GPU stats via nvidia-smi. Returns None if GPU is in error state.""" try: result = subprocess.run( ["nvidia-smi", "--query-gpu=temperature.gpu,power.draw,utilization.gpu,utilization.memory," "memory.used,memory.total,clocks.current.sm,clocks.current.memory," "pstate,fan.speed", "--format=csv,noheader,nounits"], capture_output=True, text=True, timeout=5 ) if result.returncode != 0: return None parts = [p.strip() for p in result.stdout.strip().split(",")] # Check for ERR! or [N/A] in any field if any("ERR" in p or "[N/A]" in p for p in parts[:4]): return {"error": True, "raw": result.stdout.strip()} return { "temp_c": parts[0], "power_w": parts[1], "gpu_util": parts[2], "mem_util": parts[3], "mem_used_mb": parts[4], "mem_total_mb": parts[5], "sm_clock_mhz": parts[6], "mem_clock_mhz": parts[7], "pstate": parts[8], "fan": parts[9], "error": False, } except (subprocess.TimeoutExpired, Exception) as e: return {"error": True, "raw": str(e)} def gpu_health_check() -> bool: """Returns True if GPU is healthy.""" stats = gpu_query() if stats is None or stats.get("error"): log(f"GPU ERROR: {stats}") return False return True def gpu_status_str(stats: dict) -> str: if stats.get("error"): return f"ERR! raw={stats.get('raw', 'unknown')}" return (f"T={stats['temp_c']}C P={stats['power_w']}W " f"GPU={stats['gpu_util']}% MEM={stats['mem_util']}% " f"VRAM={stats['mem_used_mb']}/{stats['mem_total_mb']}MB " f"SM={stats['sm_clock_mhz']}MHz MEMCLK={stats['mem_clock_mhz']}MHz " f"PState={stats['pstate']} Fan={stats['fan']}") def monitor_thread(interval: float = 2.0): """Background thread that logs GPU stats at regular intervals.""" while not stop_monitor.is_set(): stats = gpu_query() if stats: log(f"MONITOR: {gpu_status_str(stats)}") if stats.get("error"): log("MONITOR: GPU ENTERED ERROR STATE!") # Check dmesg for the smoking gun try: result = subprocess.run( ["sudo", "dmesg", "-T", "--level=err,warn"], capture_output=True, text=True, timeout=5 ) nvrm_lines = [l for l in result.stdout.splitlines() if "NVRM" in l or "nvidia" in l.lower()] for line in nvrm_lines[-5:]: log(f"DMESG: {line}") except Exception: pass stop_monitor.wait(interval) def check_runtime_d3(): """Check and log Runtime D3 power management status.""" try: power_file = Path("/proc/driver/nvidia/gpus/0000:02:00.0/power") if power_file.exists(): log(f"GPU Power Management:\n{power_file.read_text()}") # Check if dynamic power management is enabled result = subprocess.run( ["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_status"], capture_output=True, text=True, timeout=5 ) log(f"PCI runtime_status: {result.stdout.strip()}") result = subprocess.run( ["cat", "/sys/bus/pci/devices/0000:02:00.0/power/control"], capture_output=True, text=True, timeout=5 ) log(f"PCI power control: {result.stdout.strip()}") result = subprocess.run( ["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_enabled"], capture_output=True, text=True, timeout=5 ) log(f"PCI runtime_enabled: {result.stdout.strip()}") except Exception as e: log(f"Power check error: {e}") def check_nvidia_persistence(): """Check persistence mode.""" try: result = subprocess.run( ["nvidia-smi", "--query-gpu=persistence_mode", "--format=csv,noheader"], capture_output=True, text=True, timeout=5 ) log(f"Persistence mode: {result.stdout.strip()}") except Exception as e: log(f"Persistence check error: {e}") def transcribe_one(episode_path: str) -> bool: """Transcribe a single episode with GPU health monitoring. Returns success.""" name = Path(episode_path).stem output_dir = f"training-data/transcripts/{name}" if Path(output_dir).exists() and (Path(output_dir) / "transcript.json").exists(): log(f"SKIP: {name} already transcribed") return True # Pre-flight GPU check log(f"PRE-FLIGHT: Checking GPU before {name}") stats = gpu_query() if not stats or stats.get("error"): log(f"PRE-FLIGHT FAIL: GPU already in error state! Stats: {stats}") return False log(f"PRE-FLIGHT: {gpu_status_str(stats)}") # Quick CUDA test log("PRE-FLIGHT: Testing CUDA...") try: import torch if not torch.cuda.is_available(): log("PRE-FLIGHT FAIL: torch.cuda.is_available() = False") return False # Small allocation test x = torch.randn(100, 100, device="cuda") y = x @ x del x, y torch.cuda.synchronize() torch.cuda.empty_cache() log(f"PRE-FLIGHT: CUDA OK, allocated={torch.cuda.memory_allocated() / 1024**2:.0f}MB") except Exception as e: log(f"PRE-FLIGHT FAIL: CUDA test error: {e}") return False # Transcribe log(f"START: {name} ({episode_path})") start_time = time.time() try: from src.transcriber import transcribe transcript = transcribe(episode_path) transcript.save(Path(output_dir)) elapsed = time.time() - start_time log(f"DONE: {name} in {elapsed:.1f}s ({elapsed/60:.1f}min), " f"{len(transcript.segments)} segments") except Exception as e: elapsed = time.time() - start_time log(f"FAIL: {name} after {elapsed:.1f}s: {type(e).__name__}: {e}") # Post-failure GPU check stats = gpu_query() log(f"POST-FAIL: {gpu_status_str(stats) if stats else 'query failed'}") return False # Post-transcription GPU check stats = gpu_query() if stats and not stats.get("error"): log(f"POST: {gpu_status_str(stats)}") else: log(f"POST: GPU entered error state after transcription! {stats}") # Cool-down: clear CUDA cache, let GPU idle briefly try: import torch torch.cuda.empty_cache() torch.cuda.synchronize() except Exception: pass log("COOLDOWN: Waiting 10s between episodes...") time.sleep(10) return True def main(): log("=" * 60) log("GPU Debug Batch Transcription") log(f"Driver: {subprocess.getoutput('nvidia-smi --query-gpu=driver_version --format=csv,noheader')}") log(f"CUDA version: {subprocess.getoutput('nvidia-smi --query-gpu=cuda_version --format=csv,noheader 2>/dev/null') or 'N/A'}") log("=" * 60) # Check power management check_runtime_d3() check_nvidia_persistence() # Initial GPU state stats = gpu_query() if not stats or stats.get("error"): log(f"ABORT: GPU already in error state at startup: {stats}") sys.exit(1) log(f"INITIAL: {gpu_status_str(stats)}") # Start background monitor (every 5 seconds during transcription) monitor = threading.Thread(target=monitor_thread, args=(5.0,), daemon=True) monitor.start() # Filter to only episodes that need transcription remaining = [] for ep in EPISODES: name = Path(ep).stem out = Path(f"training-data/transcripts/{name}/transcript.json") if out.exists(): log(f"ALREADY DONE: {name}") else: remaining.append(ep) log(f"QUEUE: {len(remaining)} episodes to transcribe") completed = 0 failed = 0 for ep in remaining: success = transcribe_one(ep) if success: completed += 1 else: failed += 1 log(f"STOPPING: GPU failure detected after {completed} episodes, {failed} failed") # Log final state stats = gpu_query() log(f"FINAL: {gpu_status_str(stats) if stats else 'query failed'}") break stop_monitor.set() log(f"SUMMARY: {completed} completed, {failed} failed, " f"{len(remaining) - completed - failed} remaining") log(f"Log saved to: {LOG_FILE}") if __name__ == "__main__": main()