Synced files: - Session logs updated - Latest context and credentials - Command/directive updates Machine: acg-guru-5070 Timestamp: 2026-03-21 16:34:05 Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
295 lines
9.7 KiB
Python
295 lines
9.7 KiB
Python
#!/usr/bin/env python3
|
|
"""GPU-monitored batch transcription with diagnostics.
|
|
|
|
Monitors GPU health before, during, and after each episode transcription.
|
|
Logs temperature, power, utilization, and memory to detect what triggers
|
|
the NVRM rpcSendMessage failure (status 0x00000062).
|
|
"""
|
|
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import signal
|
|
import threading
|
|
import os
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
LOG_DIR = Path("gpu-debug-logs")
|
|
LOG_DIR.mkdir(exist_ok=True)
|
|
LOG_FILE = LOG_DIR / f"gpu_monitor_{datetime.now():%Y%m%d_%H%M%S}.log"
|
|
|
|
# Episodes to transcribe (remaining ones)
|
|
EPISODES = [
|
|
"training-data/episodes/2011-06-04-hr1.mp3",
|
|
"training-data/episodes/2011-09-10-hr1.mp3",
|
|
"training-data/episodes/2014-s6e05.mp3",
|
|
"training-data/episodes/2015-s7e30.mp3",
|
|
"training-data/episodes/2016-s8e42.mp3",
|
|
"training-data/episodes/2017-s9e26.mp3",
|
|
"training-data/episodes/2018-s10e17.mp3",
|
|
"training-data/episodes/2018-s10e21.mp3",
|
|
]
|
|
|
|
stop_monitor = threading.Event()
|
|
|
|
|
|
def log(msg: str):
|
|
ts = datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
|
line = f"[{ts}] {msg}"
|
|
print(line)
|
|
with open(LOG_FILE, "a") as f:
|
|
f.write(line + "\n")
|
|
|
|
|
|
def gpu_query() -> dict | None:
|
|
"""Query GPU stats via nvidia-smi. Returns None if GPU is in error state."""
|
|
try:
|
|
result = subprocess.run(
|
|
["nvidia-smi",
|
|
"--query-gpu=temperature.gpu,power.draw,utilization.gpu,utilization.memory,"
|
|
"memory.used,memory.total,clocks.current.sm,clocks.current.memory,"
|
|
"pstate,fan.speed",
|
|
"--format=csv,noheader,nounits"],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
if result.returncode != 0:
|
|
return None
|
|
parts = [p.strip() for p in result.stdout.strip().split(",")]
|
|
# Check for ERR! or [N/A] in any field
|
|
if any("ERR" in p or "[N/A]" in p for p in parts[:4]):
|
|
return {"error": True, "raw": result.stdout.strip()}
|
|
return {
|
|
"temp_c": parts[0],
|
|
"power_w": parts[1],
|
|
"gpu_util": parts[2],
|
|
"mem_util": parts[3],
|
|
"mem_used_mb": parts[4],
|
|
"mem_total_mb": parts[5],
|
|
"sm_clock_mhz": parts[6],
|
|
"mem_clock_mhz": parts[7],
|
|
"pstate": parts[8],
|
|
"fan": parts[9],
|
|
"error": False,
|
|
}
|
|
except (subprocess.TimeoutExpired, Exception) as e:
|
|
return {"error": True, "raw": str(e)}
|
|
|
|
|
|
def gpu_health_check() -> bool:
|
|
"""Returns True if GPU is healthy."""
|
|
stats = gpu_query()
|
|
if stats is None or stats.get("error"):
|
|
log(f"GPU ERROR: {stats}")
|
|
return False
|
|
return True
|
|
|
|
|
|
def gpu_status_str(stats: dict) -> str:
|
|
if stats.get("error"):
|
|
return f"ERR! raw={stats.get('raw', 'unknown')}"
|
|
return (f"T={stats['temp_c']}C P={stats['power_w']}W "
|
|
f"GPU={stats['gpu_util']}% MEM={stats['mem_util']}% "
|
|
f"VRAM={stats['mem_used_mb']}/{stats['mem_total_mb']}MB "
|
|
f"SM={stats['sm_clock_mhz']}MHz MEMCLK={stats['mem_clock_mhz']}MHz "
|
|
f"PState={stats['pstate']} Fan={stats['fan']}")
|
|
|
|
|
|
def monitor_thread(interval: float = 2.0):
|
|
"""Background thread that logs GPU stats at regular intervals."""
|
|
while not stop_monitor.is_set():
|
|
stats = gpu_query()
|
|
if stats:
|
|
log(f"MONITOR: {gpu_status_str(stats)}")
|
|
if stats.get("error"):
|
|
log("MONITOR: GPU ENTERED ERROR STATE!")
|
|
# Check dmesg for the smoking gun
|
|
try:
|
|
result = subprocess.run(
|
|
["sudo", "dmesg", "-T", "--level=err,warn"],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
nvrm_lines = [l for l in result.stdout.splitlines()
|
|
if "NVRM" in l or "nvidia" in l.lower()]
|
|
for line in nvrm_lines[-5:]:
|
|
log(f"DMESG: {line}")
|
|
except Exception:
|
|
pass
|
|
stop_monitor.wait(interval)
|
|
|
|
|
|
def check_runtime_d3():
|
|
"""Check and log Runtime D3 power management status."""
|
|
try:
|
|
power_file = Path("/proc/driver/nvidia/gpus/0000:02:00.0/power")
|
|
if power_file.exists():
|
|
log(f"GPU Power Management:\n{power_file.read_text()}")
|
|
|
|
# Check if dynamic power management is enabled
|
|
result = subprocess.run(
|
|
["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_status"],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
log(f"PCI runtime_status: {result.stdout.strip()}")
|
|
|
|
result = subprocess.run(
|
|
["cat", "/sys/bus/pci/devices/0000:02:00.0/power/control"],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
log(f"PCI power control: {result.stdout.strip()}")
|
|
|
|
result = subprocess.run(
|
|
["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_enabled"],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
log(f"PCI runtime_enabled: {result.stdout.strip()}")
|
|
|
|
except Exception as e:
|
|
log(f"Power check error: {e}")
|
|
|
|
|
|
def check_nvidia_persistence():
|
|
"""Check persistence mode."""
|
|
try:
|
|
result = subprocess.run(
|
|
["nvidia-smi", "--query-gpu=persistence_mode", "--format=csv,noheader"],
|
|
capture_output=True, text=True, timeout=5
|
|
)
|
|
log(f"Persistence mode: {result.stdout.strip()}")
|
|
except Exception as e:
|
|
log(f"Persistence check error: {e}")
|
|
|
|
|
|
def transcribe_one(episode_path: str) -> bool:
|
|
"""Transcribe a single episode with GPU health monitoring. Returns success."""
|
|
name = Path(episode_path).stem
|
|
output_dir = f"training-data/transcripts/{name}"
|
|
|
|
if Path(output_dir).exists() and (Path(output_dir) / "transcript.json").exists():
|
|
log(f"SKIP: {name} already transcribed")
|
|
return True
|
|
|
|
# Pre-flight GPU check
|
|
log(f"PRE-FLIGHT: Checking GPU before {name}")
|
|
stats = gpu_query()
|
|
if not stats or stats.get("error"):
|
|
log(f"PRE-FLIGHT FAIL: GPU already in error state! Stats: {stats}")
|
|
return False
|
|
log(f"PRE-FLIGHT: {gpu_status_str(stats)}")
|
|
|
|
# Quick CUDA test
|
|
log("PRE-FLIGHT: Testing CUDA...")
|
|
try:
|
|
import torch
|
|
if not torch.cuda.is_available():
|
|
log("PRE-FLIGHT FAIL: torch.cuda.is_available() = False")
|
|
return False
|
|
# Small allocation test
|
|
x = torch.randn(100, 100, device="cuda")
|
|
y = x @ x
|
|
del x, y
|
|
torch.cuda.synchronize()
|
|
torch.cuda.empty_cache()
|
|
log(f"PRE-FLIGHT: CUDA OK, allocated={torch.cuda.memory_allocated() / 1024**2:.0f}MB")
|
|
except Exception as e:
|
|
log(f"PRE-FLIGHT FAIL: CUDA test error: {e}")
|
|
return False
|
|
|
|
# Transcribe
|
|
log(f"START: {name} ({episode_path})")
|
|
start_time = time.time()
|
|
|
|
try:
|
|
from src.transcriber import transcribe
|
|
transcript = transcribe(episode_path)
|
|
transcript.save(Path(output_dir))
|
|
elapsed = time.time() - start_time
|
|
log(f"DONE: {name} in {elapsed:.1f}s ({elapsed/60:.1f}min), "
|
|
f"{len(transcript.segments)} segments")
|
|
except Exception as e:
|
|
elapsed = time.time() - start_time
|
|
log(f"FAIL: {name} after {elapsed:.1f}s: {type(e).__name__}: {e}")
|
|
|
|
# Post-failure GPU check
|
|
stats = gpu_query()
|
|
log(f"POST-FAIL: {gpu_status_str(stats) if stats else 'query failed'}")
|
|
return False
|
|
|
|
# Post-transcription GPU check
|
|
stats = gpu_query()
|
|
if stats and not stats.get("error"):
|
|
log(f"POST: {gpu_status_str(stats)}")
|
|
else:
|
|
log(f"POST: GPU entered error state after transcription! {stats}")
|
|
|
|
# Cool-down: clear CUDA cache, let GPU idle briefly
|
|
try:
|
|
import torch
|
|
torch.cuda.empty_cache()
|
|
torch.cuda.synchronize()
|
|
except Exception:
|
|
pass
|
|
|
|
log("COOLDOWN: Waiting 10s between episodes...")
|
|
time.sleep(10)
|
|
|
|
return True
|
|
|
|
|
|
def main():
|
|
log("=" * 60)
|
|
log("GPU Debug Batch Transcription")
|
|
log(f"Driver: {subprocess.getoutput('nvidia-smi --query-gpu=driver_version --format=csv,noheader')}")
|
|
log(f"CUDA version: {subprocess.getoutput('nvidia-smi --query-gpu=cuda_version --format=csv,noheader 2>/dev/null') or 'N/A'}")
|
|
log("=" * 60)
|
|
|
|
# Check power management
|
|
check_runtime_d3()
|
|
check_nvidia_persistence()
|
|
|
|
# Initial GPU state
|
|
stats = gpu_query()
|
|
if not stats or stats.get("error"):
|
|
log(f"ABORT: GPU already in error state at startup: {stats}")
|
|
sys.exit(1)
|
|
log(f"INITIAL: {gpu_status_str(stats)}")
|
|
|
|
# Start background monitor (every 5 seconds during transcription)
|
|
monitor = threading.Thread(target=monitor_thread, args=(5.0,), daemon=True)
|
|
monitor.start()
|
|
|
|
# Filter to only episodes that need transcription
|
|
remaining = []
|
|
for ep in EPISODES:
|
|
name = Path(ep).stem
|
|
out = Path(f"training-data/transcripts/{name}/transcript.json")
|
|
if out.exists():
|
|
log(f"ALREADY DONE: {name}")
|
|
else:
|
|
remaining.append(ep)
|
|
|
|
log(f"QUEUE: {len(remaining)} episodes to transcribe")
|
|
|
|
completed = 0
|
|
failed = 0
|
|
for ep in remaining:
|
|
success = transcribe_one(ep)
|
|
if success:
|
|
completed += 1
|
|
else:
|
|
failed += 1
|
|
log(f"STOPPING: GPU failure detected after {completed} episodes, {failed} failed")
|
|
# Log final state
|
|
stats = gpu_query()
|
|
log(f"FINAL: {gpu_status_str(stats) if stats else 'query failed'}")
|
|
break
|
|
|
|
stop_monitor.set()
|
|
log(f"SUMMARY: {completed} completed, {failed} failed, "
|
|
f"{len(remaining) - completed - failed} remaining")
|
|
log(f"Log saved to: {LOG_FILE}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|