sync: Auto-sync from acg-guru-5070 at 2026-03-21 16:34:05
Synced files: - Session logs updated - Latest context and credentials - Command/directive updates Machine: acg-guru-5070 Timestamp: 2026-03-21 16:34:05 Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
294
projects/radio-show/audio-processor/gpu_debug_transcribe.py
Normal file
294
projects/radio-show/audio-processor/gpu_debug_transcribe.py
Normal file
@@ -0,0 +1,294 @@
|
||||
#!/usr/bin/env python3
|
||||
"""GPU-monitored batch transcription with diagnostics.
|
||||
|
||||
Monitors GPU health before, during, and after each episode transcription.
|
||||
Logs temperature, power, utilization, and memory to detect what triggers
|
||||
the NVRM rpcSendMessage failure (status 0x00000062).
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import signal
|
||||
import threading
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
LOG_DIR = Path("gpu-debug-logs")
|
||||
LOG_DIR.mkdir(exist_ok=True)
|
||||
LOG_FILE = LOG_DIR / f"gpu_monitor_{datetime.now():%Y%m%d_%H%M%S}.log"
|
||||
|
||||
# Episodes to transcribe (remaining ones)
|
||||
EPISODES = [
|
||||
"training-data/episodes/2011-06-04-hr1.mp3",
|
||||
"training-data/episodes/2011-09-10-hr1.mp3",
|
||||
"training-data/episodes/2014-s6e05.mp3",
|
||||
"training-data/episodes/2015-s7e30.mp3",
|
||||
"training-data/episodes/2016-s8e42.mp3",
|
||||
"training-data/episodes/2017-s9e26.mp3",
|
||||
"training-data/episodes/2018-s10e17.mp3",
|
||||
"training-data/episodes/2018-s10e21.mp3",
|
||||
]
|
||||
|
||||
stop_monitor = threading.Event()
|
||||
|
||||
|
||||
def log(msg: str):
|
||||
ts = datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
||||
line = f"[{ts}] {msg}"
|
||||
print(line)
|
||||
with open(LOG_FILE, "a") as f:
|
||||
f.write(line + "\n")
|
||||
|
||||
|
||||
def gpu_query() -> dict | None:
|
||||
"""Query GPU stats via nvidia-smi. Returns None if GPU is in error state."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["nvidia-smi",
|
||||
"--query-gpu=temperature.gpu,power.draw,utilization.gpu,utilization.memory,"
|
||||
"memory.used,memory.total,clocks.current.sm,clocks.current.memory,"
|
||||
"pstate,fan.speed",
|
||||
"--format=csv,noheader,nounits"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
if result.returncode != 0:
|
||||
return None
|
||||
parts = [p.strip() for p in result.stdout.strip().split(",")]
|
||||
# Check for ERR! or [N/A] in any field
|
||||
if any("ERR" in p or "[N/A]" in p for p in parts[:4]):
|
||||
return {"error": True, "raw": result.stdout.strip()}
|
||||
return {
|
||||
"temp_c": parts[0],
|
||||
"power_w": parts[1],
|
||||
"gpu_util": parts[2],
|
||||
"mem_util": parts[3],
|
||||
"mem_used_mb": parts[4],
|
||||
"mem_total_mb": parts[5],
|
||||
"sm_clock_mhz": parts[6],
|
||||
"mem_clock_mhz": parts[7],
|
||||
"pstate": parts[8],
|
||||
"fan": parts[9],
|
||||
"error": False,
|
||||
}
|
||||
except (subprocess.TimeoutExpired, Exception) as e:
|
||||
return {"error": True, "raw": str(e)}
|
||||
|
||||
|
||||
def gpu_health_check() -> bool:
|
||||
"""Returns True if GPU is healthy."""
|
||||
stats = gpu_query()
|
||||
if stats is None or stats.get("error"):
|
||||
log(f"GPU ERROR: {stats}")
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def gpu_status_str(stats: dict) -> str:
|
||||
if stats.get("error"):
|
||||
return f"ERR! raw={stats.get('raw', 'unknown')}"
|
||||
return (f"T={stats['temp_c']}C P={stats['power_w']}W "
|
||||
f"GPU={stats['gpu_util']}% MEM={stats['mem_util']}% "
|
||||
f"VRAM={stats['mem_used_mb']}/{stats['mem_total_mb']}MB "
|
||||
f"SM={stats['sm_clock_mhz']}MHz MEMCLK={stats['mem_clock_mhz']}MHz "
|
||||
f"PState={stats['pstate']} Fan={stats['fan']}")
|
||||
|
||||
|
||||
def monitor_thread(interval: float = 2.0):
|
||||
"""Background thread that logs GPU stats at regular intervals."""
|
||||
while not stop_monitor.is_set():
|
||||
stats = gpu_query()
|
||||
if stats:
|
||||
log(f"MONITOR: {gpu_status_str(stats)}")
|
||||
if stats.get("error"):
|
||||
log("MONITOR: GPU ENTERED ERROR STATE!")
|
||||
# Check dmesg for the smoking gun
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["sudo", "dmesg", "-T", "--level=err,warn"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
nvrm_lines = [l for l in result.stdout.splitlines()
|
||||
if "NVRM" in l or "nvidia" in l.lower()]
|
||||
for line in nvrm_lines[-5:]:
|
||||
log(f"DMESG: {line}")
|
||||
except Exception:
|
||||
pass
|
||||
stop_monitor.wait(interval)
|
||||
|
||||
|
||||
def check_runtime_d3():
|
||||
"""Check and log Runtime D3 power management status."""
|
||||
try:
|
||||
power_file = Path("/proc/driver/nvidia/gpus/0000:02:00.0/power")
|
||||
if power_file.exists():
|
||||
log(f"GPU Power Management:\n{power_file.read_text()}")
|
||||
|
||||
# Check if dynamic power management is enabled
|
||||
result = subprocess.run(
|
||||
["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_status"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
log(f"PCI runtime_status: {result.stdout.strip()}")
|
||||
|
||||
result = subprocess.run(
|
||||
["cat", "/sys/bus/pci/devices/0000:02:00.0/power/control"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
log(f"PCI power control: {result.stdout.strip()}")
|
||||
|
||||
result = subprocess.run(
|
||||
["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_enabled"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
log(f"PCI runtime_enabled: {result.stdout.strip()}")
|
||||
|
||||
except Exception as e:
|
||||
log(f"Power check error: {e}")
|
||||
|
||||
|
||||
def check_nvidia_persistence():
|
||||
"""Check persistence mode."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["nvidia-smi", "--query-gpu=persistence_mode", "--format=csv,noheader"],
|
||||
capture_output=True, text=True, timeout=5
|
||||
)
|
||||
log(f"Persistence mode: {result.stdout.strip()}")
|
||||
except Exception as e:
|
||||
log(f"Persistence check error: {e}")
|
||||
|
||||
|
||||
def transcribe_one(episode_path: str) -> bool:
|
||||
"""Transcribe a single episode with GPU health monitoring. Returns success."""
|
||||
name = Path(episode_path).stem
|
||||
output_dir = f"training-data/transcripts/{name}"
|
||||
|
||||
if Path(output_dir).exists() and (Path(output_dir) / "transcript.json").exists():
|
||||
log(f"SKIP: {name} already transcribed")
|
||||
return True
|
||||
|
||||
# Pre-flight GPU check
|
||||
log(f"PRE-FLIGHT: Checking GPU before {name}")
|
||||
stats = gpu_query()
|
||||
if not stats or stats.get("error"):
|
||||
log(f"PRE-FLIGHT FAIL: GPU already in error state! Stats: {stats}")
|
||||
return False
|
||||
log(f"PRE-FLIGHT: {gpu_status_str(stats)}")
|
||||
|
||||
# Quick CUDA test
|
||||
log("PRE-FLIGHT: Testing CUDA...")
|
||||
try:
|
||||
import torch
|
||||
if not torch.cuda.is_available():
|
||||
log("PRE-FLIGHT FAIL: torch.cuda.is_available() = False")
|
||||
return False
|
||||
# Small allocation test
|
||||
x = torch.randn(100, 100, device="cuda")
|
||||
y = x @ x
|
||||
del x, y
|
||||
torch.cuda.synchronize()
|
||||
torch.cuda.empty_cache()
|
||||
log(f"PRE-FLIGHT: CUDA OK, allocated={torch.cuda.memory_allocated() / 1024**2:.0f}MB")
|
||||
except Exception as e:
|
||||
log(f"PRE-FLIGHT FAIL: CUDA test error: {e}")
|
||||
return False
|
||||
|
||||
# Transcribe
|
||||
log(f"START: {name} ({episode_path})")
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
from src.transcriber import transcribe
|
||||
transcript = transcribe(episode_path)
|
||||
transcript.save(Path(output_dir))
|
||||
elapsed = time.time() - start_time
|
||||
log(f"DONE: {name} in {elapsed:.1f}s ({elapsed/60:.1f}min), "
|
||||
f"{len(transcript.segments)} segments")
|
||||
except Exception as e:
|
||||
elapsed = time.time() - start_time
|
||||
log(f"FAIL: {name} after {elapsed:.1f}s: {type(e).__name__}: {e}")
|
||||
|
||||
# Post-failure GPU check
|
||||
stats = gpu_query()
|
||||
log(f"POST-FAIL: {gpu_status_str(stats) if stats else 'query failed'}")
|
||||
return False
|
||||
|
||||
# Post-transcription GPU check
|
||||
stats = gpu_query()
|
||||
if stats and not stats.get("error"):
|
||||
log(f"POST: {gpu_status_str(stats)}")
|
||||
else:
|
||||
log(f"POST: GPU entered error state after transcription! {stats}")
|
||||
|
||||
# Cool-down: clear CUDA cache, let GPU idle briefly
|
||||
try:
|
||||
import torch
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.synchronize()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
log("COOLDOWN: Waiting 10s between episodes...")
|
||||
time.sleep(10)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
log("=" * 60)
|
||||
log("GPU Debug Batch Transcription")
|
||||
log(f"Driver: {subprocess.getoutput('nvidia-smi --query-gpu=driver_version --format=csv,noheader')}")
|
||||
log(f"CUDA version: {subprocess.getoutput('nvidia-smi --query-gpu=cuda_version --format=csv,noheader 2>/dev/null') or 'N/A'}")
|
||||
log("=" * 60)
|
||||
|
||||
# Check power management
|
||||
check_runtime_d3()
|
||||
check_nvidia_persistence()
|
||||
|
||||
# Initial GPU state
|
||||
stats = gpu_query()
|
||||
if not stats or stats.get("error"):
|
||||
log(f"ABORT: GPU already in error state at startup: {stats}")
|
||||
sys.exit(1)
|
||||
log(f"INITIAL: {gpu_status_str(stats)}")
|
||||
|
||||
# Start background monitor (every 5 seconds during transcription)
|
||||
monitor = threading.Thread(target=monitor_thread, args=(5.0,), daemon=True)
|
||||
monitor.start()
|
||||
|
||||
# Filter to only episodes that need transcription
|
||||
remaining = []
|
||||
for ep in EPISODES:
|
||||
name = Path(ep).stem
|
||||
out = Path(f"training-data/transcripts/{name}/transcript.json")
|
||||
if out.exists():
|
||||
log(f"ALREADY DONE: {name}")
|
||||
else:
|
||||
remaining.append(ep)
|
||||
|
||||
log(f"QUEUE: {len(remaining)} episodes to transcribe")
|
||||
|
||||
completed = 0
|
||||
failed = 0
|
||||
for ep in remaining:
|
||||
success = transcribe_one(ep)
|
||||
if success:
|
||||
completed += 1
|
||||
else:
|
||||
failed += 1
|
||||
log(f"STOPPING: GPU failure detected after {completed} episodes, {failed} failed")
|
||||
# Log final state
|
||||
stats = gpu_query()
|
||||
log(f"FINAL: {gpu_status_str(stats) if stats else 'query failed'}")
|
||||
break
|
||||
|
||||
stop_monitor.set()
|
||||
log(f"SUMMARY: {completed} completed, {failed} failed, "
|
||||
f"{len(remaining) - completed - failed} remaining")
|
||||
log(f"Log saved to: {LOG_FILE}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user