sync: Auto-sync from acg-guru-5070 at 2026-03-21 16:34:05

Synced files:
- Session logs updated
- Latest context and credentials
- Command/directive updates

Machine: acg-guru-5070
Timestamp: 2026-03-21 16:34:05

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-03-21 16:34:05 -07:00
parent 37aaa6660b
commit a29d00c6b2
2 changed files with 539 additions and 0 deletions

View File

@@ -0,0 +1,294 @@
#!/usr/bin/env python3
"""GPU-monitored batch transcription with diagnostics.
Monitors GPU health before, during, and after each episode transcription.
Logs temperature, power, utilization, and memory to detect what triggers
the NVRM rpcSendMessage failure (status 0x00000062).
"""
import subprocess
import sys
import time
import signal
import threading
import os
from datetime import datetime
from pathlib import Path
LOG_DIR = Path("gpu-debug-logs")
LOG_DIR.mkdir(exist_ok=True)
LOG_FILE = LOG_DIR / f"gpu_monitor_{datetime.now():%Y%m%d_%H%M%S}.log"
# Episodes to transcribe (remaining ones)
EPISODES = [
"training-data/episodes/2011-06-04-hr1.mp3",
"training-data/episodes/2011-09-10-hr1.mp3",
"training-data/episodes/2014-s6e05.mp3",
"training-data/episodes/2015-s7e30.mp3",
"training-data/episodes/2016-s8e42.mp3",
"training-data/episodes/2017-s9e26.mp3",
"training-data/episodes/2018-s10e17.mp3",
"training-data/episodes/2018-s10e21.mp3",
]
stop_monitor = threading.Event()
def log(msg: str):
ts = datetime.now().strftime("%H:%M:%S.%f")[:-3]
line = f"[{ts}] {msg}"
print(line)
with open(LOG_FILE, "a") as f:
f.write(line + "\n")
def gpu_query() -> dict | None:
"""Query GPU stats via nvidia-smi. Returns None if GPU is in error state."""
try:
result = subprocess.run(
["nvidia-smi",
"--query-gpu=temperature.gpu,power.draw,utilization.gpu,utilization.memory,"
"memory.used,memory.total,clocks.current.sm,clocks.current.memory,"
"pstate,fan.speed",
"--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
if result.returncode != 0:
return None
parts = [p.strip() for p in result.stdout.strip().split(",")]
# Check for ERR! or [N/A] in any field
if any("ERR" in p or "[N/A]" in p for p in parts[:4]):
return {"error": True, "raw": result.stdout.strip()}
return {
"temp_c": parts[0],
"power_w": parts[1],
"gpu_util": parts[2],
"mem_util": parts[3],
"mem_used_mb": parts[4],
"mem_total_mb": parts[5],
"sm_clock_mhz": parts[6],
"mem_clock_mhz": parts[7],
"pstate": parts[8],
"fan": parts[9],
"error": False,
}
except (subprocess.TimeoutExpired, Exception) as e:
return {"error": True, "raw": str(e)}
def gpu_health_check() -> bool:
"""Returns True if GPU is healthy."""
stats = gpu_query()
if stats is None or stats.get("error"):
log(f"GPU ERROR: {stats}")
return False
return True
def gpu_status_str(stats: dict) -> str:
if stats.get("error"):
return f"ERR! raw={stats.get('raw', 'unknown')}"
return (f"T={stats['temp_c']}C P={stats['power_w']}W "
f"GPU={stats['gpu_util']}% MEM={stats['mem_util']}% "
f"VRAM={stats['mem_used_mb']}/{stats['mem_total_mb']}MB "
f"SM={stats['sm_clock_mhz']}MHz MEMCLK={stats['mem_clock_mhz']}MHz "
f"PState={stats['pstate']} Fan={stats['fan']}")
def monitor_thread(interval: float = 2.0):
"""Background thread that logs GPU stats at regular intervals."""
while not stop_monitor.is_set():
stats = gpu_query()
if stats:
log(f"MONITOR: {gpu_status_str(stats)}")
if stats.get("error"):
log("MONITOR: GPU ENTERED ERROR STATE!")
# Check dmesg for the smoking gun
try:
result = subprocess.run(
["sudo", "dmesg", "-T", "--level=err,warn"],
capture_output=True, text=True, timeout=5
)
nvrm_lines = [l for l in result.stdout.splitlines()
if "NVRM" in l or "nvidia" in l.lower()]
for line in nvrm_lines[-5:]:
log(f"DMESG: {line}")
except Exception:
pass
stop_monitor.wait(interval)
def check_runtime_d3():
"""Check and log Runtime D3 power management status."""
try:
power_file = Path("/proc/driver/nvidia/gpus/0000:02:00.0/power")
if power_file.exists():
log(f"GPU Power Management:\n{power_file.read_text()}")
# Check if dynamic power management is enabled
result = subprocess.run(
["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_status"],
capture_output=True, text=True, timeout=5
)
log(f"PCI runtime_status: {result.stdout.strip()}")
result = subprocess.run(
["cat", "/sys/bus/pci/devices/0000:02:00.0/power/control"],
capture_output=True, text=True, timeout=5
)
log(f"PCI power control: {result.stdout.strip()}")
result = subprocess.run(
["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_enabled"],
capture_output=True, text=True, timeout=5
)
log(f"PCI runtime_enabled: {result.stdout.strip()}")
except Exception as e:
log(f"Power check error: {e}")
def check_nvidia_persistence():
"""Check persistence mode."""
try:
result = subprocess.run(
["nvidia-smi", "--query-gpu=persistence_mode", "--format=csv,noheader"],
capture_output=True, text=True, timeout=5
)
log(f"Persistence mode: {result.stdout.strip()}")
except Exception as e:
log(f"Persistence check error: {e}")
def transcribe_one(episode_path: str) -> bool:
"""Transcribe a single episode with GPU health monitoring. Returns success."""
name = Path(episode_path).stem
output_dir = f"training-data/transcripts/{name}"
if Path(output_dir).exists() and (Path(output_dir) / "transcript.json").exists():
log(f"SKIP: {name} already transcribed")
return True
# Pre-flight GPU check
log(f"PRE-FLIGHT: Checking GPU before {name}")
stats = gpu_query()
if not stats or stats.get("error"):
log(f"PRE-FLIGHT FAIL: GPU already in error state! Stats: {stats}")
return False
log(f"PRE-FLIGHT: {gpu_status_str(stats)}")
# Quick CUDA test
log("PRE-FLIGHT: Testing CUDA...")
try:
import torch
if not torch.cuda.is_available():
log("PRE-FLIGHT FAIL: torch.cuda.is_available() = False")
return False
# Small allocation test
x = torch.randn(100, 100, device="cuda")
y = x @ x
del x, y
torch.cuda.synchronize()
torch.cuda.empty_cache()
log(f"PRE-FLIGHT: CUDA OK, allocated={torch.cuda.memory_allocated() / 1024**2:.0f}MB")
except Exception as e:
log(f"PRE-FLIGHT FAIL: CUDA test error: {e}")
return False
# Transcribe
log(f"START: {name} ({episode_path})")
start_time = time.time()
try:
from src.transcriber import transcribe
transcript = transcribe(episode_path)
transcript.save(Path(output_dir))
elapsed = time.time() - start_time
log(f"DONE: {name} in {elapsed:.1f}s ({elapsed/60:.1f}min), "
f"{len(transcript.segments)} segments")
except Exception as e:
elapsed = time.time() - start_time
log(f"FAIL: {name} after {elapsed:.1f}s: {type(e).__name__}: {e}")
# Post-failure GPU check
stats = gpu_query()
log(f"POST-FAIL: {gpu_status_str(stats) if stats else 'query failed'}")
return False
# Post-transcription GPU check
stats = gpu_query()
if stats and not stats.get("error"):
log(f"POST: {gpu_status_str(stats)}")
else:
log(f"POST: GPU entered error state after transcription! {stats}")
# Cool-down: clear CUDA cache, let GPU idle briefly
try:
import torch
torch.cuda.empty_cache()
torch.cuda.synchronize()
except Exception:
pass
log("COOLDOWN: Waiting 10s between episodes...")
time.sleep(10)
return True
def main():
log("=" * 60)
log("GPU Debug Batch Transcription")
log(f"Driver: {subprocess.getoutput('nvidia-smi --query-gpu=driver_version --format=csv,noheader')}")
log(f"CUDA version: {subprocess.getoutput('nvidia-smi --query-gpu=cuda_version --format=csv,noheader 2>/dev/null') or 'N/A'}")
log("=" * 60)
# Check power management
check_runtime_d3()
check_nvidia_persistence()
# Initial GPU state
stats = gpu_query()
if not stats or stats.get("error"):
log(f"ABORT: GPU already in error state at startup: {stats}")
sys.exit(1)
log(f"INITIAL: {gpu_status_str(stats)}")
# Start background monitor (every 5 seconds during transcription)
monitor = threading.Thread(target=monitor_thread, args=(5.0,), daemon=True)
monitor.start()
# Filter to only episodes that need transcription
remaining = []
for ep in EPISODES:
name = Path(ep).stem
out = Path(f"training-data/transcripts/{name}/transcript.json")
if out.exists():
log(f"ALREADY DONE: {name}")
else:
remaining.append(ep)
log(f"QUEUE: {len(remaining)} episodes to transcribe")
completed = 0
failed = 0
for ep in remaining:
success = transcribe_one(ep)
if success:
completed += 1
else:
failed += 1
log(f"STOPPING: GPU failure detected after {completed} episodes, {failed} failed")
# Log final state
stats = gpu_query()
log(f"FINAL: {gpu_status_str(stats) if stats else 'query failed'}")
break
stop_monitor.set()
log(f"SUMMARY: {completed} completed, {failed} failed, "
f"{len(remaining) - completed - failed} remaining")
log(f"Log saved to: {LOG_FILE}")
if __name__ == "__main__":
main()