sync: Auto-sync from acg-guru-5070 at 2026-03-21 16:34:05

Synced files:
- Session logs updated
- Latest context and credentials
- Command/directive updates

Machine: acg-guru-5070
Timestamp: 2026-03-21 16:34:05

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-03-21 16:34:05 -07:00
parent 37aaa6660b
commit a29d00c6b2
2 changed files with 539 additions and 0 deletions

View File

@@ -0,0 +1,294 @@
#!/usr/bin/env python3
"""GPU-monitored batch transcription with diagnostics.
Monitors GPU health before, during, and after each episode transcription.
Logs temperature, power, utilization, and memory to detect what triggers
the NVRM rpcSendMessage failure (status 0x00000062).
"""
import subprocess
import sys
import time
import signal
import threading
import os
from datetime import datetime
from pathlib import Path
LOG_DIR = Path("gpu-debug-logs")
LOG_DIR.mkdir(exist_ok=True)
LOG_FILE = LOG_DIR / f"gpu_monitor_{datetime.now():%Y%m%d_%H%M%S}.log"
# Episodes to transcribe (remaining ones)
EPISODES = [
"training-data/episodes/2011-06-04-hr1.mp3",
"training-data/episodes/2011-09-10-hr1.mp3",
"training-data/episodes/2014-s6e05.mp3",
"training-data/episodes/2015-s7e30.mp3",
"training-data/episodes/2016-s8e42.mp3",
"training-data/episodes/2017-s9e26.mp3",
"training-data/episodes/2018-s10e17.mp3",
"training-data/episodes/2018-s10e21.mp3",
]
stop_monitor = threading.Event()
def log(msg: str):
ts = datetime.now().strftime("%H:%M:%S.%f")[:-3]
line = f"[{ts}] {msg}"
print(line)
with open(LOG_FILE, "a") as f:
f.write(line + "\n")
def gpu_query() -> dict | None:
"""Query GPU stats via nvidia-smi. Returns None if GPU is in error state."""
try:
result = subprocess.run(
["nvidia-smi",
"--query-gpu=temperature.gpu,power.draw,utilization.gpu,utilization.memory,"
"memory.used,memory.total,clocks.current.sm,clocks.current.memory,"
"pstate,fan.speed",
"--format=csv,noheader,nounits"],
capture_output=True, text=True, timeout=5
)
if result.returncode != 0:
return None
parts = [p.strip() for p in result.stdout.strip().split(",")]
# Check for ERR! or [N/A] in any field
if any("ERR" in p or "[N/A]" in p for p in parts[:4]):
return {"error": True, "raw": result.stdout.strip()}
return {
"temp_c": parts[0],
"power_w": parts[1],
"gpu_util": parts[2],
"mem_util": parts[3],
"mem_used_mb": parts[4],
"mem_total_mb": parts[5],
"sm_clock_mhz": parts[6],
"mem_clock_mhz": parts[7],
"pstate": parts[8],
"fan": parts[9],
"error": False,
}
except (subprocess.TimeoutExpired, Exception) as e:
return {"error": True, "raw": str(e)}
def gpu_health_check() -> bool:
"""Returns True if GPU is healthy."""
stats = gpu_query()
if stats is None or stats.get("error"):
log(f"GPU ERROR: {stats}")
return False
return True
def gpu_status_str(stats: dict) -> str:
if stats.get("error"):
return f"ERR! raw={stats.get('raw', 'unknown')}"
return (f"T={stats['temp_c']}C P={stats['power_w']}W "
f"GPU={stats['gpu_util']}% MEM={stats['mem_util']}% "
f"VRAM={stats['mem_used_mb']}/{stats['mem_total_mb']}MB "
f"SM={stats['sm_clock_mhz']}MHz MEMCLK={stats['mem_clock_mhz']}MHz "
f"PState={stats['pstate']} Fan={stats['fan']}")
def monitor_thread(interval: float = 2.0):
"""Background thread that logs GPU stats at regular intervals."""
while not stop_monitor.is_set():
stats = gpu_query()
if stats:
log(f"MONITOR: {gpu_status_str(stats)}")
if stats.get("error"):
log("MONITOR: GPU ENTERED ERROR STATE!")
# Check dmesg for the smoking gun
try:
result = subprocess.run(
["sudo", "dmesg", "-T", "--level=err,warn"],
capture_output=True, text=True, timeout=5
)
nvrm_lines = [l for l in result.stdout.splitlines()
if "NVRM" in l or "nvidia" in l.lower()]
for line in nvrm_lines[-5:]:
log(f"DMESG: {line}")
except Exception:
pass
stop_monitor.wait(interval)
def check_runtime_d3():
"""Check and log Runtime D3 power management status."""
try:
power_file = Path("/proc/driver/nvidia/gpus/0000:02:00.0/power")
if power_file.exists():
log(f"GPU Power Management:\n{power_file.read_text()}")
# Check if dynamic power management is enabled
result = subprocess.run(
["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_status"],
capture_output=True, text=True, timeout=5
)
log(f"PCI runtime_status: {result.stdout.strip()}")
result = subprocess.run(
["cat", "/sys/bus/pci/devices/0000:02:00.0/power/control"],
capture_output=True, text=True, timeout=5
)
log(f"PCI power control: {result.stdout.strip()}")
result = subprocess.run(
["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_enabled"],
capture_output=True, text=True, timeout=5
)
log(f"PCI runtime_enabled: {result.stdout.strip()}")
except Exception as e:
log(f"Power check error: {e}")
def check_nvidia_persistence():
"""Check persistence mode."""
try:
result = subprocess.run(
["nvidia-smi", "--query-gpu=persistence_mode", "--format=csv,noheader"],
capture_output=True, text=True, timeout=5
)
log(f"Persistence mode: {result.stdout.strip()}")
except Exception as e:
log(f"Persistence check error: {e}")
def transcribe_one(episode_path: str) -> bool:
"""Transcribe a single episode with GPU health monitoring. Returns success."""
name = Path(episode_path).stem
output_dir = f"training-data/transcripts/{name}"
if Path(output_dir).exists() and (Path(output_dir) / "transcript.json").exists():
log(f"SKIP: {name} already transcribed")
return True
# Pre-flight GPU check
log(f"PRE-FLIGHT: Checking GPU before {name}")
stats = gpu_query()
if not stats or stats.get("error"):
log(f"PRE-FLIGHT FAIL: GPU already in error state! Stats: {stats}")
return False
log(f"PRE-FLIGHT: {gpu_status_str(stats)}")
# Quick CUDA test
log("PRE-FLIGHT: Testing CUDA...")
try:
import torch
if not torch.cuda.is_available():
log("PRE-FLIGHT FAIL: torch.cuda.is_available() = False")
return False
# Small allocation test
x = torch.randn(100, 100, device="cuda")
y = x @ x
del x, y
torch.cuda.synchronize()
torch.cuda.empty_cache()
log(f"PRE-FLIGHT: CUDA OK, allocated={torch.cuda.memory_allocated() / 1024**2:.0f}MB")
except Exception as e:
log(f"PRE-FLIGHT FAIL: CUDA test error: {e}")
return False
# Transcribe
log(f"START: {name} ({episode_path})")
start_time = time.time()
try:
from src.transcriber import transcribe
transcript = transcribe(episode_path)
transcript.save(Path(output_dir))
elapsed = time.time() - start_time
log(f"DONE: {name} in {elapsed:.1f}s ({elapsed/60:.1f}min), "
f"{len(transcript.segments)} segments")
except Exception as e:
elapsed = time.time() - start_time
log(f"FAIL: {name} after {elapsed:.1f}s: {type(e).__name__}: {e}")
# Post-failure GPU check
stats = gpu_query()
log(f"POST-FAIL: {gpu_status_str(stats) if stats else 'query failed'}")
return False
# Post-transcription GPU check
stats = gpu_query()
if stats and not stats.get("error"):
log(f"POST: {gpu_status_str(stats)}")
else:
log(f"POST: GPU entered error state after transcription! {stats}")
# Cool-down: clear CUDA cache, let GPU idle briefly
try:
import torch
torch.cuda.empty_cache()
torch.cuda.synchronize()
except Exception:
pass
log("COOLDOWN: Waiting 10s between episodes...")
time.sleep(10)
return True
def main():
log("=" * 60)
log("GPU Debug Batch Transcription")
log(f"Driver: {subprocess.getoutput('nvidia-smi --query-gpu=driver_version --format=csv,noheader')}")
log(f"CUDA version: {subprocess.getoutput('nvidia-smi --query-gpu=cuda_version --format=csv,noheader 2>/dev/null') or 'N/A'}")
log("=" * 60)
# Check power management
check_runtime_d3()
check_nvidia_persistence()
# Initial GPU state
stats = gpu_query()
if not stats or stats.get("error"):
log(f"ABORT: GPU already in error state at startup: {stats}")
sys.exit(1)
log(f"INITIAL: {gpu_status_str(stats)}")
# Start background monitor (every 5 seconds during transcription)
monitor = threading.Thread(target=monitor_thread, args=(5.0,), daemon=True)
monitor.start()
# Filter to only episodes that need transcription
remaining = []
for ep in EPISODES:
name = Path(ep).stem
out = Path(f"training-data/transcripts/{name}/transcript.json")
if out.exists():
log(f"ALREADY DONE: {name}")
else:
remaining.append(ep)
log(f"QUEUE: {len(remaining)} episodes to transcribe")
completed = 0
failed = 0
for ep in remaining:
success = transcribe_one(ep)
if success:
completed += 1
else:
failed += 1
log(f"STOPPING: GPU failure detected after {completed} episodes, {failed} failed")
# Log final state
stats = gpu_query()
log(f"FINAL: {gpu_status_str(stats) if stats else 'query failed'}")
break
stop_monitor.set()
log(f"SUMMARY: {completed} completed, {failed} failed, "
f"{len(remaining) - completed - failed} remaining")
log(f"Log saved to: {LOG_FILE}")
if __name__ == "__main__":
main()

View File

@@ -575,3 +575,248 @@ done
```
Then: run speaker identification across all transcribed episodes, cluster non-host voices, begin element fingerprinting.
## Update: 15:00 — Dataforth Email, GPU Debug, VWP Citrix→Hyper-V Migration, ScreenConnect
### Session Summary
Multi-task session: Dataforth email forwarding, GPU error diagnosis for voice training, and major VWP infrastructure migration (Citrix XenServer → Hyper-V). Installed ScreenConnect on VWP-FILES via PowerShell Direct.
### 1. Dataforth Email Forwarding (dataforthgit@)
**Task:** AJ (Angel Lopez) at Dataforth needs messages sent to dataforthgit@dataforth.com forwarded to him.
**Discovery:** `dataforthgit@dataforth.com` is an existing alias on the **Support** shared mailbox (`support@dataforth.com`).
**Solution:** Created inbox rule on Support mailbox via Graph API:
- **Rule:** "Forward dataforthgit@ to AJ Lopez"
- **Trigger:** recipientContains `dataforthgit@dataforth.com`
- **Action:** Forward to `alopez@dataforth.com`
- **Rule ID:** `AQAAAFO12jE=`
**Auth used:** Claude-MSP-Access multi-tenant app:
- Tenant ID: `7dfa3ce8-c496-4b51-ab8d-bd3dcd78b584`
- App ID: `fabb3421-8b34-484b-bc17-e46de9703418`
- Client Secret: `~QJ8Q~NyQSs4OcGqHZyPrA2CVnq9KBfKiimntbMO`
### 2. GPU Error Diagnosis (RTX 5070 Ti)
**Problem:** GPU entered error state during voice training batch transcription (same issue as previous session). `nvidia-smi` shows ERR! across all fields. The GPU failed ~40 min into transcription.
**Root cause investigation:**
- `NVRM: _issueRpcLarge: rpcSendMessage failed with status 0x00000062 for fn 76!` — repeating every 100ms
- No Xid errors in dmesg — only RPC communication failures
- **Runtime D3 (fine-grained power management) is enabled** — prime suspect for GPU hang during sustained compute
- GPU is in D0 power state, video memory active
- Error first appeared at 4335 seconds after boot (~72 min)
- `torch.cuda.is_available()` returned True initially, GPU loaded model into VRAM then failed
**Fix applied (pending reboot):**
- Created `/etc/modprobe.d/nvidia-no-d3.conf`: `options nvidia NVreg_DynamicPowerManagement=0`
- Plan: After reboot, run `sudo nvidia-smi -pm 1` (persistence mode)
**Diagnostic script created:** `projects/radio-show/audio-processor/gpu_debug_transcribe.py`
- Monitors GPU temp, power, utilization, VRAM, clocks every 5 seconds
- Pre-flight CUDA health check before each episode
- 10-second cooldown between episodes
- Stops at first GPU error and logs state
- Saves logs to `gpu-debug-logs/`
**Transcription status:** Only `2010-10-02-hr1` completed. 8 episodes remaining:
- 2011-06-04-hr1, 2011-09-10-hr1, 2014-s6e05, 2015-s7e30, 2016-s8e42, 2017-s9e26, 2018-s10e17, 2018-s10e21
**After reboot commands:**
```bash
sudo nvidia-smi -pm 1
source /home/guru/.local/share/radio-processor/bin/activate
cd /home/guru/ClaudeTools/projects/radio-show/audio-processor
python3 gpu_debug_transcribe.py
```
### 3. VWP Citrix XenServer → Hyper-V Migration
#### VPN Access
**Critical:** Must `sudo tailscale down` before VWP VPN — D2TESTNAS advertises `192.168.0.0/24` for Dataforth which conflicts with VWP's same subnet.
**Starlink subnet conflict:** Starlink was on `192.168.4.0/24`, same as VPN tunnel. User changed Starlink to `10.0.3.x/16` to resolve.
**Working VPN command (split tunnel):**
```bash
sudo tailscale down
sudo openvpn --config ~/Downloads/OpenVPN-Server.ovpn --auth-user-pass /etc/openvpn/vwp-auth.txt --group nobody --daemon vwp-vpn --log /tmp/vwp-vpn.log --route-noexec
# Then manually add split routes:
sudo ip route add 172.16.9.0/24 dev tun0
sudo ip route add 192.168.0.0/24 dev tun0
sudo ip route add 192.168.3.0/24 dev tun0
```
**Key:** Must use `--route-noexec` to prevent full-tunnel `0.0.0.0/1` redirect, then manually add split routes.
#### VPN Credentials
- **Auth file:** `/etc/openvpn/vwp-auth.txt` (sysadmin / r3tr0gradE99#)
- **Remote:** 4.18.160.106:1194 TCP
- **VPN IP assigned:** 192.168.4.2 or 192.168.4.3
#### WinRM Access to Hyper-V
**Installed `pywinrm`** (`pip install --user --break-system-packages pywinrm`) for remote PowerShell via WinRM.
**WinRM enabled on VWP-HYPERV1** (user ran on console):
```powershell
Enable-PSRemoting -Force
Set-Item WSMan:\localhost\Client\TrustedHosts -Value "*" -Force
New-NetFirewallRule -DisplayName "WinRM All" -Direction Inbound -Protocol TCP -LocalPort 5985 -Action Allow
```
**Python WinRM usage:**
```python
import winrm
s = winrm.Session('http://172.16.9.184:5985/wsman', auth=('sysadmin', 'r3tr0gradE99#'), transport='ntlm')
r = s.run_ps("hostname")
print(r.std_out.decode().strip())
```
#### Hyper-V Host Status (VWP-HYPERV1)
- **Hostname:** VWP-HYPERV1
- **IP:** 172.16.9.184
- **OS:** Windows Server 2025 Standard
- **Specs:** 64 vCPUs (Xeon Platinum 8180M), 256GB RAM, PowerEdge R740
- **Disk:** 10.5TB free on C:
- **vSwitch:** "Intel(R) Ethernet 10G 4P X550/I350 rNDC - Virtual Switch" (External, NIC1 at 1Gbps)
- **Physical NICs:** NIC1 (up, 1Gbps), NIC2/3/4 (disconnected)
- **Native VLAN:** 172.16.9.x (untagged)
**Existing VMs on Hyper-V:**
| VM | State | Gen | RAM | vCPUs |
|----|-------|-----|-----|-------|
| VWP-DC1 | Running | 2 | ~7.4GB | 56 |
| VWP-FILES | Running | 2 | 2GB | 16 |
**VLAN configuration:**
- Native/untagged: 172.16.9.0/24 (VWP LAN)
- VLAN 2: 192.168.0.0/24 (OldNet)
- VLAN 99: 192.168.3.0/24 (Mgt)
- UDM trunks all VLANs, defaults to selected VLAN for untagged
#### XenServer VM Inventory (source)
| VM | OS | IP | State | vCPUs | RAM | Disk |
|----|----|----|-------|-------|-----|------|
| server 2012 R2 | Server 2012 R2 Standard | 192.168.0.19 | running | 4 | 16GB | 200GB |
| BACKUP-SRV | Server 2019 Datacenter | 192.168.0.22 | running | 2 | 15GB | 240GB |
| server 2003 | Server 2003 Enterprise SP2 | 192.168.0.20 | running | 4 | 3GB | 130GB |
| XP | Windows XP | none | running | 2 | 3GB | 40GB |
| Windows 7 (32-bit) | Windows 7 | 192.168.0.40 | halted | 2 | 4GB | 80GB |
#### Server 2012 R2 Migration (IN PROGRESS)
**VDI Export running on XenServer:**
- VDI UUID: `e65ccf95-0bc7-4530-ac91-c418e667e1de`
- VM UUID: `298da244-79b5-84ed-d6e0-694825697096`
- Export command: `xe vdi-export uuid=e65ccf95-0bc7-4530-ac91-c418e667e1de filename=/mnt/hyperv/server2012r2.vhd format=vhd`
- PID: 26610 (nohup, survives disconnects)
- Destination: `//172.16.9.184/Migration` mounted at `/mnt/hyperv` (SMBv2)
- Progress at last check: **65GB of ~200GB** (~4GB/min, ~35 min remaining)
- Transfer rate: ~4GB/min over 1Gbps link
**SMB share created on Hyper-V:**
```powershell
New-SmbShare -Name 'Migration' -Path 'C:\Migration' -FullAccess 'Everyone'
New-NetFirewallRule -DisplayName 'SMB from XenServer' -Direction Inbound -Protocol TCP -LocalPort 445 -RemoteAddress 192.168.0.0/24 -Action Allow
```
**Mount on XenServer:**
```bash
mount.cifs //172.16.9.184/Migration /mnt/hyperv -o username=sysadmin,password=r3tr0gradE99#,domain=VWP,vers=2.0
```
**Planned VM creation (after export completes):**
- Generation 1 (BIOS/MBR from XenServer)
- 4 vCPUs, 16GB RAM
- NIC on VLAN 2 (192.168.0.x)
- Attach server2012r2.vhd from C:\Migration
- Boot and install Hyper-V integration services
#### ITSvc Share (C:\Shares\ITSvc on VWP-HYPERV1)
Contains installers:
- VWP-ScreenConnect.ClientSetup.msi (27.6MB)
- VWPScreenConnect.ClientSetup.exe (19.1MB)
- Ninite installers (Chrome, Firefox, .NET, WizTree)
- ISO subfolder
### 4. ScreenConnect on VWP-FILES
**VWP-FILES VM details:**
- **Hostname:** VWP-FILES.VWP.US
- **IP:** 172.16.9.107
- **OS:** Windows Server 2019 Standard
- **Hyper-V Gen:** 2
- **RAM:** 2GB, 16 vCPUs
**PowerShell Direct credentials:** `VWP\sysadmin` / `r3tr0gradE99#`
**Installation:** MSI copied via `Copy-VMFile` (Hyper-V Guest Service Interface), installed via PowerShell Direct:
```powershell
Copy-VMFile -Name 'VWP-FILES' -SourcePath 'C:\Shares\ITSvc\VWP-ScreenConnect.ClientSetup.msi' -DestinationPath 'C:\Temp\VWP-ScreenConnect.ClientSetup.msi' -CreateFullPath -FileSource Host
Invoke-Command -VMName 'VWP-FILES' -Credential ... -ScriptBlock { Start-Process msiexec.exe -ArgumentList '/i C:\Temp\VWP-ScreenConnect.ClientSetup.msi /quiet /norestart' -Wait }
```
**Issue:** Service installed but stopped immediately — "Your host has ended the remote session." User had accidentally deleted the unit in ScreenConnect console.
**Fix:** Uninstalled (`msiexec /x ... /quiet /norestart`), reinstalled same MSI. Service now **Running**.
**Service:** `ScreenConnect Client (1912bf3444b41a08)` — connects to `instance-kgc7jt-relay.screenconnect.com:443`
### 5. Memory Saved
- `reference_dataforth_contact.md` — AJ at Dataforth, dataforthgit@ email forwarding
### Credentials Used This Session
```
### Dataforth M365 (Graph API)
- Tenant ID: 7dfa3ce8-c496-4b51-ab8d-bd3dcd78b584
- App ID: fabb3421-8b34-484b-bc17-e46de9703418
- Client Secret: ~QJ8Q~NyQSs4OcGqHZyPrA2CVnq9KBfKiimntbMO
### VWP VPN
- Auth file: /etc/openvpn/vwp-auth.txt
- User: sysadmin / r3tr0gradE99#
- Remote: 4.18.160.106:1194 TCP
### VWP XenServer (192.168.0.104)
- SSH: root / r3tr0gradE99!
- Note: $'...' quoting for !
### VWP-HYPERV1 (172.16.9.184)
- WinRM: sysadmin / r3tr0gradE99# (NTLM)
- URL: http://172.16.9.184:5985/wsman
### VWP-DC1 (172.16.9.2)
- Domain: VWP\sysadmin / r3tr0gradE99#
### VWP-FILES (172.16.9.107)
- PowerShell Direct: VWP\sysadmin / r3tr0gradE99#
### VWP iDRAC - XenServer R720 (192.168.3.30)
- SSH: root / r3tr0gradE99#
- SSH flags: -o KexAlgorithms=+diffie-hellman-group14-sha1 -o HostKeyAlgorithms=+ssh-rsa -o Ciphers=+aes128-cbc,aes256-cbc
```
### Pending/Incomplete Tasks
1. **VDI export in progress** — Server 2012 R2 exporting from XenServer to Hyper-V, ~65GB/200GB done, PID 26610 on XenServer
2. **Create Server 2012 R2 VM on Hyper-V** — After export: Gen1, 4 vCPU, 16GB RAM, VLAN 2, attach VHD
3. **GPU debug after reboot** — Run `sudo nvidia-smi -pm 1` then `python3 gpu_debug_transcribe.py`
4. **Server 2003 data migration** — Move shares/data from 192.168.0.20 (G: drive) to VWP-FILES after 2012 R2 migration
5. **Remaining XenServer VMs** — BACKUP-SRV, server 2003, XP, Windows 7 all need migration
6. **pywinrm installed** — `pip install --user --break-system-packages pywinrm` on workstation
### Files Created/Modified This Session
- `/etc/modprobe.d/nvidia-no-d3.conf` — Disable GPU Runtime D3 power management
- `projects/radio-show/audio-processor/gpu_debug_transcribe.py` — GPU diagnostic batch transcription script
- `~/.claude/projects/-home-guru-ClaudeTools/memory/reference_dataforth_contact.md` — AJ/dataforthgit memory
- `~/.claude/projects/-home-guru-ClaudeTools/memory/MEMORY.md` — Updated index