From a29d00c6b239a0efa0b08228b0bd41352aafbbb9 Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Sat, 21 Mar 2026 16:34:05 -0700 Subject: [PATCH] sync: Auto-sync from acg-guru-5070 at 2026-03-21 16:34:05 Synced files: - Session logs updated - Latest context and credentials - Command/directive updates Machine: acg-guru-5070 Timestamp: 2026-03-21 16:34:05 Co-Authored-By: Claude Sonnet 4.5 --- .../audio-processor/gpu_debug_transcribe.py | 294 ++++++++++++++++++ session-logs/2026-03-21-session.md | 245 +++++++++++++++ 2 files changed, 539 insertions(+) create mode 100644 projects/radio-show/audio-processor/gpu_debug_transcribe.py diff --git a/projects/radio-show/audio-processor/gpu_debug_transcribe.py b/projects/radio-show/audio-processor/gpu_debug_transcribe.py new file mode 100644 index 0000000..cb04a2c --- /dev/null +++ b/projects/radio-show/audio-processor/gpu_debug_transcribe.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +"""GPU-monitored batch transcription with diagnostics. + +Monitors GPU health before, during, and after each episode transcription. +Logs temperature, power, utilization, and memory to detect what triggers +the NVRM rpcSendMessage failure (status 0x00000062). +""" + +import subprocess +import sys +import time +import signal +import threading +import os +from datetime import datetime +from pathlib import Path + +LOG_DIR = Path("gpu-debug-logs") +LOG_DIR.mkdir(exist_ok=True) +LOG_FILE = LOG_DIR / f"gpu_monitor_{datetime.now():%Y%m%d_%H%M%S}.log" + +# Episodes to transcribe (remaining ones) +EPISODES = [ + "training-data/episodes/2011-06-04-hr1.mp3", + "training-data/episodes/2011-09-10-hr1.mp3", + "training-data/episodes/2014-s6e05.mp3", + "training-data/episodes/2015-s7e30.mp3", + "training-data/episodes/2016-s8e42.mp3", + "training-data/episodes/2017-s9e26.mp3", + "training-data/episodes/2018-s10e17.mp3", + "training-data/episodes/2018-s10e21.mp3", +] + +stop_monitor = threading.Event() + + +def log(msg: str): + ts = datetime.now().strftime("%H:%M:%S.%f")[:-3] + line = f"[{ts}] {msg}" + print(line) + with open(LOG_FILE, "a") as f: + f.write(line + "\n") + + +def gpu_query() -> dict | None: + """Query GPU stats via nvidia-smi. Returns None if GPU is in error state.""" + try: + result = subprocess.run( + ["nvidia-smi", + "--query-gpu=temperature.gpu,power.draw,utilization.gpu,utilization.memory," + "memory.used,memory.total,clocks.current.sm,clocks.current.memory," + "pstate,fan.speed", + "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=5 + ) + if result.returncode != 0: + return None + parts = [p.strip() for p in result.stdout.strip().split(",")] + # Check for ERR! or [N/A] in any field + if any("ERR" in p or "[N/A]" in p for p in parts[:4]): + return {"error": True, "raw": result.stdout.strip()} + return { + "temp_c": parts[0], + "power_w": parts[1], + "gpu_util": parts[2], + "mem_util": parts[3], + "mem_used_mb": parts[4], + "mem_total_mb": parts[5], + "sm_clock_mhz": parts[6], + "mem_clock_mhz": parts[7], + "pstate": parts[8], + "fan": parts[9], + "error": False, + } + except (subprocess.TimeoutExpired, Exception) as e: + return {"error": True, "raw": str(e)} + + +def gpu_health_check() -> bool: + """Returns True if GPU is healthy.""" + stats = gpu_query() + if stats is None or stats.get("error"): + log(f"GPU ERROR: {stats}") + return False + return True + + +def gpu_status_str(stats: dict) -> str: + if stats.get("error"): + return f"ERR! raw={stats.get('raw', 'unknown')}" + return (f"T={stats['temp_c']}C P={stats['power_w']}W " + f"GPU={stats['gpu_util']}% MEM={stats['mem_util']}% " + f"VRAM={stats['mem_used_mb']}/{stats['mem_total_mb']}MB " + f"SM={stats['sm_clock_mhz']}MHz MEMCLK={stats['mem_clock_mhz']}MHz " + f"PState={stats['pstate']} Fan={stats['fan']}") + + +def monitor_thread(interval: float = 2.0): + """Background thread that logs GPU stats at regular intervals.""" + while not stop_monitor.is_set(): + stats = gpu_query() + if stats: + log(f"MONITOR: {gpu_status_str(stats)}") + if stats.get("error"): + log("MONITOR: GPU ENTERED ERROR STATE!") + # Check dmesg for the smoking gun + try: + result = subprocess.run( + ["sudo", "dmesg", "-T", "--level=err,warn"], + capture_output=True, text=True, timeout=5 + ) + nvrm_lines = [l for l in result.stdout.splitlines() + if "NVRM" in l or "nvidia" in l.lower()] + for line in nvrm_lines[-5:]: + log(f"DMESG: {line}") + except Exception: + pass + stop_monitor.wait(interval) + + +def check_runtime_d3(): + """Check and log Runtime D3 power management status.""" + try: + power_file = Path("/proc/driver/nvidia/gpus/0000:02:00.0/power") + if power_file.exists(): + log(f"GPU Power Management:\n{power_file.read_text()}") + + # Check if dynamic power management is enabled + result = subprocess.run( + ["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_status"], + capture_output=True, text=True, timeout=5 + ) + log(f"PCI runtime_status: {result.stdout.strip()}") + + result = subprocess.run( + ["cat", "/sys/bus/pci/devices/0000:02:00.0/power/control"], + capture_output=True, text=True, timeout=5 + ) + log(f"PCI power control: {result.stdout.strip()}") + + result = subprocess.run( + ["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_enabled"], + capture_output=True, text=True, timeout=5 + ) + log(f"PCI runtime_enabled: {result.stdout.strip()}") + + except Exception as e: + log(f"Power check error: {e}") + + +def check_nvidia_persistence(): + """Check persistence mode.""" + try: + result = subprocess.run( + ["nvidia-smi", "--query-gpu=persistence_mode", "--format=csv,noheader"], + capture_output=True, text=True, timeout=5 + ) + log(f"Persistence mode: {result.stdout.strip()}") + except Exception as e: + log(f"Persistence check error: {e}") + + +def transcribe_one(episode_path: str) -> bool: + """Transcribe a single episode with GPU health monitoring. Returns success.""" + name = Path(episode_path).stem + output_dir = f"training-data/transcripts/{name}" + + if Path(output_dir).exists() and (Path(output_dir) / "transcript.json").exists(): + log(f"SKIP: {name} already transcribed") + return True + + # Pre-flight GPU check + log(f"PRE-FLIGHT: Checking GPU before {name}") + stats = gpu_query() + if not stats or stats.get("error"): + log(f"PRE-FLIGHT FAIL: GPU already in error state! Stats: {stats}") + return False + log(f"PRE-FLIGHT: {gpu_status_str(stats)}") + + # Quick CUDA test + log("PRE-FLIGHT: Testing CUDA...") + try: + import torch + if not torch.cuda.is_available(): + log("PRE-FLIGHT FAIL: torch.cuda.is_available() = False") + return False + # Small allocation test + x = torch.randn(100, 100, device="cuda") + y = x @ x + del x, y + torch.cuda.synchronize() + torch.cuda.empty_cache() + log(f"PRE-FLIGHT: CUDA OK, allocated={torch.cuda.memory_allocated() / 1024**2:.0f}MB") + except Exception as e: + log(f"PRE-FLIGHT FAIL: CUDA test error: {e}") + return False + + # Transcribe + log(f"START: {name} ({episode_path})") + start_time = time.time() + + try: + from src.transcriber import transcribe + transcript = transcribe(episode_path) + transcript.save(Path(output_dir)) + elapsed = time.time() - start_time + log(f"DONE: {name} in {elapsed:.1f}s ({elapsed/60:.1f}min), " + f"{len(transcript.segments)} segments") + except Exception as e: + elapsed = time.time() - start_time + log(f"FAIL: {name} after {elapsed:.1f}s: {type(e).__name__}: {e}") + + # Post-failure GPU check + stats = gpu_query() + log(f"POST-FAIL: {gpu_status_str(stats) if stats else 'query failed'}") + return False + + # Post-transcription GPU check + stats = gpu_query() + if stats and not stats.get("error"): + log(f"POST: {gpu_status_str(stats)}") + else: + log(f"POST: GPU entered error state after transcription! {stats}") + + # Cool-down: clear CUDA cache, let GPU idle briefly + try: + import torch + torch.cuda.empty_cache() + torch.cuda.synchronize() + except Exception: + pass + + log("COOLDOWN: Waiting 10s between episodes...") + time.sleep(10) + + return True + + +def main(): + log("=" * 60) + log("GPU Debug Batch Transcription") + log(f"Driver: {subprocess.getoutput('nvidia-smi --query-gpu=driver_version --format=csv,noheader')}") + log(f"CUDA version: {subprocess.getoutput('nvidia-smi --query-gpu=cuda_version --format=csv,noheader 2>/dev/null') or 'N/A'}") + log("=" * 60) + + # Check power management + check_runtime_d3() + check_nvidia_persistence() + + # Initial GPU state + stats = gpu_query() + if not stats or stats.get("error"): + log(f"ABORT: GPU already in error state at startup: {stats}") + sys.exit(1) + log(f"INITIAL: {gpu_status_str(stats)}") + + # Start background monitor (every 5 seconds during transcription) + monitor = threading.Thread(target=monitor_thread, args=(5.0,), daemon=True) + monitor.start() + + # Filter to only episodes that need transcription + remaining = [] + for ep in EPISODES: + name = Path(ep).stem + out = Path(f"training-data/transcripts/{name}/transcript.json") + if out.exists(): + log(f"ALREADY DONE: {name}") + else: + remaining.append(ep) + + log(f"QUEUE: {len(remaining)} episodes to transcribe") + + completed = 0 + failed = 0 + for ep in remaining: + success = transcribe_one(ep) + if success: + completed += 1 + else: + failed += 1 + log(f"STOPPING: GPU failure detected after {completed} episodes, {failed} failed") + # Log final state + stats = gpu_query() + log(f"FINAL: {gpu_status_str(stats) if stats else 'query failed'}") + break + + stop_monitor.set() + log(f"SUMMARY: {completed} completed, {failed} failed, " + f"{len(remaining) - completed - failed} remaining") + log(f"Log saved to: {LOG_FILE}") + + +if __name__ == "__main__": + main() diff --git a/session-logs/2026-03-21-session.md b/session-logs/2026-03-21-session.md index 72b875d..a9297ec 100644 --- a/session-logs/2026-03-21-session.md +++ b/session-logs/2026-03-21-session.md @@ -575,3 +575,248 @@ done ``` Then: run speaker identification across all transcribed episodes, cluster non-host voices, begin element fingerprinting. + +## Update: 15:00 — Dataforth Email, GPU Debug, VWP Citrix→Hyper-V Migration, ScreenConnect + +### Session Summary + +Multi-task session: Dataforth email forwarding, GPU error diagnosis for voice training, and major VWP infrastructure migration (Citrix XenServer → Hyper-V). Installed ScreenConnect on VWP-FILES via PowerShell Direct. + +### 1. Dataforth Email Forwarding (dataforthgit@) + +**Task:** AJ (Angel Lopez) at Dataforth needs messages sent to dataforthgit@dataforth.com forwarded to him. + +**Discovery:** `dataforthgit@dataforth.com` is an existing alias on the **Support** shared mailbox (`support@dataforth.com`). + +**Solution:** Created inbox rule on Support mailbox via Graph API: +- **Rule:** "Forward dataforthgit@ to AJ Lopez" +- **Trigger:** recipientContains `dataforthgit@dataforth.com` +- **Action:** Forward to `alopez@dataforth.com` +- **Rule ID:** `AQAAAFO12jE=` + +**Auth used:** Claude-MSP-Access multi-tenant app: +- Tenant ID: `7dfa3ce8-c496-4b51-ab8d-bd3dcd78b584` +- App ID: `fabb3421-8b34-484b-bc17-e46de9703418` +- Client Secret: `~QJ8Q~NyQSs4OcGqHZyPrA2CVnq9KBfKiimntbMO` + +### 2. GPU Error Diagnosis (RTX 5070 Ti) + +**Problem:** GPU entered error state during voice training batch transcription (same issue as previous session). `nvidia-smi` shows ERR! across all fields. The GPU failed ~40 min into transcription. + +**Root cause investigation:** +- `NVRM: _issueRpcLarge: rpcSendMessage failed with status 0x00000062 for fn 76!` — repeating every 100ms +- No Xid errors in dmesg — only RPC communication failures +- **Runtime D3 (fine-grained power management) is enabled** — prime suspect for GPU hang during sustained compute +- GPU is in D0 power state, video memory active +- Error first appeared at 4335 seconds after boot (~72 min) +- `torch.cuda.is_available()` returned True initially, GPU loaded model into VRAM then failed + +**Fix applied (pending reboot):** +- Created `/etc/modprobe.d/nvidia-no-d3.conf`: `options nvidia NVreg_DynamicPowerManagement=0` +- Plan: After reboot, run `sudo nvidia-smi -pm 1` (persistence mode) + +**Diagnostic script created:** `projects/radio-show/audio-processor/gpu_debug_transcribe.py` +- Monitors GPU temp, power, utilization, VRAM, clocks every 5 seconds +- Pre-flight CUDA health check before each episode +- 10-second cooldown between episodes +- Stops at first GPU error and logs state +- Saves logs to `gpu-debug-logs/` + +**Transcription status:** Only `2010-10-02-hr1` completed. 8 episodes remaining: +- 2011-06-04-hr1, 2011-09-10-hr1, 2014-s6e05, 2015-s7e30, 2016-s8e42, 2017-s9e26, 2018-s10e17, 2018-s10e21 + +**After reboot commands:** +```bash +sudo nvidia-smi -pm 1 +source /home/guru/.local/share/radio-processor/bin/activate +cd /home/guru/ClaudeTools/projects/radio-show/audio-processor +python3 gpu_debug_transcribe.py +``` + +### 3. VWP Citrix XenServer → Hyper-V Migration + +#### VPN Access + +**Critical:** Must `sudo tailscale down` before VWP VPN — D2TESTNAS advertises `192.168.0.0/24` for Dataforth which conflicts with VWP's same subnet. + +**Starlink subnet conflict:** Starlink was on `192.168.4.0/24`, same as VPN tunnel. User changed Starlink to `10.0.3.x/16` to resolve. + +**Working VPN command (split tunnel):** +```bash +sudo tailscale down +sudo openvpn --config ~/Downloads/OpenVPN-Server.ovpn --auth-user-pass /etc/openvpn/vwp-auth.txt --group nobody --daemon vwp-vpn --log /tmp/vwp-vpn.log --route-noexec +# Then manually add split routes: +sudo ip route add 172.16.9.0/24 dev tun0 +sudo ip route add 192.168.0.0/24 dev tun0 +sudo ip route add 192.168.3.0/24 dev tun0 +``` + +**Key:** Must use `--route-noexec` to prevent full-tunnel `0.0.0.0/1` redirect, then manually add split routes. + +#### VPN Credentials +- **Auth file:** `/etc/openvpn/vwp-auth.txt` (sysadmin / r3tr0gradE99#) +- **Remote:** 4.18.160.106:1194 TCP +- **VPN IP assigned:** 192.168.4.2 or 192.168.4.3 + +#### WinRM Access to Hyper-V + +**Installed `pywinrm`** (`pip install --user --break-system-packages pywinrm`) for remote PowerShell via WinRM. + +**WinRM enabled on VWP-HYPERV1** (user ran on console): +```powershell +Enable-PSRemoting -Force +Set-Item WSMan:\localhost\Client\TrustedHosts -Value "*" -Force +New-NetFirewallRule -DisplayName "WinRM All" -Direction Inbound -Protocol TCP -LocalPort 5985 -Action Allow +``` + +**Python WinRM usage:** +```python +import winrm +s = winrm.Session('http://172.16.9.184:5985/wsman', auth=('sysadmin', 'r3tr0gradE99#'), transport='ntlm') +r = s.run_ps("hostname") +print(r.std_out.decode().strip()) +``` + +#### Hyper-V Host Status (VWP-HYPERV1) + +- **Hostname:** VWP-HYPERV1 +- **IP:** 172.16.9.184 +- **OS:** Windows Server 2025 Standard +- **Specs:** 64 vCPUs (Xeon Platinum 8180M), 256GB RAM, PowerEdge R740 +- **Disk:** 10.5TB free on C: +- **vSwitch:** "Intel(R) Ethernet 10G 4P X550/I350 rNDC - Virtual Switch" (External, NIC1 at 1Gbps) +- **Physical NICs:** NIC1 (up, 1Gbps), NIC2/3/4 (disconnected) +- **Native VLAN:** 172.16.9.x (untagged) + +**Existing VMs on Hyper-V:** +| VM | State | Gen | RAM | vCPUs | +|----|-------|-----|-----|-------| +| VWP-DC1 | Running | 2 | ~7.4GB | 56 | +| VWP-FILES | Running | 2 | 2GB | 16 | + +**VLAN configuration:** +- Native/untagged: 172.16.9.0/24 (VWP LAN) +- VLAN 2: 192.168.0.0/24 (OldNet) +- VLAN 99: 192.168.3.0/24 (Mgt) +- UDM trunks all VLANs, defaults to selected VLAN for untagged + +#### XenServer VM Inventory (source) + +| VM | OS | IP | State | vCPUs | RAM | Disk | +|----|----|----|-------|-------|-----|------| +| server 2012 R2 | Server 2012 R2 Standard | 192.168.0.19 | running | 4 | 16GB | 200GB | +| BACKUP-SRV | Server 2019 Datacenter | 192.168.0.22 | running | 2 | 15GB | 240GB | +| server 2003 | Server 2003 Enterprise SP2 | 192.168.0.20 | running | 4 | 3GB | 130GB | +| XP | Windows XP | none | running | 2 | 3GB | 40GB | +| Windows 7 (32-bit) | Windows 7 | 192.168.0.40 | halted | 2 | 4GB | 80GB | + +#### Server 2012 R2 Migration (IN PROGRESS) + +**VDI Export running on XenServer:** +- VDI UUID: `e65ccf95-0bc7-4530-ac91-c418e667e1de` +- VM UUID: `298da244-79b5-84ed-d6e0-694825697096` +- Export command: `xe vdi-export uuid=e65ccf95-0bc7-4530-ac91-c418e667e1de filename=/mnt/hyperv/server2012r2.vhd format=vhd` +- PID: 26610 (nohup, survives disconnects) +- Destination: `//172.16.9.184/Migration` mounted at `/mnt/hyperv` (SMBv2) +- Progress at last check: **65GB of ~200GB** (~4GB/min, ~35 min remaining) +- Transfer rate: ~4GB/min over 1Gbps link + +**SMB share created on Hyper-V:** +```powershell +New-SmbShare -Name 'Migration' -Path 'C:\Migration' -FullAccess 'Everyone' +New-NetFirewallRule -DisplayName 'SMB from XenServer' -Direction Inbound -Protocol TCP -LocalPort 445 -RemoteAddress 192.168.0.0/24 -Action Allow +``` + +**Mount on XenServer:** +```bash +mount.cifs //172.16.9.184/Migration /mnt/hyperv -o username=sysadmin,password=r3tr0gradE99#,domain=VWP,vers=2.0 +``` + +**Planned VM creation (after export completes):** +- Generation 1 (BIOS/MBR from XenServer) +- 4 vCPUs, 16GB RAM +- NIC on VLAN 2 (192.168.0.x) +- Attach server2012r2.vhd from C:\Migration +- Boot and install Hyper-V integration services + +#### ITSvc Share (C:\Shares\ITSvc on VWP-HYPERV1) +Contains installers: +- VWP-ScreenConnect.ClientSetup.msi (27.6MB) +- VWPScreenConnect.ClientSetup.exe (19.1MB) +- Ninite installers (Chrome, Firefox, .NET, WizTree) +- ISO subfolder + +### 4. ScreenConnect on VWP-FILES + +**VWP-FILES VM details:** +- **Hostname:** VWP-FILES.VWP.US +- **IP:** 172.16.9.107 +- **OS:** Windows Server 2019 Standard +- **Hyper-V Gen:** 2 +- **RAM:** 2GB, 16 vCPUs + +**PowerShell Direct credentials:** `VWP\sysadmin` / `r3tr0gradE99#` + +**Installation:** MSI copied via `Copy-VMFile` (Hyper-V Guest Service Interface), installed via PowerShell Direct: +```powershell +Copy-VMFile -Name 'VWP-FILES' -SourcePath 'C:\Shares\ITSvc\VWP-ScreenConnect.ClientSetup.msi' -DestinationPath 'C:\Temp\VWP-ScreenConnect.ClientSetup.msi' -CreateFullPath -FileSource Host +Invoke-Command -VMName 'VWP-FILES' -Credential ... -ScriptBlock { Start-Process msiexec.exe -ArgumentList '/i C:\Temp\VWP-ScreenConnect.ClientSetup.msi /quiet /norestart' -Wait } +``` + +**Issue:** Service installed but stopped immediately — "Your host has ended the remote session." User had accidentally deleted the unit in ScreenConnect console. + +**Fix:** Uninstalled (`msiexec /x ... /quiet /norestart`), reinstalled same MSI. Service now **Running**. + +**Service:** `ScreenConnect Client (1912bf3444b41a08)` — connects to `instance-kgc7jt-relay.screenconnect.com:443` + +### 5. Memory Saved + +- `reference_dataforth_contact.md` — AJ at Dataforth, dataforthgit@ email forwarding + +### Credentials Used This Session + +``` +### Dataforth M365 (Graph API) +- Tenant ID: 7dfa3ce8-c496-4b51-ab8d-bd3dcd78b584 +- App ID: fabb3421-8b34-484b-bc17-e46de9703418 +- Client Secret: ~QJ8Q~NyQSs4OcGqHZyPrA2CVnq9KBfKiimntbMO + +### VWP VPN +- Auth file: /etc/openvpn/vwp-auth.txt +- User: sysadmin / r3tr0gradE99# +- Remote: 4.18.160.106:1194 TCP + +### VWP XenServer (192.168.0.104) +- SSH: root / r3tr0gradE99! +- Note: $'...' quoting for ! + +### VWP-HYPERV1 (172.16.9.184) +- WinRM: sysadmin / r3tr0gradE99# (NTLM) +- URL: http://172.16.9.184:5985/wsman + +### VWP-DC1 (172.16.9.2) +- Domain: VWP\sysadmin / r3tr0gradE99# + +### VWP-FILES (172.16.9.107) +- PowerShell Direct: VWP\sysadmin / r3tr0gradE99# + +### VWP iDRAC - XenServer R720 (192.168.3.30) +- SSH: root / r3tr0gradE99# +- SSH flags: -o KexAlgorithms=+diffie-hellman-group14-sha1 -o HostKeyAlgorithms=+ssh-rsa -o Ciphers=+aes128-cbc,aes256-cbc +``` + +### Pending/Incomplete Tasks + +1. **VDI export in progress** — Server 2012 R2 exporting from XenServer to Hyper-V, ~65GB/200GB done, PID 26610 on XenServer +2. **Create Server 2012 R2 VM on Hyper-V** — After export: Gen1, 4 vCPU, 16GB RAM, VLAN 2, attach VHD +3. **GPU debug after reboot** — Run `sudo nvidia-smi -pm 1` then `python3 gpu_debug_transcribe.py` +4. **Server 2003 data migration** — Move shares/data from 192.168.0.20 (G: drive) to VWP-FILES after 2012 R2 migration +5. **Remaining XenServer VMs** — BACKUP-SRV, server 2003, XP, Windows 7 all need migration +6. **pywinrm installed** — `pip install --user --break-system-packages pywinrm` on workstation + +### Files Created/Modified This Session + +- `/etc/modprobe.d/nvidia-no-d3.conf` — Disable GPU Runtime D3 power management +- `projects/radio-show/audio-processor/gpu_debug_transcribe.py` — GPU diagnostic batch transcription script +- `~/.claude/projects/-home-guru-ClaudeTools/memory/reference_dataforth_contact.md` — AJ/dataforthgit memory +- `~/.claude/projects/-home-guru-ClaudeTools/memory/MEMORY.md` — Updated index