sync: Auto-sync from acg-guru-5070 at 2026-03-21 16:34:05
Synced files: - Session logs updated - Latest context and credentials - Command/directive updates Machine: acg-guru-5070 Timestamp: 2026-03-21 16:34:05 Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
294
projects/radio-show/audio-processor/gpu_debug_transcribe.py
Normal file
294
projects/radio-show/audio-processor/gpu_debug_transcribe.py
Normal file
@@ -0,0 +1,294 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""GPU-monitored batch transcription with diagnostics.
|
||||||
|
|
||||||
|
Monitors GPU health before, during, and after each episode transcription.
|
||||||
|
Logs temperature, power, utilization, and memory to detect what triggers
|
||||||
|
the NVRM rpcSendMessage failure (status 0x00000062).
|
||||||
|
"""
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import signal
|
||||||
|
import threading
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
LOG_DIR = Path("gpu-debug-logs")
|
||||||
|
LOG_DIR.mkdir(exist_ok=True)
|
||||||
|
LOG_FILE = LOG_DIR / f"gpu_monitor_{datetime.now():%Y%m%d_%H%M%S}.log"
|
||||||
|
|
||||||
|
# Episodes to transcribe (remaining ones)
|
||||||
|
EPISODES = [
|
||||||
|
"training-data/episodes/2011-06-04-hr1.mp3",
|
||||||
|
"training-data/episodes/2011-09-10-hr1.mp3",
|
||||||
|
"training-data/episodes/2014-s6e05.mp3",
|
||||||
|
"training-data/episodes/2015-s7e30.mp3",
|
||||||
|
"training-data/episodes/2016-s8e42.mp3",
|
||||||
|
"training-data/episodes/2017-s9e26.mp3",
|
||||||
|
"training-data/episodes/2018-s10e17.mp3",
|
||||||
|
"training-data/episodes/2018-s10e21.mp3",
|
||||||
|
]
|
||||||
|
|
||||||
|
stop_monitor = threading.Event()
|
||||||
|
|
||||||
|
|
||||||
|
def log(msg: str):
|
||||||
|
ts = datetime.now().strftime("%H:%M:%S.%f")[:-3]
|
||||||
|
line = f"[{ts}] {msg}"
|
||||||
|
print(line)
|
||||||
|
with open(LOG_FILE, "a") as f:
|
||||||
|
f.write(line + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def gpu_query() -> dict | None:
|
||||||
|
"""Query GPU stats via nvidia-smi. Returns None if GPU is in error state."""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["nvidia-smi",
|
||||||
|
"--query-gpu=temperature.gpu,power.draw,utilization.gpu,utilization.memory,"
|
||||||
|
"memory.used,memory.total,clocks.current.sm,clocks.current.memory,"
|
||||||
|
"pstate,fan.speed",
|
||||||
|
"--format=csv,noheader,nounits"],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
return None
|
||||||
|
parts = [p.strip() for p in result.stdout.strip().split(",")]
|
||||||
|
# Check for ERR! or [N/A] in any field
|
||||||
|
if any("ERR" in p or "[N/A]" in p for p in parts[:4]):
|
||||||
|
return {"error": True, "raw": result.stdout.strip()}
|
||||||
|
return {
|
||||||
|
"temp_c": parts[0],
|
||||||
|
"power_w": parts[1],
|
||||||
|
"gpu_util": parts[2],
|
||||||
|
"mem_util": parts[3],
|
||||||
|
"mem_used_mb": parts[4],
|
||||||
|
"mem_total_mb": parts[5],
|
||||||
|
"sm_clock_mhz": parts[6],
|
||||||
|
"mem_clock_mhz": parts[7],
|
||||||
|
"pstate": parts[8],
|
||||||
|
"fan": parts[9],
|
||||||
|
"error": False,
|
||||||
|
}
|
||||||
|
except (subprocess.TimeoutExpired, Exception) as e:
|
||||||
|
return {"error": True, "raw": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
def gpu_health_check() -> bool:
|
||||||
|
"""Returns True if GPU is healthy."""
|
||||||
|
stats = gpu_query()
|
||||||
|
if stats is None or stats.get("error"):
|
||||||
|
log(f"GPU ERROR: {stats}")
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def gpu_status_str(stats: dict) -> str:
|
||||||
|
if stats.get("error"):
|
||||||
|
return f"ERR! raw={stats.get('raw', 'unknown')}"
|
||||||
|
return (f"T={stats['temp_c']}C P={stats['power_w']}W "
|
||||||
|
f"GPU={stats['gpu_util']}% MEM={stats['mem_util']}% "
|
||||||
|
f"VRAM={stats['mem_used_mb']}/{stats['mem_total_mb']}MB "
|
||||||
|
f"SM={stats['sm_clock_mhz']}MHz MEMCLK={stats['mem_clock_mhz']}MHz "
|
||||||
|
f"PState={stats['pstate']} Fan={stats['fan']}")
|
||||||
|
|
||||||
|
|
||||||
|
def monitor_thread(interval: float = 2.0):
|
||||||
|
"""Background thread that logs GPU stats at regular intervals."""
|
||||||
|
while not stop_monitor.is_set():
|
||||||
|
stats = gpu_query()
|
||||||
|
if stats:
|
||||||
|
log(f"MONITOR: {gpu_status_str(stats)}")
|
||||||
|
if stats.get("error"):
|
||||||
|
log("MONITOR: GPU ENTERED ERROR STATE!")
|
||||||
|
# Check dmesg for the smoking gun
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["sudo", "dmesg", "-T", "--level=err,warn"],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
nvrm_lines = [l for l in result.stdout.splitlines()
|
||||||
|
if "NVRM" in l or "nvidia" in l.lower()]
|
||||||
|
for line in nvrm_lines[-5:]:
|
||||||
|
log(f"DMESG: {line}")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
stop_monitor.wait(interval)
|
||||||
|
|
||||||
|
|
||||||
|
def check_runtime_d3():
|
||||||
|
"""Check and log Runtime D3 power management status."""
|
||||||
|
try:
|
||||||
|
power_file = Path("/proc/driver/nvidia/gpus/0000:02:00.0/power")
|
||||||
|
if power_file.exists():
|
||||||
|
log(f"GPU Power Management:\n{power_file.read_text()}")
|
||||||
|
|
||||||
|
# Check if dynamic power management is enabled
|
||||||
|
result = subprocess.run(
|
||||||
|
["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_status"],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
log(f"PCI runtime_status: {result.stdout.strip()}")
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
["cat", "/sys/bus/pci/devices/0000:02:00.0/power/control"],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
log(f"PCI power control: {result.stdout.strip()}")
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
["cat", "/sys/bus/pci/devices/0000:02:00.0/power/runtime_enabled"],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
log(f"PCI runtime_enabled: {result.stdout.strip()}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Power check error: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def check_nvidia_persistence():
|
||||||
|
"""Check persistence mode."""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
["nvidia-smi", "--query-gpu=persistence_mode", "--format=csv,noheader"],
|
||||||
|
capture_output=True, text=True, timeout=5
|
||||||
|
)
|
||||||
|
log(f"Persistence mode: {result.stdout.strip()}")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"Persistence check error: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def transcribe_one(episode_path: str) -> bool:
|
||||||
|
"""Transcribe a single episode with GPU health monitoring. Returns success."""
|
||||||
|
name = Path(episode_path).stem
|
||||||
|
output_dir = f"training-data/transcripts/{name}"
|
||||||
|
|
||||||
|
if Path(output_dir).exists() and (Path(output_dir) / "transcript.json").exists():
|
||||||
|
log(f"SKIP: {name} already transcribed")
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Pre-flight GPU check
|
||||||
|
log(f"PRE-FLIGHT: Checking GPU before {name}")
|
||||||
|
stats = gpu_query()
|
||||||
|
if not stats or stats.get("error"):
|
||||||
|
log(f"PRE-FLIGHT FAIL: GPU already in error state! Stats: {stats}")
|
||||||
|
return False
|
||||||
|
log(f"PRE-FLIGHT: {gpu_status_str(stats)}")
|
||||||
|
|
||||||
|
# Quick CUDA test
|
||||||
|
log("PRE-FLIGHT: Testing CUDA...")
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
if not torch.cuda.is_available():
|
||||||
|
log("PRE-FLIGHT FAIL: torch.cuda.is_available() = False")
|
||||||
|
return False
|
||||||
|
# Small allocation test
|
||||||
|
x = torch.randn(100, 100, device="cuda")
|
||||||
|
y = x @ x
|
||||||
|
del x, y
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
log(f"PRE-FLIGHT: CUDA OK, allocated={torch.cuda.memory_allocated() / 1024**2:.0f}MB")
|
||||||
|
except Exception as e:
|
||||||
|
log(f"PRE-FLIGHT FAIL: CUDA test error: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Transcribe
|
||||||
|
log(f"START: {name} ({episode_path})")
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
from src.transcriber import transcribe
|
||||||
|
transcript = transcribe(episode_path)
|
||||||
|
transcript.save(Path(output_dir))
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
log(f"DONE: {name} in {elapsed:.1f}s ({elapsed/60:.1f}min), "
|
||||||
|
f"{len(transcript.segments)} segments")
|
||||||
|
except Exception as e:
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
log(f"FAIL: {name} after {elapsed:.1f}s: {type(e).__name__}: {e}")
|
||||||
|
|
||||||
|
# Post-failure GPU check
|
||||||
|
stats = gpu_query()
|
||||||
|
log(f"POST-FAIL: {gpu_status_str(stats) if stats else 'query failed'}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Post-transcription GPU check
|
||||||
|
stats = gpu_query()
|
||||||
|
if stats and not stats.get("error"):
|
||||||
|
log(f"POST: {gpu_status_str(stats)}")
|
||||||
|
else:
|
||||||
|
log(f"POST: GPU entered error state after transcription! {stats}")
|
||||||
|
|
||||||
|
# Cool-down: clear CUDA cache, let GPU idle briefly
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
log("COOLDOWN: Waiting 10s between episodes...")
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
log("=" * 60)
|
||||||
|
log("GPU Debug Batch Transcription")
|
||||||
|
log(f"Driver: {subprocess.getoutput('nvidia-smi --query-gpu=driver_version --format=csv,noheader')}")
|
||||||
|
log(f"CUDA version: {subprocess.getoutput('nvidia-smi --query-gpu=cuda_version --format=csv,noheader 2>/dev/null') or 'N/A'}")
|
||||||
|
log("=" * 60)
|
||||||
|
|
||||||
|
# Check power management
|
||||||
|
check_runtime_d3()
|
||||||
|
check_nvidia_persistence()
|
||||||
|
|
||||||
|
# Initial GPU state
|
||||||
|
stats = gpu_query()
|
||||||
|
if not stats or stats.get("error"):
|
||||||
|
log(f"ABORT: GPU already in error state at startup: {stats}")
|
||||||
|
sys.exit(1)
|
||||||
|
log(f"INITIAL: {gpu_status_str(stats)}")
|
||||||
|
|
||||||
|
# Start background monitor (every 5 seconds during transcription)
|
||||||
|
monitor = threading.Thread(target=monitor_thread, args=(5.0,), daemon=True)
|
||||||
|
monitor.start()
|
||||||
|
|
||||||
|
# Filter to only episodes that need transcription
|
||||||
|
remaining = []
|
||||||
|
for ep in EPISODES:
|
||||||
|
name = Path(ep).stem
|
||||||
|
out = Path(f"training-data/transcripts/{name}/transcript.json")
|
||||||
|
if out.exists():
|
||||||
|
log(f"ALREADY DONE: {name}")
|
||||||
|
else:
|
||||||
|
remaining.append(ep)
|
||||||
|
|
||||||
|
log(f"QUEUE: {len(remaining)} episodes to transcribe")
|
||||||
|
|
||||||
|
completed = 0
|
||||||
|
failed = 0
|
||||||
|
for ep in remaining:
|
||||||
|
success = transcribe_one(ep)
|
||||||
|
if success:
|
||||||
|
completed += 1
|
||||||
|
else:
|
||||||
|
failed += 1
|
||||||
|
log(f"STOPPING: GPU failure detected after {completed} episodes, {failed} failed")
|
||||||
|
# Log final state
|
||||||
|
stats = gpu_query()
|
||||||
|
log(f"FINAL: {gpu_status_str(stats) if stats else 'query failed'}")
|
||||||
|
break
|
||||||
|
|
||||||
|
stop_monitor.set()
|
||||||
|
log(f"SUMMARY: {completed} completed, {failed} failed, "
|
||||||
|
f"{len(remaining) - completed - failed} remaining")
|
||||||
|
log(f"Log saved to: {LOG_FILE}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -575,3 +575,248 @@ done
|
|||||||
```
|
```
|
||||||
|
|
||||||
Then: run speaker identification across all transcribed episodes, cluster non-host voices, begin element fingerprinting.
|
Then: run speaker identification across all transcribed episodes, cluster non-host voices, begin element fingerprinting.
|
||||||
|
|
||||||
|
## Update: 15:00 — Dataforth Email, GPU Debug, VWP Citrix→Hyper-V Migration, ScreenConnect
|
||||||
|
|
||||||
|
### Session Summary
|
||||||
|
|
||||||
|
Multi-task session: Dataforth email forwarding, GPU error diagnosis for voice training, and major VWP infrastructure migration (Citrix XenServer → Hyper-V). Installed ScreenConnect on VWP-FILES via PowerShell Direct.
|
||||||
|
|
||||||
|
### 1. Dataforth Email Forwarding (dataforthgit@)
|
||||||
|
|
||||||
|
**Task:** AJ (Angel Lopez) at Dataforth needs messages sent to dataforthgit@dataforth.com forwarded to him.
|
||||||
|
|
||||||
|
**Discovery:** `dataforthgit@dataforth.com` is an existing alias on the **Support** shared mailbox (`support@dataforth.com`).
|
||||||
|
|
||||||
|
**Solution:** Created inbox rule on Support mailbox via Graph API:
|
||||||
|
- **Rule:** "Forward dataforthgit@ to AJ Lopez"
|
||||||
|
- **Trigger:** recipientContains `dataforthgit@dataforth.com`
|
||||||
|
- **Action:** Forward to `alopez@dataforth.com`
|
||||||
|
- **Rule ID:** `AQAAAFO12jE=`
|
||||||
|
|
||||||
|
**Auth used:** Claude-MSP-Access multi-tenant app:
|
||||||
|
- Tenant ID: `7dfa3ce8-c496-4b51-ab8d-bd3dcd78b584`
|
||||||
|
- App ID: `fabb3421-8b34-484b-bc17-e46de9703418`
|
||||||
|
- Client Secret: `~QJ8Q~NyQSs4OcGqHZyPrA2CVnq9KBfKiimntbMO`
|
||||||
|
|
||||||
|
### 2. GPU Error Diagnosis (RTX 5070 Ti)
|
||||||
|
|
||||||
|
**Problem:** GPU entered error state during voice training batch transcription (same issue as previous session). `nvidia-smi` shows ERR! across all fields. The GPU failed ~40 min into transcription.
|
||||||
|
|
||||||
|
**Root cause investigation:**
|
||||||
|
- `NVRM: _issueRpcLarge: rpcSendMessage failed with status 0x00000062 for fn 76!` — repeating every 100ms
|
||||||
|
- No Xid errors in dmesg — only RPC communication failures
|
||||||
|
- **Runtime D3 (fine-grained power management) is enabled** — prime suspect for GPU hang during sustained compute
|
||||||
|
- GPU is in D0 power state, video memory active
|
||||||
|
- Error first appeared at 4335 seconds after boot (~72 min)
|
||||||
|
- `torch.cuda.is_available()` returned True initially, GPU loaded model into VRAM then failed
|
||||||
|
|
||||||
|
**Fix applied (pending reboot):**
|
||||||
|
- Created `/etc/modprobe.d/nvidia-no-d3.conf`: `options nvidia NVreg_DynamicPowerManagement=0`
|
||||||
|
- Plan: After reboot, run `sudo nvidia-smi -pm 1` (persistence mode)
|
||||||
|
|
||||||
|
**Diagnostic script created:** `projects/radio-show/audio-processor/gpu_debug_transcribe.py`
|
||||||
|
- Monitors GPU temp, power, utilization, VRAM, clocks every 5 seconds
|
||||||
|
- Pre-flight CUDA health check before each episode
|
||||||
|
- 10-second cooldown between episodes
|
||||||
|
- Stops at first GPU error and logs state
|
||||||
|
- Saves logs to `gpu-debug-logs/`
|
||||||
|
|
||||||
|
**Transcription status:** Only `2010-10-02-hr1` completed. 8 episodes remaining:
|
||||||
|
- 2011-06-04-hr1, 2011-09-10-hr1, 2014-s6e05, 2015-s7e30, 2016-s8e42, 2017-s9e26, 2018-s10e17, 2018-s10e21
|
||||||
|
|
||||||
|
**After reboot commands:**
|
||||||
|
```bash
|
||||||
|
sudo nvidia-smi -pm 1
|
||||||
|
source /home/guru/.local/share/radio-processor/bin/activate
|
||||||
|
cd /home/guru/ClaudeTools/projects/radio-show/audio-processor
|
||||||
|
python3 gpu_debug_transcribe.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. VWP Citrix XenServer → Hyper-V Migration
|
||||||
|
|
||||||
|
#### VPN Access
|
||||||
|
|
||||||
|
**Critical:** Must `sudo tailscale down` before VWP VPN — D2TESTNAS advertises `192.168.0.0/24` for Dataforth which conflicts with VWP's same subnet.
|
||||||
|
|
||||||
|
**Starlink subnet conflict:** Starlink was on `192.168.4.0/24`, same as VPN tunnel. User changed Starlink to `10.0.3.x/16` to resolve.
|
||||||
|
|
||||||
|
**Working VPN command (split tunnel):**
|
||||||
|
```bash
|
||||||
|
sudo tailscale down
|
||||||
|
sudo openvpn --config ~/Downloads/OpenVPN-Server.ovpn --auth-user-pass /etc/openvpn/vwp-auth.txt --group nobody --daemon vwp-vpn --log /tmp/vwp-vpn.log --route-noexec
|
||||||
|
# Then manually add split routes:
|
||||||
|
sudo ip route add 172.16.9.0/24 dev tun0
|
||||||
|
sudo ip route add 192.168.0.0/24 dev tun0
|
||||||
|
sudo ip route add 192.168.3.0/24 dev tun0
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key:** Must use `--route-noexec` to prevent full-tunnel `0.0.0.0/1` redirect, then manually add split routes.
|
||||||
|
|
||||||
|
#### VPN Credentials
|
||||||
|
- **Auth file:** `/etc/openvpn/vwp-auth.txt` (sysadmin / r3tr0gradE99#)
|
||||||
|
- **Remote:** 4.18.160.106:1194 TCP
|
||||||
|
- **VPN IP assigned:** 192.168.4.2 or 192.168.4.3
|
||||||
|
|
||||||
|
#### WinRM Access to Hyper-V
|
||||||
|
|
||||||
|
**Installed `pywinrm`** (`pip install --user --break-system-packages pywinrm`) for remote PowerShell via WinRM.
|
||||||
|
|
||||||
|
**WinRM enabled on VWP-HYPERV1** (user ran on console):
|
||||||
|
```powershell
|
||||||
|
Enable-PSRemoting -Force
|
||||||
|
Set-Item WSMan:\localhost\Client\TrustedHosts -Value "*" -Force
|
||||||
|
New-NetFirewallRule -DisplayName "WinRM All" -Direction Inbound -Protocol TCP -LocalPort 5985 -Action Allow
|
||||||
|
```
|
||||||
|
|
||||||
|
**Python WinRM usage:**
|
||||||
|
```python
|
||||||
|
import winrm
|
||||||
|
s = winrm.Session('http://172.16.9.184:5985/wsman', auth=('sysadmin', 'r3tr0gradE99#'), transport='ntlm')
|
||||||
|
r = s.run_ps("hostname")
|
||||||
|
print(r.std_out.decode().strip())
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Hyper-V Host Status (VWP-HYPERV1)
|
||||||
|
|
||||||
|
- **Hostname:** VWP-HYPERV1
|
||||||
|
- **IP:** 172.16.9.184
|
||||||
|
- **OS:** Windows Server 2025 Standard
|
||||||
|
- **Specs:** 64 vCPUs (Xeon Platinum 8180M), 256GB RAM, PowerEdge R740
|
||||||
|
- **Disk:** 10.5TB free on C:
|
||||||
|
- **vSwitch:** "Intel(R) Ethernet 10G 4P X550/I350 rNDC - Virtual Switch" (External, NIC1 at 1Gbps)
|
||||||
|
- **Physical NICs:** NIC1 (up, 1Gbps), NIC2/3/4 (disconnected)
|
||||||
|
- **Native VLAN:** 172.16.9.x (untagged)
|
||||||
|
|
||||||
|
**Existing VMs on Hyper-V:**
|
||||||
|
| VM | State | Gen | RAM | vCPUs |
|
||||||
|
|----|-------|-----|-----|-------|
|
||||||
|
| VWP-DC1 | Running | 2 | ~7.4GB | 56 |
|
||||||
|
| VWP-FILES | Running | 2 | 2GB | 16 |
|
||||||
|
|
||||||
|
**VLAN configuration:**
|
||||||
|
- Native/untagged: 172.16.9.0/24 (VWP LAN)
|
||||||
|
- VLAN 2: 192.168.0.0/24 (OldNet)
|
||||||
|
- VLAN 99: 192.168.3.0/24 (Mgt)
|
||||||
|
- UDM trunks all VLANs, defaults to selected VLAN for untagged
|
||||||
|
|
||||||
|
#### XenServer VM Inventory (source)
|
||||||
|
|
||||||
|
| VM | OS | IP | State | vCPUs | RAM | Disk |
|
||||||
|
|----|----|----|-------|-------|-----|------|
|
||||||
|
| server 2012 R2 | Server 2012 R2 Standard | 192.168.0.19 | running | 4 | 16GB | 200GB |
|
||||||
|
| BACKUP-SRV | Server 2019 Datacenter | 192.168.0.22 | running | 2 | 15GB | 240GB |
|
||||||
|
| server 2003 | Server 2003 Enterprise SP2 | 192.168.0.20 | running | 4 | 3GB | 130GB |
|
||||||
|
| XP | Windows XP | none | running | 2 | 3GB | 40GB |
|
||||||
|
| Windows 7 (32-bit) | Windows 7 | 192.168.0.40 | halted | 2 | 4GB | 80GB |
|
||||||
|
|
||||||
|
#### Server 2012 R2 Migration (IN PROGRESS)
|
||||||
|
|
||||||
|
**VDI Export running on XenServer:**
|
||||||
|
- VDI UUID: `e65ccf95-0bc7-4530-ac91-c418e667e1de`
|
||||||
|
- VM UUID: `298da244-79b5-84ed-d6e0-694825697096`
|
||||||
|
- Export command: `xe vdi-export uuid=e65ccf95-0bc7-4530-ac91-c418e667e1de filename=/mnt/hyperv/server2012r2.vhd format=vhd`
|
||||||
|
- PID: 26610 (nohup, survives disconnects)
|
||||||
|
- Destination: `//172.16.9.184/Migration` mounted at `/mnt/hyperv` (SMBv2)
|
||||||
|
- Progress at last check: **65GB of ~200GB** (~4GB/min, ~35 min remaining)
|
||||||
|
- Transfer rate: ~4GB/min over 1Gbps link
|
||||||
|
|
||||||
|
**SMB share created on Hyper-V:**
|
||||||
|
```powershell
|
||||||
|
New-SmbShare -Name 'Migration' -Path 'C:\Migration' -FullAccess 'Everyone'
|
||||||
|
New-NetFirewallRule -DisplayName 'SMB from XenServer' -Direction Inbound -Protocol TCP -LocalPort 445 -RemoteAddress 192.168.0.0/24 -Action Allow
|
||||||
|
```
|
||||||
|
|
||||||
|
**Mount on XenServer:**
|
||||||
|
```bash
|
||||||
|
mount.cifs //172.16.9.184/Migration /mnt/hyperv -o username=sysadmin,password=r3tr0gradE99#,domain=VWP,vers=2.0
|
||||||
|
```
|
||||||
|
|
||||||
|
**Planned VM creation (after export completes):**
|
||||||
|
- Generation 1 (BIOS/MBR from XenServer)
|
||||||
|
- 4 vCPUs, 16GB RAM
|
||||||
|
- NIC on VLAN 2 (192.168.0.x)
|
||||||
|
- Attach server2012r2.vhd from C:\Migration
|
||||||
|
- Boot and install Hyper-V integration services
|
||||||
|
|
||||||
|
#### ITSvc Share (C:\Shares\ITSvc on VWP-HYPERV1)
|
||||||
|
Contains installers:
|
||||||
|
- VWP-ScreenConnect.ClientSetup.msi (27.6MB)
|
||||||
|
- VWPScreenConnect.ClientSetup.exe (19.1MB)
|
||||||
|
- Ninite installers (Chrome, Firefox, .NET, WizTree)
|
||||||
|
- ISO subfolder
|
||||||
|
|
||||||
|
### 4. ScreenConnect on VWP-FILES
|
||||||
|
|
||||||
|
**VWP-FILES VM details:**
|
||||||
|
- **Hostname:** VWP-FILES.VWP.US
|
||||||
|
- **IP:** 172.16.9.107
|
||||||
|
- **OS:** Windows Server 2019 Standard
|
||||||
|
- **Hyper-V Gen:** 2
|
||||||
|
- **RAM:** 2GB, 16 vCPUs
|
||||||
|
|
||||||
|
**PowerShell Direct credentials:** `VWP\sysadmin` / `r3tr0gradE99#`
|
||||||
|
|
||||||
|
**Installation:** MSI copied via `Copy-VMFile` (Hyper-V Guest Service Interface), installed via PowerShell Direct:
|
||||||
|
```powershell
|
||||||
|
Copy-VMFile -Name 'VWP-FILES' -SourcePath 'C:\Shares\ITSvc\VWP-ScreenConnect.ClientSetup.msi' -DestinationPath 'C:\Temp\VWP-ScreenConnect.ClientSetup.msi' -CreateFullPath -FileSource Host
|
||||||
|
Invoke-Command -VMName 'VWP-FILES' -Credential ... -ScriptBlock { Start-Process msiexec.exe -ArgumentList '/i C:\Temp\VWP-ScreenConnect.ClientSetup.msi /quiet /norestart' -Wait }
|
||||||
|
```
|
||||||
|
|
||||||
|
**Issue:** Service installed but stopped immediately — "Your host has ended the remote session." User had accidentally deleted the unit in ScreenConnect console.
|
||||||
|
|
||||||
|
**Fix:** Uninstalled (`msiexec /x ... /quiet /norestart`), reinstalled same MSI. Service now **Running**.
|
||||||
|
|
||||||
|
**Service:** `ScreenConnect Client (1912bf3444b41a08)` — connects to `instance-kgc7jt-relay.screenconnect.com:443`
|
||||||
|
|
||||||
|
### 5. Memory Saved
|
||||||
|
|
||||||
|
- `reference_dataforth_contact.md` — AJ at Dataforth, dataforthgit@ email forwarding
|
||||||
|
|
||||||
|
### Credentials Used This Session
|
||||||
|
|
||||||
|
```
|
||||||
|
### Dataforth M365 (Graph API)
|
||||||
|
- Tenant ID: 7dfa3ce8-c496-4b51-ab8d-bd3dcd78b584
|
||||||
|
- App ID: fabb3421-8b34-484b-bc17-e46de9703418
|
||||||
|
- Client Secret: ~QJ8Q~NyQSs4OcGqHZyPrA2CVnq9KBfKiimntbMO
|
||||||
|
|
||||||
|
### VWP VPN
|
||||||
|
- Auth file: /etc/openvpn/vwp-auth.txt
|
||||||
|
- User: sysadmin / r3tr0gradE99#
|
||||||
|
- Remote: 4.18.160.106:1194 TCP
|
||||||
|
|
||||||
|
### VWP XenServer (192.168.0.104)
|
||||||
|
- SSH: root / r3tr0gradE99!
|
||||||
|
- Note: $'...' quoting for !
|
||||||
|
|
||||||
|
### VWP-HYPERV1 (172.16.9.184)
|
||||||
|
- WinRM: sysadmin / r3tr0gradE99# (NTLM)
|
||||||
|
- URL: http://172.16.9.184:5985/wsman
|
||||||
|
|
||||||
|
### VWP-DC1 (172.16.9.2)
|
||||||
|
- Domain: VWP\sysadmin / r3tr0gradE99#
|
||||||
|
|
||||||
|
### VWP-FILES (172.16.9.107)
|
||||||
|
- PowerShell Direct: VWP\sysadmin / r3tr0gradE99#
|
||||||
|
|
||||||
|
### VWP iDRAC - XenServer R720 (192.168.3.30)
|
||||||
|
- SSH: root / r3tr0gradE99#
|
||||||
|
- SSH flags: -o KexAlgorithms=+diffie-hellman-group14-sha1 -o HostKeyAlgorithms=+ssh-rsa -o Ciphers=+aes128-cbc,aes256-cbc
|
||||||
|
```
|
||||||
|
|
||||||
|
### Pending/Incomplete Tasks
|
||||||
|
|
||||||
|
1. **VDI export in progress** — Server 2012 R2 exporting from XenServer to Hyper-V, ~65GB/200GB done, PID 26610 on XenServer
|
||||||
|
2. **Create Server 2012 R2 VM on Hyper-V** — After export: Gen1, 4 vCPU, 16GB RAM, VLAN 2, attach VHD
|
||||||
|
3. **GPU debug after reboot** — Run `sudo nvidia-smi -pm 1` then `python3 gpu_debug_transcribe.py`
|
||||||
|
4. **Server 2003 data migration** — Move shares/data from 192.168.0.20 (G: drive) to VWP-FILES after 2012 R2 migration
|
||||||
|
5. **Remaining XenServer VMs** — BACKUP-SRV, server 2003, XP, Windows 7 all need migration
|
||||||
|
6. **pywinrm installed** — `pip install --user --break-system-packages pywinrm` on workstation
|
||||||
|
|
||||||
|
### Files Created/Modified This Session
|
||||||
|
|
||||||
|
- `/etc/modprobe.d/nvidia-no-d3.conf` — Disable GPU Runtime D3 power management
|
||||||
|
- `projects/radio-show/audio-processor/gpu_debug_transcribe.py` — GPU diagnostic batch transcription script
|
||||||
|
- `~/.claude/projects/-home-guru-ClaudeTools/memory/reference_dataforth_contact.md` — AJ/dataforthgit memory
|
||||||
|
- `~/.claude/projects/-home-guru-ClaudeTools/memory/MEMORY.md` — Updated index
|
||||||
|
|||||||
Reference in New Issue
Block a user