From 03b930a83bcf20e00262d06bca57383c24a7f073 Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Mon, 27 Apr 2026 14:42:21 -0700 Subject: [PATCH] sync: auto-sync from GURU-BEAST-ROG at 2026-04-27 14:42:18 Author: Mike Swanson Machine: GURU-BEAST-ROG Timestamp: 2026-04-27 14:42:18 --- .../machine_windows_guru_setup_status.md | 4 +- .claude/scheduled_tasks.lock | 2 +- .../audio-processor/download_test_episodes.py | 38 +++++++ .../2026-04-27-4090-benchmark-and-test-set.md | 99 +++++++++++++++++++ 4 files changed, 140 insertions(+), 3 deletions(-) create mode 100644 projects/radio-show/audio-processor/download_test_episodes.py create mode 100644 projects/radio-show/audio-processor/session-logs/2026-04-27-4090-benchmark-and-test-set.md diff --git a/.claude/memory/machine_windows_guru_setup_status.md b/.claude/memory/machine_windows_guru_setup_status.md index 30727c44..aa6736fc 100644 --- a/.claude/memory/machine_windows_guru_setup_status.md +++ b/.claude/memory/machine_windows_guru_setup_status.md @@ -7,7 +7,7 @@ type: reference # Windows Machine Setup Status (GURU-BEAST-ROG) **Created:** 2026-03-23 -**Updated:** 2026-04-26 +**Updated:** 2026-04-27 **Machine:** GURU-BEAST-ROG (Windows 11 Pro, i9-14900K, 128GB DDR5, RTX 4090) ## Software Status @@ -50,4 +50,4 @@ type: reference ## Remaining TODO - [ ] Deploy SSH pubkey to infrastructure servers (OwnCloud, Jupiter, etc.) -- [ ] Vault rotation to add GURU-BEAST-ROG's own age key as recipient (currently using shared ACG-5070 key) +- [x] ~~Vault rotation to add GURU-BEAST-ROG's own age key as recipient~~ — Completed 2026-04-27 (vault commit 73de020). Public key `age17nqczmkmnqj970v96w6wsyu72556psmrzhps8vm90fn67p8vqu4s3ze4ms` added to `keys/recipients.txt` and `.sops.yaml` creation rules. diff --git a/.claude/scheduled_tasks.lock b/.claude/scheduled_tasks.lock index 0af41499..91cd9b52 100644 --- a/.claude/scheduled_tasks.lock +++ b/.claude/scheduled_tasks.lock @@ -1 +1 @@ -{"sessionId":"d6600899-b6a9-4073-b362-d7d5aa0dd8dd","pid":6332,"acquiredAt":1777065966112} \ No newline at end of file +{"sessionId":"4d2b9d2c-b660-489f-8598-0a87605389c2","pid":45148,"acquiredAt":1777321662742} \ No newline at end of file diff --git a/projects/radio-show/audio-processor/download_test_episodes.py b/projects/radio-show/audio-processor/download_test_episodes.py new file mode 100644 index 00000000..2b2c5aa5 --- /dev/null +++ b/projects/radio-show/audio-processor/download_test_episodes.py @@ -0,0 +1,38 @@ +import os +import sys +import paramiko + +password = os.environ.get('IX_PASSWORD') +if not password: + print('IX_PASSWORD env var not set', file=sys.stderr) + sys.exit(1) + +client = paramiko.SSHClient() +client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) +client.connect('172.16.3.10', username='root', password=password, + look_for_keys=False, allow_agent=False, timeout=30) +sftp = client.open_sftp() + +os.makedirs('test-data/episodes', exist_ok=True) + +downloads = [ + ('/home/gurushow/public_html/archive/2011/3-12-11 HR 1.mp3', 'test-data/episodes/2011-03-12-hr1.mp3'), + ('/home/gurushow/public_html/archive/2012/3 - March/3-10-12HR1.mp3', 'test-data/episodes/2012-03-10-hr1.mp3'), + ('/home/gurushow/public_html/archive/2012/6 - June/6-9-12-HR1.mp3', 'test-data/episodes/2012-06-09-hr1.mp3'), + ('/home/gurushow/public_html/archive/2014/06/s6e19.mp3', 'test-data/episodes/2014-s6e19.mp3'), + ('/home/gurushow/public_html/archive/2016/06/s8e43.mp3', 'test-data/episodes/2016-s8e43.mp3'), + ('/home/gurushow/public_html/archive/2017/04/s9e30.mp3', 'test-data/episodes/2017-s9e30.mp3'), +] + +for remote, local in downloads: + if os.path.exists(local): + print(f'[skip] {local} already exists') + continue + size_mb = sftp.stat(remote).st_size / 1024 / 1024 + print(f'Downloading {local} ({size_mb:.1f} MB)...', flush=True) + sftp.get(remote, local) + print(' done', flush=True) + +sftp.close() +client.close() +print('All downloads complete.') diff --git a/projects/radio-show/audio-processor/session-logs/2026-04-27-4090-benchmark-and-test-set.md b/projects/radio-show/audio-processor/session-logs/2026-04-27-4090-benchmark-and-test-set.md new file mode 100644 index 00000000..2a645822 --- /dev/null +++ b/projects/radio-show/audio-processor/session-logs/2026-04-27-4090-benchmark-and-test-set.md @@ -0,0 +1,99 @@ +# Session Log — 2026-04-27 (continuation) + +**Project:** The Computer Guru Show — Archive Mining System +**Goal:** RTX 4090 perf comparison + run unseen test episodes through full pipeline (transcribe / diarize / Q&A) +**Machine:** GURU-BEAST-ROG (RTX 4090, 24GB) +**User:** Mike Swanson (mike) + +Companion to `2026-04-27-diarization-pipeline.md` (DESKTOP-0O8A1RL, RTX 5070 Ti). + +--- + +## Headline + +**Diarization on RTX 4090: 308.9x realtime — 2.07x the RTX 5070 Ti baseline (149.5x).** + +21,374s of audio across 6 unseen test episodes diarized in 69.2s wall time. + +--- + +## Setup Notes + +- ffmpeg/ffprobe not present on GURU-BEAST-ROG. Installed `Gyan.FFmpeg 8.1` via winget. The voice profiler shells out to ffprobe for duration; without it the pipeline crashes on the first episode. +- The repo already contained `benchmark.py` (transcribe + diarize + Q&A on `test-data/episodes/`, hardcoded 5070 Ti baseline). Used as-is. (BENCH_SETUP.md should mention ffmpeg as a prereq.) +- Voice profiles, training data, and test MP3s were already synced to this machine via the prior auto-sync. + +--- + +## Phase 1 — Whisper Transcription (large-v3, faster-whisper) + +| Episode | Audio | Wall | RTF | +|---|---|---|---| +| 2011-03-12-hr1 | 2509s | 198.2s | 12.7x | +| 2012-03-10-hr1 | 2634s | 208.7s | 12.6x | +| 2012-06-09-hr1 | 2648s | 192.5s | 13.8x | +| 2014-s6e19 | 2914s | 167.0s | 17.5x | +| 2016-s8e43 | 5326s | 339.1s | 15.7x | +| 2017-s9e30 | 5343s | 341.2s | 15.7x | +| **Total** | **21374s** | **1446.6s** | **14.8x** | + +Faster-whisper large-v3, beam_size=5, fp16 on the 4090. + +--- + +## Phase 2 — Diarization + +| Episode | Audio | Wall | RTF | Turns | HOST | CALLER | +|---|---|---|---|---|---|---| +| 2011-03-12-hr1 | 2509s | 16.1s | 155.6x | 19 | 2470s | 125s | +| 2012-03-10-hr1 | 2634s | 7.3s | 361.6x | 19 | 2615s | 105s | +| 2012-06-09-hr1 | 2648s | 7.8s | 338.3x | 11 | 2500s | 195s | +| 2014-s6e19 | 2914s | 8.3s | 352.6x | 28 | 2635s | 410s | +| 2016-s8e43 | 5326s | 14.7s | 361.8x | 112 | 4710s | 1170s | +| 2017-s9e30 | 5343s | 15.0s | 356.9x | 55 | 4950s | 660s | +| **Total** | **21374s** | **69.2s** | **308.9x** | 244 | 19880s | 2665s | + +**vs RTX 5070 Ti baseline: 149.5x → 308.9x (+159.4x, +106.6%).** + +Episode 1 carries the cold-start penalty (CUDA init + WavLM load): 155.6x. Warm episodes 2-6 cluster at 338-362x. The total averages 308.9x because the 5070 Ti measurement also included its first-episode cold start, so this is a fair comparison. + +--- + +## Phase 3 — Q&A Extraction + +| Episode | Q&A pairs | +|---|---| +| 2011-03-12-hr1 | 3 | +| 2012-03-10-hr1 | 2 | +| 2012-06-09-hr1 | 3 | +| 2014-s6e19 | 1 | +| 2016-s8e43 | 5 | +| 2017-s9e30 | 5 | +| **Total** | **19** | + +Density: **3.2 pairs/episode** on the unseen test set vs **3.0 pairs/episode** on the 9-episode training set (27 pairs). Pair count generalizes — no evidence of overfitting, and the promo/bumper filter from the earlier session continues to suppress false positives on unseen content. + +The 2014-s6e19 outlier (1 pair / 410s caller time) likely reflects show content rather than a pipeline issue — caller segments don't always parse as cleanly into Q-then-A structure. Worth ear-checking that one before drawing conclusions. + +--- + +## Generalization Findings + +- **Untrained year:** The two 2012 episodes (year never seen during training) produced clean HOST/CALLER labels and reasonable Q&A counts. Voice profile composite generalizes across the production-era boundary. +- **No all-HOST failures:** Every test episode hit caller segments. The 0.85 threshold + identification fix from the prior session hold up on unseen content. +- **Show duration scaling:** Both 89-minute episodes (s8e43, s9e30) hit ~360x realtime, indicating diarization wall time is dominated by audio duration, not turn count. + +--- + +## Files Written + +- `test-data/transcripts//transcript.json` (6 files) +- `test-data/transcripts//diarization.json` (6 files) + +No archive DB on this machine — test-set diarization is not patched anywhere. If we want the test episodes searchable in `archive.db`, that would happen on DESKTOP-0O8A1RL where the index lives. + +--- + +## Note for Mike + +`BENCH_SETUP.md` Step 2 (Python environment) should add `winget install Gyan.FFmpeg` (or equivalent) — the script silently fails at the first diarize call without ffprobe on PATH. Easy doc fix; flagging here so it doesn't get lost.