diff --git a/projects/radio-show/audio-processor/download_test_episodes.py b/projects/radio-show/audio-processor/download_test_episodes.py index 2b2c5aa..f37715b 100644 --- a/projects/radio-show/audio-processor/download_test_episodes.py +++ b/projects/radio-show/audio-processor/download_test_episodes.py @@ -16,12 +16,15 @@ sftp = client.open_sftp() os.makedirs('test-data/episodes', exist_ok=True) downloads = [ - ('/home/gurushow/public_html/archive/2011/3-12-11 HR 1.mp3', 'test-data/episodes/2011-03-12-hr1.mp3'), - ('/home/gurushow/public_html/archive/2012/3 - March/3-10-12HR1.mp3', 'test-data/episodes/2012-03-10-hr1.mp3'), - ('/home/gurushow/public_html/archive/2012/6 - June/6-9-12-HR1.mp3', 'test-data/episodes/2012-06-09-hr1.mp3'), - ('/home/gurushow/public_html/archive/2014/06/s6e19.mp3', 'test-data/episodes/2014-s6e19.mp3'), - ('/home/gurushow/public_html/archive/2016/06/s8e43.mp3', 'test-data/episodes/2016-s8e43.mp3'), - ('/home/gurushow/public_html/archive/2017/04/s9e30.mp3', 'test-data/episodes/2017-s9e30.mp3'), + ('/home/gurushow/public_html/archive/2010/COMPUTER GURU 5-8-10 hour 1.mp3', 'test-data/episodes/2010-05-08-hr1.mp3'), + ('/home/gurushow/public_html/archive/2011/3-12-11 HR 1.mp3', 'test-data/episodes/2011-03-12-hr1.mp3'), + ('/home/gurushow/public_html/archive/2012/3 - March/3-10-12HR1.mp3', 'test-data/episodes/2012-03-10-hr1.mp3'), + ('/home/gurushow/public_html/archive/2012/6 - June/6-9-12-HR1.mp3', 'test-data/episodes/2012-06-09-hr1.mp3'), + ('/home/gurushow/public_html/archive/2014/06/s6e19.mp3', 'test-data/episodes/2014-s6e19.mp3'), + ('/home/gurushow/public_html/archive/2015/01/s7e19.mp3', 'test-data/episodes/2015-s7e19.mp3'), + ('/home/gurushow/public_html/archive/2016/06/s8e43.mp3', 'test-data/episodes/2016-s8e43.mp3'), + ('/home/gurushow/public_html/archive/2017/04/s9e30.mp3', 'test-data/episodes/2017-s9e30.mp3'), + ('/home/gurushow/public_html/archive/2018/01/s10e18.mp3', 'test-data/episodes/2018-s10e18.mp3'), ] for remote, local in downloads: diff --git a/projects/radio-show/audio-processor/session-logs/2026-04-27-4090-benchmark-and-test-set.md b/projects/radio-show/audio-processor/session-logs/2026-04-27-4090-benchmark-and-test-set.md index b8ef1be..fa7ef85 100644 --- a/projects/radio-show/audio-processor/session-logs/2026-04-27-4090-benchmark-and-test-set.md +++ b/projects/radio-show/audio-processor/session-logs/2026-04-27-4090-benchmark-and-test-set.md @@ -121,18 +121,47 @@ archive.db is not on this machine — index update happens on DESKTOP-0O8A1RL. --- -## Tara distribution across the test set (post-rename diarization) +## Per-year test set (one episode per year, expanded) -After the rename, the diarizer's per-episode `speaker_map` shows Tara in **all 6** test episodes — well beyond the 2014+2016 the 5070 Ti session log claimed. +Mike asked to expand from the original 6 to one episode per year. Added: +- 2010: `2010-05-08-hr1.mp3` (May 2010, earliest available; avoids training's Oct 2) +- 2015: `2015-s7e19.mp3` (Jan 2015; avoids training's s7e30) +- 2018: `2018-s10e18.mp3` (only 3 non-training episodes exist for 2018) -| Episode | Tara (seconds) | % of audio | Read | -|---|---|---|---| -| 2011-03-12-hr1 | 140s (2:20) | 5.6% | likely false positive — Mike confirms 2011 was pure call-in | -| 2012-03-10-hr1 | 30s (0:30) | 1.1% | likely false positive — 2012 was pure call-in | -| 2012-06-09-hr1 | 340s (5:40) | 12.8% | suspicious — too much for noise; awaiting Mike confirm | -| 2014-s6e19 | 680s (11:20) | 23.3% | confirmed (Mike) | -| 2016-s8e43 | 1890s (31:30) | 35.5% | confirmed (Mike) | -| 2017-s9e30 | 610s (10:10) | 11.4% | plausible — pending Mike confirm; 5070 Ti log only listed Tara in 2014+2016 | +Archive has no 2019 directory (years 2010-2018, no 2013 either). Rob's "2018/2019 appearances" are constrained to the 5 available 2018 episodes only. + +### Diarization across all 9 episodes + +| Year | Episode | Audio | Tara | % | HOST | CALLER (suspect) | Q&A | +|---|---|---|---|---|---|---|---| +| 2010 | 05-08-hr1 | 42:57 | 0:30 | 1.2% | 2325s | **355s** | 4 | +| 2011 | 03-12-hr1 | 41:49 | 2:20 | 5.6% | 2455s | 70s | 1 | +| 2012 | 03-10-hr1 | 43:54 | 0:30 | 1.1% | 2615s | 90s | 2 | +| 2012 | 06-09-hr1 | 44:08 | 5:40 | 12.8% | 2500s | 10s | 0 | +| 2014 | s6e19 | 48:34 | 11:20 | 23.3% | 2625s | 30s | 0 | +| 2015 | s7e19 | 47:13 | 4:40 | 9.9% | 2690s | 45s | 1 | +| 2016 | s8e43 | 88:46 | 31:30 | 35.5% | 4615s | 140s | 2 | +| 2017 | s9e30 | 89:03 | 10:10 | 11.4% | 4945s | 350s | 4 | +| 2018 | s10e18 | 85:45 | 14:40 | 17.1% | 4745s | 230s | 3 | +| **Total** | | **8h 52m** | **1h 21m** (15.3%) | | | **1320s** | **17** | + +### Read on each row + +| Episode | Tara reading | +|---|---| +| 2010-05-08-hr1 | likely false positive (30s); 2010 was pre-Tara; could be Randall or a producer | +| 2011-03-12-hr1 | likely false positive; 2011 was pure call-in per Mike | +| 2012-03-10-hr1 | likely false positive; 2012 was pure call-in per Mike | +| 2012-06-09-hr1 | suspicious (5:40 is too much for noise); pending Mike spot-check | +| 2014-s6e19 | confirmed Tara | +| 2015-s7e19 | substantial (4:40) — plausibly Tara was on early 2015; Mike to confirm | +| 2016-s8e43 | confirmed Tara | +| 2017-s9e30 | plausible Tara (or another co-host); Mike to confirm | +| 2018-s10e18 | **could be Rob, not Tara** — Mike flagged Rob for 2018/2019 appearances. The cosine threshold may be hitting because the two co-hosts have similar acoustic properties. Worth Mike sampling. | + +### Q&A counts caveat + +The Q&A column is still suspect because **every voice that isn't Mike-or-Tara is labeled CALLER**, including Randall, Rob, and any on-air producer (Andrew/Shannon/Ken/etc). The 2010 episode in particular shows 355s CALLER and 4 Q&A — but per Mike's roster, that CALLER bucket likely includes a co-host or producer, not real callers. Spot-check before treating early-years Q&A as ground truth. **Mike's broader correction (2026-04-27):** - **Co-hosts** rotated through over the years. Confirmed: Tara, Randall (early years), Rob (early years + occasional 2018/2019). diff --git a/projects/radio-show/audio-processor/test-data/transcripts/2011-03-12-hr1/diarization.json b/projects/radio-show/audio-processor/test-data/transcripts/2011-03-12-hr1/diarization.json index b022683..63cb04b 100644 --- a/projects/radio-show/audio-processor/test-data/transcripts/2011-03-12-hr1/diarization.json +++ b/projects/radio-show/audio-processor/test-data/transcripts/2011-03-12-hr1/diarization.json @@ -1,9 +1,9 @@ { "num_speakers": 3, "speaker_map": { + "CALLER": "CALLER", "HOST": "HOST", - "CO-HOST": "CO-HOST", - "CALLER": "CALLER" + "CO-HOST": "CO-HOST" }, "turns": [ { diff --git a/projects/radio-show/audio-processor/test-data/transcripts/2012-03-10-hr1/diarization.json b/projects/radio-show/audio-processor/test-data/transcripts/2012-03-10-hr1/diarization.json index 789d23f..b69e922 100644 --- a/projects/radio-show/audio-processor/test-data/transcripts/2012-03-10-hr1/diarization.json +++ b/projects/radio-show/audio-processor/test-data/transcripts/2012-03-10-hr1/diarization.json @@ -1,9 +1,9 @@ { "num_speakers": 3, "speaker_map": { + "CALLER": "CALLER", "HOST": "HOST", - "CO-HOST": "CO-HOST", - "CALLER": "CALLER" + "CO-HOST": "CO-HOST" }, "turns": [ { diff --git a/projects/radio-show/audio-processor/test-data/transcripts/2012-06-09-hr1/diarization.json b/projects/radio-show/audio-processor/test-data/transcripts/2012-06-09-hr1/diarization.json index fa16b08..f5ea9df 100644 --- a/projects/radio-show/audio-processor/test-data/transcripts/2012-06-09-hr1/diarization.json +++ b/projects/radio-show/audio-processor/test-data/transcripts/2012-06-09-hr1/diarization.json @@ -1,9 +1,9 @@ { "num_speakers": 3, "speaker_map": { + "CALLER": "CALLER", "HOST": "HOST", - "CO-HOST": "CO-HOST", - "CALLER": "CALLER" + "CO-HOST": "CO-HOST" }, "turns": [ { diff --git a/projects/radio-show/audio-processor/test-data/transcripts/2014-s6e19/diarization.json b/projects/radio-show/audio-processor/test-data/transcripts/2014-s6e19/diarization.json index b2a19ae..023b81b 100644 --- a/projects/radio-show/audio-processor/test-data/transcripts/2014-s6e19/diarization.json +++ b/projects/radio-show/audio-processor/test-data/transcripts/2014-s6e19/diarization.json @@ -1,9 +1,9 @@ { "num_speakers": 3, "speaker_map": { - "CO-HOST": "CO-HOST", "CALLER": "CALLER", - "HOST": "HOST" + "HOST": "HOST", + "CO-HOST": "CO-HOST" }, "turns": [ { diff --git a/projects/radio-show/audio-processor/test-data/transcripts/2016-s8e43/diarization.json b/projects/radio-show/audio-processor/test-data/transcripts/2016-s8e43/diarization.json index 4232d1c..4fdcac2 100644 --- a/projects/radio-show/audio-processor/test-data/transcripts/2016-s8e43/diarization.json +++ b/projects/radio-show/audio-processor/test-data/transcripts/2016-s8e43/diarization.json @@ -1,9 +1,9 @@ { "num_speakers": 3, "speaker_map": { - "CO-HOST": "CO-HOST", "CALLER": "CALLER", - "HOST": "HOST" + "HOST": "HOST", + "CO-HOST": "CO-HOST" }, "turns": [ { diff --git a/projects/radio-show/audio-processor/test-data/transcripts/2017-s9e30/diarization.json b/projects/radio-show/audio-processor/test-data/transcripts/2017-s9e30/diarization.json index 2b30f53..b655ed2 100644 --- a/projects/radio-show/audio-processor/test-data/transcripts/2017-s9e30/diarization.json +++ b/projects/radio-show/audio-processor/test-data/transcripts/2017-s9e30/diarization.json @@ -1,9 +1,9 @@ { "num_speakers": 3, "speaker_map": { + "CALLER": "CALLER", "HOST": "HOST", - "CO-HOST": "CO-HOST", - "CALLER": "CALLER" + "CO-HOST": "CO-HOST" }, "turns": [ {