radio show: co-host voice profile, Q&A extraction fixes, archive index

- Build Tom (co-host) voice profile (44 embeddings, 0.698 similarity to Mike) - diarizer.py: add CO-HOST speaker label for cohost-role profiles - voice_profiler.py: emit "Cohost: <name>" label for cohost role - qa_extractor.py: overlap resolution at load time (midpoint boundary split), 4s CALLER-preference threshold, turn-based caller-intro lookback (2 HOST turns), _preceded_by_caller_intro() helper, _PHONE_GREETING pattern, 751-1041 + "we'll get your problem solved" promo signatures - benchmark.py: use src.transcriber.transcribe with batch_size=16 - add index_test_episodes.py and build_cohost_profile.py scripts - add .gitignore (exclude episodes, transcripts, *.db, .venv) - session log: 2026-04-27-qa-extraction-cohost-indexing.md Result: 2016-s8e43 drops from 12 false-positive Q&A pairs to 2 real caller pairs. archive.db: 6 episodes, 762 segments, 10 Q&A pairs, FTS5 search verified. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 14:41:04 -07:00
parent 79abef9dc9
commit e9ac607500
55 changed files with 649 additions and 100 deletions
--- a/projects/radio-show/audio-processor/benchmark.py
+++ b/projects/radio-show/audio-processor/benchmark.py
@@ -57,13 +57,15 @@ trans_results = []
 trans_total_audio = 0.0
 trans_total_wall  = 0.0

+import json
+from src.transcriber import transcribe as _transcribe
+
 for ep in EPISODES:
    trans_ep_dir = TRANS_DIR / ep.stem
    trans_ep_dir.mkdir(parents=True, exist_ok=True)
    transcript_path = trans_ep_dir / "transcript.json"

    if transcript_path.exists():
-        import json
        with open(transcript_path) as f:
            td = json.load(f)
        dur = td.get("duration", 0)
@@ -74,30 +76,15 @@ for ep in EPISODES:
    console.print(f"  Transcribing {ep.name}...")
    t0 = time.monotonic()

-    from faster_whisper import WhisperModel
-    if not hasattr(sys, "_whisper_model"):
-        console.print("  [dim]Loading Whisper large-v3...[/dim]")
-        sys._whisper_model = WhisperModel("large-v3", device=device, compute_type="float16")
-
-    model = sys._whisper_model
-    segments_iter, info = model.transcribe(str(ep), language="en", beam_size=5)
-
-    import json
-    segs = []
-    for seg in segments_iter:
-        segs.append({"id": seg.id, "start": seg.start, "end": seg.end, "text": seg.text})
-
-    duration = info.duration
+    transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16)
    wall = time.monotonic() - t0
-    rtf  = duration / wall
+    rtf  = transcript.duration / wall

-    result = {"duration": duration, "language": "en", "segments": segs}
-    with open(transcript_path, "w") as f:
-        json.dump(result, f)
+    transcript.save(trans_ep_dir)

-    console.print(f"  [green]{ep.stem}: {duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
-    trans_results.append((ep, transcript_path, duration, wall))
-    trans_total_audio += duration
+    console.print(f"  [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
+    trans_results.append((ep, transcript_path, transcript.duration, wall))
+    trans_total_audio += transcript.duration
    trans_total_wall  += wall

 if trans_total_wall > 0: