Files
claudetools/projects/radio-show/audio-processor/benchmark.py
Mike Swanson 1b574caba4 radio: transcript-driven speaker name resolution (oracle)
New module src/speaker_oracle.py extracts speaker introductions from
transcripts ("let's talk to William", "we have Clay from the Nerd Junkies",
"in Tara's place, we have Clay", "thanks for the call <name>") and binds
them to non-HOST diarization turns. Pure post-pass on diarization JSONs,
no audio processing — corrects audio-only cosine errors using Mike's
deterministic on-air announcements.

Algorithm:
- Extract intros: regex patterns for caller pickups, guest intros,
  fill-in announcements, caller closes. Case-strict (rejects mid-sentence
  lowercase matches), with a blacklist of common false-positive words.
  Deduplicates same-name intros within 5s.
- Resolve speakers: for each non-HOST turn, find the LATEST opening intro
  at or before turn.start (with 8s forward tolerance for boundary slop).
  Later intros implicitly close earlier callers, so the most recent
  intro wins. No artificial lookback limit (callers can talk for 10+ min).
- Falls back to caller_close patterns within 30s after a turn ends.

Validation on 9-episode test set:
  2018-s10e18: Christopher 190s correctly named (was mislabeled "Tara")
  2012-06-09 : Kay 160s correctly named (was mislabeled "Tara")
  2015-s7e19 : Clay 45s as fillin for Tara, William 40s as caller
  2016-s8e43 : Charles 630s, Bruce 210s, John 205s — most callers named
  2017-s9e30 : Denise 295s, Tom 115s, Elaine 85s, Jeff 10s
  Many other callers across all episodes correctly named.

Remaining unnamed CO-HOST/CALLER (~5-10% of non-HOST time) are real
co-host banter or callers without explicit Mike-introductions.

benchmark.py: adds Phase 2.5 "Name Resolution" between diarization and
Q&A extraction. Prints named-speaker breakdown per episode. Doesn't
modify diarization JSONs (resolution is computed on demand).

Next step: feed named turns into qa_extractor so Q&A pairs get caller
name attached for searchability. Also: bootstrap recurring-speaker
profiles (Tara, Tony, Rob, Randall, producers) by accumulating
intro-tagged windows across the full archive once download completes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 16:48:16 -07:00

219 lines
8.0 KiB
Python

"""
Benchmark: transcribe + diarize + Q&A extraction on the 6 test episodes.
Reports per-episode and total realtime factors.
Compare to DESKTOP-0O8A1RL (RTX 5070 Ti) baseline: 149.5x realtime for diarization.
"""
import sys, os, time
os.environ["PYTHONIOENCODING"] = "utf-8"
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
os.environ["TRANSFORMERS_OFFLINE"] = "1"
from pathlib import Path
from src.gpu import ensure_cuda_libs
ensure_cuda_libs()
import torch
from src.config import load_config
from src.diarizer import diarize, VoiceProfileStore
from src.qa_extractor import load_diarized_transcript, extract_qa_pairs
from rich.console import Console
from rich.table import Table
console = Console()
BASELINE_RTX = "RTX 5070 Ti (DESKTOP-0O8A1RL)"
BASELINE_RTF = 209.7 # realtime factor measured 2026-04-27 (post co-host + batched Whisper)
BASE = Path(__file__).parent
EPISODES = sorted((BASE / "test-data" / "episodes").glob("*.mp3"))
TRANS_DIR = BASE / "test-data" / "transcripts"
config = load_config()
device = "cuda" if torch.cuda.is_available() else "cpu"
if not EPISODES:
console.print("[red]No test episodes found in test-data/episodes/ — run Step 4 in BENCH_SETUP.md[/red]")
sys.exit(1)
console.print(f"\n[bold]Computer Guru Show — Diarization Benchmark[/bold]")
console.print(f"Device : {device}" + (f" ({torch.cuda.get_device_name(0)})" if device == "cuda" else ""))
console.print(f"Baseline: {BASELINE_RTX} @ {BASELINE_RTF}x realtime")
console.print(f"Episodes: {len(EPISODES)}\n")
voice_profiles = VoiceProfileStore(
config.resolve_path(config.diarization.voice_profiles_dir)
)
if not voice_profiles.embeddings:
console.print("[red]No voice profiles loaded — copy voice-profiles/ from DESKTOP-0O8A1RL (see BENCH_SETUP.md Step 3)[/red]")
sys.exit(1)
# ── Phase 1: Transcription ─────────────────────────────────────────────────
console.print("[bold]Phase 1: Transcription[/bold]")
trans_results = []
trans_total_audio = 0.0
trans_total_wall = 0.0
import json
from src.transcriber import transcribe as _transcribe
for ep in EPISODES:
trans_ep_dir = TRANS_DIR / ep.stem
trans_ep_dir.mkdir(parents=True, exist_ok=True)
transcript_path = trans_ep_dir / "transcript.json"
if transcript_path.exists():
with open(transcript_path) as f:
td = json.load(f)
dur = td.get("duration", 0)
console.print(f" [dim]{ep.stem}: already transcribed ({dur:.0f}s)[/dim]")
trans_results.append((ep, transcript_path, dur, 0.0))
continue
console.print(f" Transcribing {ep.name}...")
t0 = time.monotonic()
transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16)
wall = time.monotonic() - t0
rtf = transcript.duration / wall
transcript.save(trans_ep_dir)
console.print(f" [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
trans_results.append((ep, transcript_path, transcript.duration, wall))
trans_total_audio += transcript.duration
trans_total_wall += wall
if trans_total_wall > 0:
console.print(f" Transcription total: {trans_total_audio:.0f}s audio in {trans_total_wall:.1f}s = {trans_total_audio/trans_total_wall:.1f}x realtime\n")
# ── Phase 2: Diarization ───────────────────────────────────────────────────
console.print("[bold]Phase 2: Diarization[/bold]")
diar_rows = []
diar_total_audio = 0.0
diar_total_wall = 0.0
for ep, transcript_path, audio_dur, _ in trans_results:
trans_ep_dir = TRANS_DIR / ep.stem
diarization_path = trans_ep_dir / "diarization.json"
if audio_dur == 0:
import json
with open(transcript_path) as f:
audio_dur = json.load(f).get("duration", 0)
t0 = time.monotonic()
result = diarize(ep, voice_profiles=voice_profiles, host_match_threshold=0.85,
transcript_path=transcript_path)
wall = time.monotonic() - t0
rtf = audio_dur / wall if wall > 0 else 0
result.save(trans_ep_dir)
host_s = sum(t.end - t.start for t in result.turns if t.speaker == "HOST")
caller_s = sum(t.end - t.start for t in result.turns if t.speaker == "CALLER")
diar_rows.append({
"episode": ep.stem,
"audio_s": audio_dur,
"wall_s": wall,
"rtf": rtf,
"turns": len(result.turns),
"host_s": host_s,
"caller_s": caller_s,
})
diar_total_audio += audio_dur
diar_total_wall += wall
console.print(
f" {ep.stem}: {len(result.turns)} turns | "
f"HOST {host_s:.0f}s / CALLER {caller_s:.0f}s "
f"[{wall:.1f}s wall / {rtf:.1f}x realtime]"
)
total_rtf = diar_total_audio / diar_total_wall if diar_total_wall > 0 else 0
# ── Phase 2.5: Speaker name resolution from transcript intros ───────────────
console.print("\n[bold]Phase 2.5: Name Resolution[/bold]")
from src.speaker_oracle import resolve_from_files, named_speaker_summary
import json as _json
for ep, transcript_path, audio_dur, _ in trans_results:
trans_ep_dir = TRANS_DIR / ep.stem
diarization_path = trans_ep_dir / "diarization.json"
if not diarization_path.exists():
continue
with open(transcript_path) as f:
td = _json.load(f)
duration = td.get("duration", audio_dur or 0)
named = resolve_from_files(transcript_path, diarization_path)
summary = named_speaker_summary(named, duration)
# Show only resolved names (caller/guest/fillin) — drop HOST/BUMPER/UNKNOWN
resolved = {k: v for k, v in summary.items()
if k.startswith(("caller:", "guest:", "fillin:"))}
unresolved_caller = summary.get("CALLER", 0) + summary.get("CO-HOST", 0)
if resolved or unresolved_caller:
names_str = ", ".join(f"{k.split(': ')[1]} ({v:.0f}s)" for k, v in resolved.items())
console.print(
f" {ep.stem}: {len(resolved)} named ({names_str or 'none'})"
+ (f" [unresolved: {unresolved_caller:.0f}s]" if unresolved_caller else "")
)
# ── Phase 3: Q&A extraction ────────────────────────────────────────────────
console.print("\n[bold]Phase 3: Q&A Extraction[/bold]")
qa_rows = []
for ep, transcript_path, audio_dur, _ in trans_results:
trans_ep_dir = TRANS_DIR / ep.stem
diarization_path = trans_ep_dir / "diarization.json"
segments = load_diarized_transcript(transcript_path, diarization_path)
pairs = extract_qa_pairs(segments)
qa_rows.append((ep.stem, len(pairs)))
console.print(f" {ep.stem}: {len(pairs)} Q&A pairs")
# ── Summary ────────────────────────────────────────────────────────────────
console.print()
table = Table(title="Diarization Benchmark Results", show_footer=True)
table.add_column("Episode", footer="TOTAL")
table.add_column("Audio", footer=f"{diar_total_audio:.0f}s")
table.add_column("Wall", footer=f"{diar_total_wall:.1f}s")
table.add_column("RTF", footer=f"[bold]{total_rtf:.1f}x[/bold]")
table.add_column("Turns")
table.add_column("Q&A pairs")
for row, (ep_stem, qa_count) in zip(diar_rows, qa_rows):
table.add_row(
row["episode"],
f"{row['audio_s']:.0f}s",
f"{row['wall_s']:.1f}s",
f"{row['rtf']:.1f}x",
str(row["turns"]),
str(qa_count),
)
console.print(table)
delta = total_rtf - BASELINE_RTF
sign = "+" if delta >= 0 else ""
console.print(
f"\n[bold]vs {BASELINE_RTX}:[/bold] "
f"{BASELINE_RTF:.1f}x -> {total_rtf:.1f}x "
f"({sign}{delta:.1f}x, {sign}{delta/BASELINE_RTF*100:.1f}%)"
)
console.print(
f"\nGPU: {torch.cuda.get_device_name(0) if device == 'cuda' else 'CPU'}"
)