Files
claudetools/projects/radio-show/audio-processor/benchmark.py
Mike Swanson 488bf5849e radio: attach caller names to Q&A pairs from transcript intros
QAPair gets caller_name and caller_role fields populated by a new
attach_caller_names(pairs, transcript_segments) helper. For each pair,
finds the active opening intro at the question_start time (8s forward
tolerance, no backward limit — a caller's call can run for 10+ minutes
and the intro happens once at the start) and attaches the speaker name.

Validation on 9-episode test set:
  19/19 Q&A pairs (100%) now have caller names attached.

Examples of corrections from oracle attribution:
  2018-s10e18 @ 73:36  Christopher (was misattributed to "Tara")
  2015-s7e19 @ 35:45   William     (was misattributed to "Tara")
  2010-05-08-hr1       Jackie x3, Bruce
  2012-03-10-hr1       Adam x2
  2016-s8e43           John, Doug
  2017-s9e30           Tom, Denise x3, Charlie

speaker_oracle.py: adds speaker_at(time, intros) helper used both by the
existing resolve_speakers() and the new caller-name attachment. Also
adds the "let's fit/bring/put X in/on" intro pattern variant (caught
Charlie at 70:21 in 2017-s9e30 that "talk to X" missed).

download_full_archive.py: SSH keepalive every 30s + per-file retry-on-
failure (up to 3 attempts with reconnect). Earlier run hung on a dead
connection at file 109 of 589 with no recovery; restarted run is now
running at ~10 MB/s vs ~2-3 MB/s before.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 16:55:31 -07:00

227 lines
8.3 KiB
Python

"""
Benchmark: transcribe + diarize + Q&A extraction on the 6 test episodes.
Reports per-episode and total realtime factors.
Compare to DESKTOP-0O8A1RL (RTX 5070 Ti) baseline: 149.5x realtime for diarization.
"""
import sys, os, time
os.environ["PYTHONIOENCODING"] = "utf-8"
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
os.environ["TRANSFORMERS_OFFLINE"] = "1"
from pathlib import Path
from src.gpu import ensure_cuda_libs
ensure_cuda_libs()
import torch
from src.config import load_config
from src.diarizer import diarize, VoiceProfileStore
from src.qa_extractor import load_diarized_transcript, extract_qa_pairs, attach_caller_names
from rich.console import Console
from rich.table import Table
console = Console()
BASELINE_RTX = "RTX 5070 Ti (DESKTOP-0O8A1RL)"
BASELINE_RTF = 209.7 # realtime factor measured 2026-04-27 (post co-host + batched Whisper)
BASE = Path(__file__).parent
EPISODES = sorted((BASE / "test-data" / "episodes").glob("*.mp3"))
TRANS_DIR = BASE / "test-data" / "transcripts"
config = load_config()
device = "cuda" if torch.cuda.is_available() else "cpu"
if not EPISODES:
console.print("[red]No test episodes found in test-data/episodes/ — run Step 4 in BENCH_SETUP.md[/red]")
sys.exit(1)
console.print(f"\n[bold]Computer Guru Show — Diarization Benchmark[/bold]")
console.print(f"Device : {device}" + (f" ({torch.cuda.get_device_name(0)})" if device == "cuda" else ""))
console.print(f"Baseline: {BASELINE_RTX} @ {BASELINE_RTF}x realtime")
console.print(f"Episodes: {len(EPISODES)}\n")
voice_profiles = VoiceProfileStore(
config.resolve_path(config.diarization.voice_profiles_dir)
)
if not voice_profiles.embeddings:
console.print("[red]No voice profiles loaded — copy voice-profiles/ from DESKTOP-0O8A1RL (see BENCH_SETUP.md Step 3)[/red]")
sys.exit(1)
# ── Phase 1: Transcription ─────────────────────────────────────────────────
console.print("[bold]Phase 1: Transcription[/bold]")
trans_results = []
trans_total_audio = 0.0
trans_total_wall = 0.0
import json
from src.transcriber import transcribe as _transcribe
for ep in EPISODES:
trans_ep_dir = TRANS_DIR / ep.stem
trans_ep_dir.mkdir(parents=True, exist_ok=True)
transcript_path = trans_ep_dir / "transcript.json"
if transcript_path.exists():
with open(transcript_path) as f:
td = json.load(f)
dur = td.get("duration", 0)
console.print(f" [dim]{ep.stem}: already transcribed ({dur:.0f}s)[/dim]")
trans_results.append((ep, transcript_path, dur, 0.0))
continue
console.print(f" Transcribing {ep.name}...")
t0 = time.monotonic()
transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16)
wall = time.monotonic() - t0
rtf = transcript.duration / wall
transcript.save(trans_ep_dir)
console.print(f" [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
trans_results.append((ep, transcript_path, transcript.duration, wall))
trans_total_audio += transcript.duration
trans_total_wall += wall
if trans_total_wall > 0:
console.print(f" Transcription total: {trans_total_audio:.0f}s audio in {trans_total_wall:.1f}s = {trans_total_audio/trans_total_wall:.1f}x realtime\n")
# ── Phase 2: Diarization ───────────────────────────────────────────────────
console.print("[bold]Phase 2: Diarization[/bold]")
diar_rows = []
diar_total_audio = 0.0
diar_total_wall = 0.0
for ep, transcript_path, audio_dur, _ in trans_results:
trans_ep_dir = TRANS_DIR / ep.stem
diarization_path = trans_ep_dir / "diarization.json"
if audio_dur == 0:
import json
with open(transcript_path) as f:
audio_dur = json.load(f).get("duration", 0)
t0 = time.monotonic()
result = diarize(ep, voice_profiles=voice_profiles, host_match_threshold=0.85,
transcript_path=transcript_path)
wall = time.monotonic() - t0
rtf = audio_dur / wall if wall > 0 else 0
result.save(trans_ep_dir)
host_s = sum(t.end - t.start for t in result.turns if t.speaker == "HOST")
caller_s = sum(t.end - t.start for t in result.turns if t.speaker == "CALLER")
diar_rows.append({
"episode": ep.stem,
"audio_s": audio_dur,
"wall_s": wall,
"rtf": rtf,
"turns": len(result.turns),
"host_s": host_s,
"caller_s": caller_s,
})
diar_total_audio += audio_dur
diar_total_wall += wall
console.print(
f" {ep.stem}: {len(result.turns)} turns | "
f"HOST {host_s:.0f}s / CALLER {caller_s:.0f}s "
f"[{wall:.1f}s wall / {rtf:.1f}x realtime]"
)
total_rtf = diar_total_audio / diar_total_wall if diar_total_wall > 0 else 0
# ── Phase 2.5: Speaker name resolution from transcript intros ───────────────
console.print("\n[bold]Phase 2.5: Name Resolution[/bold]")
from src.speaker_oracle import resolve_from_files, named_speaker_summary
import json as _json
for ep, transcript_path, audio_dur, _ in trans_results:
trans_ep_dir = TRANS_DIR / ep.stem
diarization_path = trans_ep_dir / "diarization.json"
if not diarization_path.exists():
continue
with open(transcript_path) as f:
td = _json.load(f)
duration = td.get("duration", audio_dur or 0)
named = resolve_from_files(transcript_path, diarization_path)
summary = named_speaker_summary(named, duration)
# Show only resolved names (caller/guest/fillin) — drop HOST/BUMPER/UNKNOWN
resolved = {k: v for k, v in summary.items()
if k.startswith(("caller:", "guest:", "fillin:"))}
unresolved_caller = summary.get("CALLER", 0) + summary.get("CO-HOST", 0)
if resolved or unresolved_caller:
names_str = ", ".join(f"{k.split(': ')[1]} ({v:.0f}s)" for k, v in resolved.items())
console.print(
f" {ep.stem}: {len(resolved)} named ({names_str or 'none'})"
+ (f" [unresolved: {unresolved_caller:.0f}s]" if unresolved_caller else "")
)
# ── Phase 3: Q&A extraction ────────────────────────────────────────────────
console.print("\n[bold]Phase 3: Q&A Extraction[/bold]")
qa_rows = []
for ep, transcript_path, audio_dur, _ in trans_results:
trans_ep_dir = TRANS_DIR / ep.stem
diarization_path = trans_ep_dir / "diarization.json"
segments = load_diarized_transcript(transcript_path, diarization_path)
pairs = extract_qa_pairs(segments)
# Attach caller names from transcript intros
with open(transcript_path) as f:
td = _json.load(f)
attach_caller_names(pairs, td.get("segments", []))
named = sum(1 for p in pairs if p.caller_name)
name_str = ", ".join(p.caller_name for p in pairs if p.caller_name) or ""
qa_rows.append((ep.stem, len(pairs)))
console.print(f" {ep.stem}: {len(pairs)} pairs ({named} named: {name_str})")
# ── Summary ────────────────────────────────────────────────────────────────
console.print()
table = Table(title="Diarization Benchmark Results", show_footer=True)
table.add_column("Episode", footer="TOTAL")
table.add_column("Audio", footer=f"{diar_total_audio:.0f}s")
table.add_column("Wall", footer=f"{diar_total_wall:.1f}s")
table.add_column("RTF", footer=f"[bold]{total_rtf:.1f}x[/bold]")
table.add_column("Turns")
table.add_column("Q&A pairs")
for row, (ep_stem, qa_count) in zip(diar_rows, qa_rows):
table.add_row(
row["episode"],
f"{row['audio_s']:.0f}s",
f"{row['wall_s']:.1f}s",
f"{row['rtf']:.1f}x",
str(row["turns"]),
str(qa_count),
)
console.print(table)
delta = total_rtf - BASELINE_RTF
sign = "+" if delta >= 0 else ""
console.print(
f"\n[bold]vs {BASELINE_RTX}:[/bold] "
f"{BASELINE_RTF:.1f}x -> {total_rtf:.1f}x "
f"({sign}{delta:.1f}x, {sign}{delta/BASELINE_RTF*100:.1f}%)"
)
console.print(
f"\nGPU: {torch.cuda.get_device_name(0) if device == 'cuda' else 'CPU'}"
)