New module src/speaker_oracle.py extracts speaker introductions from
transcripts ("let's talk to William", "we have Clay from the Nerd Junkies",
"in Tara's place, we have Clay", "thanks for the call <name>") and binds
them to non-HOST diarization turns. Pure post-pass on diarization JSONs,
no audio processing — corrects audio-only cosine errors using Mike's
deterministic on-air announcements.
Algorithm:
- Extract intros: regex patterns for caller pickups, guest intros,
fill-in announcements, caller closes. Case-strict (rejects mid-sentence
lowercase matches), with a blacklist of common false-positive words.
Deduplicates same-name intros within 5s.
- Resolve speakers: for each non-HOST turn, find the LATEST opening intro
at or before turn.start (with 8s forward tolerance for boundary slop).
Later intros implicitly close earlier callers, so the most recent
intro wins. No artificial lookback limit (callers can talk for 10+ min).
- Falls back to caller_close patterns within 30s after a turn ends.
Validation on 9-episode test set:
2018-s10e18: Christopher 190s correctly named (was mislabeled "Tara")
2012-06-09 : Kay 160s correctly named (was mislabeled "Tara")
2015-s7e19 : Clay 45s as fillin for Tara, William 40s as caller
2016-s8e43 : Charles 630s, Bruce 210s, John 205s — most callers named
2017-s9e30 : Denise 295s, Tom 115s, Elaine 85s, Jeff 10s
Many other callers across all episodes correctly named.
Remaining unnamed CO-HOST/CALLER (~5-10% of non-HOST time) are real
co-host banter or callers without explicit Mike-introductions.
benchmark.py: adds Phase 2.5 "Name Resolution" between diarization and
Q&A extraction. Prints named-speaker breakdown per episode. Doesn't
modify diarization JSONs (resolution is computed on demand).
Next step: feed named turns into qa_extractor so Q&A pairs get caller
name attached for searchability. Also: bootstrap recurring-speaker
profiles (Tara, Tony, Rob, Randall, producers) by accumulating
intro-tagged windows across the full archive once download completes.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
219 lines
8.0 KiB
Python
219 lines
8.0 KiB
Python
"""
|
|
Benchmark: transcribe + diarize + Q&A extraction on the 6 test episodes.
|
|
Reports per-episode and total realtime factors.
|
|
Compare to DESKTOP-0O8A1RL (RTX 5070 Ti) baseline: 149.5x realtime for diarization.
|
|
"""
|
|
import sys, os, time
|
|
|
|
os.environ["PYTHONIOENCODING"] = "utf-8"
|
|
if hasattr(sys.stdout, "reconfigure"):
|
|
sys.stdout.reconfigure(encoding="utf-8")
|
|
os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
|
|
|
from pathlib import Path
|
|
from src.gpu import ensure_cuda_libs
|
|
ensure_cuda_libs()
|
|
|
|
import torch
|
|
from src.config import load_config
|
|
from src.diarizer import diarize, VoiceProfileStore
|
|
from src.qa_extractor import load_diarized_transcript, extract_qa_pairs
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
|
|
console = Console()
|
|
|
|
BASELINE_RTX = "RTX 5070 Ti (DESKTOP-0O8A1RL)"
|
|
BASELINE_RTF = 209.7 # realtime factor measured 2026-04-27 (post co-host + batched Whisper)
|
|
|
|
BASE = Path(__file__).parent
|
|
EPISODES = sorted((BASE / "test-data" / "episodes").glob("*.mp3"))
|
|
TRANS_DIR = BASE / "test-data" / "transcripts"
|
|
|
|
config = load_config()
|
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
if not EPISODES:
|
|
console.print("[red]No test episodes found in test-data/episodes/ — run Step 4 in BENCH_SETUP.md[/red]")
|
|
sys.exit(1)
|
|
|
|
console.print(f"\n[bold]Computer Guru Show — Diarization Benchmark[/bold]")
|
|
console.print(f"Device : {device}" + (f" ({torch.cuda.get_device_name(0)})" if device == "cuda" else ""))
|
|
console.print(f"Baseline: {BASELINE_RTX} @ {BASELINE_RTF}x realtime")
|
|
console.print(f"Episodes: {len(EPISODES)}\n")
|
|
|
|
voice_profiles = VoiceProfileStore(
|
|
config.resolve_path(config.diarization.voice_profiles_dir)
|
|
)
|
|
if not voice_profiles.embeddings:
|
|
console.print("[red]No voice profiles loaded — copy voice-profiles/ from DESKTOP-0O8A1RL (see BENCH_SETUP.md Step 3)[/red]")
|
|
sys.exit(1)
|
|
|
|
# ── Phase 1: Transcription ─────────────────────────────────────────────────
|
|
|
|
console.print("[bold]Phase 1: Transcription[/bold]")
|
|
|
|
trans_results = []
|
|
trans_total_audio = 0.0
|
|
trans_total_wall = 0.0
|
|
|
|
import json
|
|
from src.transcriber import transcribe as _transcribe
|
|
|
|
for ep in EPISODES:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
trans_ep_dir.mkdir(parents=True, exist_ok=True)
|
|
transcript_path = trans_ep_dir / "transcript.json"
|
|
|
|
if transcript_path.exists():
|
|
with open(transcript_path) as f:
|
|
td = json.load(f)
|
|
dur = td.get("duration", 0)
|
|
console.print(f" [dim]{ep.stem}: already transcribed ({dur:.0f}s)[/dim]")
|
|
trans_results.append((ep, transcript_path, dur, 0.0))
|
|
continue
|
|
|
|
console.print(f" Transcribing {ep.name}...")
|
|
t0 = time.monotonic()
|
|
|
|
transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16)
|
|
wall = time.monotonic() - t0
|
|
rtf = transcript.duration / wall
|
|
|
|
transcript.save(trans_ep_dir)
|
|
|
|
console.print(f" [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
|
|
trans_results.append((ep, transcript_path, transcript.duration, wall))
|
|
trans_total_audio += transcript.duration
|
|
trans_total_wall += wall
|
|
|
|
if trans_total_wall > 0:
|
|
console.print(f" Transcription total: {trans_total_audio:.0f}s audio in {trans_total_wall:.1f}s = {trans_total_audio/trans_total_wall:.1f}x realtime\n")
|
|
|
|
# ── Phase 2: Diarization ───────────────────────────────────────────────────
|
|
|
|
console.print("[bold]Phase 2: Diarization[/bold]")
|
|
|
|
diar_rows = []
|
|
diar_total_audio = 0.0
|
|
diar_total_wall = 0.0
|
|
|
|
for ep, transcript_path, audio_dur, _ in trans_results:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
diarization_path = trans_ep_dir / "diarization.json"
|
|
|
|
if audio_dur == 0:
|
|
import json
|
|
with open(transcript_path) as f:
|
|
audio_dur = json.load(f).get("duration", 0)
|
|
|
|
t0 = time.monotonic()
|
|
result = diarize(ep, voice_profiles=voice_profiles, host_match_threshold=0.85,
|
|
transcript_path=transcript_path)
|
|
wall = time.monotonic() - t0
|
|
rtf = audio_dur / wall if wall > 0 else 0
|
|
|
|
result.save(trans_ep_dir)
|
|
|
|
host_s = sum(t.end - t.start for t in result.turns if t.speaker == "HOST")
|
|
caller_s = sum(t.end - t.start for t in result.turns if t.speaker == "CALLER")
|
|
|
|
diar_rows.append({
|
|
"episode": ep.stem,
|
|
"audio_s": audio_dur,
|
|
"wall_s": wall,
|
|
"rtf": rtf,
|
|
"turns": len(result.turns),
|
|
"host_s": host_s,
|
|
"caller_s": caller_s,
|
|
})
|
|
diar_total_audio += audio_dur
|
|
diar_total_wall += wall
|
|
|
|
console.print(
|
|
f" {ep.stem}: {len(result.turns)} turns | "
|
|
f"HOST {host_s:.0f}s / CALLER {caller_s:.0f}s "
|
|
f"[{wall:.1f}s wall / {rtf:.1f}x realtime]"
|
|
)
|
|
|
|
total_rtf = diar_total_audio / diar_total_wall if diar_total_wall > 0 else 0
|
|
|
|
# ── Phase 2.5: Speaker name resolution from transcript intros ───────────────
|
|
|
|
console.print("\n[bold]Phase 2.5: Name Resolution[/bold]")
|
|
|
|
from src.speaker_oracle import resolve_from_files, named_speaker_summary
|
|
import json as _json
|
|
|
|
for ep, transcript_path, audio_dur, _ in trans_results:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
diarization_path = trans_ep_dir / "diarization.json"
|
|
if not diarization_path.exists():
|
|
continue
|
|
|
|
with open(transcript_path) as f:
|
|
td = _json.load(f)
|
|
duration = td.get("duration", audio_dur or 0)
|
|
|
|
named = resolve_from_files(transcript_path, diarization_path)
|
|
summary = named_speaker_summary(named, duration)
|
|
|
|
# Show only resolved names (caller/guest/fillin) — drop HOST/BUMPER/UNKNOWN
|
|
resolved = {k: v for k, v in summary.items()
|
|
if k.startswith(("caller:", "guest:", "fillin:"))}
|
|
unresolved_caller = summary.get("CALLER", 0) + summary.get("CO-HOST", 0)
|
|
|
|
if resolved or unresolved_caller:
|
|
names_str = ", ".join(f"{k.split(': ')[1]} ({v:.0f}s)" for k, v in resolved.items())
|
|
console.print(
|
|
f" {ep.stem}: {len(resolved)} named ({names_str or 'none'})"
|
|
+ (f" [unresolved: {unresolved_caller:.0f}s]" if unresolved_caller else "")
|
|
)
|
|
|
|
# ── Phase 3: Q&A extraction ────────────────────────────────────────────────
|
|
|
|
console.print("\n[bold]Phase 3: Q&A Extraction[/bold]")
|
|
|
|
qa_rows = []
|
|
for ep, transcript_path, audio_dur, _ in trans_results:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
diarization_path = trans_ep_dir / "diarization.json"
|
|
segments = load_diarized_transcript(transcript_path, diarization_path)
|
|
pairs = extract_qa_pairs(segments)
|
|
qa_rows.append((ep.stem, len(pairs)))
|
|
console.print(f" {ep.stem}: {len(pairs)} Q&A pairs")
|
|
|
|
# ── Summary ────────────────────────────────────────────────────────────────
|
|
|
|
console.print()
|
|
table = Table(title="Diarization Benchmark Results", show_footer=True)
|
|
table.add_column("Episode", footer="TOTAL")
|
|
table.add_column("Audio", footer=f"{diar_total_audio:.0f}s")
|
|
table.add_column("Wall", footer=f"{diar_total_wall:.1f}s")
|
|
table.add_column("RTF", footer=f"[bold]{total_rtf:.1f}x[/bold]")
|
|
table.add_column("Turns")
|
|
table.add_column("Q&A pairs")
|
|
|
|
for row, (ep_stem, qa_count) in zip(diar_rows, qa_rows):
|
|
table.add_row(
|
|
row["episode"],
|
|
f"{row['audio_s']:.0f}s",
|
|
f"{row['wall_s']:.1f}s",
|
|
f"{row['rtf']:.1f}x",
|
|
str(row["turns"]),
|
|
str(qa_count),
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
delta = total_rtf - BASELINE_RTF
|
|
sign = "+" if delta >= 0 else ""
|
|
console.print(
|
|
f"\n[bold]vs {BASELINE_RTX}:[/bold] "
|
|
f"{BASELINE_RTF:.1f}x -> {total_rtf:.1f}x "
|
|
f"({sign}{delta:.1f}x, {sign}{delta/BASELINE_RTF*100:.1f}%)"
|
|
)
|
|
console.print(
|
|
f"\nGPU: {torch.cuda.get_device_name(0) if device == 'cuda' else 'CPU'}"
|
|
)
|