QAPair gets caller_name and caller_role fields populated by a new attach_caller_names(pairs, transcript_segments) helper. For each pair, finds the active opening intro at the question_start time (8s forward tolerance, no backward limit — a caller's call can run for 10+ minutes and the intro happens once at the start) and attaches the speaker name. Validation on 9-episode test set: 19/19 Q&A pairs (100%) now have caller names attached. Examples of corrections from oracle attribution: 2018-s10e18 @ 73:36 Christopher (was misattributed to "Tara") 2015-s7e19 @ 35:45 William (was misattributed to "Tara") 2010-05-08-hr1 Jackie x3, Bruce 2012-03-10-hr1 Adam x2 2016-s8e43 John, Doug 2017-s9e30 Tom, Denise x3, Charlie speaker_oracle.py: adds speaker_at(time, intros) helper used both by the existing resolve_speakers() and the new caller-name attachment. Also adds the "let's fit/bring/put X in/on" intro pattern variant (caught Charlie at 70:21 in 2017-s9e30 that "talk to X" missed). download_full_archive.py: SSH keepalive every 30s + per-file retry-on- failure (up to 3 attempts with reconnect). Earlier run hung on a dead connection at file 109 of 589 with no recovery; restarted run is now running at ~10 MB/s vs ~2-3 MB/s before. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
227 lines
8.3 KiB
Python
227 lines
8.3 KiB
Python
"""
|
|
Benchmark: transcribe + diarize + Q&A extraction on the 6 test episodes.
|
|
Reports per-episode and total realtime factors.
|
|
Compare to DESKTOP-0O8A1RL (RTX 5070 Ti) baseline: 149.5x realtime for diarization.
|
|
"""
|
|
import sys, os, time
|
|
|
|
os.environ["PYTHONIOENCODING"] = "utf-8"
|
|
if hasattr(sys.stdout, "reconfigure"):
|
|
sys.stdout.reconfigure(encoding="utf-8")
|
|
os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
|
|
|
from pathlib import Path
|
|
from src.gpu import ensure_cuda_libs
|
|
ensure_cuda_libs()
|
|
|
|
import torch
|
|
from src.config import load_config
|
|
from src.diarizer import diarize, VoiceProfileStore
|
|
from src.qa_extractor import load_diarized_transcript, extract_qa_pairs, attach_caller_names
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
|
|
console = Console()
|
|
|
|
BASELINE_RTX = "RTX 5070 Ti (DESKTOP-0O8A1RL)"
|
|
BASELINE_RTF = 209.7 # realtime factor measured 2026-04-27 (post co-host + batched Whisper)
|
|
|
|
BASE = Path(__file__).parent
|
|
EPISODES = sorted((BASE / "test-data" / "episodes").glob("*.mp3"))
|
|
TRANS_DIR = BASE / "test-data" / "transcripts"
|
|
|
|
config = load_config()
|
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
if not EPISODES:
|
|
console.print("[red]No test episodes found in test-data/episodes/ — run Step 4 in BENCH_SETUP.md[/red]")
|
|
sys.exit(1)
|
|
|
|
console.print(f"\n[bold]Computer Guru Show — Diarization Benchmark[/bold]")
|
|
console.print(f"Device : {device}" + (f" ({torch.cuda.get_device_name(0)})" if device == "cuda" else ""))
|
|
console.print(f"Baseline: {BASELINE_RTX} @ {BASELINE_RTF}x realtime")
|
|
console.print(f"Episodes: {len(EPISODES)}\n")
|
|
|
|
voice_profiles = VoiceProfileStore(
|
|
config.resolve_path(config.diarization.voice_profiles_dir)
|
|
)
|
|
if not voice_profiles.embeddings:
|
|
console.print("[red]No voice profiles loaded — copy voice-profiles/ from DESKTOP-0O8A1RL (see BENCH_SETUP.md Step 3)[/red]")
|
|
sys.exit(1)
|
|
|
|
# ── Phase 1: Transcription ─────────────────────────────────────────────────
|
|
|
|
console.print("[bold]Phase 1: Transcription[/bold]")
|
|
|
|
trans_results = []
|
|
trans_total_audio = 0.0
|
|
trans_total_wall = 0.0
|
|
|
|
import json
|
|
from src.transcriber import transcribe as _transcribe
|
|
|
|
for ep in EPISODES:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
trans_ep_dir.mkdir(parents=True, exist_ok=True)
|
|
transcript_path = trans_ep_dir / "transcript.json"
|
|
|
|
if transcript_path.exists():
|
|
with open(transcript_path) as f:
|
|
td = json.load(f)
|
|
dur = td.get("duration", 0)
|
|
console.print(f" [dim]{ep.stem}: already transcribed ({dur:.0f}s)[/dim]")
|
|
trans_results.append((ep, transcript_path, dur, 0.0))
|
|
continue
|
|
|
|
console.print(f" Transcribing {ep.name}...")
|
|
t0 = time.monotonic()
|
|
|
|
transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16)
|
|
wall = time.monotonic() - t0
|
|
rtf = transcript.duration / wall
|
|
|
|
transcript.save(trans_ep_dir)
|
|
|
|
console.print(f" [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
|
|
trans_results.append((ep, transcript_path, transcript.duration, wall))
|
|
trans_total_audio += transcript.duration
|
|
trans_total_wall += wall
|
|
|
|
if trans_total_wall > 0:
|
|
console.print(f" Transcription total: {trans_total_audio:.0f}s audio in {trans_total_wall:.1f}s = {trans_total_audio/trans_total_wall:.1f}x realtime\n")
|
|
|
|
# ── Phase 2: Diarization ───────────────────────────────────────────────────
|
|
|
|
console.print("[bold]Phase 2: Diarization[/bold]")
|
|
|
|
diar_rows = []
|
|
diar_total_audio = 0.0
|
|
diar_total_wall = 0.0
|
|
|
|
for ep, transcript_path, audio_dur, _ in trans_results:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
diarization_path = trans_ep_dir / "diarization.json"
|
|
|
|
if audio_dur == 0:
|
|
import json
|
|
with open(transcript_path) as f:
|
|
audio_dur = json.load(f).get("duration", 0)
|
|
|
|
t0 = time.monotonic()
|
|
result = diarize(ep, voice_profiles=voice_profiles, host_match_threshold=0.85,
|
|
transcript_path=transcript_path)
|
|
wall = time.monotonic() - t0
|
|
rtf = audio_dur / wall if wall > 0 else 0
|
|
|
|
result.save(trans_ep_dir)
|
|
|
|
host_s = sum(t.end - t.start for t in result.turns if t.speaker == "HOST")
|
|
caller_s = sum(t.end - t.start for t in result.turns if t.speaker == "CALLER")
|
|
|
|
diar_rows.append({
|
|
"episode": ep.stem,
|
|
"audio_s": audio_dur,
|
|
"wall_s": wall,
|
|
"rtf": rtf,
|
|
"turns": len(result.turns),
|
|
"host_s": host_s,
|
|
"caller_s": caller_s,
|
|
})
|
|
diar_total_audio += audio_dur
|
|
diar_total_wall += wall
|
|
|
|
console.print(
|
|
f" {ep.stem}: {len(result.turns)} turns | "
|
|
f"HOST {host_s:.0f}s / CALLER {caller_s:.0f}s "
|
|
f"[{wall:.1f}s wall / {rtf:.1f}x realtime]"
|
|
)
|
|
|
|
total_rtf = diar_total_audio / diar_total_wall if diar_total_wall > 0 else 0
|
|
|
|
# ── Phase 2.5: Speaker name resolution from transcript intros ───────────────
|
|
|
|
console.print("\n[bold]Phase 2.5: Name Resolution[/bold]")
|
|
|
|
from src.speaker_oracle import resolve_from_files, named_speaker_summary
|
|
import json as _json
|
|
|
|
for ep, transcript_path, audio_dur, _ in trans_results:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
diarization_path = trans_ep_dir / "diarization.json"
|
|
if not diarization_path.exists():
|
|
continue
|
|
|
|
with open(transcript_path) as f:
|
|
td = _json.load(f)
|
|
duration = td.get("duration", audio_dur or 0)
|
|
|
|
named = resolve_from_files(transcript_path, diarization_path)
|
|
summary = named_speaker_summary(named, duration)
|
|
|
|
# Show only resolved names (caller/guest/fillin) — drop HOST/BUMPER/UNKNOWN
|
|
resolved = {k: v for k, v in summary.items()
|
|
if k.startswith(("caller:", "guest:", "fillin:"))}
|
|
unresolved_caller = summary.get("CALLER", 0) + summary.get("CO-HOST", 0)
|
|
|
|
if resolved or unresolved_caller:
|
|
names_str = ", ".join(f"{k.split(': ')[1]} ({v:.0f}s)" for k, v in resolved.items())
|
|
console.print(
|
|
f" {ep.stem}: {len(resolved)} named ({names_str or 'none'})"
|
|
+ (f" [unresolved: {unresolved_caller:.0f}s]" if unresolved_caller else "")
|
|
)
|
|
|
|
# ── Phase 3: Q&A extraction ────────────────────────────────────────────────
|
|
|
|
console.print("\n[bold]Phase 3: Q&A Extraction[/bold]")
|
|
|
|
qa_rows = []
|
|
for ep, transcript_path, audio_dur, _ in trans_results:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
diarization_path = trans_ep_dir / "diarization.json"
|
|
segments = load_diarized_transcript(transcript_path, diarization_path)
|
|
pairs = extract_qa_pairs(segments)
|
|
|
|
# Attach caller names from transcript intros
|
|
with open(transcript_path) as f:
|
|
td = _json.load(f)
|
|
attach_caller_names(pairs, td.get("segments", []))
|
|
|
|
named = sum(1 for p in pairs if p.caller_name)
|
|
name_str = ", ".join(p.caller_name for p in pairs if p.caller_name) or "—"
|
|
qa_rows.append((ep.stem, len(pairs)))
|
|
console.print(f" {ep.stem}: {len(pairs)} pairs ({named} named: {name_str})")
|
|
|
|
# ── Summary ────────────────────────────────────────────────────────────────
|
|
|
|
console.print()
|
|
table = Table(title="Diarization Benchmark Results", show_footer=True)
|
|
table.add_column("Episode", footer="TOTAL")
|
|
table.add_column("Audio", footer=f"{diar_total_audio:.0f}s")
|
|
table.add_column("Wall", footer=f"{diar_total_wall:.1f}s")
|
|
table.add_column("RTF", footer=f"[bold]{total_rtf:.1f}x[/bold]")
|
|
table.add_column("Turns")
|
|
table.add_column("Q&A pairs")
|
|
|
|
for row, (ep_stem, qa_count) in zip(diar_rows, qa_rows):
|
|
table.add_row(
|
|
row["episode"],
|
|
f"{row['audio_s']:.0f}s",
|
|
f"{row['wall_s']:.1f}s",
|
|
f"{row['rtf']:.1f}x",
|
|
str(row["turns"]),
|
|
str(qa_count),
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
delta = total_rtf - BASELINE_RTF
|
|
sign = "+" if delta >= 0 else ""
|
|
console.print(
|
|
f"\n[bold]vs {BASELINE_RTX}:[/bold] "
|
|
f"{BASELINE_RTF:.1f}x -> {total_rtf:.1f}x "
|
|
f"({sign}{delta:.1f}x, {sign}{delta/BASELINE_RTF*100:.1f}%)"
|
|
)
|
|
console.print(
|
|
f"\nGPU: {torch.cuda.get_device_name(0) if device == 'cuda' else 'CPU'}"
|
|
)
|