""" Benchmark: transcribe + diarize + Q&A extraction on the 6 test episodes. Reports per-episode and total realtime factors. Compare to DESKTOP-0O8A1RL (RTX 5070 Ti) baseline: 149.5x realtime for diarization. """ import sys, os, time os.environ["PYTHONIOENCODING"] = "utf-8" if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8") os.environ["TRANSFORMERS_OFFLINE"] = "1" from pathlib import Path from src.gpu import ensure_cuda_libs ensure_cuda_libs() import torch from src.config import load_config from src.diarizer import diarize, VoiceProfileStore from src.qa_extractor import load_diarized_transcript, extract_qa_pairs from rich.console import Console from rich.table import Table console = Console() BASELINE_RTX = "RTX 5070 Ti (DESKTOP-0O8A1RL)" BASELINE_RTF = 149.5 # realtime factor measured 2026-04-27 BASE = Path(__file__).parent EPISODES = sorted((BASE / "test-data" / "episodes").glob("*.mp3")) TRANS_DIR = BASE / "test-data" / "transcripts" config = load_config() device = "cuda" if torch.cuda.is_available() else "cpu" if not EPISODES: console.print("[red]No test episodes found in test-data/episodes/ — run Step 4 in BENCH_SETUP.md[/red]") sys.exit(1) console.print(f"\n[bold]Computer Guru Show — Diarization Benchmark[/bold]") console.print(f"Device : {device}" + (f" ({torch.cuda.get_device_name(0)})" if device == "cuda" else "")) console.print(f"Baseline: {BASELINE_RTX} @ {BASELINE_RTF}x realtime") console.print(f"Episodes: {len(EPISODES)}\n") voice_profiles = VoiceProfileStore( config.resolve_path(config.diarization.voice_profiles_dir) ) if not voice_profiles.embeddings: console.print("[red]No voice profiles loaded — copy voice-profiles/ from DESKTOP-0O8A1RL (see BENCH_SETUP.md Step 3)[/red]") sys.exit(1) # ── Phase 1: Transcription ───────────────────────────────────────────────── console.print("[bold]Phase 1: Transcription[/bold]") trans_results = [] trans_total_audio = 0.0 trans_total_wall = 0.0 import json from src.transcriber import transcribe as _transcribe for ep in EPISODES: trans_ep_dir = TRANS_DIR / ep.stem trans_ep_dir.mkdir(parents=True, exist_ok=True) transcript_path = trans_ep_dir / "transcript.json" if transcript_path.exists(): with open(transcript_path) as f: td = json.load(f) dur = td.get("duration", 0) console.print(f" [dim]{ep.stem}: already transcribed ({dur:.0f}s)[/dim]") trans_results.append((ep, transcript_path, dur, 0.0)) continue console.print(f" Transcribing {ep.name}...") t0 = time.monotonic() transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16) wall = time.monotonic() - t0 rtf = transcript.duration / wall transcript.save(trans_ep_dir) console.print(f" [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]") trans_results.append((ep, transcript_path, transcript.duration, wall)) trans_total_audio += transcript.duration trans_total_wall += wall if trans_total_wall > 0: console.print(f" Transcription total: {trans_total_audio:.0f}s audio in {trans_total_wall:.1f}s = {trans_total_audio/trans_total_wall:.1f}x realtime\n") # ── Phase 2: Diarization ─────────────────────────────────────────────────── console.print("[bold]Phase 2: Diarization[/bold]") diar_rows = [] diar_total_audio = 0.0 diar_total_wall = 0.0 for ep, transcript_path, audio_dur, _ in trans_results: trans_ep_dir = TRANS_DIR / ep.stem diarization_path = trans_ep_dir / "diarization.json" if audio_dur == 0: import json with open(transcript_path) as f: audio_dur = json.load(f).get("duration", 0) t0 = time.monotonic() result = diarize(ep, voice_profiles=voice_profiles, host_match_threshold=0.85) wall = time.monotonic() - t0 rtf = audio_dur / wall if wall > 0 else 0 result.save(trans_ep_dir) host_s = sum(t.end - t.start for t in result.turns if t.speaker == "HOST") caller_s = sum(t.end - t.start for t in result.turns if t.speaker == "CALLER") diar_rows.append({ "episode": ep.stem, "audio_s": audio_dur, "wall_s": wall, "rtf": rtf, "turns": len(result.turns), "host_s": host_s, "caller_s": caller_s, }) diar_total_audio += audio_dur diar_total_wall += wall console.print( f" {ep.stem}: {len(result.turns)} turns | " f"HOST {host_s:.0f}s / CALLER {caller_s:.0f}s " f"[{wall:.1f}s wall / {rtf:.1f}x realtime]" ) total_rtf = diar_total_audio / diar_total_wall if diar_total_wall > 0 else 0 # ── Phase 3: Q&A extraction ──────────────────────────────────────────────── console.print("\n[bold]Phase 3: Q&A Extraction[/bold]") qa_rows = [] for ep, transcript_path, audio_dur, _ in trans_results: trans_ep_dir = TRANS_DIR / ep.stem diarization_path = trans_ep_dir / "diarization.json" segments = load_diarized_transcript(transcript_path, diarization_path) pairs = extract_qa_pairs(segments) qa_rows.append((ep.stem, len(pairs))) console.print(f" {ep.stem}: {len(pairs)} Q&A pairs") # ── Summary ──────────────────────────────────────────────────────────────── console.print() table = Table(title="Diarization Benchmark Results", show_footer=True) table.add_column("Episode", footer="TOTAL") table.add_column("Audio", footer=f"{diar_total_audio:.0f}s") table.add_column("Wall", footer=f"{diar_total_wall:.1f}s") table.add_column("RTF", footer=f"[bold]{total_rtf:.1f}x[/bold]") table.add_column("Turns") table.add_column("Q&A pairs") for row, (ep_stem, qa_count) in zip(diar_rows, qa_rows): table.add_row( row["episode"], f"{row['audio_s']:.0f}s", f"{row['wall_s']:.1f}s", f"{row['rtf']:.1f}x", str(row["turns"]), str(qa_count), ) console.print(table) delta = total_rtf - BASELINE_RTF sign = "+" if delta >= 0 else "" console.print( f"\n[bold]vs {BASELINE_RTX}:[/bold] " f"{BASELINE_RTF:.1f}x -> {total_rtf:.1f}x " f"({sign}{delta:.1f}x, {sign}{delta/BASELINE_RTF*100:.1f}%)" ) console.print( f"\nGPU: {torch.cuda.get_device_name(0) if device == 'cuda' else 'CPU'}" )