Files
claudetools/projects/radio-show/audio-processor/benchmark.py
Mike Swanson b9a4bb8807 scc: 4090 benchmark with new code state — 338.1x diarize, 94.8x transcribe
Re-ran benchmark.py on GURU-BEAST-ROG against the post-overhaul code
(co-host profile, batched Whisper int8_float16, revised Q&A extractor).

Results vs 5070 Ti baseline:
- Diarization: 209.7x -> 338.1x (+61.2%)
- Transcription: 63.8x -> 94.8x (+48.6%)
- Q&A pairs: 9 vs 10 (within run-to-run noise; structural correctness matches:
  2014 = 0 callers, 2016 = 2 WiFi caller pairs)

Setup change: BENCH_SETUP.md now lists ffmpeg as a Step-2 prereq
(winget install Gyan.FFmpeg). Was missing on this machine and the pipeline
fails silently at the first diarize call without ffprobe.

Code change: benchmark.py BASELINE_RTF updated 149.5 -> 209.7 to reflect
the 5070 Ti's post-overhaul measurement (e9ac607).

Data: 6 test episode transcripts and diarizations regenerated under the
new code path (batched Whisper output + co-host-aware speaker_map).

Correction memory: voice-profiles/tom/ directory + 5070 Ti session log
fabricated a co-host named "Tom" — Mike confirms no such person exists on
the show. The audio profile is real and the diarization separation is
sound, but the human identity attached to it is wrong. Saved under
.claude/memory/radio_show_no_cohost_named_tom.md pending Mike providing
the correct name for rename.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 14:54:07 -07:00

186 lines
6.6 KiB
Python

"""
Benchmark: transcribe + diarize + Q&A extraction on the 6 test episodes.
Reports per-episode and total realtime factors.
Compare to DESKTOP-0O8A1RL (RTX 5070 Ti) baseline: 149.5x realtime for diarization.
"""
import sys, os, time
os.environ["PYTHONIOENCODING"] = "utf-8"
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
os.environ["TRANSFORMERS_OFFLINE"] = "1"
from pathlib import Path
from src.gpu import ensure_cuda_libs
ensure_cuda_libs()
import torch
from src.config import load_config
from src.diarizer import diarize, VoiceProfileStore
from src.qa_extractor import load_diarized_transcript, extract_qa_pairs
from rich.console import Console
from rich.table import Table
console = Console()
BASELINE_RTX = "RTX 5070 Ti (DESKTOP-0O8A1RL)"
BASELINE_RTF = 209.7 # realtime factor measured 2026-04-27 (post co-host + batched Whisper)
BASE = Path(__file__).parent
EPISODES = sorted((BASE / "test-data" / "episodes").glob("*.mp3"))
TRANS_DIR = BASE / "test-data" / "transcripts"
config = load_config()
device = "cuda" if torch.cuda.is_available() else "cpu"
if not EPISODES:
console.print("[red]No test episodes found in test-data/episodes/ — run Step 4 in BENCH_SETUP.md[/red]")
sys.exit(1)
console.print(f"\n[bold]Computer Guru Show — Diarization Benchmark[/bold]")
console.print(f"Device : {device}" + (f" ({torch.cuda.get_device_name(0)})" if device == "cuda" else ""))
console.print(f"Baseline: {BASELINE_RTX} @ {BASELINE_RTF}x realtime")
console.print(f"Episodes: {len(EPISODES)}\n")
voice_profiles = VoiceProfileStore(
config.resolve_path(config.diarization.voice_profiles_dir)
)
if not voice_profiles.embeddings:
console.print("[red]No voice profiles loaded — copy voice-profiles/ from DESKTOP-0O8A1RL (see BENCH_SETUP.md Step 3)[/red]")
sys.exit(1)
# ── Phase 1: Transcription ─────────────────────────────────────────────────
console.print("[bold]Phase 1: Transcription[/bold]")
trans_results = []
trans_total_audio = 0.0
trans_total_wall = 0.0
import json
from src.transcriber import transcribe as _transcribe
for ep in EPISODES:
trans_ep_dir = TRANS_DIR / ep.stem
trans_ep_dir.mkdir(parents=True, exist_ok=True)
transcript_path = trans_ep_dir / "transcript.json"
if transcript_path.exists():
with open(transcript_path) as f:
td = json.load(f)
dur = td.get("duration", 0)
console.print(f" [dim]{ep.stem}: already transcribed ({dur:.0f}s)[/dim]")
trans_results.append((ep, transcript_path, dur, 0.0))
continue
console.print(f" Transcribing {ep.name}...")
t0 = time.monotonic()
transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16)
wall = time.monotonic() - t0
rtf = transcript.duration / wall
transcript.save(trans_ep_dir)
console.print(f" [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
trans_results.append((ep, transcript_path, transcript.duration, wall))
trans_total_audio += transcript.duration
trans_total_wall += wall
if trans_total_wall > 0:
console.print(f" Transcription total: {trans_total_audio:.0f}s audio in {trans_total_wall:.1f}s = {trans_total_audio/trans_total_wall:.1f}x realtime\n")
# ── Phase 2: Diarization ───────────────────────────────────────────────────
console.print("[bold]Phase 2: Diarization[/bold]")
diar_rows = []
diar_total_audio = 0.0
diar_total_wall = 0.0
for ep, transcript_path, audio_dur, _ in trans_results:
trans_ep_dir = TRANS_DIR / ep.stem
diarization_path = trans_ep_dir / "diarization.json"
if audio_dur == 0:
import json
with open(transcript_path) as f:
audio_dur = json.load(f).get("duration", 0)
t0 = time.monotonic()
result = diarize(ep, voice_profiles=voice_profiles, host_match_threshold=0.85)
wall = time.monotonic() - t0
rtf = audio_dur / wall if wall > 0 else 0
result.save(trans_ep_dir)
host_s = sum(t.end - t.start for t in result.turns if t.speaker == "HOST")
caller_s = sum(t.end - t.start for t in result.turns if t.speaker == "CALLER")
diar_rows.append({
"episode": ep.stem,
"audio_s": audio_dur,
"wall_s": wall,
"rtf": rtf,
"turns": len(result.turns),
"host_s": host_s,
"caller_s": caller_s,
})
diar_total_audio += audio_dur
diar_total_wall += wall
console.print(
f" {ep.stem}: {len(result.turns)} turns | "
f"HOST {host_s:.0f}s / CALLER {caller_s:.0f}s "
f"[{wall:.1f}s wall / {rtf:.1f}x realtime]"
)
total_rtf = diar_total_audio / diar_total_wall if diar_total_wall > 0 else 0
# ── Phase 3: Q&A extraction ────────────────────────────────────────────────
console.print("\n[bold]Phase 3: Q&A Extraction[/bold]")
qa_rows = []
for ep, transcript_path, audio_dur, _ in trans_results:
trans_ep_dir = TRANS_DIR / ep.stem
diarization_path = trans_ep_dir / "diarization.json"
segments = load_diarized_transcript(transcript_path, diarization_path)
pairs = extract_qa_pairs(segments)
qa_rows.append((ep.stem, len(pairs)))
console.print(f" {ep.stem}: {len(pairs)} Q&A pairs")
# ── Summary ────────────────────────────────────────────────────────────────
console.print()
table = Table(title="Diarization Benchmark Results", show_footer=True)
table.add_column("Episode", footer="TOTAL")
table.add_column("Audio", footer=f"{diar_total_audio:.0f}s")
table.add_column("Wall", footer=f"{diar_total_wall:.1f}s")
table.add_column("RTF", footer=f"[bold]{total_rtf:.1f}x[/bold]")
table.add_column("Turns")
table.add_column("Q&A pairs")
for row, (ep_stem, qa_count) in zip(diar_rows, qa_rows):
table.add_row(
row["episode"],
f"{row['audio_s']:.0f}s",
f"{row['wall_s']:.1f}s",
f"{row['rtf']:.1f}x",
str(row["turns"]),
str(qa_count),
)
console.print(table)
delta = total_rtf - BASELINE_RTF
sign = "+" if delta >= 0 else ""
console.print(
f"\n[bold]vs {BASELINE_RTX}:[/bold] "
f"{BASELINE_RTF:.1f}x -> {total_rtf:.1f}x "
f"({sign}{delta:.1f}x, {sign}{delta/BASELINE_RTF*100:.1f}%)"
)
console.print(
f"\nGPU: {torch.cuda.get_device_name(0) if device == 'cuda' else 'CPU'}"
)