Re-ran benchmark.py on GURU-BEAST-ROG against the post-overhaul code
(co-host profile, batched Whisper int8_float16, revised Q&A extractor).
Results vs 5070 Ti baseline:
- Diarization: 209.7x -> 338.1x (+61.2%)
- Transcription: 63.8x -> 94.8x (+48.6%)
- Q&A pairs: 9 vs 10 (within run-to-run noise; structural correctness matches:
2014 = 0 callers, 2016 = 2 WiFi caller pairs)
Setup change: BENCH_SETUP.md now lists ffmpeg as a Step-2 prereq
(winget install Gyan.FFmpeg). Was missing on this machine and the pipeline
fails silently at the first diarize call without ffprobe.
Code change: benchmark.py BASELINE_RTF updated 149.5 -> 209.7 to reflect
the 5070 Ti's post-overhaul measurement (e9ac607).
Data: 6 test episode transcripts and diarizations regenerated under the
new code path (batched Whisper output + co-host-aware speaker_map).
Correction memory: voice-profiles/tom/ directory + 5070 Ti session log
fabricated a co-host named "Tom" — Mike confirms no such person exists on
the show. The audio profile is real and the diarization separation is
sound, but the human identity attached to it is wrong. Saved under
.claude/memory/radio_show_no_cohost_named_tom.md pending Mike providing
the correct name for rename.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
186 lines
6.6 KiB
Python
186 lines
6.6 KiB
Python
"""
|
|
Benchmark: transcribe + diarize + Q&A extraction on the 6 test episodes.
|
|
Reports per-episode and total realtime factors.
|
|
Compare to DESKTOP-0O8A1RL (RTX 5070 Ti) baseline: 149.5x realtime for diarization.
|
|
"""
|
|
import sys, os, time
|
|
|
|
os.environ["PYTHONIOENCODING"] = "utf-8"
|
|
if hasattr(sys.stdout, "reconfigure"):
|
|
sys.stdout.reconfigure(encoding="utf-8")
|
|
os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
|
|
|
from pathlib import Path
|
|
from src.gpu import ensure_cuda_libs
|
|
ensure_cuda_libs()
|
|
|
|
import torch
|
|
from src.config import load_config
|
|
from src.diarizer import diarize, VoiceProfileStore
|
|
from src.qa_extractor import load_diarized_transcript, extract_qa_pairs
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
|
|
console = Console()
|
|
|
|
BASELINE_RTX = "RTX 5070 Ti (DESKTOP-0O8A1RL)"
|
|
BASELINE_RTF = 209.7 # realtime factor measured 2026-04-27 (post co-host + batched Whisper)
|
|
|
|
BASE = Path(__file__).parent
|
|
EPISODES = sorted((BASE / "test-data" / "episodes").glob("*.mp3"))
|
|
TRANS_DIR = BASE / "test-data" / "transcripts"
|
|
|
|
config = load_config()
|
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
if not EPISODES:
|
|
console.print("[red]No test episodes found in test-data/episodes/ — run Step 4 in BENCH_SETUP.md[/red]")
|
|
sys.exit(1)
|
|
|
|
console.print(f"\n[bold]Computer Guru Show — Diarization Benchmark[/bold]")
|
|
console.print(f"Device : {device}" + (f" ({torch.cuda.get_device_name(0)})" if device == "cuda" else ""))
|
|
console.print(f"Baseline: {BASELINE_RTX} @ {BASELINE_RTF}x realtime")
|
|
console.print(f"Episodes: {len(EPISODES)}\n")
|
|
|
|
voice_profiles = VoiceProfileStore(
|
|
config.resolve_path(config.diarization.voice_profiles_dir)
|
|
)
|
|
if not voice_profiles.embeddings:
|
|
console.print("[red]No voice profiles loaded — copy voice-profiles/ from DESKTOP-0O8A1RL (see BENCH_SETUP.md Step 3)[/red]")
|
|
sys.exit(1)
|
|
|
|
# ── Phase 1: Transcription ─────────────────────────────────────────────────
|
|
|
|
console.print("[bold]Phase 1: Transcription[/bold]")
|
|
|
|
trans_results = []
|
|
trans_total_audio = 0.0
|
|
trans_total_wall = 0.0
|
|
|
|
import json
|
|
from src.transcriber import transcribe as _transcribe
|
|
|
|
for ep in EPISODES:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
trans_ep_dir.mkdir(parents=True, exist_ok=True)
|
|
transcript_path = trans_ep_dir / "transcript.json"
|
|
|
|
if transcript_path.exists():
|
|
with open(transcript_path) as f:
|
|
td = json.load(f)
|
|
dur = td.get("duration", 0)
|
|
console.print(f" [dim]{ep.stem}: already transcribed ({dur:.0f}s)[/dim]")
|
|
trans_results.append((ep, transcript_path, dur, 0.0))
|
|
continue
|
|
|
|
console.print(f" Transcribing {ep.name}...")
|
|
t0 = time.monotonic()
|
|
|
|
transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16)
|
|
wall = time.monotonic() - t0
|
|
rtf = transcript.duration / wall
|
|
|
|
transcript.save(trans_ep_dir)
|
|
|
|
console.print(f" [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
|
|
trans_results.append((ep, transcript_path, transcript.duration, wall))
|
|
trans_total_audio += transcript.duration
|
|
trans_total_wall += wall
|
|
|
|
if trans_total_wall > 0:
|
|
console.print(f" Transcription total: {trans_total_audio:.0f}s audio in {trans_total_wall:.1f}s = {trans_total_audio/trans_total_wall:.1f}x realtime\n")
|
|
|
|
# ── Phase 2: Diarization ───────────────────────────────────────────────────
|
|
|
|
console.print("[bold]Phase 2: Diarization[/bold]")
|
|
|
|
diar_rows = []
|
|
diar_total_audio = 0.0
|
|
diar_total_wall = 0.0
|
|
|
|
for ep, transcript_path, audio_dur, _ in trans_results:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
diarization_path = trans_ep_dir / "diarization.json"
|
|
|
|
if audio_dur == 0:
|
|
import json
|
|
with open(transcript_path) as f:
|
|
audio_dur = json.load(f).get("duration", 0)
|
|
|
|
t0 = time.monotonic()
|
|
result = diarize(ep, voice_profiles=voice_profiles, host_match_threshold=0.85)
|
|
wall = time.monotonic() - t0
|
|
rtf = audio_dur / wall if wall > 0 else 0
|
|
|
|
result.save(trans_ep_dir)
|
|
|
|
host_s = sum(t.end - t.start for t in result.turns if t.speaker == "HOST")
|
|
caller_s = sum(t.end - t.start for t in result.turns if t.speaker == "CALLER")
|
|
|
|
diar_rows.append({
|
|
"episode": ep.stem,
|
|
"audio_s": audio_dur,
|
|
"wall_s": wall,
|
|
"rtf": rtf,
|
|
"turns": len(result.turns),
|
|
"host_s": host_s,
|
|
"caller_s": caller_s,
|
|
})
|
|
diar_total_audio += audio_dur
|
|
diar_total_wall += wall
|
|
|
|
console.print(
|
|
f" {ep.stem}: {len(result.turns)} turns | "
|
|
f"HOST {host_s:.0f}s / CALLER {caller_s:.0f}s "
|
|
f"[{wall:.1f}s wall / {rtf:.1f}x realtime]"
|
|
)
|
|
|
|
total_rtf = diar_total_audio / diar_total_wall if diar_total_wall > 0 else 0
|
|
|
|
# ── Phase 3: Q&A extraction ────────────────────────────────────────────────
|
|
|
|
console.print("\n[bold]Phase 3: Q&A Extraction[/bold]")
|
|
|
|
qa_rows = []
|
|
for ep, transcript_path, audio_dur, _ in trans_results:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
diarization_path = trans_ep_dir / "diarization.json"
|
|
segments = load_diarized_transcript(transcript_path, diarization_path)
|
|
pairs = extract_qa_pairs(segments)
|
|
qa_rows.append((ep.stem, len(pairs)))
|
|
console.print(f" {ep.stem}: {len(pairs)} Q&A pairs")
|
|
|
|
# ── Summary ────────────────────────────────────────────────────────────────
|
|
|
|
console.print()
|
|
table = Table(title="Diarization Benchmark Results", show_footer=True)
|
|
table.add_column("Episode", footer="TOTAL")
|
|
table.add_column("Audio", footer=f"{diar_total_audio:.0f}s")
|
|
table.add_column("Wall", footer=f"{diar_total_wall:.1f}s")
|
|
table.add_column("RTF", footer=f"[bold]{total_rtf:.1f}x[/bold]")
|
|
table.add_column("Turns")
|
|
table.add_column("Q&A pairs")
|
|
|
|
for row, (ep_stem, qa_count) in zip(diar_rows, qa_rows):
|
|
table.add_row(
|
|
row["episode"],
|
|
f"{row['audio_s']:.0f}s",
|
|
f"{row['wall_s']:.1f}s",
|
|
f"{row['rtf']:.1f}x",
|
|
str(row["turns"]),
|
|
str(qa_count),
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
delta = total_rtf - BASELINE_RTF
|
|
sign = "+" if delta >= 0 else ""
|
|
console.print(
|
|
f"\n[bold]vs {BASELINE_RTX}:[/bold] "
|
|
f"{BASELINE_RTF:.1f}x -> {total_rtf:.1f}x "
|
|
f"({sign}{delta:.1f}x, {sign}{delta/BASELINE_RTF*100:.1f}%)"
|
|
)
|
|
console.print(
|
|
f"\nGPU: {torch.cuda.get_device_name(0) if device == 'cuda' else 'CPU'}"
|
|
)
|