Files
claudetools/projects/radio-show/audio-processor/benchmark.py
Mike Swanson c760e430c0 radio: bumper detection in diarizer + full archive download script
Adds a transcript-driven bumper filter to the diarization pipeline. When
a transcript segment matches qa_extractor's promo/bumper signatures, the
overlapping audio windows are labeled BUMPER and the WavLM cosine match
is skipped. Prevents music/promo from being matched against speaker
profiles (the failure mode Mike caught in 2018-s10e18 @ 09:20-10:05).

Code changes:
- src/voice_profiler.py: identify_speakers() takes optional skip_ranges
  parameter; windows whose midpoint falls in a skip range get labeled
  "[bumper]" and skip cosine match
- src/diarizer.py: diarize() takes optional transcript_path; pre-computes
  bumper time ranges via qa_extractor._is_promo_or_bumper, passes to
  identify_speakers; adds BUMPER speaker label
- benchmark.py: passes transcript_path to diarize()

Aggregate impact across 9-episode test set:
  Tara attribution: 4880s -> 3680s  (-1200s / -25%)
  Q&A pairs: 17 -> 19 (+2)
    (bumper-flagged segments had been disrupting conversation detection
     in 2017-s9e30 and 2018-s10e18)
  CALLER total: 1320s -> 1190s  (bumpers previously labeled CALLER moved)
  Per-episode bumpers caught: 1-8, total ~165 bumper segments across set

Remaining Tara false positives are real callers acoustically similar to
Tara (Christopher in 2018, Kay in 2012, William and Charles in 2015) and
guest Clay in 2015-s7e19 — those need profile rebuild + Clay profile,
not bumper filtering.

Adds download_full_archive.py — resumable mirror-style downloader that
walks IX server's /home/gurushow/public_html/archive/{year}/ and copies
all MP3s to archive-data/episodes/. Run is in progress (~589 files,
~10-15GB). Used to source clean profile windows for the remaining
co-hosts (Tara rebuild, Clay, Tony, Rob, Randall, producers).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 16:17:50 -07:00

187 lines
6.7 KiB
Python

"""
Benchmark: transcribe + diarize + Q&A extraction on the 6 test episodes.
Reports per-episode and total realtime factors.
Compare to DESKTOP-0O8A1RL (RTX 5070 Ti) baseline: 149.5x realtime for diarization.
"""
import sys, os, time
os.environ["PYTHONIOENCODING"] = "utf-8"
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
os.environ["TRANSFORMERS_OFFLINE"] = "1"
from pathlib import Path
from src.gpu import ensure_cuda_libs
ensure_cuda_libs()
import torch
from src.config import load_config
from src.diarizer import diarize, VoiceProfileStore
from src.qa_extractor import load_diarized_transcript, extract_qa_pairs
from rich.console import Console
from rich.table import Table
console = Console()
BASELINE_RTX = "RTX 5070 Ti (DESKTOP-0O8A1RL)"
BASELINE_RTF = 209.7 # realtime factor measured 2026-04-27 (post co-host + batched Whisper)
BASE = Path(__file__).parent
EPISODES = sorted((BASE / "test-data" / "episodes").glob("*.mp3"))
TRANS_DIR = BASE / "test-data" / "transcripts"
config = load_config()
device = "cuda" if torch.cuda.is_available() else "cpu"
if not EPISODES:
console.print("[red]No test episodes found in test-data/episodes/ — run Step 4 in BENCH_SETUP.md[/red]")
sys.exit(1)
console.print(f"\n[bold]Computer Guru Show — Diarization Benchmark[/bold]")
console.print(f"Device : {device}" + (f" ({torch.cuda.get_device_name(0)})" if device == "cuda" else ""))
console.print(f"Baseline: {BASELINE_RTX} @ {BASELINE_RTF}x realtime")
console.print(f"Episodes: {len(EPISODES)}\n")
voice_profiles = VoiceProfileStore(
config.resolve_path(config.diarization.voice_profiles_dir)
)
if not voice_profiles.embeddings:
console.print("[red]No voice profiles loaded — copy voice-profiles/ from DESKTOP-0O8A1RL (see BENCH_SETUP.md Step 3)[/red]")
sys.exit(1)
# ── Phase 1: Transcription ─────────────────────────────────────────────────
console.print("[bold]Phase 1: Transcription[/bold]")
trans_results = []
trans_total_audio = 0.0
trans_total_wall = 0.0
import json
from src.transcriber import transcribe as _transcribe
for ep in EPISODES:
trans_ep_dir = TRANS_DIR / ep.stem
trans_ep_dir.mkdir(parents=True, exist_ok=True)
transcript_path = trans_ep_dir / "transcript.json"
if transcript_path.exists():
with open(transcript_path) as f:
td = json.load(f)
dur = td.get("duration", 0)
console.print(f" [dim]{ep.stem}: already transcribed ({dur:.0f}s)[/dim]")
trans_results.append((ep, transcript_path, dur, 0.0))
continue
console.print(f" Transcribing {ep.name}...")
t0 = time.monotonic()
transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16)
wall = time.monotonic() - t0
rtf = transcript.duration / wall
transcript.save(trans_ep_dir)
console.print(f" [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
trans_results.append((ep, transcript_path, transcript.duration, wall))
trans_total_audio += transcript.duration
trans_total_wall += wall
if trans_total_wall > 0:
console.print(f" Transcription total: {trans_total_audio:.0f}s audio in {trans_total_wall:.1f}s = {trans_total_audio/trans_total_wall:.1f}x realtime\n")
# ── Phase 2: Diarization ───────────────────────────────────────────────────
console.print("[bold]Phase 2: Diarization[/bold]")
diar_rows = []
diar_total_audio = 0.0
diar_total_wall = 0.0
for ep, transcript_path, audio_dur, _ in trans_results:
trans_ep_dir = TRANS_DIR / ep.stem
diarization_path = trans_ep_dir / "diarization.json"
if audio_dur == 0:
import json
with open(transcript_path) as f:
audio_dur = json.load(f).get("duration", 0)
t0 = time.monotonic()
result = diarize(ep, voice_profiles=voice_profiles, host_match_threshold=0.85,
transcript_path=transcript_path)
wall = time.monotonic() - t0
rtf = audio_dur / wall if wall > 0 else 0
result.save(trans_ep_dir)
host_s = sum(t.end - t.start for t in result.turns if t.speaker == "HOST")
caller_s = sum(t.end - t.start for t in result.turns if t.speaker == "CALLER")
diar_rows.append({
"episode": ep.stem,
"audio_s": audio_dur,
"wall_s": wall,
"rtf": rtf,
"turns": len(result.turns),
"host_s": host_s,
"caller_s": caller_s,
})
diar_total_audio += audio_dur
diar_total_wall += wall
console.print(
f" {ep.stem}: {len(result.turns)} turns | "
f"HOST {host_s:.0f}s / CALLER {caller_s:.0f}s "
f"[{wall:.1f}s wall / {rtf:.1f}x realtime]"
)
total_rtf = diar_total_audio / diar_total_wall if diar_total_wall > 0 else 0
# ── Phase 3: Q&A extraction ────────────────────────────────────────────────
console.print("\n[bold]Phase 3: Q&A Extraction[/bold]")
qa_rows = []
for ep, transcript_path, audio_dur, _ in trans_results:
trans_ep_dir = TRANS_DIR / ep.stem
diarization_path = trans_ep_dir / "diarization.json"
segments = load_diarized_transcript(transcript_path, diarization_path)
pairs = extract_qa_pairs(segments)
qa_rows.append((ep.stem, len(pairs)))
console.print(f" {ep.stem}: {len(pairs)} Q&A pairs")
# ── Summary ────────────────────────────────────────────────────────────────
console.print()
table = Table(title="Diarization Benchmark Results", show_footer=True)
table.add_column("Episode", footer="TOTAL")
table.add_column("Audio", footer=f"{diar_total_audio:.0f}s")
table.add_column("Wall", footer=f"{diar_total_wall:.1f}s")
table.add_column("RTF", footer=f"[bold]{total_rtf:.1f}x[/bold]")
table.add_column("Turns")
table.add_column("Q&A pairs")
for row, (ep_stem, qa_count) in zip(diar_rows, qa_rows):
table.add_row(
row["episode"],
f"{row['audio_s']:.0f}s",
f"{row['wall_s']:.1f}s",
f"{row['rtf']:.1f}x",
str(row["turns"]),
str(qa_count),
)
console.print(table)
delta = total_rtf - BASELINE_RTF
sign = "+" if delta >= 0 else ""
console.print(
f"\n[bold]vs {BASELINE_RTX}:[/bold] "
f"{BASELINE_RTF:.1f}x -> {total_rtf:.1f}x "
f"({sign}{delta:.1f}x, {sign}{delta/BASELINE_RTF*100:.1f}%)"
)
console.print(
f"\nGPU: {torch.cuda.get_device_name(0) if device == 'cuda' else 'CPU'}"
)