Adds a transcript-driven bumper filter to the diarization pipeline. When
a transcript segment matches qa_extractor's promo/bumper signatures, the
overlapping audio windows are labeled BUMPER and the WavLM cosine match
is skipped. Prevents music/promo from being matched against speaker
profiles (the failure mode Mike caught in 2018-s10e18 @ 09:20-10:05).
Code changes:
- src/voice_profiler.py: identify_speakers() takes optional skip_ranges
parameter; windows whose midpoint falls in a skip range get labeled
"[bumper]" and skip cosine match
- src/diarizer.py: diarize() takes optional transcript_path; pre-computes
bumper time ranges via qa_extractor._is_promo_or_bumper, passes to
identify_speakers; adds BUMPER speaker label
- benchmark.py: passes transcript_path to diarize()
Aggregate impact across 9-episode test set:
Tara attribution: 4880s -> 3680s (-1200s / -25%)
Q&A pairs: 17 -> 19 (+2)
(bumper-flagged segments had been disrupting conversation detection
in 2017-s9e30 and 2018-s10e18)
CALLER total: 1320s -> 1190s (bumpers previously labeled CALLER moved)
Per-episode bumpers caught: 1-8, total ~165 bumper segments across set
Remaining Tara false positives are real callers acoustically similar to
Tara (Christopher in 2018, Kay in 2012, William and Charles in 2015) and
guest Clay in 2015-s7e19 — those need profile rebuild + Clay profile,
not bumper filtering.
Adds download_full_archive.py — resumable mirror-style downloader that
walks IX server's /home/gurushow/public_html/archive/{year}/ and copies
all MP3s to archive-data/episodes/. Run is in progress (~589 files,
~10-15GB). Used to source clean profile windows for the remaining
co-hosts (Tara rebuild, Clay, Tony, Rob, Randall, producers).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
187 lines
6.7 KiB
Python
187 lines
6.7 KiB
Python
"""
|
|
Benchmark: transcribe + diarize + Q&A extraction on the 6 test episodes.
|
|
Reports per-episode and total realtime factors.
|
|
Compare to DESKTOP-0O8A1RL (RTX 5070 Ti) baseline: 149.5x realtime for diarization.
|
|
"""
|
|
import sys, os, time
|
|
|
|
os.environ["PYTHONIOENCODING"] = "utf-8"
|
|
if hasattr(sys.stdout, "reconfigure"):
|
|
sys.stdout.reconfigure(encoding="utf-8")
|
|
os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
|
|
|
from pathlib import Path
|
|
from src.gpu import ensure_cuda_libs
|
|
ensure_cuda_libs()
|
|
|
|
import torch
|
|
from src.config import load_config
|
|
from src.diarizer import diarize, VoiceProfileStore
|
|
from src.qa_extractor import load_diarized_transcript, extract_qa_pairs
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
|
|
console = Console()
|
|
|
|
BASELINE_RTX = "RTX 5070 Ti (DESKTOP-0O8A1RL)"
|
|
BASELINE_RTF = 209.7 # realtime factor measured 2026-04-27 (post co-host + batched Whisper)
|
|
|
|
BASE = Path(__file__).parent
|
|
EPISODES = sorted((BASE / "test-data" / "episodes").glob("*.mp3"))
|
|
TRANS_DIR = BASE / "test-data" / "transcripts"
|
|
|
|
config = load_config()
|
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
if not EPISODES:
|
|
console.print("[red]No test episodes found in test-data/episodes/ — run Step 4 in BENCH_SETUP.md[/red]")
|
|
sys.exit(1)
|
|
|
|
console.print(f"\n[bold]Computer Guru Show — Diarization Benchmark[/bold]")
|
|
console.print(f"Device : {device}" + (f" ({torch.cuda.get_device_name(0)})" if device == "cuda" else ""))
|
|
console.print(f"Baseline: {BASELINE_RTX} @ {BASELINE_RTF}x realtime")
|
|
console.print(f"Episodes: {len(EPISODES)}\n")
|
|
|
|
voice_profiles = VoiceProfileStore(
|
|
config.resolve_path(config.diarization.voice_profiles_dir)
|
|
)
|
|
if not voice_profiles.embeddings:
|
|
console.print("[red]No voice profiles loaded — copy voice-profiles/ from DESKTOP-0O8A1RL (see BENCH_SETUP.md Step 3)[/red]")
|
|
sys.exit(1)
|
|
|
|
# ── Phase 1: Transcription ─────────────────────────────────────────────────
|
|
|
|
console.print("[bold]Phase 1: Transcription[/bold]")
|
|
|
|
trans_results = []
|
|
trans_total_audio = 0.0
|
|
trans_total_wall = 0.0
|
|
|
|
import json
|
|
from src.transcriber import transcribe as _transcribe
|
|
|
|
for ep in EPISODES:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
trans_ep_dir.mkdir(parents=True, exist_ok=True)
|
|
transcript_path = trans_ep_dir / "transcript.json"
|
|
|
|
if transcript_path.exists():
|
|
with open(transcript_path) as f:
|
|
td = json.load(f)
|
|
dur = td.get("duration", 0)
|
|
console.print(f" [dim]{ep.stem}: already transcribed ({dur:.0f}s)[/dim]")
|
|
trans_results.append((ep, transcript_path, dur, 0.0))
|
|
continue
|
|
|
|
console.print(f" Transcribing {ep.name}...")
|
|
t0 = time.monotonic()
|
|
|
|
transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16)
|
|
wall = time.monotonic() - t0
|
|
rtf = transcript.duration / wall
|
|
|
|
transcript.save(trans_ep_dir)
|
|
|
|
console.print(f" [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
|
|
trans_results.append((ep, transcript_path, transcript.duration, wall))
|
|
trans_total_audio += transcript.duration
|
|
trans_total_wall += wall
|
|
|
|
if trans_total_wall > 0:
|
|
console.print(f" Transcription total: {trans_total_audio:.0f}s audio in {trans_total_wall:.1f}s = {trans_total_audio/trans_total_wall:.1f}x realtime\n")
|
|
|
|
# ── Phase 2: Diarization ───────────────────────────────────────────────────
|
|
|
|
console.print("[bold]Phase 2: Diarization[/bold]")
|
|
|
|
diar_rows = []
|
|
diar_total_audio = 0.0
|
|
diar_total_wall = 0.0
|
|
|
|
for ep, transcript_path, audio_dur, _ in trans_results:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
diarization_path = trans_ep_dir / "diarization.json"
|
|
|
|
if audio_dur == 0:
|
|
import json
|
|
with open(transcript_path) as f:
|
|
audio_dur = json.load(f).get("duration", 0)
|
|
|
|
t0 = time.monotonic()
|
|
result = diarize(ep, voice_profiles=voice_profiles, host_match_threshold=0.85,
|
|
transcript_path=transcript_path)
|
|
wall = time.monotonic() - t0
|
|
rtf = audio_dur / wall if wall > 0 else 0
|
|
|
|
result.save(trans_ep_dir)
|
|
|
|
host_s = sum(t.end - t.start for t in result.turns if t.speaker == "HOST")
|
|
caller_s = sum(t.end - t.start for t in result.turns if t.speaker == "CALLER")
|
|
|
|
diar_rows.append({
|
|
"episode": ep.stem,
|
|
"audio_s": audio_dur,
|
|
"wall_s": wall,
|
|
"rtf": rtf,
|
|
"turns": len(result.turns),
|
|
"host_s": host_s,
|
|
"caller_s": caller_s,
|
|
})
|
|
diar_total_audio += audio_dur
|
|
diar_total_wall += wall
|
|
|
|
console.print(
|
|
f" {ep.stem}: {len(result.turns)} turns | "
|
|
f"HOST {host_s:.0f}s / CALLER {caller_s:.0f}s "
|
|
f"[{wall:.1f}s wall / {rtf:.1f}x realtime]"
|
|
)
|
|
|
|
total_rtf = diar_total_audio / diar_total_wall if diar_total_wall > 0 else 0
|
|
|
|
# ── Phase 3: Q&A extraction ────────────────────────────────────────────────
|
|
|
|
console.print("\n[bold]Phase 3: Q&A Extraction[/bold]")
|
|
|
|
qa_rows = []
|
|
for ep, transcript_path, audio_dur, _ in trans_results:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
diarization_path = trans_ep_dir / "diarization.json"
|
|
segments = load_diarized_transcript(transcript_path, diarization_path)
|
|
pairs = extract_qa_pairs(segments)
|
|
qa_rows.append((ep.stem, len(pairs)))
|
|
console.print(f" {ep.stem}: {len(pairs)} Q&A pairs")
|
|
|
|
# ── Summary ────────────────────────────────────────────────────────────────
|
|
|
|
console.print()
|
|
table = Table(title="Diarization Benchmark Results", show_footer=True)
|
|
table.add_column("Episode", footer="TOTAL")
|
|
table.add_column("Audio", footer=f"{diar_total_audio:.0f}s")
|
|
table.add_column("Wall", footer=f"{diar_total_wall:.1f}s")
|
|
table.add_column("RTF", footer=f"[bold]{total_rtf:.1f}x[/bold]")
|
|
table.add_column("Turns")
|
|
table.add_column("Q&A pairs")
|
|
|
|
for row, (ep_stem, qa_count) in zip(diar_rows, qa_rows):
|
|
table.add_row(
|
|
row["episode"],
|
|
f"{row['audio_s']:.0f}s",
|
|
f"{row['wall_s']:.1f}s",
|
|
f"{row['rtf']:.1f}x",
|
|
str(row["turns"]),
|
|
str(qa_count),
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
delta = total_rtf - BASELINE_RTF
|
|
sign = "+" if delta >= 0 else ""
|
|
console.print(
|
|
f"\n[bold]vs {BASELINE_RTX}:[/bold] "
|
|
f"{BASELINE_RTF:.1f}x -> {total_rtf:.1f}x "
|
|
f"({sign}{delta:.1f}x, {sign}{delta/BASELINE_RTF*100:.1f}%)"
|
|
)
|
|
console.print(
|
|
f"\nGPU: {torch.cuda.get_device_name(0) if device == 'cuda' else 'CPU'}"
|
|
)
|