- Build Tom (co-host) voice profile (44 embeddings, 0.698 similarity to Mike) - diarizer.py: add CO-HOST speaker label for cohost-role profiles - voice_profiler.py: emit "Cohost: <name>" label for cohost role - qa_extractor.py: overlap resolution at load time (midpoint boundary split), 4s CALLER-preference threshold, turn-based caller-intro lookback (2 HOST turns), _preceded_by_caller_intro() helper, _PHONE_GREETING pattern, 751-1041 + "we'll get your problem solved" promo signatures - benchmark.py: use src.transcriber.transcribe with batch_size=16 - add index_test_episodes.py and build_cohost_profile.py scripts - add .gitignore (exclude episodes, transcripts, *.db, .venv) - session log: 2026-04-27-qa-extraction-cohost-indexing.md Result: 2016-s8e43 drops from 12 false-positive Q&A pairs to 2 real caller pairs. archive.db: 6 episodes, 762 segments, 10 Q&A pairs, FTS5 search verified. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
186 lines
6.6 KiB
Python
186 lines
6.6 KiB
Python
"""
|
|
Benchmark: transcribe + diarize + Q&A extraction on the 6 test episodes.
|
|
Reports per-episode and total realtime factors.
|
|
Compare to DESKTOP-0O8A1RL (RTX 5070 Ti) baseline: 149.5x realtime for diarization.
|
|
"""
|
|
import sys, os, time
|
|
|
|
os.environ["PYTHONIOENCODING"] = "utf-8"
|
|
if hasattr(sys.stdout, "reconfigure"):
|
|
sys.stdout.reconfigure(encoding="utf-8")
|
|
os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
|
|
|
from pathlib import Path
|
|
from src.gpu import ensure_cuda_libs
|
|
ensure_cuda_libs()
|
|
|
|
import torch
|
|
from src.config import load_config
|
|
from src.diarizer import diarize, VoiceProfileStore
|
|
from src.qa_extractor import load_diarized_transcript, extract_qa_pairs
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
|
|
console = Console()
|
|
|
|
BASELINE_RTX = "RTX 5070 Ti (DESKTOP-0O8A1RL)"
|
|
BASELINE_RTF = 149.5 # realtime factor measured 2026-04-27
|
|
|
|
BASE = Path(__file__).parent
|
|
EPISODES = sorted((BASE / "test-data" / "episodes").glob("*.mp3"))
|
|
TRANS_DIR = BASE / "test-data" / "transcripts"
|
|
|
|
config = load_config()
|
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
if not EPISODES:
|
|
console.print("[red]No test episodes found in test-data/episodes/ — run Step 4 in BENCH_SETUP.md[/red]")
|
|
sys.exit(1)
|
|
|
|
console.print(f"\n[bold]Computer Guru Show — Diarization Benchmark[/bold]")
|
|
console.print(f"Device : {device}" + (f" ({torch.cuda.get_device_name(0)})" if device == "cuda" else ""))
|
|
console.print(f"Baseline: {BASELINE_RTX} @ {BASELINE_RTF}x realtime")
|
|
console.print(f"Episodes: {len(EPISODES)}\n")
|
|
|
|
voice_profiles = VoiceProfileStore(
|
|
config.resolve_path(config.diarization.voice_profiles_dir)
|
|
)
|
|
if not voice_profiles.embeddings:
|
|
console.print("[red]No voice profiles loaded — copy voice-profiles/ from DESKTOP-0O8A1RL (see BENCH_SETUP.md Step 3)[/red]")
|
|
sys.exit(1)
|
|
|
|
# ── Phase 1: Transcription ─────────────────────────────────────────────────
|
|
|
|
console.print("[bold]Phase 1: Transcription[/bold]")
|
|
|
|
trans_results = []
|
|
trans_total_audio = 0.0
|
|
trans_total_wall = 0.0
|
|
|
|
import json
|
|
from src.transcriber import transcribe as _transcribe
|
|
|
|
for ep in EPISODES:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
trans_ep_dir.mkdir(parents=True, exist_ok=True)
|
|
transcript_path = trans_ep_dir / "transcript.json"
|
|
|
|
if transcript_path.exists():
|
|
with open(transcript_path) as f:
|
|
td = json.load(f)
|
|
dur = td.get("duration", 0)
|
|
console.print(f" [dim]{ep.stem}: already transcribed ({dur:.0f}s)[/dim]")
|
|
trans_results.append((ep, transcript_path, dur, 0.0))
|
|
continue
|
|
|
|
console.print(f" Transcribing {ep.name}...")
|
|
t0 = time.monotonic()
|
|
|
|
transcript = _transcribe(ep, model_size="large-v3", device=device, batch_size=16)
|
|
wall = time.monotonic() - t0
|
|
rtf = transcript.duration / wall
|
|
|
|
transcript.save(trans_ep_dir)
|
|
|
|
console.print(f" [green]{ep.stem}: {transcript.duration:.0f}s audio in {wall:.1f}s = {rtf:.1f}x realtime[/green]")
|
|
trans_results.append((ep, transcript_path, transcript.duration, wall))
|
|
trans_total_audio += transcript.duration
|
|
trans_total_wall += wall
|
|
|
|
if trans_total_wall > 0:
|
|
console.print(f" Transcription total: {trans_total_audio:.0f}s audio in {trans_total_wall:.1f}s = {trans_total_audio/trans_total_wall:.1f}x realtime\n")
|
|
|
|
# ── Phase 2: Diarization ───────────────────────────────────────────────────
|
|
|
|
console.print("[bold]Phase 2: Diarization[/bold]")
|
|
|
|
diar_rows = []
|
|
diar_total_audio = 0.0
|
|
diar_total_wall = 0.0
|
|
|
|
for ep, transcript_path, audio_dur, _ in trans_results:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
diarization_path = trans_ep_dir / "diarization.json"
|
|
|
|
if audio_dur == 0:
|
|
import json
|
|
with open(transcript_path) as f:
|
|
audio_dur = json.load(f).get("duration", 0)
|
|
|
|
t0 = time.monotonic()
|
|
result = diarize(ep, voice_profiles=voice_profiles, host_match_threshold=0.85)
|
|
wall = time.monotonic() - t0
|
|
rtf = audio_dur / wall if wall > 0 else 0
|
|
|
|
result.save(trans_ep_dir)
|
|
|
|
host_s = sum(t.end - t.start for t in result.turns if t.speaker == "HOST")
|
|
caller_s = sum(t.end - t.start for t in result.turns if t.speaker == "CALLER")
|
|
|
|
diar_rows.append({
|
|
"episode": ep.stem,
|
|
"audio_s": audio_dur,
|
|
"wall_s": wall,
|
|
"rtf": rtf,
|
|
"turns": len(result.turns),
|
|
"host_s": host_s,
|
|
"caller_s": caller_s,
|
|
})
|
|
diar_total_audio += audio_dur
|
|
diar_total_wall += wall
|
|
|
|
console.print(
|
|
f" {ep.stem}: {len(result.turns)} turns | "
|
|
f"HOST {host_s:.0f}s / CALLER {caller_s:.0f}s "
|
|
f"[{wall:.1f}s wall / {rtf:.1f}x realtime]"
|
|
)
|
|
|
|
total_rtf = diar_total_audio / diar_total_wall if diar_total_wall > 0 else 0
|
|
|
|
# ── Phase 3: Q&A extraction ────────────────────────────────────────────────
|
|
|
|
console.print("\n[bold]Phase 3: Q&A Extraction[/bold]")
|
|
|
|
qa_rows = []
|
|
for ep, transcript_path, audio_dur, _ in trans_results:
|
|
trans_ep_dir = TRANS_DIR / ep.stem
|
|
diarization_path = trans_ep_dir / "diarization.json"
|
|
segments = load_diarized_transcript(transcript_path, diarization_path)
|
|
pairs = extract_qa_pairs(segments)
|
|
qa_rows.append((ep.stem, len(pairs)))
|
|
console.print(f" {ep.stem}: {len(pairs)} Q&A pairs")
|
|
|
|
# ── Summary ────────────────────────────────────────────────────────────────
|
|
|
|
console.print()
|
|
table = Table(title="Diarization Benchmark Results", show_footer=True)
|
|
table.add_column("Episode", footer="TOTAL")
|
|
table.add_column("Audio", footer=f"{diar_total_audio:.0f}s")
|
|
table.add_column("Wall", footer=f"{diar_total_wall:.1f}s")
|
|
table.add_column("RTF", footer=f"[bold]{total_rtf:.1f}x[/bold]")
|
|
table.add_column("Turns")
|
|
table.add_column("Q&A pairs")
|
|
|
|
for row, (ep_stem, qa_count) in zip(diar_rows, qa_rows):
|
|
table.add_row(
|
|
row["episode"],
|
|
f"{row['audio_s']:.0f}s",
|
|
f"{row['wall_s']:.1f}s",
|
|
f"{row['rtf']:.1f}x",
|
|
str(row["turns"]),
|
|
str(qa_count),
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
delta = total_rtf - BASELINE_RTF
|
|
sign = "+" if delta >= 0 else ""
|
|
console.print(
|
|
f"\n[bold]vs {BASELINE_RTX}:[/bold] "
|
|
f"{BASELINE_RTF:.1f}x -> {total_rtf:.1f}x "
|
|
f"({sign}{delta:.1f}x, {sign}{delta/BASELINE_RTF*100:.1f}%)"
|
|
)
|
|
console.print(
|
|
f"\nGPU: {torch.cuda.get_device_name(0) if device == 'cuda' else 'CPU'}"
|
|
)
|