radio show: co-host voice profile, Q&A extraction fixes, archive index
- Build Tom (co-host) voice profile (44 embeddings, 0.698 similarity to Mike) - diarizer.py: add CO-HOST speaker label for cohost-role profiles - voice_profiler.py: emit "Cohost: <name>" label for cohost role - qa_extractor.py: overlap resolution at load time (midpoint boundary split), 4s CALLER-preference threshold, turn-based caller-intro lookback (2 HOST turns), _preceded_by_caller_intro() helper, _PHONE_GREETING pattern, 751-1041 + "we'll get your problem solved" promo signatures - benchmark.py: use src.transcriber.transcribe with batch_size=16 - add index_test_episodes.py and build_cohost_profile.py scripts - add .gitignore (exclude episodes, transcripts, *.db, .venv) - session log: 2026-04-27-qa-extraction-cohost-indexing.md Result: 2016-s8e43 drops from 12 false-positive Q&A pairs to 2 real caller pairs. archive.db: 6 episodes, 762 segments, 10 Q&A pairs, FTS5 search verified. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
115
projects/radio-show/audio-processor/build_cohost_profile.py
Normal file
115
projects/radio-show/audio-processor/build_cohost_profile.py
Normal file
@@ -0,0 +1,115 @@
|
||||
"""
|
||||
Build voice profile for Tom (co-host) from known co-host speech windows.
|
||||
|
||||
Uses CALLER-labeled windows from the first 60 min of co-host-era episodes,
|
||||
before any real callers would have called in.
|
||||
"""
|
||||
import os, sys
|
||||
os.environ["PYTHONIOENCODING"] = "utf-8"
|
||||
os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
|
||||
from pathlib import Path
|
||||
import json
|
||||
import numpy as np
|
||||
from src.gpu import ensure_cuda_libs
|
||||
ensure_cuda_libs()
|
||||
|
||||
import torch
|
||||
from src.voice_profiler import VoiceProfiler, SpeakerProfile
|
||||
from rich.console import Console
|
||||
|
||||
console = Console()
|
||||
|
||||
BASE = Path(__file__).parent
|
||||
PROFILES_DIR = BASE / "voice-profiles"
|
||||
EPISODES_DIR = BASE / "test-data" / "episodes"
|
||||
TRANS_DIR = BASE / "test-data" / "transcripts"
|
||||
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
console.print(f"Device: {device}")
|
||||
|
||||
profiler = VoiceProfiler(PROFILES_DIR, device=device)
|
||||
|
||||
# Tom's known speech windows per episode
|
||||
# CALLER turns from diarization that are in the first 60 min (before real callers)
|
||||
# Windows at 0-40s excluded (promo/jingle, not Tom's voice)
|
||||
TOM_WINDOWS = {
|
||||
"2014-s6e19.mp3": [
|
||||
(195, 260),
|
||||
(320, 425),
|
||||
(600, 650),
|
||||
(675, 710),
|
||||
],
|
||||
"2016-s8e43.mp3": [
|
||||
(100, 115),
|
||||
(135, 160),
|
||||
(270, 295),
|
||||
(575, 605),
|
||||
(1185, 1235),
|
||||
(1790, 1870),
|
||||
(2020, 2055),
|
||||
],
|
||||
}
|
||||
|
||||
COHOST_NAME = "Tom"
|
||||
|
||||
if COHOST_NAME not in profiler.profiles:
|
||||
profiler.profiles[COHOST_NAME] = SpeakerProfile(
|
||||
name=COHOST_NAME,
|
||||
role="cohost",
|
||||
embeddings=[],
|
||||
source_episodes=[],
|
||||
)
|
||||
|
||||
profile = profiler.profiles[COHOST_NAME]
|
||||
console.print(f"\n[bold]Building co-host profile for: {COHOST_NAME}[/bold]")
|
||||
|
||||
for ep_name, windows in TOM_WINDOWS.items():
|
||||
ep_path = EPISODES_DIR / ep_name
|
||||
if not ep_path.exists():
|
||||
console.print(f"[yellow] Skipping {ep_name} — not found[/yellow]")
|
||||
continue
|
||||
|
||||
console.print(f"\n Loading {ep_name}...")
|
||||
audio = profiler._load_full_audio(ep_path)
|
||||
profiler._get_model()
|
||||
|
||||
SAMPLE_RATE = 16000
|
||||
chunk_s = 10.0
|
||||
chunk_samples = int(chunk_s * SAMPLE_RATE)
|
||||
|
||||
for win_start, win_end in windows:
|
||||
for chunk_start in range(win_start, win_end - int(chunk_s), int(chunk_s)):
|
||||
chunk_end = chunk_start + int(chunk_s)
|
||||
s = int(chunk_start * SAMPLE_RATE)
|
||||
e = s + chunk_samples
|
||||
if e > len(audio):
|
||||
break
|
||||
try:
|
||||
emb = profiler._embed_audio_np(audio[s:e])
|
||||
profile.embeddings.append(emb)
|
||||
console.print(f" [dim]+1 embedding @ {chunk_start}s[/dim]")
|
||||
except Exception as ex:
|
||||
console.print(f" [red]Failed @ {chunk_start}s: {ex}[/red]")
|
||||
|
||||
profile.source_episodes.append(ep_name)
|
||||
|
||||
if not profile.embeddings:
|
||||
console.print("[red]No embeddings collected — check episode paths[/red]")
|
||||
sys.exit(1)
|
||||
|
||||
profile.compute_composite()
|
||||
console.print(f"\n[green]Tom profile built: {profile.num_samples} embeddings "
|
||||
f"from {len(profile.source_episodes)} episodes[/green]")
|
||||
|
||||
# Verify: check cosine similarity vs Mike to ensure separation
|
||||
mike = profiler.profiles.get("Mike Swanson")
|
||||
if mike and mike.composite_embedding is not None and profile.composite_embedding is not None:
|
||||
sim = float(np.dot(mike.composite_embedding, profile.composite_embedding) /
|
||||
(np.linalg.norm(mike.composite_embedding) * np.linalg.norm(profile.composite_embedding) + 1e-8))
|
||||
console.print(f"Tom vs Mike similarity: {sim:.3f} (lower is better separation)")
|
||||
|
||||
profiler.save_profiles()
|
||||
console.print("[bold green]Profile saved.[/bold green]")
|
||||
Reference in New Issue
Block a user