Mike confirmed there is no co-host named "Tom" — the voice in 2014-s6e19 and 2016-s8e43 is Tara. The 5070 Ti session fabricated the Tom identity. The voice profile itself (44 embeddings, 0.698 cosine vs Mike) is correct; only the human label was wrong. Rename swept: - voice-profiles/tom/ -> voice-profiles/tara/ (git mv preserves all .npy) - voice-profiles/profiles.json: "Tom" key -> "Tara" - build_cohost_profile.py: TOM_WINDOWS -> TARA_WINDOWS, COHOST_NAME, comments - 2026-04-27-qa-extraction-cohost-indexing.md: correction header + body sweep - 2026-04-27-4090-benchmark-and-test-set.md: closure note - .claude/memory/radio_show_no_cohost_named_tom.md: resolution + speaker roster Diarization re-run after rename so speaker_map emits "Cohost: Tara". Q&A counts unchanged (rename is label-only): 9 pairs across 6 test episodes. Tara distribution from the post-rename diarization (per-episode % of audio): 2011-03-12-hr1 140s 5.6% likely false positive (call-in only) 2012-03-10-hr1 30s 1.1% likely false positive (call-in only) 2012-06-09-hr1 340s 12.8% suspicious — pending Mike confirm 2014-s6e19 680s 23.3% confirmed 2016-s8e43 1890s 35.5% confirmed 2017-s9e30 610s 11.4% plausible — pending Mike confirm Broader speaker-roster context Mike provided this session (saved to memory): the show has had multiple co-hosts (Tara, Randall, Rob) plus producers/board ops (Andrew, Shannon, Ken, others) who would sometimes go on-air. Only Tara has a profile so far. Every other speaker is currently labeled CALLER, which means small CO-HOST attributions in unexpected episodes (e.g. 2011/2012) may actually be a producer rather than a false positive — Mike to spot-check. Action item before full-archive run: build profiles for Randall, Rob, and the named producers to avoid systematic Q&A false positives in early-years and 2018/2019 episodes. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
116 lines
3.6 KiB
Python
116 lines
3.6 KiB
Python
"""
|
|
Build voice profile for Tara (co-host) from known co-host speech windows.
|
|
|
|
Uses CALLER-labeled windows from the first 60 min of co-host-era episodes,
|
|
before any real callers would have called in.
|
|
"""
|
|
import os, sys
|
|
os.environ["PYTHONIOENCODING"] = "utf-8"
|
|
os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
|
if hasattr(sys.stdout, "reconfigure"):
|
|
sys.stdout.reconfigure(encoding="utf-8")
|
|
|
|
from pathlib import Path
|
|
import json
|
|
import numpy as np
|
|
from src.gpu import ensure_cuda_libs
|
|
ensure_cuda_libs()
|
|
|
|
import torch
|
|
from src.voice_profiler import VoiceProfiler, SpeakerProfile
|
|
from rich.console import Console
|
|
|
|
console = Console()
|
|
|
|
BASE = Path(__file__).parent
|
|
PROFILES_DIR = BASE / "voice-profiles"
|
|
EPISODES_DIR = BASE / "test-data" / "episodes"
|
|
TRANS_DIR = BASE / "test-data" / "transcripts"
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
console.print(f"Device: {device}")
|
|
|
|
profiler = VoiceProfiler(PROFILES_DIR, device=device)
|
|
|
|
# Tara's known speech windows per episode
|
|
# CALLER turns from diarization that are in the first 60 min (before real callers)
|
|
# Windows at 0-40s excluded (promo/jingle, not Tara's voice)
|
|
TARA_WINDOWS = {
|
|
"2014-s6e19.mp3": [
|
|
(195, 260),
|
|
(320, 425),
|
|
(600, 650),
|
|
(675, 710),
|
|
],
|
|
"2016-s8e43.mp3": [
|
|
(100, 115),
|
|
(135, 160),
|
|
(270, 295),
|
|
(575, 605),
|
|
(1185, 1235),
|
|
(1790, 1870),
|
|
(2020, 2055),
|
|
],
|
|
}
|
|
|
|
COHOST_NAME = "Tara"
|
|
|
|
if COHOST_NAME not in profiler.profiles:
|
|
profiler.profiles[COHOST_NAME] = SpeakerProfile(
|
|
name=COHOST_NAME,
|
|
role="cohost",
|
|
embeddings=[],
|
|
source_episodes=[],
|
|
)
|
|
|
|
profile = profiler.profiles[COHOST_NAME]
|
|
console.print(f"\n[bold]Building co-host profile for: {COHOST_NAME}[/bold]")
|
|
|
|
for ep_name, windows in TARA_WINDOWS.items():
|
|
ep_path = EPISODES_DIR / ep_name
|
|
if not ep_path.exists():
|
|
console.print(f"[yellow] Skipping {ep_name} — not found[/yellow]")
|
|
continue
|
|
|
|
console.print(f"\n Loading {ep_name}...")
|
|
audio = profiler._load_full_audio(ep_path)
|
|
profiler._get_model()
|
|
|
|
SAMPLE_RATE = 16000
|
|
chunk_s = 10.0
|
|
chunk_samples = int(chunk_s * SAMPLE_RATE)
|
|
|
|
for win_start, win_end in windows:
|
|
for chunk_start in range(win_start, win_end - int(chunk_s), int(chunk_s)):
|
|
chunk_end = chunk_start + int(chunk_s)
|
|
s = int(chunk_start * SAMPLE_RATE)
|
|
e = s + chunk_samples
|
|
if e > len(audio):
|
|
break
|
|
try:
|
|
emb = profiler._embed_audio_np(audio[s:e])
|
|
profile.embeddings.append(emb)
|
|
console.print(f" [dim]+1 embedding @ {chunk_start}s[/dim]")
|
|
except Exception as ex:
|
|
console.print(f" [red]Failed @ {chunk_start}s: {ex}[/red]")
|
|
|
|
profile.source_episodes.append(ep_name)
|
|
|
|
if not profile.embeddings:
|
|
console.print("[red]No embeddings collected — check episode paths[/red]")
|
|
sys.exit(1)
|
|
|
|
profile.compute_composite()
|
|
console.print(f"\n[green]Tara profile built: {profile.num_samples} embeddings "
|
|
f"from {len(profile.source_episodes)} episodes[/green]")
|
|
|
|
# Verify: check cosine similarity vs Mike to ensure separation
|
|
mike = profiler.profiles.get("Mike Swanson")
|
|
if mike and mike.composite_embedding is not None and profile.composite_embedding is not None:
|
|
sim = float(np.dot(mike.composite_embedding, profile.composite_embedding) /
|
|
(np.linalg.norm(mike.composite_embedding) * np.linalg.norm(profile.composite_embedding) + 1e-8))
|
|
console.print(f"Tara vs Mike similarity: {sim:.3f} (lower is better separation)")
|
|
|
|
profiler.save_profiles()
|
|
console.print("[bold green]Profile saved.[/bold green]")
|