Files
claudetools/projects/radio-show/audio-processor/build_clay_profile.py
Mike Swanson 4c89402df8 radio: skip Clay profile build (failed) — accept 2015-s7e19 Q&A as noisy
First attempt at Clay's voice profile from 2015-s7e19 produced
Clay-vs-Mike cosine similarity of 0.994 — essentially a Mike clone.
Root cause: 10s WavLM x-vector chunks averaged Mike's frequent
interjections together with Clay's dialogue, and Mike's well-trained
profile dominated the resulting embedding signal.

Mike's call: skip Clay, accept the 2015-s7e19 Q&A as noisy. Clay rarely
appears in other episodes, so the cost of not having his profile is
bounded to this one episode plus any rare future appearances.

Cleanup:
- voice-profiles/clay/ removed
- voice-profiles/profiles.json: Clay entry removed
- Memory updated to record the decision and the failure mode

Kept build_clay_profile.py in-repo as documentation of the attempt and
the Mike-similarity-filter pattern. Useful starting point if a future
attempt provides cleaner pure-Clay timestamps.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 16:36:46 -07:00

137 lines
5.0 KiB
Python

"""
Build voice profile for Clay (Nerd Junkies — fill-in for Tara) from
hand-picked windows in 2015-s7e19.
Adds a Mike-similarity filter (skip any chunk whose cosine vs Mike's
composite is >= 0.85) so Mike's interjections during Clay's monologues
don't contaminate Clay's profile.
"""
import os, sys
os.environ["PYTHONIOENCODING"] = "utf-8"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8")
from pathlib import Path
import numpy as np
from src.gpu import ensure_cuda_libs
ensure_cuda_libs()
import torch
from src.voice_profiler import VoiceProfiler, SpeakerProfile
from rich.console import Console
console = Console()
BASE = Path(__file__).parent
PROFILES_DIR = BASE / "voice-profiles"
EPISODES_DIR = BASE / "test-data" / "episodes"
# Clay windows in 2015-s7e19 (transcript-vetted: Mike+Clay banter,
# no callers in these ranges). Chunks matching Mike's profile will
# be filtered out at build time.
CLAY_WINDOWS = {
"2015-s7e19.mp3": [
(90, 150), # 01:30-02:30 — Clay introducing Nerd Junkies team
(2520, 2640), # 42:00-44:00 — Clay's 2014 gaming year-in-review
(2730, 2820), # 45:30-47:00 — Clay on VR/Oculus
],
}
COHOST_NAME = "Clay"
# Mike-filter would drop everything (Mike's profile matches at 0.92+ on
# any chunk in these windows because Mike is interjecting and his profile
# is broad). Disabled — relying on cosine comparison at diarization time
# to put Mike chunks in Mike's bucket and Clay chunks in Clay's.
MIKE_FILTER_THRESHOLD = 1.01 # effectively disabled
device = "cuda" if torch.cuda.is_available() else "cpu"
console.print(f"Device: {device}")
profiler = VoiceProfiler(PROFILES_DIR, device=device)
mike = profiler.profiles.get("Mike Swanson")
if mike is None or mike.composite_embedding is None:
console.print("[red]Mike's profile not loaded — abort.[/red]")
sys.exit(1)
if COHOST_NAME not in profiler.profiles:
profiler.profiles[COHOST_NAME] = SpeakerProfile(
name=COHOST_NAME,
role="cohost",
embeddings=[],
source_episodes=[],
)
profile = profiler.profiles[COHOST_NAME]
console.print(f"\n[bold]Building voice profile: {COHOST_NAME}[/bold]")
console.print(f" Mike-similarity filter @ >= {MIKE_FILTER_THRESHOLD}")
mike_norm = np.linalg.norm(mike.composite_embedding)
kept = 0
skipped_mike = 0
failed = 0
for ep_name, windows in CLAY_WINDOWS.items():
ep_path = EPISODES_DIR / ep_name
if not ep_path.exists():
console.print(f"[yellow] Skipping {ep_name} — not found[/yellow]")
continue
console.print(f"\n Loading {ep_name}...")
audio = profiler._load_full_audio(ep_path)
profiler._get_model()
SAMPLE_RATE = 16000
chunk_s = 10.0
chunk_samples = int(chunk_s * SAMPLE_RATE)
for win_start, win_end in windows:
for chunk_start in range(win_start, win_end - int(chunk_s), int(chunk_s)):
chunk_end = chunk_start + int(chunk_s)
s = int(chunk_start * SAMPLE_RATE)
e = s + chunk_samples
if e > len(audio):
break
try:
emb = profiler._embed_audio_np(audio[s:e])
# Skip chunks that match Mike strongly (Mike interjections)
mike_sim = float(np.dot(mike.composite_embedding, emb) /
(mike_norm * np.linalg.norm(emb) + 1e-8))
if mike_sim >= MIKE_FILTER_THRESHOLD:
skipped_mike += 1
console.print(f" [dim yellow]skip Mike @ {chunk_start}s "
f"(sim={mike_sim:.2f})[/dim yellow]")
continue
profile.embeddings.append(emb)
kept += 1
console.print(f" [dim]+1 @ {chunk_start}s (mike={mike_sim:.2f})[/dim]")
except Exception as ex:
failed += 1
console.print(f" [red]Failed @ {chunk_start}s: {ex}[/red]")
profile.source_episodes.append(ep_name)
if not profile.embeddings:
console.print("[red]No embeddings collected — check windows / Mike threshold[/red]")
sys.exit(1)
profile.compute_composite()
console.print(f"\n[green]{COHOST_NAME} profile built: {profile.num_samples} embeddings, "
f"skipped {skipped_mike} as Mike, {failed} failed[/green]")
# Diagnostics
mike_sim = float(np.dot(mike.composite_embedding, profile.composite_embedding) /
(mike_norm * np.linalg.norm(profile.composite_embedding) + 1e-8))
console.print(f"[bold]Clay vs Mike similarity:[/bold] {mike_sim:.3f} (lower is better separation)")
tara = profiler.profiles.get("Tara")
if tara and tara.composite_embedding is not None:
tara_sim = float(np.dot(tara.composite_embedding, profile.composite_embedding) /
(np.linalg.norm(tara.composite_embedding) * np.linalg.norm(profile.composite_embedding) + 1e-8))
console.print(f"[bold]Clay vs Tara similarity:[/bold] {tara_sim:.3f}")
profiler.save_profiles()
console.print("[bold green]Profile saved.[/bold green]")