First attempt at Clay's voice profile from 2015-s7e19 produced Clay-vs-Mike cosine similarity of 0.994 — essentially a Mike clone. Root cause: 10s WavLM x-vector chunks averaged Mike's frequent interjections together with Clay's dialogue, and Mike's well-trained profile dominated the resulting embedding signal. Mike's call: skip Clay, accept the 2015-s7e19 Q&A as noisy. Clay rarely appears in other episodes, so the cost of not having his profile is bounded to this one episode plus any rare future appearances. Cleanup: - voice-profiles/clay/ removed - voice-profiles/profiles.json: Clay entry removed - Memory updated to record the decision and the failure mode Kept build_clay_profile.py in-repo as documentation of the attempt and the Mike-similarity-filter pattern. Useful starting point if a future attempt provides cleaner pure-Clay timestamps. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
137 lines
5.0 KiB
Python
137 lines
5.0 KiB
Python
"""
|
|
Build voice profile for Clay (Nerd Junkies — fill-in for Tara) from
|
|
hand-picked windows in 2015-s7e19.
|
|
|
|
Adds a Mike-similarity filter (skip any chunk whose cosine vs Mike's
|
|
composite is >= 0.85) so Mike's interjections during Clay's monologues
|
|
don't contaminate Clay's profile.
|
|
"""
|
|
import os, sys
|
|
os.environ["PYTHONIOENCODING"] = "utf-8"
|
|
os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
|
if hasattr(sys.stdout, "reconfigure"):
|
|
sys.stdout.reconfigure(encoding="utf-8")
|
|
|
|
from pathlib import Path
|
|
import numpy as np
|
|
from src.gpu import ensure_cuda_libs
|
|
ensure_cuda_libs()
|
|
|
|
import torch
|
|
from src.voice_profiler import VoiceProfiler, SpeakerProfile
|
|
from rich.console import Console
|
|
|
|
console = Console()
|
|
|
|
BASE = Path(__file__).parent
|
|
PROFILES_DIR = BASE / "voice-profiles"
|
|
EPISODES_DIR = BASE / "test-data" / "episodes"
|
|
|
|
# Clay windows in 2015-s7e19 (transcript-vetted: Mike+Clay banter,
|
|
# no callers in these ranges). Chunks matching Mike's profile will
|
|
# be filtered out at build time.
|
|
CLAY_WINDOWS = {
|
|
"2015-s7e19.mp3": [
|
|
(90, 150), # 01:30-02:30 — Clay introducing Nerd Junkies team
|
|
(2520, 2640), # 42:00-44:00 — Clay's 2014 gaming year-in-review
|
|
(2730, 2820), # 45:30-47:00 — Clay on VR/Oculus
|
|
],
|
|
}
|
|
|
|
COHOST_NAME = "Clay"
|
|
# Mike-filter would drop everything (Mike's profile matches at 0.92+ on
|
|
# any chunk in these windows because Mike is interjecting and his profile
|
|
# is broad). Disabled — relying on cosine comparison at diarization time
|
|
# to put Mike chunks in Mike's bucket and Clay chunks in Clay's.
|
|
MIKE_FILTER_THRESHOLD = 1.01 # effectively disabled
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
console.print(f"Device: {device}")
|
|
|
|
profiler = VoiceProfiler(PROFILES_DIR, device=device)
|
|
|
|
mike = profiler.profiles.get("Mike Swanson")
|
|
if mike is None or mike.composite_embedding is None:
|
|
console.print("[red]Mike's profile not loaded — abort.[/red]")
|
|
sys.exit(1)
|
|
|
|
if COHOST_NAME not in profiler.profiles:
|
|
profiler.profiles[COHOST_NAME] = SpeakerProfile(
|
|
name=COHOST_NAME,
|
|
role="cohost",
|
|
embeddings=[],
|
|
source_episodes=[],
|
|
)
|
|
|
|
profile = profiler.profiles[COHOST_NAME]
|
|
console.print(f"\n[bold]Building voice profile: {COHOST_NAME}[/bold]")
|
|
console.print(f" Mike-similarity filter @ >= {MIKE_FILTER_THRESHOLD}")
|
|
|
|
mike_norm = np.linalg.norm(mike.composite_embedding)
|
|
|
|
kept = 0
|
|
skipped_mike = 0
|
|
failed = 0
|
|
|
|
for ep_name, windows in CLAY_WINDOWS.items():
|
|
ep_path = EPISODES_DIR / ep_name
|
|
if not ep_path.exists():
|
|
console.print(f"[yellow] Skipping {ep_name} — not found[/yellow]")
|
|
continue
|
|
|
|
console.print(f"\n Loading {ep_name}...")
|
|
audio = profiler._load_full_audio(ep_path)
|
|
profiler._get_model()
|
|
|
|
SAMPLE_RATE = 16000
|
|
chunk_s = 10.0
|
|
chunk_samples = int(chunk_s * SAMPLE_RATE)
|
|
|
|
for win_start, win_end in windows:
|
|
for chunk_start in range(win_start, win_end - int(chunk_s), int(chunk_s)):
|
|
chunk_end = chunk_start + int(chunk_s)
|
|
s = int(chunk_start * SAMPLE_RATE)
|
|
e = s + chunk_samples
|
|
if e > len(audio):
|
|
break
|
|
try:
|
|
emb = profiler._embed_audio_np(audio[s:e])
|
|
# Skip chunks that match Mike strongly (Mike interjections)
|
|
mike_sim = float(np.dot(mike.composite_embedding, emb) /
|
|
(mike_norm * np.linalg.norm(emb) + 1e-8))
|
|
if mike_sim >= MIKE_FILTER_THRESHOLD:
|
|
skipped_mike += 1
|
|
console.print(f" [dim yellow]skip Mike @ {chunk_start}s "
|
|
f"(sim={mike_sim:.2f})[/dim yellow]")
|
|
continue
|
|
profile.embeddings.append(emb)
|
|
kept += 1
|
|
console.print(f" [dim]+1 @ {chunk_start}s (mike={mike_sim:.2f})[/dim]")
|
|
except Exception as ex:
|
|
failed += 1
|
|
console.print(f" [red]Failed @ {chunk_start}s: {ex}[/red]")
|
|
|
|
profile.source_episodes.append(ep_name)
|
|
|
|
if not profile.embeddings:
|
|
console.print("[red]No embeddings collected — check windows / Mike threshold[/red]")
|
|
sys.exit(1)
|
|
|
|
profile.compute_composite()
|
|
console.print(f"\n[green]{COHOST_NAME} profile built: {profile.num_samples} embeddings, "
|
|
f"skipped {skipped_mike} as Mike, {failed} failed[/green]")
|
|
|
|
# Diagnostics
|
|
mike_sim = float(np.dot(mike.composite_embedding, profile.composite_embedding) /
|
|
(mike_norm * np.linalg.norm(profile.composite_embedding) + 1e-8))
|
|
console.print(f"[bold]Clay vs Mike similarity:[/bold] {mike_sim:.3f} (lower is better separation)")
|
|
|
|
tara = profiler.profiles.get("Tara")
|
|
if tara and tara.composite_embedding is not None:
|
|
tara_sim = float(np.dot(tara.composite_embedding, profile.composite_embedding) /
|
|
(np.linalg.norm(tara.composite_embedding) * np.linalg.norm(profile.composite_embedding) + 1e-8))
|
|
console.print(f"[bold]Clay vs Tara similarity:[/bold] {tara_sim:.3f}")
|
|
|
|
profiler.save_profiles()
|
|
console.print("[bold green]Profile saved.[/bold green]")
|