claudetools/projects/radio-show/audio-processor/build_clay_profile.py

"""
Build voice profile for Clay (Nerd Junkies — fill-in for Tara) from
hand-picked windows in 2015-s7e19.

Adds a Mike-similarity filter (skip any chunk whose cosine vs Mike's
composite is >= 0.85) so Mike's interjections during Clay's monologues
don't contaminate Clay's profile.
"""
import os, sys
os.environ["PYTHONIOENCODING"] = "utf-8"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
if hasattr(sys.stdout, "reconfigure"):
    sys.stdout.reconfigure(encoding="utf-8")

from pathlib import Path
import numpy as np
from src.gpu import ensure_cuda_libs
ensure_cuda_libs()

import torch
from src.voice_profiler import VoiceProfiler, SpeakerProfile
from rich.console import Console

console = Console()

BASE = Path(__file__).parent
PROFILES_DIR = BASE / "voice-profiles"
EPISODES_DIR = BASE / "test-data" / "episodes"

# Clay windows in 2015-s7e19 (transcript-vetted: Mike+Clay banter,
# no callers in these ranges). Chunks matching Mike's profile will
# be filtered out at build time.
CLAY_WINDOWS = {
    "2015-s7e19.mp3": [
        (90, 150),    # 01:30-02:30 — Clay introducing Nerd Junkies team
        (2520, 2640), # 42:00-44:00 — Clay's 2014 gaming year-in-review
        (2730, 2820), # 45:30-47:00 — Clay on VR/Oculus
    ],
}

COHOST_NAME = "Clay"
# Mike-filter would drop everything (Mike's profile matches at 0.92+ on
# any chunk in these windows because Mike is interjecting and his profile
# is broad). Disabled — relying on cosine comparison at diarization time
# to put Mike chunks in Mike's bucket and Clay chunks in Clay's.
MIKE_FILTER_THRESHOLD = 1.01  # effectively disabled

device = "cuda" if torch.cuda.is_available() else "cpu"
console.print(f"Device: {device}")

profiler = VoiceProfiler(PROFILES_DIR, device=device)

mike = profiler.profiles.get("Mike Swanson")
if mike is None or mike.composite_embedding is None:
    console.print("[red]Mike's profile not loaded — abort.[/red]")
    sys.exit(1)

if COHOST_NAME not in profiler.profiles:
    profiler.profiles[COHOST_NAME] = SpeakerProfile(
        name=COHOST_NAME,
        role="cohost",
        embeddings=[],
        source_episodes=[],
    )

profile = profiler.profiles[COHOST_NAME]
console.print(f"\n[bold]Building voice profile: {COHOST_NAME}[/bold]")
console.print(f"  Mike-similarity filter @ >= {MIKE_FILTER_THRESHOLD}")

mike_norm = np.linalg.norm(mike.composite_embedding)

kept = 0
skipped_mike = 0
failed = 0

for ep_name, windows in CLAY_WINDOWS.items():
    ep_path = EPISODES_DIR / ep_name
    if not ep_path.exists():
        console.print(f"[yellow]  Skipping {ep_name} — not found[/yellow]")
        continue

    console.print(f"\n  Loading {ep_name}...")
    audio = profiler._load_full_audio(ep_path)
    profiler._get_model()

    SAMPLE_RATE = 16000
    chunk_s = 10.0
    chunk_samples = int(chunk_s * SAMPLE_RATE)

    for win_start, win_end in windows:
        for chunk_start in range(win_start, win_end - int(chunk_s), int(chunk_s)):
            chunk_end = chunk_start + int(chunk_s)
            s = int(chunk_start * SAMPLE_RATE)
            e = s + chunk_samples
            if e > len(audio):
                break
            try:
                emb = profiler._embed_audio_np(audio[s:e])
                # Skip chunks that match Mike strongly (Mike interjections)
                mike_sim = float(np.dot(mike.composite_embedding, emb) /
                                 (mike_norm * np.linalg.norm(emb) + 1e-8))
                if mike_sim >= MIKE_FILTER_THRESHOLD:
                    skipped_mike += 1
                    console.print(f"    [dim yellow]skip Mike @ {chunk_start}s "
                                  f"(sim={mike_sim:.2f})[/dim yellow]")
                    continue
                profile.embeddings.append(emb)
                kept += 1
                console.print(f"    [dim]+1 @ {chunk_start}s (mike={mike_sim:.2f})[/dim]")
            except Exception as ex:
                failed += 1
                console.print(f"    [red]Failed @ {chunk_start}s: {ex}[/red]")

    profile.source_episodes.append(ep_name)

if not profile.embeddings:
    console.print("[red]No embeddings collected — check windows / Mike threshold[/red]")
    sys.exit(1)

profile.compute_composite()
console.print(f"\n[green]{COHOST_NAME} profile built: {profile.num_samples} embeddings, "
              f"skipped {skipped_mike} as Mike, {failed} failed[/green]")

# Diagnostics
mike_sim = float(np.dot(mike.composite_embedding, profile.composite_embedding) /
                 (mike_norm * np.linalg.norm(profile.composite_embedding) + 1e-8))
console.print(f"[bold]Clay vs Mike similarity:[/bold] {mike_sim:.3f} (lower is better separation)")

tara = profiler.profiles.get("Tara")
if tara and tara.composite_embedding is not None:
    tara_sim = float(np.dot(tara.composite_embedding, profile.composite_embedding) /
                     (np.linalg.norm(tara.composite_embedding) * np.linalg.norm(profile.composite_embedding) + 1e-8))
    console.print(f"[bold]Clay vs Tara similarity:[/bold] {tara_sim:.3f}")

profiler.save_profiles()
console.print("[bold green]Profile saved.[/bold green]")