claudetools/projects/radio-show/audio-processor/build_cohost_profile.py

"""
Build voice profile for Tom (co-host) from known co-host speech windows.

Uses CALLER-labeled windows from the first 60 min of co-host-era episodes,
before any real callers would have called in.
"""
import os, sys
os.environ["PYTHONIOENCODING"] = "utf-8"
os.environ["TRANSFORMERS_OFFLINE"] = "1"
if hasattr(sys.stdout, "reconfigure"):
    sys.stdout.reconfigure(encoding="utf-8")

from pathlib import Path
import json
import numpy as np
from src.gpu import ensure_cuda_libs
ensure_cuda_libs()

import torch
from src.voice_profiler import VoiceProfiler, SpeakerProfile
from rich.console import Console

console = Console()

BASE = Path(__file__).parent
PROFILES_DIR = BASE / "voice-profiles"
EPISODES_DIR = BASE / "test-data" / "episodes"
TRANS_DIR = BASE / "test-data" / "transcripts"

device = "cuda" if torch.cuda.is_available() else "cpu"
console.print(f"Device: {device}")

profiler = VoiceProfiler(PROFILES_DIR, device=device)

# Tom's known speech windows per episode
# CALLER turns from diarization that are in the first 60 min (before real callers)
# Windows at 0-40s excluded (promo/jingle, not Tom's voice)
TOM_WINDOWS = {
    "2014-s6e19.mp3": [
        (195, 260),
        (320, 425),
        (600, 650),
        (675, 710),
    ],
    "2016-s8e43.mp3": [
        (100, 115),
        (135, 160),
        (270, 295),
        (575, 605),
        (1185, 1235),
        (1790, 1870),
        (2020, 2055),
    ],
}

COHOST_NAME = "Tom"

if COHOST_NAME not in profiler.profiles:
    profiler.profiles[COHOST_NAME] = SpeakerProfile(
        name=COHOST_NAME,
        role="cohost",
        embeddings=[],
        source_episodes=[],
    )

profile = profiler.profiles[COHOST_NAME]
console.print(f"\n[bold]Building co-host profile for: {COHOST_NAME}[/bold]")

for ep_name, windows in TOM_WINDOWS.items():
    ep_path = EPISODES_DIR / ep_name
    if not ep_path.exists():
        console.print(f"[yellow]  Skipping {ep_name} — not found[/yellow]")
        continue

    console.print(f"\n  Loading {ep_name}...")
    audio = profiler._load_full_audio(ep_path)
    profiler._get_model()

    SAMPLE_RATE = 16000
    chunk_s = 10.0
    chunk_samples = int(chunk_s * SAMPLE_RATE)

    for win_start, win_end in windows:
        for chunk_start in range(win_start, win_end - int(chunk_s), int(chunk_s)):
            chunk_end = chunk_start + int(chunk_s)
            s = int(chunk_start * SAMPLE_RATE)
            e = s + chunk_samples
            if e > len(audio):
                break
            try:
                emb = profiler._embed_audio_np(audio[s:e])
                profile.embeddings.append(emb)
                console.print(f"    [dim]+1 embedding @ {chunk_start}s[/dim]")
            except Exception as ex:
                console.print(f"    [red]Failed @ {chunk_start}s: {ex}[/red]")

    profile.source_episodes.append(ep_name)

if not profile.embeddings:
    console.print("[red]No embeddings collected — check episode paths[/red]")
    sys.exit(1)

profile.compute_composite()
console.print(f"\n[green]Tom profile built: {profile.num_samples} embeddings "
              f"from {len(profile.source_episodes)} episodes[/green]")

# Verify: check cosine similarity vs Mike to ensure separation
mike = profiler.profiles.get("Mike Swanson")
if mike and mike.composite_embedding is not None and profile.composite_embedding is not None:
    sim = float(np.dot(mike.composite_embedding, profile.composite_embedding) /
                (np.linalg.norm(mike.composite_embedding) * np.linalg.norm(profile.composite_embedding) + 1e-8))
    console.print(f"Tom vs Mike similarity: {sim:.3f} (lower is better separation)")

profiler.save_profiles()
console.print("[bold green]Profile saved.[/bold green]")