""" Build voice profile for Tara (co-host) from known co-host speech windows. Uses CALLER-labeled windows from the first 60 min of co-host-era episodes, before any real callers would have called in. """ import os, sys os.environ["PYTHONIOENCODING"] = "utf-8" os.environ["TRANSFORMERS_OFFLINE"] = "1" if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8") from pathlib import Path import json import numpy as np from src.gpu import ensure_cuda_libs ensure_cuda_libs() import torch from src.voice_profiler import VoiceProfiler, SpeakerProfile from rich.console import Console console = Console() BASE = Path(__file__).parent PROFILES_DIR = BASE / "voice-profiles" EPISODES_DIR = BASE / "test-data" / "episodes" TRANS_DIR = BASE / "test-data" / "transcripts" device = "cuda" if torch.cuda.is_available() else "cpu" console.print(f"Device: {device}") profiler = VoiceProfiler(PROFILES_DIR, device=device) # Tara's known speech windows per episode # CALLER turns from diarization that are in the first 60 min (before real callers) # Windows at 0-40s excluded (promo/jingle, not Tara's voice) TARA_WINDOWS = { "2014-s6e19.mp3": [ (195, 260), (320, 425), (600, 650), (675, 710), ], "2016-s8e43.mp3": [ (100, 115), (135, 160), (270, 295), (575, 605), (1185, 1235), (1790, 1870), (2020, 2055), ], } COHOST_NAME = "Tara" if COHOST_NAME not in profiler.profiles: profiler.profiles[COHOST_NAME] = SpeakerProfile( name=COHOST_NAME, role="cohost", embeddings=[], source_episodes=[], ) profile = profiler.profiles[COHOST_NAME] console.print(f"\n[bold]Building co-host profile for: {COHOST_NAME}[/bold]") for ep_name, windows in TARA_WINDOWS.items(): ep_path = EPISODES_DIR / ep_name if not ep_path.exists(): console.print(f"[yellow] Skipping {ep_name} — not found[/yellow]") continue console.print(f"\n Loading {ep_name}...") audio = profiler._load_full_audio(ep_path) profiler._get_model() SAMPLE_RATE = 16000 chunk_s = 10.0 chunk_samples = int(chunk_s * SAMPLE_RATE) for win_start, win_end in windows: for chunk_start in range(win_start, win_end - int(chunk_s), int(chunk_s)): chunk_end = chunk_start + int(chunk_s) s = int(chunk_start * SAMPLE_RATE) e = s + chunk_samples if e > len(audio): break try: emb = profiler._embed_audio_np(audio[s:e]) profile.embeddings.append(emb) console.print(f" [dim]+1 embedding @ {chunk_start}s[/dim]") except Exception as ex: console.print(f" [red]Failed @ {chunk_start}s: {ex}[/red]") profile.source_episodes.append(ep_name) if not profile.embeddings: console.print("[red]No embeddings collected — check episode paths[/red]") sys.exit(1) profile.compute_composite() console.print(f"\n[green]Tara profile built: {profile.num_samples} embeddings " f"from {len(profile.source_episodes)} episodes[/green]") # Verify: check cosine similarity vs Mike to ensure separation mike = profiler.profiles.get("Mike Swanson") if mike and mike.composite_embedding is not None and profile.composite_embedding is not None: sim = float(np.dot(mike.composite_embedding, profile.composite_embedding) / (np.linalg.norm(mike.composite_embedding) * np.linalg.norm(profile.composite_embedding) + 1e-8)) console.print(f"Tara vs Mike similarity: {sim:.3f} (lower is better separation)") profiler.save_profiles() console.print("[bold green]Profile saved.[/bold green]")