""" Build voice profile for Clay (Nerd Junkies — fill-in for Tara) from hand-picked windows in 2015-s7e19. Adds a Mike-similarity filter (skip any chunk whose cosine vs Mike's composite is >= 0.85) so Mike's interjections during Clay's monologues don't contaminate Clay's profile. """ import os, sys os.environ["PYTHONIOENCODING"] = "utf-8" os.environ["TRANSFORMERS_OFFLINE"] = "1" if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8") from pathlib import Path import numpy as np from src.gpu import ensure_cuda_libs ensure_cuda_libs() import torch from src.voice_profiler import VoiceProfiler, SpeakerProfile from rich.console import Console console = Console() BASE = Path(__file__).parent PROFILES_DIR = BASE / "voice-profiles" EPISODES_DIR = BASE / "test-data" / "episodes" # Clay windows in 2015-s7e19 (transcript-vetted: Mike+Clay banter, # no callers in these ranges). Chunks matching Mike's profile will # be filtered out at build time. CLAY_WINDOWS = { "2015-s7e19.mp3": [ (90, 150), # 01:30-02:30 — Clay introducing Nerd Junkies team (2520, 2640), # 42:00-44:00 — Clay's 2014 gaming year-in-review (2730, 2820), # 45:30-47:00 — Clay on VR/Oculus ], } COHOST_NAME = "Clay" # Mike-filter would drop everything (Mike's profile matches at 0.92+ on # any chunk in these windows because Mike is interjecting and his profile # is broad). Disabled — relying on cosine comparison at diarization time # to put Mike chunks in Mike's bucket and Clay chunks in Clay's. MIKE_FILTER_THRESHOLD = 1.01 # effectively disabled device = "cuda" if torch.cuda.is_available() else "cpu" console.print(f"Device: {device}") profiler = VoiceProfiler(PROFILES_DIR, device=device) mike = profiler.profiles.get("Mike Swanson") if mike is None or mike.composite_embedding is None: console.print("[red]Mike's profile not loaded — abort.[/red]") sys.exit(1) if COHOST_NAME not in profiler.profiles: profiler.profiles[COHOST_NAME] = SpeakerProfile( name=COHOST_NAME, role="cohost", embeddings=[], source_episodes=[], ) profile = profiler.profiles[COHOST_NAME] console.print(f"\n[bold]Building voice profile: {COHOST_NAME}[/bold]") console.print(f" Mike-similarity filter @ >= {MIKE_FILTER_THRESHOLD}") mike_norm = np.linalg.norm(mike.composite_embedding) kept = 0 skipped_mike = 0 failed = 0 for ep_name, windows in CLAY_WINDOWS.items(): ep_path = EPISODES_DIR / ep_name if not ep_path.exists(): console.print(f"[yellow] Skipping {ep_name} — not found[/yellow]") continue console.print(f"\n Loading {ep_name}...") audio = profiler._load_full_audio(ep_path) profiler._get_model() SAMPLE_RATE = 16000 chunk_s = 10.0 chunk_samples = int(chunk_s * SAMPLE_RATE) for win_start, win_end in windows: for chunk_start in range(win_start, win_end - int(chunk_s), int(chunk_s)): chunk_end = chunk_start + int(chunk_s) s = int(chunk_start * SAMPLE_RATE) e = s + chunk_samples if e > len(audio): break try: emb = profiler._embed_audio_np(audio[s:e]) # Skip chunks that match Mike strongly (Mike interjections) mike_sim = float(np.dot(mike.composite_embedding, emb) / (mike_norm * np.linalg.norm(emb) + 1e-8)) if mike_sim >= MIKE_FILTER_THRESHOLD: skipped_mike += 1 console.print(f" [dim yellow]skip Mike @ {chunk_start}s " f"(sim={mike_sim:.2f})[/dim yellow]") continue profile.embeddings.append(emb) kept += 1 console.print(f" [dim]+1 @ {chunk_start}s (mike={mike_sim:.2f})[/dim]") except Exception as ex: failed += 1 console.print(f" [red]Failed @ {chunk_start}s: {ex}[/red]") profile.source_episodes.append(ep_name) if not profile.embeddings: console.print("[red]No embeddings collected — check windows / Mike threshold[/red]") sys.exit(1) profile.compute_composite() console.print(f"\n[green]{COHOST_NAME} profile built: {profile.num_samples} embeddings, " f"skipped {skipped_mike} as Mike, {failed} failed[/green]") # Diagnostics mike_sim = float(np.dot(mike.composite_embedding, profile.composite_embedding) / (mike_norm * np.linalg.norm(profile.composite_embedding) + 1e-8)) console.print(f"[bold]Clay vs Mike similarity:[/bold] {mike_sim:.3f} (lower is better separation)") tara = profiler.profiles.get("Tara") if tara and tara.composite_embedding is not None: tara_sim = float(np.dot(tara.composite_embedding, profile.composite_embedding) / (np.linalg.norm(tara.composite_embedding) * np.linalg.norm(profile.composite_embedding) + 1e-8)) console.print(f"[bold]Clay vs Tara similarity:[/bold] {tara_sim:.3f}") profiler.save_profiles() console.print("[bold green]Profile saved.[/bold green]")