Audio processor: working voice profiler with WavLM speaker embeddings

- Voice profiler using microsoft/wavlm-base-sv (512-dim x-vector embeddings) - Bootstrap from archive: 180 embeddings from 9 episodes across 2010-2018 - Host identification accuracy: 0.87-0.98 similarity for live speech, 0.60-0.64 for non-host audio (produced intros, co-host) - Dropped speechbrain dependency (requires torchaudio, CUDA version conflicts) - Patched torchaudio CUDA 12.8/13.1 version check (warning instead of error) - Profile stored in voice-profiles/mike-swanson/ with per-chunk embeddings Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 12:19:13 -07:00
parent 87f5a9306a
commit 826141a319
2 changed files with 430 additions and 6 deletions
--- a/projects/radio-show/audio-processor/src/cli.py
+++ b/projects/radio-show/audio-processor/src/cli.py
@@ -334,13 +334,29 @@ def _cmd_split(args, config):

 def _cmd_bootstrap_voice(args, config):
    """Bootstrap host voice profile from archive episodes."""
-    console.print("[bold]Bootstrapping host voice profile[/bold]")
-    console.print(f"Archive: {args.archive_dir}")
-    console.print(f"Speaker: {args.speaker_name}")
-    console.print(f"Sampling {args.sample_count} episodes")
+    from .voice_profiler import VoiceProfiler

-    # TODO: Implement archive sampling + diarization + embedding extraction
-    console.print("[yellow]Not yet implemented — run individual diarizations first[/yellow]")
+    archive_dir = Path(args.archive_dir)
+    profiler = VoiceProfiler(
+        config.resolve_path(config.paths.voice_profiles),
+        device="cuda",
+    )
+
+    # Find MP3 files in archive directory
+    mp3_files = sorted(archive_dir.glob("**/*.mp3"))
+    if not mp3_files:
+        console.print(f"[red]No MP3 files found in {archive_dir}[/red]")
+        return
+
+    # Sample if we have more than requested
+    if len(mp3_files) > args.sample_count:
+        step = len(mp3_files) // args.sample_count
+        mp3_files = [mp3_files[i * step] for i in range(args.sample_count)]
+
+    console.print(f"[dim]Found {len(mp3_files)} episodes to process[/dim]")
+
+    profiler.bootstrap_host_from_episodes(mp3_files, host_name=args.speaker_name)
+    profiler.print_profiles()


 def _cmd_review_elements(args, config):