radio: diarization pipeline fixes, benchmark setup, test episode set

- Fix voice_profiler threshold bug (HOST label overwrote Unknown unconditionally) - Audio preload optimization: single ffmpeg per episode, 149.5x realtime on 5070 Ti - WavLM threshold raised to 0.85 (Mike 0.90-0.99, callers 0.46-0.83) - Promo/bumper filter: weighted signature scoring, 42->27 clean Q&A pairs - Text-only Q&A fallback for episodes with no CALLER diarization labels - TRANSFORMERS_OFFLINE=1 to skip HuggingFace freshness checks - Add diarize_2018.py for targeted re-run + FTS5 rebuild - Add benchmark.py + BENCH_SETUP.md for GURU-BEAST-ROG (RTX 4090) comparison - Commit 9-episode training diarization.json outputs - Session log: 2026-04-27-diarization-pipeline.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 13:20:10 -07:00
parent 206cd2f929
commit 79abef9dc9
21 changed files with 4720 additions and 202 deletions
--- a/projects/radio-show/audio-processor/src/diarizer.py
+++ b/projects/radio-show/audio-processor/src/diarizer.py
@@ -158,117 +158,86 @@ def diarize(audio_path: str | Path,
            voice_profiles: VoiceProfileStore | None = None,
            min_speakers: int = 1,
            max_speakers: int = 6,
-            host_match_threshold: float = 0.75) -> DiarizationResult:
-    """Run speaker diarization on an audio file."""
-    from pyannote.audio import Pipeline
+            host_match_threshold: float = 0.85) -> DiarizationResult:
+    """Run speaker diarization using WavLM sliding-window speaker identification.
+
+    Uses the built-in VoiceProfiler (WavLM x-vectors) — no HuggingFace token
+    or gated model required. Identifies HOST vs non-HOST speakers using the
+    stored voice profile for Mike Swanson.
+    """
    import torch
+    from .voice_profiler import VoiceProfiler

    audio_path = Path(audio_path)
    console.print(f"[bold]Diarizing:[/bold] {audio_path.name}")

-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
    console.print(f"[dim]Device: {device}[/dim]")

-    pipeline = Pipeline.from_pretrained(
-        "pyannote/speaker-diarization-3.1"
-    ).to(device)
+    # Locate voice profiles directory from the VoiceProfileStore path
+    profiles_dir = voice_profiles.profiles_dir if voice_profiles else Path("voice-profiles")

-    diarization = pipeline(
-        str(audio_path),
-        min_speakers=min_speakers,
-        max_speakers=max_speakers,
+    profiler = VoiceProfiler(profiles_dir, device=device)
+
+    if not profiler.profiles:
+        console.print("[yellow]No voice profiles found — labeling all as HOST[/yellow]")
+        # Return a single HOST turn covering the whole episode
+        from .voice_profiler import VoiceProfiler as VP
+        duration = profiler._get_duration(audio_path)
+        return DiarizationResult(
+            turns=[SpeakerTurn(speaker="HOST", start=0.0, end=duration)],
+            num_speakers=1,
+            speaker_map={"HOST": "HOST"},
+        )
+
+    # Sliding-window identification: 10s windows, 5s hop
+    voice_segs = profiler.identify_speakers(
+        audio_path, window_s=10.0, hop_s=5.0,
+        threshold=host_match_threshold,
    )

-    # Extract turns
+    # Convert VoiceSegment labels to HOST / CALLER
    raw_turns = []
-    for turn, _, speaker in diarization.itertracks(yield_label=True):
+    for seg in voice_segs:
+        label = seg.speaker_label.split(" (")[0]  # strip confidence score
+        if label.startswith("Host:") or label.startswith("Host "):
+            speaker = "HOST"
+        elif label == "[error]":
+            speaker = "UNKNOWN"
+        else:
+            speaker = "CALLER"
+
        raw_turns.append(SpeakerTurn(
            speaker=speaker,
-            start=turn.start,
-            end=turn.end,
+            start=seg.start,
+            end=seg.end,
+            confidence=float(seg.speaker_label.split("(")[-1].rstrip(")"))
+                       if "(" in seg.speaker_label else 0.5,
        ))

-    # Count unique speakers
-    raw_speakers = set(t.speaker for t in raw_turns)
-    console.print(f"[dim]Detected {len(raw_speakers)} speakers[/dim]")
-
-    # Match against voice profiles if available
-    speaker_map = {}
-    if voice_profiles and voice_profiles.embeddings:
-        console.print("[dim]Matching speakers against voice profiles...[/dim]")
-        embedding_model = pipeline.embedding  # pyannote's embedding model
-
-        # Get embeddings for each detected speaker
-        from pyannote.audio import Inference
-        inference = Inference(pipeline.embedding, window="whole")
-
-        for raw_label in raw_speakers:
-            # Get segments for this speaker
-            speaker_segments = [t for t in raw_turns if t.speaker == raw_label]
-            total_time = sum(t.duration for t in speaker_segments)
-
-            # Use the longest segment for embedding
-            longest = max(speaker_segments, key=lambda t: t.duration)
-
-            try:
-                # Extract embedding from audio segment
-                import torchaudio
-                waveform, sr = torchaudio.load(
-                    str(audio_path),
-                    frame_offset=int(longest.start * sr if 'sr' in dir() else longest.start * 16000),
-                    num_frames=int(longest.duration * sr if 'sr' in dir() else longest.duration * 16000),
-                )
-                # This is simplified — proper implementation would use pyannote's
-                # embedding extraction pipeline
-                match_name, score = voice_profiles.match_embedding(
-                    np.zeros(256),  # placeholder
-                    threshold=host_match_threshold,
-                )
-                if match_name:
-                    speaker_map[raw_label] = match_name
-                    console.print(f"  [green]{raw_label} -> {match_name} "
-                                  f"(score: {score:.2f}, {total_time:.0f}s)[/green]")
-            except Exception as e:
-                console.print(f"  [yellow]Could not match {raw_label}: {e}[/yellow]")
-
-        # If no voice profiles matched, use speaking time heuristic
-        # The host almost always has the most speaking time
-        if not speaker_map:
-            ranked = sorted(
-                [(s, sum(t.duration for t in raw_turns if t.speaker == s))
-                 for s in raw_speakers],
-                key=lambda x: x[1],
-                reverse=True,
-            )
-            if ranked:
-                speaker_map[ranked[0][0]] = f"Host: {voice_profiles.metadata.get('host', {}).get('name', 'Unknown')}"
-                console.print(f"  [yellow]Assumed {ranked[0][0]} is host "
-                              f"(most speaking time: {ranked[0][1]:.0f}s)[/yellow]")
-
-    # If no voice profiles at all, label by speaking time
-    if not speaker_map:
-        ranked = sorted(
-            [(s, sum(t.duration for t in raw_turns if t.speaker == s))
-             for s in raw_speakers],
-            key=lambda x: x[1],
-            reverse=True,
-        )
-        for i, (speaker, time) in enumerate(ranked):
-            if i == 0:
-                speaker_map[speaker] = "Host (assumed)"
-            else:
-                speaker_map[speaker] = f"Speaker {i}"
-
-    # Apply friendly names
+    # Merge consecutive same-speaker turns
+    merged: list[SpeakerTurn] = []
    for turn in raw_turns:
-        if turn.speaker in speaker_map:
-            turn.speaker = speaker_map[turn.speaker]
+        if merged and merged[-1].speaker == turn.speaker:
+            merged[-1].end = turn.end
+        else:
+            merged.append(SpeakerTurn(
+                speaker=turn.speaker,
+                start=turn.start,
+                end=turn.end,
+                confidence=turn.confidence,
+            ))

-    console.print(f"[green]Diarization complete: {len(raw_turns)} turns, "
-                  f"{len(raw_speakers)} speakers[/green]")
+    unique_speakers = set(t.speaker for t in merged)
+    speaker_map = {s: s for s in unique_speakers}
+
+    host_time = sum(t.duration for t in merged if t.speaker == "HOST")
+    caller_time = sum(t.duration for t in merged if t.speaker == "CALLER")
+    console.print(f"[green]Diarization complete:[/green] {len(merged)} turns | "
+                  f"HOST {host_time:.0f}s / CALLER {caller_time:.0f}s")

    return DiarizationResult(
-        turns=raw_turns,
-        num_speakers=len(raw_speakers),
+        turns=merged,
+        num_speakers=len(unique_speakers),
        speaker_map=speaker_map,
    )