radio: diarization pipeline fixes, benchmark setup, test episode set

- Fix voice_profiler threshold bug (HOST label overwrote Unknown unconditionally) - Audio preload optimization: single ffmpeg per episode, 149.5x realtime on 5070 Ti - WavLM threshold raised to 0.85 (Mike 0.90-0.99, callers 0.46-0.83) - Promo/bumper filter: weighted signature scoring, 42->27 clean Q&A pairs - Text-only Q&A fallback for episodes with no CALLER diarization labels - TRANSFORMERS_OFFLINE=1 to skip HuggingFace freshness checks - Add diarize_2018.py for targeted re-run + FTS5 rebuild - Add benchmark.py + BENCH_SETUP.md for GURU-BEAST-ROG (RTX 4090) comparison - Commit 9-episode training diarization.json outputs - Session log: 2026-04-27-diarization-pipeline.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 13:20:10 -07:00
parent 206cd2f929
commit 79abef9dc9
21 changed files with 4720 additions and 202 deletions
--- a/projects/radio-show/audio-processor/src/voice_profiler.py
+++ b/projects/radio-show/audio-processor/src/voice_profiler.py
@@ -13,7 +13,6 @@ import numpy as np
 import torch
 import soundfile as sf
 from rich.console import Console
-from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeElapsedColumn
 from rich.table import Table

 console = Console()
@@ -159,36 +158,43 @@ class VoiceProfiler:

    def extract_embedding(self, audio_path: Path, start: float = 0.0,
                          end: float | None = None) -> np.ndarray:
-        """Extract a speaker embedding from an audio segment."""
-        model = self._get_model()
+        """Extract a speaker embedding from an audio segment (file-based, any format)."""
+        self._get_model()
+        waveform, _ = self._load_audio_segment(audio_path, start, end)
+        return self._embed_audio_np(waveform.squeeze(0).numpy())

-        # Load audio segment (already at SAMPLE_RATE via ffmpeg)
-        waveform, sr = self._load_audio_segment(audio_path, start, end)
-
-        # waveform is [1, samples] tensor, need just the numpy array for the extractor
-        audio_np = waveform.squeeze(0).numpy()
-
-        # Extract features
+    def _embed_audio_np(self, audio_np: np.ndarray) -> np.ndarray:
+        """Embed a float32 mono numpy array (already at SAMPLE_RATE). Returns L2-normalized embedding."""
+        self._get_model()
        inputs = self._extractor(
            audio_np, sampling_rate=SAMPLE_RATE,
            return_tensors="pt", padding=True,
        )
-
-        # Get embedding
        with torch.no_grad():
-            outputs = model(**{k: v.to(self.device) for k, v in inputs.items()})
-
+            outputs = self._model(**{k: v.to(self.device) for k, v in inputs.items()})
        embedding = outputs.embeddings.squeeze().cpu().numpy()
-        # L2 normalize
        norm = np.linalg.norm(embedding)
        if norm > 0:
            embedding = embedding / norm
-
        return embedding

+    def _load_full_audio(self, audio_path: Path) -> np.ndarray:
+        """Decode entire audio file to float32 mono at SAMPLE_RATE via a single ffmpeg call."""
+        cmd = [
+            "ffmpeg", "-i", str(audio_path),
+            "-f", "wav", "-ac", "1", "-ar", str(SAMPLE_RATE),
+            "-acodec", "pcm_s16le", "pipe:1",
+        ]
+        result = subprocess.run(cmd, capture_output=True, timeout=600)
+        if result.returncode != 0:
+            raise RuntimeError(f"ffmpeg failed: {result.stderr.decode()[:200]}")
+        import io
+        data, _ = sf.read(io.BytesIO(result.stdout), dtype="float32")
+        return data  # shape: (samples,)
+
    def _load_audio_segment(self, audio_path: Path, start: float = 0.0,
                            end: float | None = None) -> tuple[torch.Tensor, int]:
-        """Load an audio segment using ffmpeg (handles any format)."""
+        """Load a single audio segment via ffmpeg (used for one-off extraction)."""
        cmd = ["ffmpeg", "-i", str(audio_path)]
        if start > 0:
            cmd.extend(["-ss", str(start)])
@@ -227,68 +233,39 @@ class VoiceProfiler:

        profile = self.profiles[host_name]

-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            BarColumn(),
-            TextColumn("{task.completed}/{task.total}"),
-            TimeElapsedColumn(),
-            console=console,
-        ) as progress:
-            task = progress.add_task("Processing episodes...",
-                                     total=len(episode_paths))
+        for ep_idx, ep_path in enumerate(episode_paths, 1):
+            console.print(f"[dim]  [{ep_idx}/{len(episode_paths)}] {ep_path.name}[/dim]")

-            for ep_path in episode_paths:
-                progress.update(task, description=f"Processing {ep_path.name}...")
+            try:
+                duration = self._get_duration(ep_path)

-                try:
-                    # Get episode duration
-                    duration = self._get_duration(ep_path)
+                windows = []
+                if duration > 90:
+                    windows.append((30.0, 90.0))
+                if duration > 180:
+                    windows.append((120.0, 180.0))
+                mid = duration / 2
+                if mid > 60:
+                    windows.append((mid, min(mid + 60, duration)))
+                late = duration - 180
+                if late > 300:
+                    windows.append((late, late + 60))

-                    # Strategy: extract embeddings from multiple time windows
-                    # Skip first 30s (likely intro jingle), then sample every 2 min
-                    windows = []
+                chunk_duration = 10.0
+                for start, end in windows:
+                    for chunk_start in np.arange(start, end - chunk_duration, chunk_duration):
+                        try:
+                            emb = self.extract_embedding(
+                                ep_path, chunk_start, chunk_start + chunk_duration
+                            )
+                            profile.embeddings.append(emb)
+                        except Exception as e:
+                            console.print(f"    [dim red]Chunk {chunk_start:.0f}s failed: {e}[/dim red]")

-                    # Window 1: After intro (30s-90s) — usually host monologue
-                    if duration > 90:
-                        windows.append((30.0, 90.0))
+                profile.source_episodes.append(ep_path.name)

-                    # Window 2: Early show (2min-3min)
-                    if duration > 180:
-                        windows.append((120.0, 180.0))
-
-                    # Window 3: Mid show
-                    mid = duration / 2
-                    if mid > 60:
-                        windows.append((mid, min(mid + 60, duration)))
-
-                    # Window 4: Late show (but not last 2 min — likely outro)
-                    late = duration - 180
-                    if late > 300:
-                        windows.append((late, late + 60))
-
-                    for start, end in windows:
-                        # Extract 10-second chunks within each window
-                        # and take the embedding of each chunk
-                        chunk_duration = 10.0
-                        for chunk_start in np.arange(start, end - chunk_duration,
-                                                     chunk_duration):
-                            try:
-                                emb = self.extract_embedding(
-                                    ep_path, chunk_start,
-                                    chunk_start + chunk_duration
-                                )
-                                profile.embeddings.append(emb)
-                            except Exception as e:
-                                console.print(f"    [dim red]Chunk {chunk_start:.0f}s failed: {e}[/dim red]")
-                                continue
-
-                    profile.source_episodes.append(ep_path.name)
-
-                except Exception as e:
-                    console.print(f"  [red]Failed: {ep_path.name}: {e}[/red]")
-
-                progress.update(task, advance=1)
+            except Exception as e:
+                console.print(f"  [red]Failed: {ep_path.name}: {e}[/red]")

        # Compute composite
        profile.compute_composite()
@@ -305,58 +282,66 @@ class VoiceProfiler:
                          threshold: float = 0.70) -> list[VoiceSegment]:
        """Identify speakers throughout an audio file using sliding window.

+        Loads the full audio once then slices in memory — avoids spawning
+        hundreds of ffmpeg subprocesses.
        Returns timestamped segments with speaker labels and embeddings.
        """
        console.print(f"[bold]Identifying speakers:[/bold] {audio_path.name}")

        duration = self._get_duration(audio_path)
+        console.print(f"[dim]Loading audio into memory...[/dim]")
+        audio = self._load_full_audio(audio_path)  # float32 mono array
+        self._get_model()  # ensure model is warm before the loop
+
        segments = []
+        window_samples = int(window_s * SAMPLE_RATE)
+        hop_samples = int(hop_s * SAMPLE_RATE)
+        total_samples = len(audio)

-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            BarColumn(),
-            TextColumn("{task.percentage:>3.0f}%"),
-            TimeElapsedColumn(),
-            console=console,
-        ) as progress:
-            task = progress.add_task("Analyzing speakers...",
-                                     total=int(duration))
+        total_windows = int((duration - window_s) / hop_s) + 1
+        report_every = max(1, total_windows // 10)

-            for start in np.arange(0, duration - window_s, hop_s):
-                end = min(start + window_s, duration)
+        for idx, start in enumerate(np.arange(0, duration - window_s, hop_s)):
+            end = min(start + window_s, duration)
+            s = int(start * SAMPLE_RATE)
+            e = min(s + window_samples, total_samples)

-                try:
-                    emb = self.extract_embedding(audio_path, start, end)
+            try:
+                emb = self._embed_audio_np(audio[s:e])

-                    # Match against known profiles
-                    best_match = None
-                    best_score = 0.0
+                best_match = None
+                best_score = 0.0

-                    for name, profile in self.profiles.items():
-                        score = profile.similarity(emb)
-                        if score > best_score:
-                            best_score = score
-                            best_match = name
+                for name, profile in self.profiles.items():
+                    score = profile.similarity(emb)
+                    if score > best_score:
+                        best_score = score
+                        best_match = name

-                    label = best_match if best_score >= threshold else "Unknown"
+                if best_score >= threshold:
                    if best_match and self.profiles[best_match].role == "host":
                        label = f"Host: {best_match}"
+                    else:
+                        label = best_match
+                else:
+                    label = "Unknown"

-                    segments.append(VoiceSegment(
-                        start=start,
-                        end=end,
-                        embedding=emb,
-                        speaker_label=f"{label} ({best_score:.2f})",
-                    ))
+                segments.append(VoiceSegment(
+                    start=start,
+                    end=end,
+                    embedding=emb,
+                    speaker_label=f"{label} ({best_score:.2f})",
+                ))

-                except Exception:
-                    segments.append(VoiceSegment(
-                        start=start, end=end,
-                        speaker_label="[error]",
-                    ))
+            except Exception:
+                segments.append(VoiceSegment(
+                    start=start, end=end,
+                    speaker_label="[error]",
+                ))

-                progress.update(task, completed=int(end))
+            if idx % report_every == 0:
+                pct = int(end / duration * 100)
+                console.print(f"[dim]  {pct}% ({end:.0f}s / {duration:.0f}s)[/dim]")

        # Print summary
        self._print_speaker_summary(segments, duration)