radio: diarization pipeline fixes, benchmark setup, test episode set

- Fix voice_profiler threshold bug (HOST label overwrote Unknown unconditionally)
- Audio preload optimization: single ffmpeg per episode, 149.5x realtime on 5070 Ti
- WavLM threshold raised to 0.85 (Mike 0.90-0.99, callers 0.46-0.83)
- Promo/bumper filter: weighted signature scoring, 42->27 clean Q&A pairs
- Text-only Q&A fallback for episodes with no CALLER diarization labels
- TRANSFORMERS_OFFLINE=1 to skip HuggingFace freshness checks
- Add diarize_2018.py for targeted re-run + FTS5 rebuild
- Add benchmark.py + BENCH_SETUP.md for GURU-BEAST-ROG (RTX 4090) comparison
- Commit 9-episode training diarization.json outputs
- Session log: 2026-04-27-diarization-pipeline.md

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-27 13:20:10 -07:00
parent 206cd2f929
commit 79abef9dc9
21 changed files with 4720 additions and 202 deletions

View File

@@ -13,7 +13,6 @@ import numpy as np
import torch
import soundfile as sf
from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeElapsedColumn
from rich.table import Table
console = Console()
@@ -159,36 +158,43 @@ class VoiceProfiler:
def extract_embedding(self, audio_path: Path, start: float = 0.0,
end: float | None = None) -> np.ndarray:
"""Extract a speaker embedding from an audio segment."""
model = self._get_model()
"""Extract a speaker embedding from an audio segment (file-based, any format)."""
self._get_model()
waveform, _ = self._load_audio_segment(audio_path, start, end)
return self._embed_audio_np(waveform.squeeze(0).numpy())
# Load audio segment (already at SAMPLE_RATE via ffmpeg)
waveform, sr = self._load_audio_segment(audio_path, start, end)
# waveform is [1, samples] tensor, need just the numpy array for the extractor
audio_np = waveform.squeeze(0).numpy()
# Extract features
def _embed_audio_np(self, audio_np: np.ndarray) -> np.ndarray:
"""Embed a float32 mono numpy array (already at SAMPLE_RATE). Returns L2-normalized embedding."""
self._get_model()
inputs = self._extractor(
audio_np, sampling_rate=SAMPLE_RATE,
return_tensors="pt", padding=True,
)
# Get embedding
with torch.no_grad():
outputs = model(**{k: v.to(self.device) for k, v in inputs.items()})
outputs = self._model(**{k: v.to(self.device) for k, v in inputs.items()})
embedding = outputs.embeddings.squeeze().cpu().numpy()
# L2 normalize
norm = np.linalg.norm(embedding)
if norm > 0:
embedding = embedding / norm
return embedding
def _load_full_audio(self, audio_path: Path) -> np.ndarray:
"""Decode entire audio file to float32 mono at SAMPLE_RATE via a single ffmpeg call."""
cmd = [
"ffmpeg", "-i", str(audio_path),
"-f", "wav", "-ac", "1", "-ar", str(SAMPLE_RATE),
"-acodec", "pcm_s16le", "pipe:1",
]
result = subprocess.run(cmd, capture_output=True, timeout=600)
if result.returncode != 0:
raise RuntimeError(f"ffmpeg failed: {result.stderr.decode()[:200]}")
import io
data, _ = sf.read(io.BytesIO(result.stdout), dtype="float32")
return data # shape: (samples,)
def _load_audio_segment(self, audio_path: Path, start: float = 0.0,
end: float | None = None) -> tuple[torch.Tensor, int]:
"""Load an audio segment using ffmpeg (handles any format)."""
"""Load a single audio segment via ffmpeg (used for one-off extraction)."""
cmd = ["ffmpeg", "-i", str(audio_path)]
if start > 0:
cmd.extend(["-ss", str(start)])
@@ -227,68 +233,39 @@ class VoiceProfiler:
profile = self.profiles[host_name]
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("{task.completed}/{task.total}"),
TimeElapsedColumn(),
console=console,
) as progress:
task = progress.add_task("Processing episodes...",
total=len(episode_paths))
for ep_idx, ep_path in enumerate(episode_paths, 1):
console.print(f"[dim] [{ep_idx}/{len(episode_paths)}] {ep_path.name}[/dim]")
for ep_path in episode_paths:
progress.update(task, description=f"Processing {ep_path.name}...")
try:
duration = self._get_duration(ep_path)
try:
# Get episode duration
duration = self._get_duration(ep_path)
windows = []
if duration > 90:
windows.append((30.0, 90.0))
if duration > 180:
windows.append((120.0, 180.0))
mid = duration / 2
if mid > 60:
windows.append((mid, min(mid + 60, duration)))
late = duration - 180
if late > 300:
windows.append((late, late + 60))
# Strategy: extract embeddings from multiple time windows
# Skip first 30s (likely intro jingle), then sample every 2 min
windows = []
chunk_duration = 10.0
for start, end in windows:
for chunk_start in np.arange(start, end - chunk_duration, chunk_duration):
try:
emb = self.extract_embedding(
ep_path, chunk_start, chunk_start + chunk_duration
)
profile.embeddings.append(emb)
except Exception as e:
console.print(f" [dim red]Chunk {chunk_start:.0f}s failed: {e}[/dim red]")
# Window 1: After intro (30s-90s) — usually host monologue
if duration > 90:
windows.append((30.0, 90.0))
profile.source_episodes.append(ep_path.name)
# Window 2: Early show (2min-3min)
if duration > 180:
windows.append((120.0, 180.0))
# Window 3: Mid show
mid = duration / 2
if mid > 60:
windows.append((mid, min(mid + 60, duration)))
# Window 4: Late show (but not last 2 min — likely outro)
late = duration - 180
if late > 300:
windows.append((late, late + 60))
for start, end in windows:
# Extract 10-second chunks within each window
# and take the embedding of each chunk
chunk_duration = 10.0
for chunk_start in np.arange(start, end - chunk_duration,
chunk_duration):
try:
emb = self.extract_embedding(
ep_path, chunk_start,
chunk_start + chunk_duration
)
profile.embeddings.append(emb)
except Exception as e:
console.print(f" [dim red]Chunk {chunk_start:.0f}s failed: {e}[/dim red]")
continue
profile.source_episodes.append(ep_path.name)
except Exception as e:
console.print(f" [red]Failed: {ep_path.name}: {e}[/red]")
progress.update(task, advance=1)
except Exception as e:
console.print(f" [red]Failed: {ep_path.name}: {e}[/red]")
# Compute composite
profile.compute_composite()
@@ -305,58 +282,66 @@ class VoiceProfiler:
threshold: float = 0.70) -> list[VoiceSegment]:
"""Identify speakers throughout an audio file using sliding window.
Loads the full audio once then slices in memory — avoids spawning
hundreds of ffmpeg subprocesses.
Returns timestamped segments with speaker labels and embeddings.
"""
console.print(f"[bold]Identifying speakers:[/bold] {audio_path.name}")
duration = self._get_duration(audio_path)
console.print(f"[dim]Loading audio into memory...[/dim]")
audio = self._load_full_audio(audio_path) # float32 mono array
self._get_model() # ensure model is warm before the loop
segments = []
window_samples = int(window_s * SAMPLE_RATE)
hop_samples = int(hop_s * SAMPLE_RATE)
total_samples = len(audio)
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("{task.percentage:>3.0f}%"),
TimeElapsedColumn(),
console=console,
) as progress:
task = progress.add_task("Analyzing speakers...",
total=int(duration))
total_windows = int((duration - window_s) / hop_s) + 1
report_every = max(1, total_windows // 10)
for start in np.arange(0, duration - window_s, hop_s):
end = min(start + window_s, duration)
for idx, start in enumerate(np.arange(0, duration - window_s, hop_s)):
end = min(start + window_s, duration)
s = int(start * SAMPLE_RATE)
e = min(s + window_samples, total_samples)
try:
emb = self.extract_embedding(audio_path, start, end)
try:
emb = self._embed_audio_np(audio[s:e])
# Match against known profiles
best_match = None
best_score = 0.0
best_match = None
best_score = 0.0
for name, profile in self.profiles.items():
score = profile.similarity(emb)
if score > best_score:
best_score = score
best_match = name
for name, profile in self.profiles.items():
score = profile.similarity(emb)
if score > best_score:
best_score = score
best_match = name
label = best_match if best_score >= threshold else "Unknown"
if best_score >= threshold:
if best_match and self.profiles[best_match].role == "host":
label = f"Host: {best_match}"
else:
label = best_match
else:
label = "Unknown"
segments.append(VoiceSegment(
start=start,
end=end,
embedding=emb,
speaker_label=f"{label} ({best_score:.2f})",
))
segments.append(VoiceSegment(
start=start,
end=end,
embedding=emb,
speaker_label=f"{label} ({best_score:.2f})",
))
except Exception:
segments.append(VoiceSegment(
start=start, end=end,
speaker_label="[error]",
))
except Exception:
segments.append(VoiceSegment(
start=start, end=end,
speaker_label="[error]",
))
progress.update(task, completed=int(end))
if idx % report_every == 0:
pct = int(end / duration * 100)
console.print(f"[dim] {pct}% ({end:.0f}s / {duration:.0f}s)[/dim]")
# Print summary
self._print_speaker_summary(segments, duration)