radio: diarization pipeline fixes, benchmark setup, test episode set
- Fix voice_profiler threshold bug (HOST label overwrote Unknown unconditionally) - Audio preload optimization: single ffmpeg per episode, 149.5x realtime on 5070 Ti - WavLM threshold raised to 0.85 (Mike 0.90-0.99, callers 0.46-0.83) - Promo/bumper filter: weighted signature scoring, 42->27 clean Q&A pairs - Text-only Q&A fallback for episodes with no CALLER diarization labels - TRANSFORMERS_OFFLINE=1 to skip HuggingFace freshness checks - Add diarize_2018.py for targeted re-run + FTS5 rebuild - Add benchmark.py + BENCH_SETUP.md for GURU-BEAST-ROG (RTX 4090) comparison - Commit 9-episode training diarization.json outputs - Session log: 2026-04-27-diarization-pipeline.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -13,7 +13,6 @@ import numpy as np
|
||||
import torch
|
||||
import soundfile as sf
|
||||
from rich.console import Console
|
||||
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeElapsedColumn
|
||||
from rich.table import Table
|
||||
|
||||
console = Console()
|
||||
@@ -159,36 +158,43 @@ class VoiceProfiler:
|
||||
|
||||
def extract_embedding(self, audio_path: Path, start: float = 0.0,
|
||||
end: float | None = None) -> np.ndarray:
|
||||
"""Extract a speaker embedding from an audio segment."""
|
||||
model = self._get_model()
|
||||
"""Extract a speaker embedding from an audio segment (file-based, any format)."""
|
||||
self._get_model()
|
||||
waveform, _ = self._load_audio_segment(audio_path, start, end)
|
||||
return self._embed_audio_np(waveform.squeeze(0).numpy())
|
||||
|
||||
# Load audio segment (already at SAMPLE_RATE via ffmpeg)
|
||||
waveform, sr = self._load_audio_segment(audio_path, start, end)
|
||||
|
||||
# waveform is [1, samples] tensor, need just the numpy array for the extractor
|
||||
audio_np = waveform.squeeze(0).numpy()
|
||||
|
||||
# Extract features
|
||||
def _embed_audio_np(self, audio_np: np.ndarray) -> np.ndarray:
|
||||
"""Embed a float32 mono numpy array (already at SAMPLE_RATE). Returns L2-normalized embedding."""
|
||||
self._get_model()
|
||||
inputs = self._extractor(
|
||||
audio_np, sampling_rate=SAMPLE_RATE,
|
||||
return_tensors="pt", padding=True,
|
||||
)
|
||||
|
||||
# Get embedding
|
||||
with torch.no_grad():
|
||||
outputs = model(**{k: v.to(self.device) for k, v in inputs.items()})
|
||||
|
||||
outputs = self._model(**{k: v.to(self.device) for k, v in inputs.items()})
|
||||
embedding = outputs.embeddings.squeeze().cpu().numpy()
|
||||
# L2 normalize
|
||||
norm = np.linalg.norm(embedding)
|
||||
if norm > 0:
|
||||
embedding = embedding / norm
|
||||
|
||||
return embedding
|
||||
|
||||
def _load_full_audio(self, audio_path: Path) -> np.ndarray:
|
||||
"""Decode entire audio file to float32 mono at SAMPLE_RATE via a single ffmpeg call."""
|
||||
cmd = [
|
||||
"ffmpeg", "-i", str(audio_path),
|
||||
"-f", "wav", "-ac", "1", "-ar", str(SAMPLE_RATE),
|
||||
"-acodec", "pcm_s16le", "pipe:1",
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=600)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"ffmpeg failed: {result.stderr.decode()[:200]}")
|
||||
import io
|
||||
data, _ = sf.read(io.BytesIO(result.stdout), dtype="float32")
|
||||
return data # shape: (samples,)
|
||||
|
||||
def _load_audio_segment(self, audio_path: Path, start: float = 0.0,
|
||||
end: float | None = None) -> tuple[torch.Tensor, int]:
|
||||
"""Load an audio segment using ffmpeg (handles any format)."""
|
||||
"""Load a single audio segment via ffmpeg (used for one-off extraction)."""
|
||||
cmd = ["ffmpeg", "-i", str(audio_path)]
|
||||
if start > 0:
|
||||
cmd.extend(["-ss", str(start)])
|
||||
@@ -227,68 +233,39 @@ class VoiceProfiler:
|
||||
|
||||
profile = self.profiles[host_name]
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TextColumn("{task.completed}/{task.total}"),
|
||||
TimeElapsedColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task("Processing episodes...",
|
||||
total=len(episode_paths))
|
||||
for ep_idx, ep_path in enumerate(episode_paths, 1):
|
||||
console.print(f"[dim] [{ep_idx}/{len(episode_paths)}] {ep_path.name}[/dim]")
|
||||
|
||||
for ep_path in episode_paths:
|
||||
progress.update(task, description=f"Processing {ep_path.name}...")
|
||||
try:
|
||||
duration = self._get_duration(ep_path)
|
||||
|
||||
try:
|
||||
# Get episode duration
|
||||
duration = self._get_duration(ep_path)
|
||||
windows = []
|
||||
if duration > 90:
|
||||
windows.append((30.0, 90.0))
|
||||
if duration > 180:
|
||||
windows.append((120.0, 180.0))
|
||||
mid = duration / 2
|
||||
if mid > 60:
|
||||
windows.append((mid, min(mid + 60, duration)))
|
||||
late = duration - 180
|
||||
if late > 300:
|
||||
windows.append((late, late + 60))
|
||||
|
||||
# Strategy: extract embeddings from multiple time windows
|
||||
# Skip first 30s (likely intro jingle), then sample every 2 min
|
||||
windows = []
|
||||
chunk_duration = 10.0
|
||||
for start, end in windows:
|
||||
for chunk_start in np.arange(start, end - chunk_duration, chunk_duration):
|
||||
try:
|
||||
emb = self.extract_embedding(
|
||||
ep_path, chunk_start, chunk_start + chunk_duration
|
||||
)
|
||||
profile.embeddings.append(emb)
|
||||
except Exception as e:
|
||||
console.print(f" [dim red]Chunk {chunk_start:.0f}s failed: {e}[/dim red]")
|
||||
|
||||
# Window 1: After intro (30s-90s) — usually host monologue
|
||||
if duration > 90:
|
||||
windows.append((30.0, 90.0))
|
||||
profile.source_episodes.append(ep_path.name)
|
||||
|
||||
# Window 2: Early show (2min-3min)
|
||||
if duration > 180:
|
||||
windows.append((120.0, 180.0))
|
||||
|
||||
# Window 3: Mid show
|
||||
mid = duration / 2
|
||||
if mid > 60:
|
||||
windows.append((mid, min(mid + 60, duration)))
|
||||
|
||||
# Window 4: Late show (but not last 2 min — likely outro)
|
||||
late = duration - 180
|
||||
if late > 300:
|
||||
windows.append((late, late + 60))
|
||||
|
||||
for start, end in windows:
|
||||
# Extract 10-second chunks within each window
|
||||
# and take the embedding of each chunk
|
||||
chunk_duration = 10.0
|
||||
for chunk_start in np.arange(start, end - chunk_duration,
|
||||
chunk_duration):
|
||||
try:
|
||||
emb = self.extract_embedding(
|
||||
ep_path, chunk_start,
|
||||
chunk_start + chunk_duration
|
||||
)
|
||||
profile.embeddings.append(emb)
|
||||
except Exception as e:
|
||||
console.print(f" [dim red]Chunk {chunk_start:.0f}s failed: {e}[/dim red]")
|
||||
continue
|
||||
|
||||
profile.source_episodes.append(ep_path.name)
|
||||
|
||||
except Exception as e:
|
||||
console.print(f" [red]Failed: {ep_path.name}: {e}[/red]")
|
||||
|
||||
progress.update(task, advance=1)
|
||||
except Exception as e:
|
||||
console.print(f" [red]Failed: {ep_path.name}: {e}[/red]")
|
||||
|
||||
# Compute composite
|
||||
profile.compute_composite()
|
||||
@@ -305,58 +282,66 @@ class VoiceProfiler:
|
||||
threshold: float = 0.70) -> list[VoiceSegment]:
|
||||
"""Identify speakers throughout an audio file using sliding window.
|
||||
|
||||
Loads the full audio once then slices in memory — avoids spawning
|
||||
hundreds of ffmpeg subprocesses.
|
||||
Returns timestamped segments with speaker labels and embeddings.
|
||||
"""
|
||||
console.print(f"[bold]Identifying speakers:[/bold] {audio_path.name}")
|
||||
|
||||
duration = self._get_duration(audio_path)
|
||||
console.print(f"[dim]Loading audio into memory...[/dim]")
|
||||
audio = self._load_full_audio(audio_path) # float32 mono array
|
||||
self._get_model() # ensure model is warm before the loop
|
||||
|
||||
segments = []
|
||||
window_samples = int(window_s * SAMPLE_RATE)
|
||||
hop_samples = int(hop_s * SAMPLE_RATE)
|
||||
total_samples = len(audio)
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TextColumn("{task.percentage:>3.0f}%"),
|
||||
TimeElapsedColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task("Analyzing speakers...",
|
||||
total=int(duration))
|
||||
total_windows = int((duration - window_s) / hop_s) + 1
|
||||
report_every = max(1, total_windows // 10)
|
||||
|
||||
for start in np.arange(0, duration - window_s, hop_s):
|
||||
end = min(start + window_s, duration)
|
||||
for idx, start in enumerate(np.arange(0, duration - window_s, hop_s)):
|
||||
end = min(start + window_s, duration)
|
||||
s = int(start * SAMPLE_RATE)
|
||||
e = min(s + window_samples, total_samples)
|
||||
|
||||
try:
|
||||
emb = self.extract_embedding(audio_path, start, end)
|
||||
try:
|
||||
emb = self._embed_audio_np(audio[s:e])
|
||||
|
||||
# Match against known profiles
|
||||
best_match = None
|
||||
best_score = 0.0
|
||||
best_match = None
|
||||
best_score = 0.0
|
||||
|
||||
for name, profile in self.profiles.items():
|
||||
score = profile.similarity(emb)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = name
|
||||
for name, profile in self.profiles.items():
|
||||
score = profile.similarity(emb)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = name
|
||||
|
||||
label = best_match if best_score >= threshold else "Unknown"
|
||||
if best_score >= threshold:
|
||||
if best_match and self.profiles[best_match].role == "host":
|
||||
label = f"Host: {best_match}"
|
||||
else:
|
||||
label = best_match
|
||||
else:
|
||||
label = "Unknown"
|
||||
|
||||
segments.append(VoiceSegment(
|
||||
start=start,
|
||||
end=end,
|
||||
embedding=emb,
|
||||
speaker_label=f"{label} ({best_score:.2f})",
|
||||
))
|
||||
segments.append(VoiceSegment(
|
||||
start=start,
|
||||
end=end,
|
||||
embedding=emb,
|
||||
speaker_label=f"{label} ({best_score:.2f})",
|
||||
))
|
||||
|
||||
except Exception:
|
||||
segments.append(VoiceSegment(
|
||||
start=start, end=end,
|
||||
speaker_label="[error]",
|
||||
))
|
||||
except Exception:
|
||||
segments.append(VoiceSegment(
|
||||
start=start, end=end,
|
||||
speaker_label="[error]",
|
||||
))
|
||||
|
||||
progress.update(task, completed=int(end))
|
||||
if idx % report_every == 0:
|
||||
pct = int(end / duration * 100)
|
||||
console.print(f"[dim] {pct}% ({end:.0f}s / {duration:.0f}s)[/dim]")
|
||||
|
||||
# Print summary
|
||||
self._print_speaker_summary(segments, duration)
|
||||
|
||||
Reference in New Issue
Block a user