radio: diarization pipeline fixes, benchmark setup, test episode set

- Fix voice_profiler threshold bug (HOST label overwrote Unknown unconditionally)
- Audio preload optimization: single ffmpeg per episode, 149.5x realtime on 5070 Ti
- WavLM threshold raised to 0.85 (Mike 0.90-0.99, callers 0.46-0.83)
- Promo/bumper filter: weighted signature scoring, 42->27 clean Q&A pairs
- Text-only Q&A fallback for episodes with no CALLER diarization labels
- TRANSFORMERS_OFFLINE=1 to skip HuggingFace freshness checks
- Add diarize_2018.py for targeted re-run + FTS5 rebuild
- Add benchmark.py + BENCH_SETUP.md for GURU-BEAST-ROG (RTX 4090) comparison
- Commit 9-episode training diarization.json outputs
- Session log: 2026-04-27-diarization-pipeline.md

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-27 13:20:10 -07:00
parent 206cd2f929
commit 79abef9dc9
21 changed files with 4720 additions and 202 deletions

View File

@@ -158,117 +158,86 @@ def diarize(audio_path: str | Path,
voice_profiles: VoiceProfileStore | None = None,
min_speakers: int = 1,
max_speakers: int = 6,
host_match_threshold: float = 0.75) -> DiarizationResult:
"""Run speaker diarization on an audio file."""
from pyannote.audio import Pipeline
host_match_threshold: float = 0.85) -> DiarizationResult:
"""Run speaker diarization using WavLM sliding-window speaker identification.
Uses the built-in VoiceProfiler (WavLM x-vectors) — no HuggingFace token
or gated model required. Identifies HOST vs non-HOST speakers using the
stored voice profile for Mike Swanson.
"""
import torch
from .voice_profiler import VoiceProfiler
audio_path = Path(audio_path)
console.print(f"[bold]Diarizing:[/bold] {audio_path.name}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cuda" if torch.cuda.is_available() else "cpu"
console.print(f"[dim]Device: {device}[/dim]")
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1"
).to(device)
# Locate voice profiles directory from the VoiceProfileStore path
profiles_dir = voice_profiles.profiles_dir if voice_profiles else Path("voice-profiles")
diarization = pipeline(
str(audio_path),
min_speakers=min_speakers,
max_speakers=max_speakers,
profiler = VoiceProfiler(profiles_dir, device=device)
if not profiler.profiles:
console.print("[yellow]No voice profiles found — labeling all as HOST[/yellow]")
# Return a single HOST turn covering the whole episode
from .voice_profiler import VoiceProfiler as VP
duration = profiler._get_duration(audio_path)
return DiarizationResult(
turns=[SpeakerTurn(speaker="HOST", start=0.0, end=duration)],
num_speakers=1,
speaker_map={"HOST": "HOST"},
)
# Sliding-window identification: 10s windows, 5s hop
voice_segs = profiler.identify_speakers(
audio_path, window_s=10.0, hop_s=5.0,
threshold=host_match_threshold,
)
# Extract turns
# Convert VoiceSegment labels to HOST / CALLER
raw_turns = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
for seg in voice_segs:
label = seg.speaker_label.split(" (")[0] # strip confidence score
if label.startswith("Host:") or label.startswith("Host "):
speaker = "HOST"
elif label == "[error]":
speaker = "UNKNOWN"
else:
speaker = "CALLER"
raw_turns.append(SpeakerTurn(
speaker=speaker,
start=turn.start,
end=turn.end,
start=seg.start,
end=seg.end,
confidence=float(seg.speaker_label.split("(")[-1].rstrip(")"))
if "(" in seg.speaker_label else 0.5,
))
# Count unique speakers
raw_speakers = set(t.speaker for t in raw_turns)
console.print(f"[dim]Detected {len(raw_speakers)} speakers[/dim]")
# Match against voice profiles if available
speaker_map = {}
if voice_profiles and voice_profiles.embeddings:
console.print("[dim]Matching speakers against voice profiles...[/dim]")
embedding_model = pipeline.embedding # pyannote's embedding model
# Get embeddings for each detected speaker
from pyannote.audio import Inference
inference = Inference(pipeline.embedding, window="whole")
for raw_label in raw_speakers:
# Get segments for this speaker
speaker_segments = [t for t in raw_turns if t.speaker == raw_label]
total_time = sum(t.duration for t in speaker_segments)
# Use the longest segment for embedding
longest = max(speaker_segments, key=lambda t: t.duration)
try:
# Extract embedding from audio segment
import torchaudio
waveform, sr = torchaudio.load(
str(audio_path),
frame_offset=int(longest.start * sr if 'sr' in dir() else longest.start * 16000),
num_frames=int(longest.duration * sr if 'sr' in dir() else longest.duration * 16000),
)
# This is simplified — proper implementation would use pyannote's
# embedding extraction pipeline
match_name, score = voice_profiles.match_embedding(
np.zeros(256), # placeholder
threshold=host_match_threshold,
)
if match_name:
speaker_map[raw_label] = match_name
console.print(f" [green]{raw_label} -> {match_name} "
f"(score: {score:.2f}, {total_time:.0f}s)[/green]")
except Exception as e:
console.print(f" [yellow]Could not match {raw_label}: {e}[/yellow]")
# If no voice profiles matched, use speaking time heuristic
# The host almost always has the most speaking time
if not speaker_map:
ranked = sorted(
[(s, sum(t.duration for t in raw_turns if t.speaker == s))
for s in raw_speakers],
key=lambda x: x[1],
reverse=True,
)
if ranked:
speaker_map[ranked[0][0]] = f"Host: {voice_profiles.metadata.get('host', {}).get('name', 'Unknown')}"
console.print(f" [yellow]Assumed {ranked[0][0]} is host "
f"(most speaking time: {ranked[0][1]:.0f}s)[/yellow]")
# If no voice profiles at all, label by speaking time
if not speaker_map:
ranked = sorted(
[(s, sum(t.duration for t in raw_turns if t.speaker == s))
for s in raw_speakers],
key=lambda x: x[1],
reverse=True,
)
for i, (speaker, time) in enumerate(ranked):
if i == 0:
speaker_map[speaker] = "Host (assumed)"
else:
speaker_map[speaker] = f"Speaker {i}"
# Apply friendly names
# Merge consecutive same-speaker turns
merged: list[SpeakerTurn] = []
for turn in raw_turns:
if turn.speaker in speaker_map:
turn.speaker = speaker_map[turn.speaker]
if merged and merged[-1].speaker == turn.speaker:
merged[-1].end = turn.end
else:
merged.append(SpeakerTurn(
speaker=turn.speaker,
start=turn.start,
end=turn.end,
confidence=turn.confidence,
))
console.print(f"[green]Diarization complete: {len(raw_turns)} turns, "
f"{len(raw_speakers)} speakers[/green]")
unique_speakers = set(t.speaker for t in merged)
speaker_map = {s: s for s in unique_speakers}
host_time = sum(t.duration for t in merged if t.speaker == "HOST")
caller_time = sum(t.duration for t in merged if t.speaker == "CALLER")
console.print(f"[green]Diarization complete:[/green] {len(merged)} turns | "
f"HOST {host_time:.0f}s / CALLER {caller_time:.0f}s")
return DiarizationResult(
turns=raw_turns,
num_speakers=len(raw_speakers),
turns=merged,
num_speakers=len(unique_speakers),
speaker_map=speaker_map,
)