radio: diarization pipeline fixes, benchmark setup, test episode set
- Fix voice_profiler threshold bug (HOST label overwrote Unknown unconditionally) - Audio preload optimization: single ffmpeg per episode, 149.5x realtime on 5070 Ti - WavLM threshold raised to 0.85 (Mike 0.90-0.99, callers 0.46-0.83) - Promo/bumper filter: weighted signature scoring, 42->27 clean Q&A pairs - Text-only Q&A fallback for episodes with no CALLER diarization labels - TRANSFORMERS_OFFLINE=1 to skip HuggingFace freshness checks - Add diarize_2018.py for targeted re-run + FTS5 rebuild - Add benchmark.py + BENCH_SETUP.md for GURU-BEAST-ROG (RTX 4090) comparison - Commit 9-episode training diarization.json outputs - Session log: 2026-04-27-diarization-pipeline.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -158,117 +158,86 @@ def diarize(audio_path: str | Path,
|
||||
voice_profiles: VoiceProfileStore | None = None,
|
||||
min_speakers: int = 1,
|
||||
max_speakers: int = 6,
|
||||
host_match_threshold: float = 0.75) -> DiarizationResult:
|
||||
"""Run speaker diarization on an audio file."""
|
||||
from pyannote.audio import Pipeline
|
||||
host_match_threshold: float = 0.85) -> DiarizationResult:
|
||||
"""Run speaker diarization using WavLM sliding-window speaker identification.
|
||||
|
||||
Uses the built-in VoiceProfiler (WavLM x-vectors) — no HuggingFace token
|
||||
or gated model required. Identifies HOST vs non-HOST speakers using the
|
||||
stored voice profile for Mike Swanson.
|
||||
"""
|
||||
import torch
|
||||
from .voice_profiler import VoiceProfiler
|
||||
|
||||
audio_path = Path(audio_path)
|
||||
console.print(f"[bold]Diarizing:[/bold] {audio_path.name}")
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
console.print(f"[dim]Device: {device}[/dim]")
|
||||
|
||||
pipeline = Pipeline.from_pretrained(
|
||||
"pyannote/speaker-diarization-3.1"
|
||||
).to(device)
|
||||
# Locate voice profiles directory from the VoiceProfileStore path
|
||||
profiles_dir = voice_profiles.profiles_dir if voice_profiles else Path("voice-profiles")
|
||||
|
||||
diarization = pipeline(
|
||||
str(audio_path),
|
||||
min_speakers=min_speakers,
|
||||
max_speakers=max_speakers,
|
||||
profiler = VoiceProfiler(profiles_dir, device=device)
|
||||
|
||||
if not profiler.profiles:
|
||||
console.print("[yellow]No voice profiles found — labeling all as HOST[/yellow]")
|
||||
# Return a single HOST turn covering the whole episode
|
||||
from .voice_profiler import VoiceProfiler as VP
|
||||
duration = profiler._get_duration(audio_path)
|
||||
return DiarizationResult(
|
||||
turns=[SpeakerTurn(speaker="HOST", start=0.0, end=duration)],
|
||||
num_speakers=1,
|
||||
speaker_map={"HOST": "HOST"},
|
||||
)
|
||||
|
||||
# Sliding-window identification: 10s windows, 5s hop
|
||||
voice_segs = profiler.identify_speakers(
|
||||
audio_path, window_s=10.0, hop_s=5.0,
|
||||
threshold=host_match_threshold,
|
||||
)
|
||||
|
||||
# Extract turns
|
||||
# Convert VoiceSegment labels to HOST / CALLER
|
||||
raw_turns = []
|
||||
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
||||
for seg in voice_segs:
|
||||
label = seg.speaker_label.split(" (")[0] # strip confidence score
|
||||
if label.startswith("Host:") or label.startswith("Host "):
|
||||
speaker = "HOST"
|
||||
elif label == "[error]":
|
||||
speaker = "UNKNOWN"
|
||||
else:
|
||||
speaker = "CALLER"
|
||||
|
||||
raw_turns.append(SpeakerTurn(
|
||||
speaker=speaker,
|
||||
start=turn.start,
|
||||
end=turn.end,
|
||||
start=seg.start,
|
||||
end=seg.end,
|
||||
confidence=float(seg.speaker_label.split("(")[-1].rstrip(")"))
|
||||
if "(" in seg.speaker_label else 0.5,
|
||||
))
|
||||
|
||||
# Count unique speakers
|
||||
raw_speakers = set(t.speaker for t in raw_turns)
|
||||
console.print(f"[dim]Detected {len(raw_speakers)} speakers[/dim]")
|
||||
|
||||
# Match against voice profiles if available
|
||||
speaker_map = {}
|
||||
if voice_profiles and voice_profiles.embeddings:
|
||||
console.print("[dim]Matching speakers against voice profiles...[/dim]")
|
||||
embedding_model = pipeline.embedding # pyannote's embedding model
|
||||
|
||||
# Get embeddings for each detected speaker
|
||||
from pyannote.audio import Inference
|
||||
inference = Inference(pipeline.embedding, window="whole")
|
||||
|
||||
for raw_label in raw_speakers:
|
||||
# Get segments for this speaker
|
||||
speaker_segments = [t for t in raw_turns if t.speaker == raw_label]
|
||||
total_time = sum(t.duration for t in speaker_segments)
|
||||
|
||||
# Use the longest segment for embedding
|
||||
longest = max(speaker_segments, key=lambda t: t.duration)
|
||||
|
||||
try:
|
||||
# Extract embedding from audio segment
|
||||
import torchaudio
|
||||
waveform, sr = torchaudio.load(
|
||||
str(audio_path),
|
||||
frame_offset=int(longest.start * sr if 'sr' in dir() else longest.start * 16000),
|
||||
num_frames=int(longest.duration * sr if 'sr' in dir() else longest.duration * 16000),
|
||||
)
|
||||
# This is simplified — proper implementation would use pyannote's
|
||||
# embedding extraction pipeline
|
||||
match_name, score = voice_profiles.match_embedding(
|
||||
np.zeros(256), # placeholder
|
||||
threshold=host_match_threshold,
|
||||
)
|
||||
if match_name:
|
||||
speaker_map[raw_label] = match_name
|
||||
console.print(f" [green]{raw_label} -> {match_name} "
|
||||
f"(score: {score:.2f}, {total_time:.0f}s)[/green]")
|
||||
except Exception as e:
|
||||
console.print(f" [yellow]Could not match {raw_label}: {e}[/yellow]")
|
||||
|
||||
# If no voice profiles matched, use speaking time heuristic
|
||||
# The host almost always has the most speaking time
|
||||
if not speaker_map:
|
||||
ranked = sorted(
|
||||
[(s, sum(t.duration for t in raw_turns if t.speaker == s))
|
||||
for s in raw_speakers],
|
||||
key=lambda x: x[1],
|
||||
reverse=True,
|
||||
)
|
||||
if ranked:
|
||||
speaker_map[ranked[0][0]] = f"Host: {voice_profiles.metadata.get('host', {}).get('name', 'Unknown')}"
|
||||
console.print(f" [yellow]Assumed {ranked[0][0]} is host "
|
||||
f"(most speaking time: {ranked[0][1]:.0f}s)[/yellow]")
|
||||
|
||||
# If no voice profiles at all, label by speaking time
|
||||
if not speaker_map:
|
||||
ranked = sorted(
|
||||
[(s, sum(t.duration for t in raw_turns if t.speaker == s))
|
||||
for s in raw_speakers],
|
||||
key=lambda x: x[1],
|
||||
reverse=True,
|
||||
)
|
||||
for i, (speaker, time) in enumerate(ranked):
|
||||
if i == 0:
|
||||
speaker_map[speaker] = "Host (assumed)"
|
||||
else:
|
||||
speaker_map[speaker] = f"Speaker {i}"
|
||||
|
||||
# Apply friendly names
|
||||
# Merge consecutive same-speaker turns
|
||||
merged: list[SpeakerTurn] = []
|
||||
for turn in raw_turns:
|
||||
if turn.speaker in speaker_map:
|
||||
turn.speaker = speaker_map[turn.speaker]
|
||||
if merged and merged[-1].speaker == turn.speaker:
|
||||
merged[-1].end = turn.end
|
||||
else:
|
||||
merged.append(SpeakerTurn(
|
||||
speaker=turn.speaker,
|
||||
start=turn.start,
|
||||
end=turn.end,
|
||||
confidence=turn.confidence,
|
||||
))
|
||||
|
||||
console.print(f"[green]Diarization complete: {len(raw_turns)} turns, "
|
||||
f"{len(raw_speakers)} speakers[/green]")
|
||||
unique_speakers = set(t.speaker for t in merged)
|
||||
speaker_map = {s: s for s in unique_speakers}
|
||||
|
||||
host_time = sum(t.duration for t in merged if t.speaker == "HOST")
|
||||
caller_time = sum(t.duration for t in merged if t.speaker == "CALLER")
|
||||
console.print(f"[green]Diarization complete:[/green] {len(merged)} turns | "
|
||||
f"HOST {host_time:.0f}s / CALLER {caller_time:.0f}s")
|
||||
|
||||
return DiarizationResult(
|
||||
turns=raw_turns,
|
||||
num_speakers=len(raw_speakers),
|
||||
turns=merged,
|
||||
num_speakers=len(unique_speakers),
|
||||
speaker_map=speaker_map,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user