Files
claudetools/projects/radio-show/audio-processor/src/voice_profiler.py
Mike Swanson c760e430c0 radio: bumper detection in diarizer + full archive download script
Adds a transcript-driven bumper filter to the diarization pipeline. When
a transcript segment matches qa_extractor's promo/bumper signatures, the
overlapping audio windows are labeled BUMPER and the WavLM cosine match
is skipped. Prevents music/promo from being matched against speaker
profiles (the failure mode Mike caught in 2018-s10e18 @ 09:20-10:05).

Code changes:
- src/voice_profiler.py: identify_speakers() takes optional skip_ranges
  parameter; windows whose midpoint falls in a skip range get labeled
  "[bumper]" and skip cosine match
- src/diarizer.py: diarize() takes optional transcript_path; pre-computes
  bumper time ranges via qa_extractor._is_promo_or_bumper, passes to
  identify_speakers; adds BUMPER speaker label
- benchmark.py: passes transcript_path to diarize()

Aggregate impact across 9-episode test set:
  Tara attribution: 4880s -> 3680s  (-1200s / -25%)
  Q&A pairs: 17 -> 19 (+2)
    (bumper-flagged segments had been disrupting conversation detection
     in 2017-s9e30 and 2018-s10e18)
  CALLER total: 1320s -> 1190s  (bumpers previously labeled CALLER moved)
  Per-episode bumpers caught: 1-8, total ~165 bumper segments across set

Remaining Tara false positives are real callers acoustically similar to
Tara (Christopher in 2018, Kay in 2012, William and Charles in 2015) and
guest Clay in 2015-s7e19 — those need profile rebuild + Clay profile,
not bumper filtering.

Adds download_full_archive.py — resumable mirror-style downloader that
walks IX server's /home/gurushow/public_html/archive/{year}/ and copies
all MP3s to archive-data/episodes/. Run is in progress (~589 files,
~10-15GB). Used to source clean profile windows for the remaining
co-hosts (Tara rebuild, Clay, Tony, Rob, Randall, producers).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-27 16:17:50 -07:00

416 lines
16 KiB
Python

"""Voice profiler: builds and manages speaker embeddings using speechbrain.
Uses ECAPA-TDNN speaker verification model to generate embeddings.
No HuggingFace gated model access required (unlike pyannote).
"""
import json
import subprocess
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import torch
import soundfile as sf
from rich.console import Console
from rich.table import Table
console = Console()
# Target sample rate for the embedding model
SAMPLE_RATE = 16000
# Minimum segment length for a usable embedding (seconds)
MIN_SEGMENT_S = 3.0
# Maximum segment length to process at once (seconds)
MAX_SEGMENT_S = 30.0
@dataclass
class VoiceSegment:
"""A segment of audio attributed to a single speaker."""
start: float
end: float
embedding: np.ndarray | None = None
speaker_label: str = ""
@property
def duration(self) -> float:
return self.end - self.start
@dataclass
class SpeakerProfile:
"""A speaker's voice profile built from multiple embeddings."""
name: str
role: str # "host", "cohost", "guest", "caller"
embeddings: list[np.ndarray]
source_episodes: list[str]
composite_embedding: np.ndarray | None = None
@property
def num_samples(self) -> int:
return len(self.embeddings)
def compute_composite(self):
"""Average all embeddings into a single composite."""
if self.embeddings:
self.composite_embedding = np.mean(self.embeddings, axis=0)
# L2 normalize
norm = np.linalg.norm(self.composite_embedding)
if norm > 0:
self.composite_embedding /= norm
def similarity(self, embedding: np.ndarray) -> float:
"""Cosine similarity between an embedding and this profile's composite."""
if self.composite_embedding is None:
self.compute_composite()
return float(np.dot(self.composite_embedding, embedding) / (
np.linalg.norm(self.composite_embedding) * np.linalg.norm(embedding) + 1e-8
))
class VoiceProfiler:
"""Builds speaker voice profiles from audio using speechbrain ECAPA-TDNN."""
def __init__(self, profiles_dir: str | Path, device: str = "cuda"):
self.profiles_dir = Path(profiles_dir)
self.profiles_dir.mkdir(parents=True, exist_ok=True)
self.device = device
self._model = None
self.profiles: dict[str, SpeakerProfile] = {}
self._load_existing_profiles()
def _get_model(self):
"""Lazy-load the embedding model (WavLM x-vector)."""
if self._model is None:
console.print("[dim]Loading speaker embedding model (WavLM-SV)...[/dim]")
from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector
self._extractor = Wav2Vec2FeatureExtractor.from_pretrained(
"microsoft/wavlm-base-sv"
)
self._model = WavLMForXVector.from_pretrained(
"microsoft/wavlm-base-sv"
).to(self.device)
self._model.eval()
console.print("[dim]Speaker embedding model loaded[/dim]")
return self._model
def _load_existing_profiles(self):
"""Load saved profiles from disk."""
profile_file = self.profiles_dir / "profiles.json"
if not profile_file.exists():
return
with open(profile_file) as f:
data = json.load(f)
for name, pdata in data.items():
embeddings = []
emb_dir = self.profiles_dir / name.lower().replace(" ", "-")
for emb_file in sorted(emb_dir.glob("embedding_*.npy")):
embeddings.append(np.load(emb_file))
composite = None
composite_file = emb_dir / "composite.npy"
if composite_file.exists():
composite = np.load(composite_file)
self.profiles[name] = SpeakerProfile(
name=name,
role=pdata.get("role", "unknown"),
embeddings=embeddings,
source_episodes=pdata.get("source_episodes", []),
composite_embedding=composite,
)
if self.profiles:
console.print(f"[dim]Loaded {len(self.profiles)} voice profiles[/dim]")
def save_profiles(self):
"""Save all profiles to disk."""
metadata = {}
for name, profile in self.profiles.items():
slug = name.lower().replace(" ", "-")
emb_dir = self.profiles_dir / slug
emb_dir.mkdir(parents=True, exist_ok=True)
# Save individual embeddings
for i, emb in enumerate(profile.embeddings):
np.save(emb_dir / f"embedding_{i:04d}.npy", emb)
# Save composite
profile.compute_composite()
if profile.composite_embedding is not None:
np.save(emb_dir / "composite.npy", profile.composite_embedding)
metadata[name] = {
"role": profile.role,
"num_samples": profile.num_samples,
"source_episodes": profile.source_episodes,
}
with open(self.profiles_dir / "profiles.json", "w") as f:
json.dump(metadata, f, indent=2)
console.print(f"[green]Saved {len(self.profiles)} voice profiles[/green]")
def extract_embedding(self, audio_path: Path, start: float = 0.0,
end: float | None = None) -> np.ndarray:
"""Extract a speaker embedding from an audio segment (file-based, any format)."""
self._get_model()
waveform, _ = self._load_audio_segment(audio_path, start, end)
return self._embed_audio_np(waveform.squeeze(0).numpy())
def _embed_audio_np(self, audio_np: np.ndarray) -> np.ndarray:
"""Embed a float32 mono numpy array (already at SAMPLE_RATE). Returns L2-normalized embedding."""
self._get_model()
inputs = self._extractor(
audio_np, sampling_rate=SAMPLE_RATE,
return_tensors="pt", padding=True,
)
with torch.no_grad():
outputs = self._model(**{k: v.to(self.device) for k, v in inputs.items()})
embedding = outputs.embeddings.squeeze().cpu().numpy()
norm = np.linalg.norm(embedding)
if norm > 0:
embedding = embedding / norm
return embedding
def _load_full_audio(self, audio_path: Path) -> np.ndarray:
"""Decode entire audio file to float32 mono at SAMPLE_RATE via a single ffmpeg call."""
cmd = [
"ffmpeg", "-i", str(audio_path),
"-f", "wav", "-ac", "1", "-ar", str(SAMPLE_RATE),
"-acodec", "pcm_s16le", "pipe:1",
]
result = subprocess.run(cmd, capture_output=True, timeout=600)
if result.returncode != 0:
raise RuntimeError(f"ffmpeg failed: {result.stderr.decode()[:200]}")
import io
data, _ = sf.read(io.BytesIO(result.stdout), dtype="float32")
return data # shape: (samples,)
def _load_audio_segment(self, audio_path: Path, start: float = 0.0,
end: float | None = None) -> tuple[torch.Tensor, int]:
"""Load a single audio segment via ffmpeg (used for one-off extraction)."""
cmd = ["ffmpeg", "-i", str(audio_path)]
if start > 0:
cmd.extend(["-ss", str(start)])
if end is not None:
cmd.extend(["-t", str(end - start)])
cmd.extend(["-f", "wav", "-ac", "1", "-ar", str(SAMPLE_RATE),
"-acodec", "pcm_s16le", "pipe:1"])
result = subprocess.run(cmd, capture_output=True, timeout=60)
if result.returncode != 0:
raise RuntimeError(f"ffmpeg failed: {result.stderr.decode()[:200]}")
import io
data, sr = sf.read(io.BytesIO(result.stdout), dtype="float32")
waveform = torch.from_numpy(data).unsqueeze(0) # [1, samples]
return waveform, sr
def bootstrap_host_from_episodes(self, episode_paths: list[Path],
host_name: str = "Mike Swanson"):
"""Build host voice profile by extracting the dominant speaker from episodes.
Strategy: In each episode, the host speaks the most. We extract embeddings
from the first 2-5 minutes (usually the intro/monologue) where the host
is most likely speaking solo.
"""
console.print(f"[bold]Bootstrapping voice profile for {host_name}[/bold]")
console.print(f"[dim]Processing {len(episode_paths)} episodes[/dim]")
if host_name not in self.profiles:
self.profiles[host_name] = SpeakerProfile(
name=host_name,
role="host",
embeddings=[],
source_episodes=[],
)
profile = self.profiles[host_name]
for ep_idx, ep_path in enumerate(episode_paths, 1):
console.print(f"[dim] [{ep_idx}/{len(episode_paths)}] {ep_path.name}[/dim]")
try:
duration = self._get_duration(ep_path)
windows = []
if duration > 90:
windows.append((30.0, 90.0))
if duration > 180:
windows.append((120.0, 180.0))
mid = duration / 2
if mid > 60:
windows.append((mid, min(mid + 60, duration)))
late = duration - 180
if late > 300:
windows.append((late, late + 60))
chunk_duration = 10.0
for start, end in windows:
for chunk_start in np.arange(start, end - chunk_duration, chunk_duration):
try:
emb = self.extract_embedding(
ep_path, chunk_start, chunk_start + chunk_duration
)
profile.embeddings.append(emb)
except Exception as e:
console.print(f" [dim red]Chunk {chunk_start:.0f}s failed: {e}[/dim red]")
profile.source_episodes.append(ep_path.name)
except Exception as e:
console.print(f" [red]Failed: {ep_path.name}: {e}[/red]")
# Compute composite
profile.compute_composite()
console.print(f"\n[green]Host profile built: {profile.num_samples} embeddings "
f"from {len(profile.source_episodes)} episodes[/green]")
# Save
self.save_profiles()
def identify_speakers(self, audio_path: Path,
window_s: float = 10.0,
hop_s: float = 5.0,
threshold: float = 0.70,
skip_ranges: list[tuple[float, float]] | None = None
) -> list[VoiceSegment]:
"""Identify speakers throughout an audio file using sliding window.
Loads the full audio once then slices in memory — avoids spawning
hundreds of ffmpeg subprocesses.
Returns timestamped segments with speaker labels and embeddings.
skip_ranges: list of (start, end) seconds. Windows whose midpoint
falls inside any of these ranges are labeled "[bumper]" and the
speaker cosine match is skipped — used to suppress music/promo
from being matched against speaker profiles.
"""
console.print(f"[bold]Identifying speakers:[/bold] {audio_path.name}")
duration = self._get_duration(audio_path)
console.print(f"[dim]Loading audio into memory...[/dim]")
audio = self._load_full_audio(audio_path) # float32 mono array
self._get_model() # ensure model is warm before the loop
skip_ranges = skip_ranges or []
segments = []
window_samples = int(window_s * SAMPLE_RATE)
hop_samples = int(hop_s * SAMPLE_RATE)
total_samples = len(audio)
total_windows = int((duration - window_s) / hop_s) + 1
report_every = max(1, total_windows // 10)
for idx, start in enumerate(np.arange(0, duration - window_s, hop_s)):
end = min(start + window_s, duration)
s = int(start * SAMPLE_RATE)
e = min(s + window_samples, total_samples)
mid = (start + end) / 2
in_bumper = any(rs <= mid <= re for rs, re in skip_ranges)
if in_bumper:
segments.append(VoiceSegment(
start=start, end=end,
speaker_label="[bumper] (1.00)",
))
continue
try:
emb = self._embed_audio_np(audio[s:e])
best_match = None
best_score = 0.0
for name, profile in self.profiles.items():
score = profile.similarity(emb)
if score > best_score:
best_score = score
best_match = name
if best_score >= threshold:
role = self.profiles[best_match].role if best_match else "unknown"
if role == "host":
label = f"Host: {best_match}"
elif role == "cohost":
label = f"Cohost: {best_match}"
else:
label = best_match
else:
label = "Unknown"
segments.append(VoiceSegment(
start=start,
end=end,
embedding=emb,
speaker_label=f"{label} ({best_score:.2f})",
))
except Exception:
segments.append(VoiceSegment(
start=start, end=end,
speaker_label="[error]",
))
if idx % report_every == 0:
pct = int(end / duration * 100)
console.print(f"[dim] {pct}% ({end:.0f}s / {duration:.0f}s)[/dim]")
# Print summary
self._print_speaker_summary(segments, duration)
return segments
def _print_speaker_summary(self, segments: list[VoiceSegment], duration: float):
"""Print a summary of who spoke and for how long."""
speaker_times: dict[str, float] = {}
for seg in segments:
label = seg.speaker_label.split(" (")[0] # Strip score
speaker_times[label] = speaker_times.get(label, 0) + seg.duration
table = Table(title="Speaker Summary")
table.add_column("Speaker", style="cyan")
table.add_column("Time", style="magenta")
table.add_column("Percentage", style="green")
for speaker, time in sorted(speaker_times.items(), key=lambda x: -x[1]):
pct = (time / duration) * 100
table.add_row(speaker, f"{time:.0f}s", f"{pct:.1f}%")
console.print(table)
def _get_duration(self, audio_path: Path) -> float:
"""Get audio duration in seconds."""
result = subprocess.run(
["ffprobe", "-v", "quiet", "-show_entries", "format=duration",
"-of", "csv=p=0", str(audio_path)],
capture_output=True, text=True,
)
return float(result.stdout.strip())
def print_profiles(self):
"""Print summary of all loaded profiles."""
table = Table(title="Voice Profiles")
table.add_column("Name", style="cyan")
table.add_column("Role", style="green")
table.add_column("Samples", style="magenta")
table.add_column("Episodes", style="yellow")
for name, profile in self.profiles.items():
table.add_row(
name, profile.role,
str(profile.num_samples),
str(len(profile.source_episodes)),
)
console.print(table)