radio: diarization pipeline fixes, benchmark setup, test episode set
- Fix voice_profiler threshold bug (HOST label overwrote Unknown unconditionally) - Audio preload optimization: single ffmpeg per episode, 149.5x realtime on 5070 Ti - WavLM threshold raised to 0.85 (Mike 0.90-0.99, callers 0.46-0.83) - Promo/bumper filter: weighted signature scoring, 42->27 clean Q&A pairs - Text-only Q&A fallback for episodes with no CALLER diarization labels - TRANSFORMERS_OFFLINE=1 to skip HuggingFace freshness checks - Add diarize_2018.py for targeted re-run + FTS5 rebuild - Add benchmark.py + BENCH_SETUP.md for GURU-BEAST-ROG (RTX 4090) comparison - Commit 9-episode training diarization.json outputs - Session log: 2026-04-27-diarization-pipeline.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
105
projects/radio-show/audio-processor/src/clip_extractor.py
Normal file
105
projects/radio-show/audio-processor/src/clip_extractor.py
Normal file
@@ -0,0 +1,105 @@
|
||||
"""
|
||||
Audio clip extraction using ffmpeg.
|
||||
Cuts clips from original broadcast MP3s for use in Audition/Audacity.
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from rich.console import Console
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
def extract_clip(
|
||||
source_path: Path,
|
||||
start: float,
|
||||
end: float,
|
||||
output_path: Path,
|
||||
padding: float = 1.5,
|
||||
fade_ms: int = 200,
|
||||
) -> Path:
|
||||
"""
|
||||
Extract a clip from source_path between start and end seconds.
|
||||
Adds padding on both sides and applies fade in/out.
|
||||
Returns the output path.
|
||||
"""
|
||||
source_path = Path(source_path)
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
clip_start = max(0.0, start - padding)
|
||||
clip_end = end + padding
|
||||
duration = clip_end - clip_start
|
||||
|
||||
fade_s = fade_ms / 1000.0
|
||||
|
||||
cmd = [
|
||||
"ffmpeg", "-y",
|
||||
"-ss", f"{clip_start:.3f}",
|
||||
"-i", str(source_path),
|
||||
"-t", f"{duration:.3f}",
|
||||
"-af", f"afade=t=in:st=0:d={fade_s},afade=t=out:st={duration - fade_s:.3f}:d={fade_s}",
|
||||
"-q:a", "2",
|
||||
str(output_path),
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"ffmpeg failed: {result.stderr[-500:]}")
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
def extract_clips_for_results(results, output_dir: Path, padding: float = 1.5) -> dict[int, Path]:
|
||||
"""
|
||||
Extract clips for a list of QAResult or SearchResult objects.
|
||||
Returns {index: clip_path}.
|
||||
"""
|
||||
output_dir = Path(output_dir)
|
||||
clip_paths = {}
|
||||
|
||||
for i, result in enumerate(results):
|
||||
episode = result.episode_id
|
||||
audio_path = Path(result.audio_path)
|
||||
|
||||
if not audio_path.exists():
|
||||
console.print(f"[yellow]Audio not found: {audio_path}[/yellow]")
|
||||
continue
|
||||
|
||||
# Determine time range
|
||||
if hasattr(result, "question_start"):
|
||||
# QAResult
|
||||
start = result.question_start
|
||||
end = result.answer_end
|
||||
else:
|
||||
# SearchResult
|
||||
start = result.start
|
||||
end = result.end
|
||||
|
||||
def fmt(s):
|
||||
m, sec = divmod(int(s), 60)
|
||||
h, m = divmod(m, 60)
|
||||
return f"{h}h{m:02d}m{sec:02d}s" if h else f"{m}m{sec:02d}s"
|
||||
|
||||
clip_name = f"{episode}_{fmt(start)}.mp3"
|
||||
clip_path = output_dir / clip_name
|
||||
|
||||
try:
|
||||
extract_clip(audio_path, start, end, clip_path, padding=padding)
|
||||
clip_paths[i] = clip_path
|
||||
console.print(f"[green]Clip {i+1}:[/green] {clip_name}")
|
||||
except Exception as e:
|
||||
console.print(f"[red]Clip {i+1} failed:[/red] {e}")
|
||||
|
||||
return clip_paths
|
||||
|
||||
|
||||
def format_timestamp(seconds: float) -> str:
|
||||
"""Format seconds as H:MM:SS or M:SS."""
|
||||
h = int(seconds // 3600)
|
||||
m = int((seconds % 3600) // 60)
|
||||
s = int(seconds % 60)
|
||||
if h:
|
||||
return f"{h}:{m:02d}:{s:02d}"
|
||||
return f"{m}:{s:02d}"
|
||||
@@ -158,117 +158,86 @@ def diarize(audio_path: str | Path,
|
||||
voice_profiles: VoiceProfileStore | None = None,
|
||||
min_speakers: int = 1,
|
||||
max_speakers: int = 6,
|
||||
host_match_threshold: float = 0.75) -> DiarizationResult:
|
||||
"""Run speaker diarization on an audio file."""
|
||||
from pyannote.audio import Pipeline
|
||||
host_match_threshold: float = 0.85) -> DiarizationResult:
|
||||
"""Run speaker diarization using WavLM sliding-window speaker identification.
|
||||
|
||||
Uses the built-in VoiceProfiler (WavLM x-vectors) — no HuggingFace token
|
||||
or gated model required. Identifies HOST vs non-HOST speakers using the
|
||||
stored voice profile for Mike Swanson.
|
||||
"""
|
||||
import torch
|
||||
from .voice_profiler import VoiceProfiler
|
||||
|
||||
audio_path = Path(audio_path)
|
||||
console.print(f"[bold]Diarizing:[/bold] {audio_path.name}")
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
console.print(f"[dim]Device: {device}[/dim]")
|
||||
|
||||
pipeline = Pipeline.from_pretrained(
|
||||
"pyannote/speaker-diarization-3.1"
|
||||
).to(device)
|
||||
# Locate voice profiles directory from the VoiceProfileStore path
|
||||
profiles_dir = voice_profiles.profiles_dir if voice_profiles else Path("voice-profiles")
|
||||
|
||||
diarization = pipeline(
|
||||
str(audio_path),
|
||||
min_speakers=min_speakers,
|
||||
max_speakers=max_speakers,
|
||||
profiler = VoiceProfiler(profiles_dir, device=device)
|
||||
|
||||
if not profiler.profiles:
|
||||
console.print("[yellow]No voice profiles found — labeling all as HOST[/yellow]")
|
||||
# Return a single HOST turn covering the whole episode
|
||||
from .voice_profiler import VoiceProfiler as VP
|
||||
duration = profiler._get_duration(audio_path)
|
||||
return DiarizationResult(
|
||||
turns=[SpeakerTurn(speaker="HOST", start=0.0, end=duration)],
|
||||
num_speakers=1,
|
||||
speaker_map={"HOST": "HOST"},
|
||||
)
|
||||
|
||||
# Sliding-window identification: 10s windows, 5s hop
|
||||
voice_segs = profiler.identify_speakers(
|
||||
audio_path, window_s=10.0, hop_s=5.0,
|
||||
threshold=host_match_threshold,
|
||||
)
|
||||
|
||||
# Extract turns
|
||||
# Convert VoiceSegment labels to HOST / CALLER
|
||||
raw_turns = []
|
||||
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
||||
for seg in voice_segs:
|
||||
label = seg.speaker_label.split(" (")[0] # strip confidence score
|
||||
if label.startswith("Host:") or label.startswith("Host "):
|
||||
speaker = "HOST"
|
||||
elif label == "[error]":
|
||||
speaker = "UNKNOWN"
|
||||
else:
|
||||
speaker = "CALLER"
|
||||
|
||||
raw_turns.append(SpeakerTurn(
|
||||
speaker=speaker,
|
||||
start=turn.start,
|
||||
end=turn.end,
|
||||
start=seg.start,
|
||||
end=seg.end,
|
||||
confidence=float(seg.speaker_label.split("(")[-1].rstrip(")"))
|
||||
if "(" in seg.speaker_label else 0.5,
|
||||
))
|
||||
|
||||
# Count unique speakers
|
||||
raw_speakers = set(t.speaker for t in raw_turns)
|
||||
console.print(f"[dim]Detected {len(raw_speakers)} speakers[/dim]")
|
||||
|
||||
# Match against voice profiles if available
|
||||
speaker_map = {}
|
||||
if voice_profiles and voice_profiles.embeddings:
|
||||
console.print("[dim]Matching speakers against voice profiles...[/dim]")
|
||||
embedding_model = pipeline.embedding # pyannote's embedding model
|
||||
|
||||
# Get embeddings for each detected speaker
|
||||
from pyannote.audio import Inference
|
||||
inference = Inference(pipeline.embedding, window="whole")
|
||||
|
||||
for raw_label in raw_speakers:
|
||||
# Get segments for this speaker
|
||||
speaker_segments = [t for t in raw_turns if t.speaker == raw_label]
|
||||
total_time = sum(t.duration for t in speaker_segments)
|
||||
|
||||
# Use the longest segment for embedding
|
||||
longest = max(speaker_segments, key=lambda t: t.duration)
|
||||
|
||||
try:
|
||||
# Extract embedding from audio segment
|
||||
import torchaudio
|
||||
waveform, sr = torchaudio.load(
|
||||
str(audio_path),
|
||||
frame_offset=int(longest.start * sr if 'sr' in dir() else longest.start * 16000),
|
||||
num_frames=int(longest.duration * sr if 'sr' in dir() else longest.duration * 16000),
|
||||
)
|
||||
# This is simplified — proper implementation would use pyannote's
|
||||
# embedding extraction pipeline
|
||||
match_name, score = voice_profiles.match_embedding(
|
||||
np.zeros(256), # placeholder
|
||||
threshold=host_match_threshold,
|
||||
)
|
||||
if match_name:
|
||||
speaker_map[raw_label] = match_name
|
||||
console.print(f" [green]{raw_label} -> {match_name} "
|
||||
f"(score: {score:.2f}, {total_time:.0f}s)[/green]")
|
||||
except Exception as e:
|
||||
console.print(f" [yellow]Could not match {raw_label}: {e}[/yellow]")
|
||||
|
||||
# If no voice profiles matched, use speaking time heuristic
|
||||
# The host almost always has the most speaking time
|
||||
if not speaker_map:
|
||||
ranked = sorted(
|
||||
[(s, sum(t.duration for t in raw_turns if t.speaker == s))
|
||||
for s in raw_speakers],
|
||||
key=lambda x: x[1],
|
||||
reverse=True,
|
||||
)
|
||||
if ranked:
|
||||
speaker_map[ranked[0][0]] = f"Host: {voice_profiles.metadata.get('host', {}).get('name', 'Unknown')}"
|
||||
console.print(f" [yellow]Assumed {ranked[0][0]} is host "
|
||||
f"(most speaking time: {ranked[0][1]:.0f}s)[/yellow]")
|
||||
|
||||
# If no voice profiles at all, label by speaking time
|
||||
if not speaker_map:
|
||||
ranked = sorted(
|
||||
[(s, sum(t.duration for t in raw_turns if t.speaker == s))
|
||||
for s in raw_speakers],
|
||||
key=lambda x: x[1],
|
||||
reverse=True,
|
||||
)
|
||||
for i, (speaker, time) in enumerate(ranked):
|
||||
if i == 0:
|
||||
speaker_map[speaker] = "Host (assumed)"
|
||||
else:
|
||||
speaker_map[speaker] = f"Speaker {i}"
|
||||
|
||||
# Apply friendly names
|
||||
# Merge consecutive same-speaker turns
|
||||
merged: list[SpeakerTurn] = []
|
||||
for turn in raw_turns:
|
||||
if turn.speaker in speaker_map:
|
||||
turn.speaker = speaker_map[turn.speaker]
|
||||
if merged and merged[-1].speaker == turn.speaker:
|
||||
merged[-1].end = turn.end
|
||||
else:
|
||||
merged.append(SpeakerTurn(
|
||||
speaker=turn.speaker,
|
||||
start=turn.start,
|
||||
end=turn.end,
|
||||
confidence=turn.confidence,
|
||||
))
|
||||
|
||||
console.print(f"[green]Diarization complete: {len(raw_turns)} turns, "
|
||||
f"{len(raw_speakers)} speakers[/green]")
|
||||
unique_speakers = set(t.speaker for t in merged)
|
||||
speaker_map = {s: s for s in unique_speakers}
|
||||
|
||||
host_time = sum(t.duration for t in merged if t.speaker == "HOST")
|
||||
caller_time = sum(t.duration for t in merged if t.speaker == "CALLER")
|
||||
console.print(f"[green]Diarization complete:[/green] {len(merged)} turns | "
|
||||
f"HOST {host_time:.0f}s / CALLER {caller_time:.0f}s")
|
||||
|
||||
return DiarizationResult(
|
||||
turns=raw_turns,
|
||||
num_speakers=len(raw_speakers),
|
||||
turns=merged,
|
||||
num_speakers=len(unique_speakers),
|
||||
speaker_map=speaker_map,
|
||||
)
|
||||
|
||||
247
projects/radio-show/audio-processor/src/indexer.py
Normal file
247
projects/radio-show/audio-processor/src/indexer.py
Normal file
@@ -0,0 +1,247 @@
|
||||
"""
|
||||
Archive transcript index using SQLite FTS5.
|
||||
Stores all transcript segments with speaker labels, searchable by keyword or phrase.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
|
||||
from rich.console import Console
|
||||
|
||||
console = Console()
|
||||
|
||||
DB_SCHEMA = """
|
||||
CREATE TABLE IF NOT EXISTS episodes (
|
||||
id INTEGER PRIMARY KEY,
|
||||
episode_id TEXT UNIQUE NOT NULL, -- e.g. "2016-s8e42"
|
||||
date TEXT, -- "2016-03-15"
|
||||
audio_path TEXT, -- absolute path to original MP3
|
||||
duration REAL,
|
||||
hr INTEGER -- 1 or 2 (for split episodes)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS segments (
|
||||
id INTEGER PRIMARY KEY,
|
||||
episode_id TEXT NOT NULL,
|
||||
seg_index INTEGER NOT NULL,
|
||||
start REAL NOT NULL,
|
||||
end REAL NOT NULL,
|
||||
speaker TEXT, -- "HOST", "CALLER", "UNKNOWN", "COMMERCIAL"
|
||||
text TEXT NOT NULL,
|
||||
FOREIGN KEY (episode_id) REFERENCES episodes(episode_id)
|
||||
);
|
||||
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS segments_fts USING fts5(
|
||||
text,
|
||||
speaker UNINDEXED,
|
||||
episode_id UNINDEXED,
|
||||
seg_index UNINDEXED,
|
||||
content='segments',
|
||||
content_rowid='id'
|
||||
);
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS segments_ai AFTER INSERT ON segments BEGIN
|
||||
INSERT INTO segments_fts(rowid, text, speaker, episode_id, seg_index)
|
||||
VALUES (new.id, new.text, new.speaker, new.episode_id, new.seg_index);
|
||||
END;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS qa_pairs (
|
||||
id INTEGER PRIMARY KEY,
|
||||
episode_id TEXT NOT NULL,
|
||||
question_start REAL NOT NULL,
|
||||
question_end REAL NOT NULL,
|
||||
answer_start REAL NOT NULL,
|
||||
answer_end REAL NOT NULL,
|
||||
question_text TEXT NOT NULL,
|
||||
answer_text TEXT NOT NULL,
|
||||
topic TEXT, -- Ollama-tagged topic
|
||||
topic_tags TEXT, -- JSON array of tags
|
||||
FOREIGN KEY (episode_id) REFERENCES episodes(episode_id)
|
||||
);
|
||||
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS qa_fts USING fts5(
|
||||
question_text,
|
||||
answer_text,
|
||||
topic,
|
||||
episode_id UNINDEXED,
|
||||
content='qa_pairs',
|
||||
content_rowid='id'
|
||||
);
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS qa_ai AFTER INSERT ON qa_pairs BEGIN
|
||||
INSERT INTO qa_fts(rowid, question_text, answer_text, topic, episode_id)
|
||||
VALUES (new.id, new.question_text, new.answer_text, new.topic, new.episode_id);
|
||||
END;
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class SearchResult:
|
||||
episode_id: str
|
||||
date: str
|
||||
start: float
|
||||
end: float
|
||||
speaker: str
|
||||
text: str
|
||||
audio_path: str
|
||||
score: float = 0.0
|
||||
|
||||
def timestamp_str(self) -> str:
|
||||
def fmt(s):
|
||||
m, sec = divmod(int(s), 60)
|
||||
h, m = divmod(m, 60)
|
||||
return f"{h}:{m:02d}:{sec:02d}" if h else f"{m}:{sec:02d}"
|
||||
return f"{fmt(self.start)}–{fmt(self.end)}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class QAResult:
|
||||
episode_id: str
|
||||
date: str
|
||||
question_start: float
|
||||
question_end: float
|
||||
answer_start: float
|
||||
answer_end: float
|
||||
question_text: str
|
||||
answer_text: str
|
||||
topic: str
|
||||
audio_path: str
|
||||
|
||||
def clip_start(self, padding: float = 1.0) -> float:
|
||||
return max(0.0, self.question_start - padding)
|
||||
|
||||
def clip_end(self, padding: float = 1.0) -> float:
|
||||
return self.answer_end + padding
|
||||
|
||||
def timestamp_str(self) -> str:
|
||||
def fmt(s):
|
||||
m, sec = divmod(int(s), 60)
|
||||
h, m = divmod(m, 60)
|
||||
return f"{h}:{m:02d}:{sec:02d}" if h else f"{m}:{sec:02d}"
|
||||
return f"{fmt(self.question_start)}–{fmt(self.answer_end)}"
|
||||
|
||||
def duration(self) -> float:
|
||||
return self.answer_end - self.question_start
|
||||
|
||||
|
||||
class ArchiveIndex:
|
||||
def __init__(self, db_path: Path):
|
||||
self.db_path = Path(db_path)
|
||||
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._conn = sqlite3.connect(str(self.db_path))
|
||||
self._conn.row_factory = sqlite3.Row
|
||||
self._conn.executescript(DB_SCHEMA)
|
||||
self._conn.commit()
|
||||
|
||||
def close(self):
|
||||
self._conn.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *_):
|
||||
self.close()
|
||||
|
||||
# ── Ingestion ──────────────────────────────────────────────────────────
|
||||
|
||||
def add_episode(self, episode_id: str, audio_path: Path,
|
||||
date: str = None, duration: float = None, hr: int = None):
|
||||
self._conn.execute(
|
||||
"INSERT OR IGNORE INTO episodes (episode_id, date, audio_path, duration, hr) "
|
||||
"VALUES (?, ?, ?, ?, ?)",
|
||||
(episode_id, date, str(audio_path), duration, hr)
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def add_segments(self, episode_id: str, segments: list[dict]):
|
||||
"""Add transcript segments. Each dict: {start, end, text, speaker}."""
|
||||
existing = self._conn.execute(
|
||||
"SELECT COUNT(*) FROM segments WHERE episode_id = ?", (episode_id,)
|
||||
).fetchone()[0]
|
||||
if existing:
|
||||
return # already indexed
|
||||
|
||||
self._conn.executemany(
|
||||
"INSERT INTO segments (episode_id, seg_index, start, end, speaker, text) "
|
||||
"VALUES (?, ?, ?, ?, ?, ?)",
|
||||
[
|
||||
(episode_id, i, s["start"], s["end"],
|
||||
s.get("speaker", "UNKNOWN"), s["text"])
|
||||
for i, s in enumerate(segments)
|
||||
]
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def add_qa_pair(self, episode_id: str, q_start: float, q_end: float,
|
||||
a_start: float, a_end: float, question: str, answer: str,
|
||||
topic: str = None, tags: list[str] = None):
|
||||
self._conn.execute(
|
||||
"INSERT INTO qa_pairs "
|
||||
"(episode_id, question_start, question_end, answer_start, answer_end, "
|
||||
"question_text, answer_text, topic, topic_tags) "
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
|
||||
(episode_id, q_start, q_end, a_start, a_end, question, answer,
|
||||
topic, json.dumps(tags or []))
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
# ── Search ─────────────────────────────────────────────────────────────
|
||||
|
||||
def search(self, query: str, speaker_filter: str = None,
|
||||
limit: int = 20) -> list[SearchResult]:
|
||||
"""Full-text search across all transcript segments."""
|
||||
speaker_clause = ""
|
||||
params = [query, limit]
|
||||
if speaker_filter:
|
||||
speaker_clause = "AND s.speaker = ?"
|
||||
params.insert(1, speaker_filter)
|
||||
|
||||
rows = self._conn.execute(f"""
|
||||
SELECT s.episode_id, e.date, s.start, s.end, s.speaker, s.text,
|
||||
e.audio_path, rank
|
||||
FROM segments_fts f
|
||||
JOIN segments s ON s.id = f.rowid
|
||||
JOIN episodes e ON e.episode_id = s.episode_id
|
||||
WHERE segments_fts MATCH ?
|
||||
{speaker_clause}
|
||||
ORDER BY rank
|
||||
LIMIT ?
|
||||
""", params).fetchall()
|
||||
|
||||
return [SearchResult(
|
||||
episode_id=r["episode_id"], date=r["date"] or r["episode_id"],
|
||||
start=r["start"], end=r["end"], speaker=r["speaker"],
|
||||
text=r["text"], audio_path=r["audio_path"], score=r["rank"]
|
||||
) for r in rows]
|
||||
|
||||
def search_qa(self, query: str, limit: int = 20) -> list[QAResult]:
|
||||
"""Search Q&A pairs — matches against question, answer, and topic."""
|
||||
rows = self._conn.execute("""
|
||||
SELECT q.episode_id, e.date, q.question_start, q.question_end,
|
||||
q.answer_start, q.answer_end, q.question_text, q.answer_text,
|
||||
q.topic, e.audio_path, rank
|
||||
FROM qa_fts f
|
||||
JOIN qa_pairs q ON q.id = f.rowid
|
||||
JOIN episodes e ON e.episode_id = q.episode_id
|
||||
WHERE qa_fts MATCH ?
|
||||
ORDER BY rank
|
||||
LIMIT ?
|
||||
""", [query, limit]).fetchall()
|
||||
|
||||
return [QAResult(
|
||||
episode_id=r["episode_id"], date=r["date"] or r["episode_id"],
|
||||
question_start=r["question_start"], question_end=r["question_end"],
|
||||
answer_start=r["answer_start"], answer_end=r["answer_end"],
|
||||
question_text=r["question_text"], answer_text=r["answer_text"],
|
||||
topic=r["topic"] or "", audio_path=r["audio_path"]
|
||||
) for r in rows]
|
||||
|
||||
def stats(self) -> dict:
|
||||
return {
|
||||
"episodes": self._conn.execute("SELECT COUNT(*) FROM episodes").fetchone()[0],
|
||||
"segments": self._conn.execute("SELECT COUNT(*) FROM segments").fetchone()[0],
|
||||
"qa_pairs": self._conn.execute("SELECT COUNT(*) FROM qa_pairs").fetchone()[0],
|
||||
}
|
||||
372
projects/radio-show/audio-processor/src/qa_extractor.py
Normal file
372
projects/radio-show/audio-processor/src/qa_extractor.py
Normal file
@@ -0,0 +1,372 @@
|
||||
"""
|
||||
Q&A pair extraction from diarized transcripts.
|
||||
|
||||
Identifies exchanges where a CALLER asks a question and the HOST answers.
|
||||
Outputs structured Q&A pairs with timestamps for clip extraction and indexing.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from rich.console import Console
|
||||
|
||||
console = Console()
|
||||
|
||||
# Phrases that signal a caller is asking a question
|
||||
QUESTION_SIGNALS = [
|
||||
r"\?",
|
||||
r"\bhow (do|can|should|would|does)\b",
|
||||
r"\bwhat (is|are|should|can|do|does|about)\b",
|
||||
r"\bwhy (is|are|does|do|would|should)\b",
|
||||
r"\bis (it|there|this|that) (true|safe|possible|good|bad|worth)\b",
|
||||
r"\bshould i\b",
|
||||
r"\bcan you\b",
|
||||
r"\bi (was wondering|wanted to ask|have a question)\b",
|
||||
]
|
||||
|
||||
QUESTION_PATTERN = re.compile("|".join(QUESTION_SIGNALS), re.IGNORECASE)
|
||||
|
||||
# Minimum durations for a meaningful exchange
|
||||
MIN_QUESTION_DURATION = 5.0 # seconds
|
||||
MIN_ANSWER_DURATION = 15.0 # seconds
|
||||
MAX_GAP_BETWEEN_QA = 30.0 # seconds between question end and answer start
|
||||
|
||||
# ── Promo / bumper filter ──────────────────────────────────────────────────
|
||||
# Promos evolve across years but preserve signature phrases.
|
||||
# Weight 2 = highly distinctive (one match sufficient to filter).
|
||||
# Weight 1 = semi-generic (need 2+ to filter).
|
||||
# A question turn with total score >= PROMO_SCORE_THRESHOLD is suppressed.
|
||||
PROMO_SCORE_THRESHOLD = 2
|
||||
|
||||
_PROMO_SIGS: list[tuple[re.Pattern, int]] = [
|
||||
# Highly distinctive — score 2 each
|
||||
(re.compile(r"acquired a life of its own", re.I), 2),
|
||||
(re.compile(r"simply desire a deeper", re.I), 2),
|
||||
(re.compile(r"tame that beast", re.I), 2),
|
||||
(re.compile(r"mike swanson will be back after", re.I), 2),
|
||||
(re.compile(r"heaven forbid.{0,20}virus", re.I | re.DOTALL), 2),
|
||||
(re.compile(r"mike swanson is answering all", re.I), 2),
|
||||
# Semi-distinctive — score 1 each, need two to filter
|
||||
(re.compile(r"\bcomputer running slow\b", re.I), 1),
|
||||
(re.compile(r"\bafter these messages\b", re.I), 1),
|
||||
(re.compile(r"\b790.?2040\b", re.I), 1),
|
||||
(re.compile(r"\bgurushow\.com\b", re.I), 1),
|
||||
(re.compile(r"\bcall in now\b", re.I), 1),
|
||||
(re.compile(r"\bcomputer troubles\?", re.I), 1),
|
||||
(re.compile(r"\bhardware installation\b", re.I), 1),
|
||||
]
|
||||
|
||||
|
||||
def _is_promo_or_bumper(text: str) -> bool:
|
||||
"""Return True if text scores above threshold on show promo/bumper signatures."""
|
||||
score = sum(w for pat, w in _PROMO_SIGS if pat.search(text))
|
||||
return score >= PROMO_SCORE_THRESHOLD
|
||||
|
||||
|
||||
@dataclass
|
||||
class QAPair:
|
||||
question_start: float
|
||||
question_end: float
|
||||
answer_start: float
|
||||
answer_end: float
|
||||
question_text: str
|
||||
answer_text: str
|
||||
topic: Optional[str] = None
|
||||
topic_tags: list[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"question_start": self.question_start,
|
||||
"question_end": self.question_end,
|
||||
"answer_start": self.answer_start,
|
||||
"answer_end": self.answer_end,
|
||||
"question_text": self.question_text,
|
||||
"answer_text": self.answer_text,
|
||||
"topic": self.topic,
|
||||
"topic_tags": self.topic_tags,
|
||||
}
|
||||
|
||||
def clip_start(self, padding: float = 1.5) -> float:
|
||||
return max(0.0, self.question_start - padding)
|
||||
|
||||
def clip_end(self, padding: float = 1.5) -> float:
|
||||
return self.answer_end + padding
|
||||
|
||||
def duration(self) -> float:
|
||||
return self.answer_end - self.question_start
|
||||
|
||||
|
||||
def extract_qa_pairs(diarized_segments: list[dict]) -> list[QAPair]:
|
||||
"""
|
||||
Extract caller Q&A pairs from diarized transcript segments.
|
||||
|
||||
Each segment dict: {start, end, text, speaker}
|
||||
Speaker values: "HOST", "CALLER", "UNKNOWN"
|
||||
"""
|
||||
pairs = []
|
||||
|
||||
# Group consecutive segments by speaker into speaker turns
|
||||
turns = _merge_consecutive_speaker_turns(diarized_segments)
|
||||
|
||||
# Check if diarization produced any non-HOST speakers
|
||||
has_caller_labels = any(t["speaker"] in ("CALLER", "UNKNOWN") for t in turns)
|
||||
|
||||
if not has_caller_labels:
|
||||
# Diarization labels are absent or unreliable — fall back to text-pattern detection
|
||||
return _extract_qa_text_only(turns)
|
||||
|
||||
i = 0
|
||||
while i < len(turns):
|
||||
turn = turns[i]
|
||||
|
||||
# Look for a CALLER turn that looks like a question
|
||||
if turn["speaker"] in ("CALLER", "UNKNOWN") and _looks_like_question(turn["text"]):
|
||||
if _is_promo_or_bumper(turn["text"]):
|
||||
i += 1
|
||||
continue
|
||||
q_duration = turn["end"] - turn["start"]
|
||||
if q_duration < MIN_QUESTION_DURATION:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Look ahead for HOST answer turn(s)
|
||||
j = i + 1
|
||||
answer_turns = []
|
||||
while j < len(turns):
|
||||
next_turn = turns[j]
|
||||
gap = next_turn["start"] - turns[j - 1]["end"]
|
||||
|
||||
if gap > MAX_GAP_BETWEEN_QA and not answer_turns:
|
||||
break # too big a gap before any answer
|
||||
|
||||
if next_turn["speaker"] == "HOST":
|
||||
answer_turns.append(next_turn)
|
||||
# Keep collecting consecutive HOST turns
|
||||
j += 1
|
||||
while j < len(turns) and turns[j]["speaker"] == "HOST":
|
||||
answer_turns.append(turns[j])
|
||||
j += 1
|
||||
break
|
||||
elif next_turn["speaker"] in ("CALLER", "UNKNOWN"):
|
||||
# Another caller turn before host answered — skip this question
|
||||
break
|
||||
else:
|
||||
j += 1
|
||||
|
||||
if answer_turns:
|
||||
answer_text = " ".join(t["text"] for t in answer_turns)
|
||||
answer_duration = answer_turns[-1]["end"] - answer_turns[0]["start"]
|
||||
|
||||
if answer_duration >= MIN_ANSWER_DURATION:
|
||||
pairs.append(QAPair(
|
||||
question_start=turn["start"],
|
||||
question_end=turn["end"],
|
||||
answer_start=answer_turns[0]["start"],
|
||||
answer_end=answer_turns[-1]["end"],
|
||||
question_text=turn["text"].strip(),
|
||||
answer_text=answer_text.strip(),
|
||||
))
|
||||
i = j
|
||||
continue
|
||||
|
||||
i += 1
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
# Maximum duration for a question turn in text-only mode — avoids capturing monologues
|
||||
_MAX_QUESTION_S_TEXT_MODE = 90.0
|
||||
|
||||
# Caller introduction phrases Mike uses before taking a call
|
||||
_CALLER_INTRO = re.compile(
|
||||
r"\b(let'?s go to|going to the phones?|you'?re on the air|on the air|"
|
||||
r"first caller|next caller|caller from|go ahead|what'?s (your question|going on)|"
|
||||
r"welcome to the show|thanks for calling|thank you for calling|"
|
||||
r"our (first|next|last) (caller|call)|taking (a |your )?call)\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _extract_qa_text_only(turns: list[dict]) -> list[QAPair]:
|
||||
"""
|
||||
Q&A extraction when speaker labels are unavailable or all HOST.
|
||||
|
||||
Uses text patterns to identify question anchors. Works well for call-in
|
||||
radio format where callers describe problems and the host answers at length.
|
||||
Captures both genuine caller questions and Mike's own rhetorical Q&A segments.
|
||||
"""
|
||||
pairs = []
|
||||
|
||||
i = 0
|
||||
while i < len(turns):
|
||||
turn = turns[i]
|
||||
q_duration = turn["end"] - turn["start"]
|
||||
|
||||
is_q_candidate = (
|
||||
_looks_like_question(turn["text"])
|
||||
and MIN_QUESTION_DURATION <= q_duration <= _MAX_QUESTION_S_TEXT_MODE
|
||||
)
|
||||
|
||||
# Also treat segments immediately after a caller-intro phrase as candidates
|
||||
if not is_q_candidate and i > 0:
|
||||
prev_text = turns[i - 1]["text"]
|
||||
if _CALLER_INTRO.search(prev_text) and q_duration >= MIN_QUESTION_DURATION:
|
||||
is_q_candidate = True
|
||||
|
||||
if is_q_candidate and _is_promo_or_bumper(turn["text"]):
|
||||
i += 1
|
||||
continue
|
||||
|
||||
if is_q_candidate:
|
||||
# Collect following segments as the answer until we hit another question
|
||||
j = i + 1
|
||||
answer_turns = []
|
||||
|
||||
while j < len(turns):
|
||||
next_turn = turns[j]
|
||||
gap = next_turn["start"] - turns[j - 1]["end"]
|
||||
|
||||
if gap > MAX_GAP_BETWEEN_QA and not answer_turns:
|
||||
break
|
||||
|
||||
# Stop collecting if we hit another short question-pattern turn
|
||||
if (
|
||||
_looks_like_question(next_turn["text"])
|
||||
and (next_turn["end"] - next_turn["start"]) <= _MAX_QUESTION_S_TEXT_MODE
|
||||
and answer_turns
|
||||
):
|
||||
break
|
||||
|
||||
answer_turns.append(next_turn)
|
||||
j += 1
|
||||
|
||||
# Stop once we have a substantial answer block
|
||||
if answer_turns:
|
||||
ans_dur = answer_turns[-1]["end"] - answer_turns[0]["start"]
|
||||
if ans_dur >= MIN_ANSWER_DURATION * 3:
|
||||
break
|
||||
|
||||
if answer_turns:
|
||||
answer_text = " ".join(t["text"] for t in answer_turns)
|
||||
answer_duration = answer_turns[-1]["end"] - answer_turns[0]["start"]
|
||||
|
||||
if answer_duration >= MIN_ANSWER_DURATION:
|
||||
pairs.append(QAPair(
|
||||
question_start=turn["start"],
|
||||
question_end=turn["end"],
|
||||
answer_start=answer_turns[0]["start"],
|
||||
answer_end=answer_turns[-1]["end"],
|
||||
question_text=turn["text"].strip(),
|
||||
answer_text=answer_text.strip(),
|
||||
))
|
||||
i = j
|
||||
continue
|
||||
|
||||
i += 1
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def tag_qa_pairs_with_ollama(pairs: list[QAPair], ollama_host: str = "http://localhost:11434",
|
||||
model: str = "qwen3:14b") -> list[QAPair]:
|
||||
"""Use Ollama to tag each Q&A pair with a topic and tags."""
|
||||
try:
|
||||
import ollama
|
||||
client = ollama.Client(host=ollama_host)
|
||||
except ImportError:
|
||||
console.print("[yellow]ollama not installed — skipping topic tagging[/yellow]")
|
||||
return pairs
|
||||
|
||||
for i, pair in enumerate(pairs):
|
||||
console.print(f"[dim]Tagging Q&A {i+1}/{len(pairs)}...[/dim]")
|
||||
try:
|
||||
prompt = (
|
||||
f"A radio show caller asked:\n\"{pair.question_text[:300]}\"\n\n"
|
||||
f"The host answered:\n\"{pair.answer_text[:500]}\"\n\n"
|
||||
"Respond with JSON only, no explanation:\n"
|
||||
'{"topic": "short topic name (3-5 words)", "tags": ["tag1", "tag2", "tag3"]}'
|
||||
)
|
||||
resp = client.chat(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
options={"temperature": 0},
|
||||
)
|
||||
raw = resp["message"]["content"].strip()
|
||||
# Extract JSON from response
|
||||
start = raw.find("{")
|
||||
end = raw.rfind("}") + 1
|
||||
if start >= 0 and end > start:
|
||||
data = json.loads(raw[start:end])
|
||||
pair.topic = data.get("topic", "")
|
||||
pair.topic_tags = data.get("tags", [])
|
||||
except Exception as e:
|
||||
console.print(f"[yellow]Tagging failed for pair {i+1}: {e}[/yellow]")
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def load_diarized_transcript(transcript_path: Path,
|
||||
diarization_path: Optional[Path]) -> list[dict]:
|
||||
"""
|
||||
Merge transcript and diarization into speaker-labeled segments.
|
||||
Falls back to HOST-only if no diarization available.
|
||||
"""
|
||||
with open(transcript_path) as f:
|
||||
transcript = json.load(f)
|
||||
|
||||
segments = transcript["segments"]
|
||||
|
||||
if diarization_path is None or not diarization_path.exists():
|
||||
return [
|
||||
{"start": s["start"], "end": s["end"],
|
||||
"text": s["text"], "speaker": "HOST"}
|
||||
for s in segments
|
||||
]
|
||||
|
||||
with open(diarization_path) as f:
|
||||
diarization = json.load(f)
|
||||
|
||||
turns = diarization.get("turns", [])
|
||||
|
||||
def speaker_at(t: float) -> str:
|
||||
"""Find which diarization turn covers time t."""
|
||||
for turn in turns:
|
||||
if turn["start"] <= t <= turn["end"]:
|
||||
return turn["speaker"]
|
||||
return "UNKNOWN"
|
||||
|
||||
return [
|
||||
{"start": s["start"], "end": s["end"],
|
||||
"text": s["text"],
|
||||
"speaker": speaker_at((s["start"] + s["end"]) / 2)}
|
||||
for s in segments
|
||||
]
|
||||
|
||||
|
||||
# ── Helpers ────────────────────────────────────────────────────────────────
|
||||
|
||||
def _looks_like_question(text: str) -> bool:
|
||||
return bool(QUESTION_PATTERN.search(text))
|
||||
|
||||
|
||||
def _merge_consecutive_speaker_turns(segments: list[dict]) -> list[dict]:
|
||||
"""Merge adjacent segments from the same speaker into continuous turns."""
|
||||
if not segments:
|
||||
return []
|
||||
|
||||
turns = []
|
||||
current = dict(segments[0])
|
||||
|
||||
for seg in segments[1:]:
|
||||
if seg["speaker"] == current["speaker"]:
|
||||
current["end"] = seg["end"]
|
||||
current["text"] = current["text"].rstrip() + " " + seg["text"].lstrip()
|
||||
else:
|
||||
turns.append(current)
|
||||
current = dict(seg)
|
||||
|
||||
turns.append(current)
|
||||
return turns
|
||||
206
projects/radio-show/audio-processor/src/show_prep.py
Normal file
206
projects/radio-show/audio-processor/src/show_prep.py
Normal file
@@ -0,0 +1,206 @@
|
||||
"""
|
||||
Show prep generator: search the archive index for past caller topics,
|
||||
extract clips, and generate "then vs now" talking points via Ollama.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich import box
|
||||
|
||||
from .indexer import ArchiveIndex, QAResult, SearchResult
|
||||
from .clip_extractor import extract_clips_for_results, format_timestamp
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
def generate_show_prep(
|
||||
index: ArchiveIndex,
|
||||
topic: str,
|
||||
output_dir: Path,
|
||||
extract_clips: bool = True,
|
||||
ollama_host: str = "http://localhost:11434",
|
||||
ollama_model: str = "qwen3:14b",
|
||||
limit: int = 10,
|
||||
) -> Path:
|
||||
"""
|
||||
Search the archive for past discussions of a topic.
|
||||
Extracts audio clips and generates "then vs now" talking points.
|
||||
Returns path to the generated markdown prep file.
|
||||
"""
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
console.print(Panel.fit(f"[bold]Show Prep:[/bold] {topic}", border_style="blue"))
|
||||
|
||||
# Search Q&A pairs first (caller exchanges)
|
||||
qa_results = index.search_qa(topic, limit=limit)
|
||||
# Also search raw segments (for monologue mentions)
|
||||
segment_results = index.search(topic, limit=limit)
|
||||
|
||||
if not qa_results and not segment_results:
|
||||
console.print(f"[yellow]No results found for: {topic}[/yellow]")
|
||||
return None
|
||||
|
||||
# Display results table
|
||||
_print_results_table(qa_results, segment_results, topic)
|
||||
|
||||
# Extract clips
|
||||
clip_paths = {}
|
||||
if extract_clips and qa_results:
|
||||
clips_dir = output_dir / "clips"
|
||||
console.print(f"\n[dim]Extracting {len(qa_results)} clip(s)...[/dim]")
|
||||
clip_paths = extract_clips_for_results(qa_results, clips_dir)
|
||||
|
||||
# Generate then-vs-now content via Ollama
|
||||
then_now = _generate_then_vs_now(topic, qa_results, segment_results,
|
||||
ollama_host, ollama_model)
|
||||
|
||||
# Write markdown prep file
|
||||
safe_topic = topic.lower().replace(" ", "-").replace("/", "-")[:40]
|
||||
date_str = datetime.now().strftime("%Y-%m-%d")
|
||||
prep_path = output_dir / f"{date_str}-{safe_topic}-prep.md"
|
||||
|
||||
_write_prep_file(prep_path, topic, qa_results, segment_results,
|
||||
clip_paths, then_now)
|
||||
|
||||
console.print(f"\n[bold green]Prep file:[/bold green] {prep_path}")
|
||||
return prep_path
|
||||
|
||||
|
||||
def _print_results_table(qa_results: list[QAResult], segment_results: list[SearchResult],
|
||||
topic: str):
|
||||
if qa_results:
|
||||
table = Table(title=f"Caller Q&A — \"{topic}\"", box=box.SIMPLE, show_lines=True)
|
||||
table.add_column("Date", style="cyan", width=12)
|
||||
table.add_column("Timestamps", style="dim", width=14)
|
||||
table.add_column("Duration", style="dim", width=8)
|
||||
table.add_column("Caller asked", width=35)
|
||||
table.add_column("Topic", style="green", width=20)
|
||||
|
||||
for r in qa_results:
|
||||
dur = r.duration()
|
||||
table.add_row(
|
||||
r.date or r.episode_id,
|
||||
r.timestamp_str(),
|
||||
f"{int(dur//60)}m{int(dur%60):02d}s",
|
||||
r.question_text[:80] + ("…" if len(r.question_text) > 80 else ""),
|
||||
r.topic or "—",
|
||||
)
|
||||
console.print(table)
|
||||
|
||||
if segment_results and not qa_results:
|
||||
console.print(f"\n[dim]No structured Q&A found. Showing {len(segment_results)} "
|
||||
f"transcript mentions:[/dim]")
|
||||
for r in segment_results:
|
||||
console.print(f" [cyan]{r.date}[/cyan] [{r.timestamp_str()}] "
|
||||
f"[dim]{r.speaker}[/dim]: {r.text[:100]}…")
|
||||
|
||||
|
||||
def _generate_then_vs_now(topic: str, qa_results: list, segment_results: list,
|
||||
ollama_host: str, model: str) -> str:
|
||||
try:
|
||||
import ollama
|
||||
client = ollama.Client(host=ollama_host)
|
||||
except ImportError:
|
||||
return "_Ollama not available — install with: pip install ollama_"
|
||||
|
||||
# Build context from past discussions
|
||||
past_context = ""
|
||||
for r in qa_results[:5]:
|
||||
date = r.date or r.episode_id
|
||||
past_context += f"\n[{date}] Caller: {r.question_text[:200]}\n"
|
||||
past_context += f"Host answer: {r.answer_text[:400]}\n"
|
||||
|
||||
if not past_context and segment_results:
|
||||
for r in segment_results[:5]:
|
||||
past_context += f"\n[{r.date}] {r.speaker}: {r.text[:300]}\n"
|
||||
|
||||
if not past_context:
|
||||
return ""
|
||||
|
||||
prompt = f"""You are helping prepare talking points for a technology radio show host.
|
||||
The host discussed "{topic}" in past episodes. Here are excerpts:
|
||||
|
||||
{past_context}
|
||||
|
||||
The host wants to do a new segment revisiting this topic.
|
||||
|
||||
Write talking points in this format:
|
||||
## What I Said Then
|
||||
- [2-3 bullets summarizing the past advice/position]
|
||||
|
||||
## What's Changed Since Then
|
||||
- [2-3 bullets on how the technology/situation has evolved]
|
||||
|
||||
## Why My Answer Is Different Now
|
||||
- [2-3 bullets on the updated recommendation/position]
|
||||
|
||||
## Suggested Opening
|
||||
[1-2 sentences the host can use to open the segment, referencing the old clip]
|
||||
|
||||
Keep it conversational, radio-friendly. Be specific about what actually changed."""
|
||||
|
||||
try:
|
||||
resp = client.chat(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
options={"temperature": 0.3},
|
||||
)
|
||||
return resp["message"]["content"]
|
||||
except Exception as e:
|
||||
return f"_Ollama generation failed: {e}_"
|
||||
|
||||
|
||||
def _write_prep_file(path: Path, topic: str, qa_results: list, segment_results: list,
|
||||
clip_paths: dict, then_now: str):
|
||||
lines = [
|
||||
f"# Show Prep: {topic}",
|
||||
f"",
|
||||
f"_Generated {datetime.now().strftime('%Y-%m-%d %H:%M')}_",
|
||||
f"",
|
||||
]
|
||||
|
||||
if qa_results:
|
||||
lines += [f"## Past Caller Exchanges ({len(qa_results)} found)", ""]
|
||||
for i, r in enumerate(qa_results):
|
||||
clip_info = ""
|
||||
if i in clip_paths:
|
||||
clip_info = f" — `{clip_paths[i].name}`"
|
||||
lines += [
|
||||
f"### {r.date or r.episode_id} — [{r.timestamp_str()}]{clip_info}",
|
||||
f"**Caller:** {r.question_text}",
|
||||
f"",
|
||||
f"**Host:** {r.answer_text[:600]}{'…' if len(r.answer_text) > 600 else ''}",
|
||||
f"",
|
||||
]
|
||||
|
||||
elif segment_results:
|
||||
lines += [f"## Transcript Mentions ({len(segment_results)} found)", ""]
|
||||
for r in segment_results:
|
||||
lines += [
|
||||
f"- **{r.date}** [{r.timestamp_str()}] ({r.speaker}): {r.text[:200]}",
|
||||
]
|
||||
lines.append("")
|
||||
|
||||
if then_now:
|
||||
lines += ["## Then vs Now", "", then_now, ""]
|
||||
|
||||
if clip_paths:
|
||||
lines += [
|
||||
"## Clips",
|
||||
"",
|
||||
f"Extracted to `clips/` — drag into Audition/Audacity:",
|
||||
"",
|
||||
]
|
||||
for i, p in clip_paths.items():
|
||||
if i < len(qa_results):
|
||||
r = qa_results[i]
|
||||
lines.append(f"- `{p.name}` — {r.date} [{r.timestamp_str()}]")
|
||||
lines.append("")
|
||||
|
||||
path.write_text("\n".join(lines), encoding="utf-8")
|
||||
@@ -13,7 +13,6 @@ import numpy as np
|
||||
import torch
|
||||
import soundfile as sf
|
||||
from rich.console import Console
|
||||
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeElapsedColumn
|
||||
from rich.table import Table
|
||||
|
||||
console = Console()
|
||||
@@ -159,36 +158,43 @@ class VoiceProfiler:
|
||||
|
||||
def extract_embedding(self, audio_path: Path, start: float = 0.0,
|
||||
end: float | None = None) -> np.ndarray:
|
||||
"""Extract a speaker embedding from an audio segment."""
|
||||
model = self._get_model()
|
||||
"""Extract a speaker embedding from an audio segment (file-based, any format)."""
|
||||
self._get_model()
|
||||
waveform, _ = self._load_audio_segment(audio_path, start, end)
|
||||
return self._embed_audio_np(waveform.squeeze(0).numpy())
|
||||
|
||||
# Load audio segment (already at SAMPLE_RATE via ffmpeg)
|
||||
waveform, sr = self._load_audio_segment(audio_path, start, end)
|
||||
|
||||
# waveform is [1, samples] tensor, need just the numpy array for the extractor
|
||||
audio_np = waveform.squeeze(0).numpy()
|
||||
|
||||
# Extract features
|
||||
def _embed_audio_np(self, audio_np: np.ndarray) -> np.ndarray:
|
||||
"""Embed a float32 mono numpy array (already at SAMPLE_RATE). Returns L2-normalized embedding."""
|
||||
self._get_model()
|
||||
inputs = self._extractor(
|
||||
audio_np, sampling_rate=SAMPLE_RATE,
|
||||
return_tensors="pt", padding=True,
|
||||
)
|
||||
|
||||
# Get embedding
|
||||
with torch.no_grad():
|
||||
outputs = model(**{k: v.to(self.device) for k, v in inputs.items()})
|
||||
|
||||
outputs = self._model(**{k: v.to(self.device) for k, v in inputs.items()})
|
||||
embedding = outputs.embeddings.squeeze().cpu().numpy()
|
||||
# L2 normalize
|
||||
norm = np.linalg.norm(embedding)
|
||||
if norm > 0:
|
||||
embedding = embedding / norm
|
||||
|
||||
return embedding
|
||||
|
||||
def _load_full_audio(self, audio_path: Path) -> np.ndarray:
|
||||
"""Decode entire audio file to float32 mono at SAMPLE_RATE via a single ffmpeg call."""
|
||||
cmd = [
|
||||
"ffmpeg", "-i", str(audio_path),
|
||||
"-f", "wav", "-ac", "1", "-ar", str(SAMPLE_RATE),
|
||||
"-acodec", "pcm_s16le", "pipe:1",
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, timeout=600)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"ffmpeg failed: {result.stderr.decode()[:200]}")
|
||||
import io
|
||||
data, _ = sf.read(io.BytesIO(result.stdout), dtype="float32")
|
||||
return data # shape: (samples,)
|
||||
|
||||
def _load_audio_segment(self, audio_path: Path, start: float = 0.0,
|
||||
end: float | None = None) -> tuple[torch.Tensor, int]:
|
||||
"""Load an audio segment using ffmpeg (handles any format)."""
|
||||
"""Load a single audio segment via ffmpeg (used for one-off extraction)."""
|
||||
cmd = ["ffmpeg", "-i", str(audio_path)]
|
||||
if start > 0:
|
||||
cmd.extend(["-ss", str(start)])
|
||||
@@ -227,68 +233,39 @@ class VoiceProfiler:
|
||||
|
||||
profile = self.profiles[host_name]
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TextColumn("{task.completed}/{task.total}"),
|
||||
TimeElapsedColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task("Processing episodes...",
|
||||
total=len(episode_paths))
|
||||
for ep_idx, ep_path in enumerate(episode_paths, 1):
|
||||
console.print(f"[dim] [{ep_idx}/{len(episode_paths)}] {ep_path.name}[/dim]")
|
||||
|
||||
for ep_path in episode_paths:
|
||||
progress.update(task, description=f"Processing {ep_path.name}...")
|
||||
try:
|
||||
duration = self._get_duration(ep_path)
|
||||
|
||||
try:
|
||||
# Get episode duration
|
||||
duration = self._get_duration(ep_path)
|
||||
windows = []
|
||||
if duration > 90:
|
||||
windows.append((30.0, 90.0))
|
||||
if duration > 180:
|
||||
windows.append((120.0, 180.0))
|
||||
mid = duration / 2
|
||||
if mid > 60:
|
||||
windows.append((mid, min(mid + 60, duration)))
|
||||
late = duration - 180
|
||||
if late > 300:
|
||||
windows.append((late, late + 60))
|
||||
|
||||
# Strategy: extract embeddings from multiple time windows
|
||||
# Skip first 30s (likely intro jingle), then sample every 2 min
|
||||
windows = []
|
||||
chunk_duration = 10.0
|
||||
for start, end in windows:
|
||||
for chunk_start in np.arange(start, end - chunk_duration, chunk_duration):
|
||||
try:
|
||||
emb = self.extract_embedding(
|
||||
ep_path, chunk_start, chunk_start + chunk_duration
|
||||
)
|
||||
profile.embeddings.append(emb)
|
||||
except Exception as e:
|
||||
console.print(f" [dim red]Chunk {chunk_start:.0f}s failed: {e}[/dim red]")
|
||||
|
||||
# Window 1: After intro (30s-90s) — usually host monologue
|
||||
if duration > 90:
|
||||
windows.append((30.0, 90.0))
|
||||
profile.source_episodes.append(ep_path.name)
|
||||
|
||||
# Window 2: Early show (2min-3min)
|
||||
if duration > 180:
|
||||
windows.append((120.0, 180.0))
|
||||
|
||||
# Window 3: Mid show
|
||||
mid = duration / 2
|
||||
if mid > 60:
|
||||
windows.append((mid, min(mid + 60, duration)))
|
||||
|
||||
# Window 4: Late show (but not last 2 min — likely outro)
|
||||
late = duration - 180
|
||||
if late > 300:
|
||||
windows.append((late, late + 60))
|
||||
|
||||
for start, end in windows:
|
||||
# Extract 10-second chunks within each window
|
||||
# and take the embedding of each chunk
|
||||
chunk_duration = 10.0
|
||||
for chunk_start in np.arange(start, end - chunk_duration,
|
||||
chunk_duration):
|
||||
try:
|
||||
emb = self.extract_embedding(
|
||||
ep_path, chunk_start,
|
||||
chunk_start + chunk_duration
|
||||
)
|
||||
profile.embeddings.append(emb)
|
||||
except Exception as e:
|
||||
console.print(f" [dim red]Chunk {chunk_start:.0f}s failed: {e}[/dim red]")
|
||||
continue
|
||||
|
||||
profile.source_episodes.append(ep_path.name)
|
||||
|
||||
except Exception as e:
|
||||
console.print(f" [red]Failed: {ep_path.name}: {e}[/red]")
|
||||
|
||||
progress.update(task, advance=1)
|
||||
except Exception as e:
|
||||
console.print(f" [red]Failed: {ep_path.name}: {e}[/red]")
|
||||
|
||||
# Compute composite
|
||||
profile.compute_composite()
|
||||
@@ -305,58 +282,66 @@ class VoiceProfiler:
|
||||
threshold: float = 0.70) -> list[VoiceSegment]:
|
||||
"""Identify speakers throughout an audio file using sliding window.
|
||||
|
||||
Loads the full audio once then slices in memory — avoids spawning
|
||||
hundreds of ffmpeg subprocesses.
|
||||
Returns timestamped segments with speaker labels and embeddings.
|
||||
"""
|
||||
console.print(f"[bold]Identifying speakers:[/bold] {audio_path.name}")
|
||||
|
||||
duration = self._get_duration(audio_path)
|
||||
console.print(f"[dim]Loading audio into memory...[/dim]")
|
||||
audio = self._load_full_audio(audio_path) # float32 mono array
|
||||
self._get_model() # ensure model is warm before the loop
|
||||
|
||||
segments = []
|
||||
window_samples = int(window_s * SAMPLE_RATE)
|
||||
hop_samples = int(hop_s * SAMPLE_RATE)
|
||||
total_samples = len(audio)
|
||||
|
||||
with Progress(
|
||||
SpinnerColumn(),
|
||||
TextColumn("[progress.description]{task.description}"),
|
||||
BarColumn(),
|
||||
TextColumn("{task.percentage:>3.0f}%"),
|
||||
TimeElapsedColumn(),
|
||||
console=console,
|
||||
) as progress:
|
||||
task = progress.add_task("Analyzing speakers...",
|
||||
total=int(duration))
|
||||
total_windows = int((duration - window_s) / hop_s) + 1
|
||||
report_every = max(1, total_windows // 10)
|
||||
|
||||
for start in np.arange(0, duration - window_s, hop_s):
|
||||
end = min(start + window_s, duration)
|
||||
for idx, start in enumerate(np.arange(0, duration - window_s, hop_s)):
|
||||
end = min(start + window_s, duration)
|
||||
s = int(start * SAMPLE_RATE)
|
||||
e = min(s + window_samples, total_samples)
|
||||
|
||||
try:
|
||||
emb = self.extract_embedding(audio_path, start, end)
|
||||
try:
|
||||
emb = self._embed_audio_np(audio[s:e])
|
||||
|
||||
# Match against known profiles
|
||||
best_match = None
|
||||
best_score = 0.0
|
||||
best_match = None
|
||||
best_score = 0.0
|
||||
|
||||
for name, profile in self.profiles.items():
|
||||
score = profile.similarity(emb)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = name
|
||||
for name, profile in self.profiles.items():
|
||||
score = profile.similarity(emb)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_match = name
|
||||
|
||||
label = best_match if best_score >= threshold else "Unknown"
|
||||
if best_score >= threshold:
|
||||
if best_match and self.profiles[best_match].role == "host":
|
||||
label = f"Host: {best_match}"
|
||||
else:
|
||||
label = best_match
|
||||
else:
|
||||
label = "Unknown"
|
||||
|
||||
segments.append(VoiceSegment(
|
||||
start=start,
|
||||
end=end,
|
||||
embedding=emb,
|
||||
speaker_label=f"{label} ({best_score:.2f})",
|
||||
))
|
||||
segments.append(VoiceSegment(
|
||||
start=start,
|
||||
end=end,
|
||||
embedding=emb,
|
||||
speaker_label=f"{label} ({best_score:.2f})",
|
||||
))
|
||||
|
||||
except Exception:
|
||||
segments.append(VoiceSegment(
|
||||
start=start, end=end,
|
||||
speaker_label="[error]",
|
||||
))
|
||||
except Exception:
|
||||
segments.append(VoiceSegment(
|
||||
start=start, end=end,
|
||||
speaker_label="[error]",
|
||||
))
|
||||
|
||||
progress.update(task, completed=int(end))
|
||||
if idx % report_every == 0:
|
||||
pct = int(end / duration * 100)
|
||||
console.print(f"[dim] {pct}% ({end:.0f}s / {duration:.0f}s)[/dim]")
|
||||
|
||||
# Print summary
|
||||
self._print_speaker_summary(segments, duration)
|
||||
|
||||
Reference in New Issue
Block a user