radio: diarization pipeline fixes, benchmark setup, test episode set

- Fix voice_profiler threshold bug (HOST label overwrote Unknown unconditionally) - Audio preload optimization: single ffmpeg per episode, 149.5x realtime on 5070 Ti - WavLM threshold raised to 0.85 (Mike 0.90-0.99, callers 0.46-0.83) - Promo/bumper filter: weighted signature scoring, 42->27 clean Q&A pairs - Text-only Q&A fallback for episodes with no CALLER diarization labels - TRANSFORMERS_OFFLINE=1 to skip HuggingFace freshness checks - Add diarize_2018.py for targeted re-run + FTS5 rebuild - Add benchmark.py + BENCH_SETUP.md for GURU-BEAST-ROG (RTX 4090) comparison - Commit 9-episode training diarization.json outputs - Session log: 2026-04-27-diarization-pipeline.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 13:20:10 -07:00
parent 206cd2f929
commit 79abef9dc9
21 changed files with 4720 additions and 202 deletions
--- a/projects/radio-show/audio-processor/src/clip_extractor.py
+++ b/projects/radio-show/audio-processor/src/clip_extractor.py
@@ -0,0 +1,105 @@
+"""
+Audio clip extraction using ffmpeg.
+Cuts clips from original broadcast MP3s for use in Audition/Audacity.
+"""
+
+import subprocess
+from pathlib import Path
+
+from rich.console import Console
+
+console = Console()
+
+
+def extract_clip(
+    source_path: Path,
+    start: float,
+    end: float,
+    output_path: Path,
+    padding: float = 1.5,
+    fade_ms: int = 200,
+) -> Path:
+    """
+    Extract a clip from source_path between start and end seconds.
+    Adds padding on both sides and applies fade in/out.
+    Returns the output path.
+    """
+    source_path = Path(source_path)
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    clip_start = max(0.0, start - padding)
+    clip_end = end + padding
+    duration = clip_end - clip_start
+
+    fade_s = fade_ms / 1000.0
+
+    cmd = [
+        "ffmpeg", "-y",
+        "-ss", f"{clip_start:.3f}",
+        "-i", str(source_path),
+        "-t", f"{duration:.3f}",
+        "-af", f"afade=t=in:st=0:d={fade_s},afade=t=out:st={duration - fade_s:.3f}:d={fade_s}",
+        "-q:a", "2",
+        str(output_path),
+    ]
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        raise RuntimeError(f"ffmpeg failed: {result.stderr[-500:]}")
+
+    return output_path
+
+
+def extract_clips_for_results(results, output_dir: Path, padding: float = 1.5) -> dict[int, Path]:
+    """
+    Extract clips for a list of QAResult or SearchResult objects.
+    Returns {index: clip_path}.
+    """
+    output_dir = Path(output_dir)
+    clip_paths = {}
+
+    for i, result in enumerate(results):
+        episode = result.episode_id
+        audio_path = Path(result.audio_path)
+
+        if not audio_path.exists():
+            console.print(f"[yellow]Audio not found: {audio_path}[/yellow]")
+            continue
+
+        # Determine time range
+        if hasattr(result, "question_start"):
+            # QAResult
+            start = result.question_start
+            end = result.answer_end
+        else:
+            # SearchResult
+            start = result.start
+            end = result.end
+
+        def fmt(s):
+            m, sec = divmod(int(s), 60)
+            h, m = divmod(m, 60)
+            return f"{h}h{m:02d}m{sec:02d}s" if h else f"{m}m{sec:02d}s"
+
+        clip_name = f"{episode}_{fmt(start)}.mp3"
+        clip_path = output_dir / clip_name
+
+        try:
+            extract_clip(audio_path, start, end, clip_path, padding=padding)
+            clip_paths[i] = clip_path
+            console.print(f"[green]Clip {i+1}:[/green] {clip_name}")
+        except Exception as e:
+            console.print(f"[red]Clip {i+1} failed:[/red] {e}")
+
+    return clip_paths
+
+
+def format_timestamp(seconds: float) -> str:
+    """Format seconds as H:MM:SS or M:SS."""
+    h = int(seconds // 3600)
+    m = int((seconds % 3600) // 60)
+    s = int(seconds % 60)
+    if h:
+        return f"{h}:{m:02d}:{s:02d}"
+    return f"{m}:{s:02d}"
--- a/projects/radio-show/audio-processor/src/diarizer.py
+++ b/projects/radio-show/audio-processor/src/diarizer.py
@@ -158,117 +158,86 @@ def diarize(audio_path: str | Path,
            voice_profiles: VoiceProfileStore | None = None,
            min_speakers: int = 1,
            max_speakers: int = 6,
-            host_match_threshold: float = 0.75) -> DiarizationResult:
-    """Run speaker diarization on an audio file."""
-    from pyannote.audio import Pipeline
+            host_match_threshold: float = 0.85) -> DiarizationResult:
+    """Run speaker diarization using WavLM sliding-window speaker identification.
+
+    Uses the built-in VoiceProfiler (WavLM x-vectors) — no HuggingFace token
+    or gated model required. Identifies HOST vs non-HOST speakers using the
+    stored voice profile for Mike Swanson.
+    """
    import torch
+    from .voice_profiler import VoiceProfiler

    audio_path = Path(audio_path)
    console.print(f"[bold]Diarizing:[/bold] {audio_path.name}")

-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
    console.print(f"[dim]Device: {device}[/dim]")

-    pipeline = Pipeline.from_pretrained(
-        "pyannote/speaker-diarization-3.1"
-    ).to(device)
+    # Locate voice profiles directory from the VoiceProfileStore path
+    profiles_dir = voice_profiles.profiles_dir if voice_profiles else Path("voice-profiles")

-    diarization = pipeline(
-        str(audio_path),
-        min_speakers=min_speakers,
-        max_speakers=max_speakers,
+    profiler = VoiceProfiler(profiles_dir, device=device)
+
+    if not profiler.profiles:
+        console.print("[yellow]No voice profiles found — labeling all as HOST[/yellow]")
+        # Return a single HOST turn covering the whole episode
+        from .voice_profiler import VoiceProfiler as VP
+        duration = profiler._get_duration(audio_path)
+        return DiarizationResult(
+            turns=[SpeakerTurn(speaker="HOST", start=0.0, end=duration)],
+            num_speakers=1,
+            speaker_map={"HOST": "HOST"},
+        )
+
+    # Sliding-window identification: 10s windows, 5s hop
+    voice_segs = profiler.identify_speakers(
+        audio_path, window_s=10.0, hop_s=5.0,
+        threshold=host_match_threshold,
    )

-    # Extract turns
+    # Convert VoiceSegment labels to HOST / CALLER
    raw_turns = []
-    for turn, _, speaker in diarization.itertracks(yield_label=True):
+    for seg in voice_segs:
+        label = seg.speaker_label.split(" (")[0]  # strip confidence score
+        if label.startswith("Host:") or label.startswith("Host "):
+            speaker = "HOST"
+        elif label == "[error]":
+            speaker = "UNKNOWN"
+        else:
+            speaker = "CALLER"
+
        raw_turns.append(SpeakerTurn(
            speaker=speaker,
-            start=turn.start,
-            end=turn.end,
+            start=seg.start,
+            end=seg.end,
+            confidence=float(seg.speaker_label.split("(")[-1].rstrip(")"))
+                       if "(" in seg.speaker_label else 0.5,
        ))

-    # Count unique speakers
-    raw_speakers = set(t.speaker for t in raw_turns)
-    console.print(f"[dim]Detected {len(raw_speakers)} speakers[/dim]")
-
-    # Match against voice profiles if available
-    speaker_map = {}
-    if voice_profiles and voice_profiles.embeddings:
-        console.print("[dim]Matching speakers against voice profiles...[/dim]")
-        embedding_model = pipeline.embedding  # pyannote's embedding model
-
-        # Get embeddings for each detected speaker
-        from pyannote.audio import Inference
-        inference = Inference(pipeline.embedding, window="whole")
-
-        for raw_label in raw_speakers:
-            # Get segments for this speaker
-            speaker_segments = [t for t in raw_turns if t.speaker == raw_label]
-            total_time = sum(t.duration for t in speaker_segments)
-
-            # Use the longest segment for embedding
-            longest = max(speaker_segments, key=lambda t: t.duration)
-
-            try:
-                # Extract embedding from audio segment
-                import torchaudio
-                waveform, sr = torchaudio.load(
-                    str(audio_path),
-                    frame_offset=int(longest.start * sr if 'sr' in dir() else longest.start * 16000),
-                    num_frames=int(longest.duration * sr if 'sr' in dir() else longest.duration * 16000),
-                )
-                # This is simplified — proper implementation would use pyannote's
-                # embedding extraction pipeline
-                match_name, score = voice_profiles.match_embedding(
-                    np.zeros(256),  # placeholder
-                    threshold=host_match_threshold,
-                )
-                if match_name:
-                    speaker_map[raw_label] = match_name
-                    console.print(f"  [green]{raw_label} -> {match_name} "
-                                  f"(score: {score:.2f}, {total_time:.0f}s)[/green]")
-            except Exception as e:
-                console.print(f"  [yellow]Could not match {raw_label}: {e}[/yellow]")
-
-        # If no voice profiles matched, use speaking time heuristic
-        # The host almost always has the most speaking time
-        if not speaker_map:
-            ranked = sorted(
-                [(s, sum(t.duration for t in raw_turns if t.speaker == s))
-                 for s in raw_speakers],
-                key=lambda x: x[1],
-                reverse=True,
-            )
-            if ranked:
-                speaker_map[ranked[0][0]] = f"Host: {voice_profiles.metadata.get('host', {}).get('name', 'Unknown')}"
-                console.print(f"  [yellow]Assumed {ranked[0][0]} is host "
-                              f"(most speaking time: {ranked[0][1]:.0f}s)[/yellow]")
-
-    # If no voice profiles at all, label by speaking time
-    if not speaker_map:
-        ranked = sorted(
-            [(s, sum(t.duration for t in raw_turns if t.speaker == s))
-             for s in raw_speakers],
-            key=lambda x: x[1],
-            reverse=True,
-        )
-        for i, (speaker, time) in enumerate(ranked):
-            if i == 0:
-                speaker_map[speaker] = "Host (assumed)"
-            else:
-                speaker_map[speaker] = f"Speaker {i}"
-
-    # Apply friendly names
+    # Merge consecutive same-speaker turns
+    merged: list[SpeakerTurn] = []
    for turn in raw_turns:
-        if turn.speaker in speaker_map:
-            turn.speaker = speaker_map[turn.speaker]
+        if merged and merged[-1].speaker == turn.speaker:
+            merged[-1].end = turn.end
+        else:
+            merged.append(SpeakerTurn(
+                speaker=turn.speaker,
+                start=turn.start,
+                end=turn.end,
+                confidence=turn.confidence,
+            ))

-    console.print(f"[green]Diarization complete: {len(raw_turns)} turns, "
-                  f"{len(raw_speakers)} speakers[/green]")
+    unique_speakers = set(t.speaker for t in merged)
+    speaker_map = {s: s for s in unique_speakers}
+
+    host_time = sum(t.duration for t in merged if t.speaker == "HOST")
+    caller_time = sum(t.duration for t in merged if t.speaker == "CALLER")
+    console.print(f"[green]Diarization complete:[/green] {len(merged)} turns | "
+                  f"HOST {host_time:.0f}s / CALLER {caller_time:.0f}s")

    return DiarizationResult(
-        turns=raw_turns,
-        num_speakers=len(raw_speakers),
+        turns=merged,
+        num_speakers=len(unique_speakers),
        speaker_map=speaker_map,
    )
--- a/projects/radio-show/audio-processor/src/indexer.py
+++ b/projects/radio-show/audio-processor/src/indexer.py
@@ -0,0 +1,247 @@
+"""
+Archive transcript index using SQLite FTS5.
+Stores all transcript segments with speaker labels, searchable by keyword or phrase.
+"""
+
+import json
+import sqlite3
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterator
+
+from rich.console import Console
+
+console = Console()
+
+DB_SCHEMA = """
+CREATE TABLE IF NOT EXISTS episodes (
+    id          INTEGER PRIMARY KEY,
+    episode_id  TEXT UNIQUE NOT NULL,   -- e.g. "2016-s8e42"
+    date        TEXT,                   -- "2016-03-15"
+    audio_path  TEXT,                   -- absolute path to original MP3
+    duration    REAL,
+    hr          INTEGER                 -- 1 or 2 (for split episodes)
+);
+
+CREATE TABLE IF NOT EXISTS segments (
+    id          INTEGER PRIMARY KEY,
+    episode_id  TEXT NOT NULL,
+    seg_index   INTEGER NOT NULL,
+    start       REAL NOT NULL,
+    end         REAL NOT NULL,
+    speaker     TEXT,                   -- "HOST", "CALLER", "UNKNOWN", "COMMERCIAL"
+    text        TEXT NOT NULL,
+    FOREIGN KEY (episode_id) REFERENCES episodes(episode_id)
+);
+
+CREATE VIRTUAL TABLE IF NOT EXISTS segments_fts USING fts5(
+    text,
+    speaker UNINDEXED,
+    episode_id UNINDEXED,
+    seg_index UNINDEXED,
+    content='segments',
+    content_rowid='id'
+);
+
+CREATE TRIGGER IF NOT EXISTS segments_ai AFTER INSERT ON segments BEGIN
+    INSERT INTO segments_fts(rowid, text, speaker, episode_id, seg_index)
+    VALUES (new.id, new.text, new.speaker, new.episode_id, new.seg_index);
+END;
+
+CREATE TABLE IF NOT EXISTS qa_pairs (
+    id              INTEGER PRIMARY KEY,
+    episode_id      TEXT NOT NULL,
+    question_start  REAL NOT NULL,
+    question_end    REAL NOT NULL,
+    answer_start    REAL NOT NULL,
+    answer_end      REAL NOT NULL,
+    question_text   TEXT NOT NULL,
+    answer_text     TEXT NOT NULL,
+    topic           TEXT,               -- Ollama-tagged topic
+    topic_tags      TEXT,               -- JSON array of tags
+    FOREIGN KEY (episode_id) REFERENCES episodes(episode_id)
+);
+
+CREATE VIRTUAL TABLE IF NOT EXISTS qa_fts USING fts5(
+    question_text,
+    answer_text,
+    topic,
+    episode_id UNINDEXED,
+    content='qa_pairs',
+    content_rowid='id'
+);
+
+CREATE TRIGGER IF NOT EXISTS qa_ai AFTER INSERT ON qa_pairs BEGIN
+    INSERT INTO qa_fts(rowid, question_text, answer_text, topic, episode_id)
+    VALUES (new.id, new.question_text, new.answer_text, new.topic, new.episode_id);
+END;
+"""
+
+
+@dataclass
+class SearchResult:
+    episode_id: str
+    date: str
+    start: float
+    end: float
+    speaker: str
+    text: str
+    audio_path: str
+    score: float = 0.0
+
+    def timestamp_str(self) -> str:
+        def fmt(s):
+            m, sec = divmod(int(s), 60)
+            h, m = divmod(m, 60)
+            return f"{h}:{m:02d}:{sec:02d}" if h else f"{m}:{sec:02d}"
+        return f"{fmt(self.start)}–{fmt(self.end)}"
+
+
+@dataclass
+class QAResult:
+    episode_id: str
+    date: str
+    question_start: float
+    question_end: float
+    answer_start: float
+    answer_end: float
+    question_text: str
+    answer_text: str
+    topic: str
+    audio_path: str
+
+    def clip_start(self, padding: float = 1.0) -> float:
+        return max(0.0, self.question_start - padding)
+
+    def clip_end(self, padding: float = 1.0) -> float:
+        return self.answer_end + padding
+
+    def timestamp_str(self) -> str:
+        def fmt(s):
+            m, sec = divmod(int(s), 60)
+            h, m = divmod(m, 60)
+            return f"{h}:{m:02d}:{sec:02d}" if h else f"{m}:{sec:02d}"
+        return f"{fmt(self.question_start)}–{fmt(self.answer_end)}"
+
+    def duration(self) -> float:
+        return self.answer_end - self.question_start
+
+
+class ArchiveIndex:
+    def __init__(self, db_path: Path):
+        self.db_path = Path(db_path)
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        self._conn = sqlite3.connect(str(self.db_path))
+        self._conn.row_factory = sqlite3.Row
+        self._conn.executescript(DB_SCHEMA)
+        self._conn.commit()
+
+    def close(self):
+        self._conn.close()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *_):
+        self.close()
+
+    # ── Ingestion ──────────────────────────────────────────────────────────
+
+    def add_episode(self, episode_id: str, audio_path: Path,
+                    date: str = None, duration: float = None, hr: int = None):
+        self._conn.execute(
+            "INSERT OR IGNORE INTO episodes (episode_id, date, audio_path, duration, hr) "
+            "VALUES (?, ?, ?, ?, ?)",
+            (episode_id, date, str(audio_path), duration, hr)
+        )
+        self._conn.commit()
+
+    def add_segments(self, episode_id: str, segments: list[dict]):
+        """Add transcript segments. Each dict: {start, end, text, speaker}."""
+        existing = self._conn.execute(
+            "SELECT COUNT(*) FROM segments WHERE episode_id = ?", (episode_id,)
+        ).fetchone()[0]
+        if existing:
+            return  # already indexed
+
+        self._conn.executemany(
+            "INSERT INTO segments (episode_id, seg_index, start, end, speaker, text) "
+            "VALUES (?, ?, ?, ?, ?, ?)",
+            [
+                (episode_id, i, s["start"], s["end"],
+                 s.get("speaker", "UNKNOWN"), s["text"])
+                for i, s in enumerate(segments)
+            ]
+        )
+        self._conn.commit()
+
+    def add_qa_pair(self, episode_id: str, q_start: float, q_end: float,
+                    a_start: float, a_end: float, question: str, answer: str,
+                    topic: str = None, tags: list[str] = None):
+        self._conn.execute(
+            "INSERT INTO qa_pairs "
+            "(episode_id, question_start, question_end, answer_start, answer_end, "
+            "question_text, answer_text, topic, topic_tags) "
+            "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
+            (episode_id, q_start, q_end, a_start, a_end, question, answer,
+             topic, json.dumps(tags or []))
+        )
+        self._conn.commit()
+
+    # ── Search ─────────────────────────────────────────────────────────────
+
+    def search(self, query: str, speaker_filter: str = None,
+               limit: int = 20) -> list[SearchResult]:
+        """Full-text search across all transcript segments."""
+        speaker_clause = ""
+        params = [query, limit]
+        if speaker_filter:
+            speaker_clause = "AND s.speaker = ?"
+            params.insert(1, speaker_filter)
+
+        rows = self._conn.execute(f"""
+            SELECT s.episode_id, e.date, s.start, s.end, s.speaker, s.text,
+                   e.audio_path, rank
+            FROM segments_fts f
+            JOIN segments s ON s.id = f.rowid
+            JOIN episodes e ON e.episode_id = s.episode_id
+            WHERE segments_fts MATCH ?
+            {speaker_clause}
+            ORDER BY rank
+            LIMIT ?
+        """, params).fetchall()
+
+        return [SearchResult(
+            episode_id=r["episode_id"], date=r["date"] or r["episode_id"],
+            start=r["start"], end=r["end"], speaker=r["speaker"],
+            text=r["text"], audio_path=r["audio_path"], score=r["rank"]
+        ) for r in rows]
+
+    def search_qa(self, query: str, limit: int = 20) -> list[QAResult]:
+        """Search Q&A pairs — matches against question, answer, and topic."""
+        rows = self._conn.execute("""
+            SELECT q.episode_id, e.date, q.question_start, q.question_end,
+                   q.answer_start, q.answer_end, q.question_text, q.answer_text,
+                   q.topic, e.audio_path, rank
+            FROM qa_fts f
+            JOIN qa_pairs q ON q.id = f.rowid
+            JOIN episodes e ON e.episode_id = q.episode_id
+            WHERE qa_fts MATCH ?
+            ORDER BY rank
+            LIMIT ?
+        """, [query, limit]).fetchall()
+
+        return [QAResult(
+            episode_id=r["episode_id"], date=r["date"] or r["episode_id"],
+            question_start=r["question_start"], question_end=r["question_end"],
+            answer_start=r["answer_start"], answer_end=r["answer_end"],
+            question_text=r["question_text"], answer_text=r["answer_text"],
+            topic=r["topic"] or "", audio_path=r["audio_path"]
+        ) for r in rows]
+
+    def stats(self) -> dict:
+        return {
+            "episodes": self._conn.execute("SELECT COUNT(*) FROM episodes").fetchone()[0],
+            "segments": self._conn.execute("SELECT COUNT(*) FROM segments").fetchone()[0],
+            "qa_pairs": self._conn.execute("SELECT COUNT(*) FROM qa_pairs").fetchone()[0],
+        }
--- a/projects/radio-show/audio-processor/src/qa_extractor.py
+++ b/projects/radio-show/audio-processor/src/qa_extractor.py
@@ -0,0 +1,372 @@
+"""
+Q&A pair extraction from diarized transcripts.
+
+Identifies exchanges where a CALLER asks a question and the HOST answers.
+Outputs structured Q&A pairs with timestamps for clip extraction and indexing.
+"""
+
+import json
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+from rich.console import Console
+
+console = Console()
+
+# Phrases that signal a caller is asking a question
+QUESTION_SIGNALS = [
+    r"\?",
+    r"\bhow (do|can|should|would|does)\b",
+    r"\bwhat (is|are|should|can|do|does|about)\b",
+    r"\bwhy (is|are|does|do|would|should)\b",
+    r"\bis (it|there|this|that) (true|safe|possible|good|bad|worth)\b",
+    r"\bshould i\b",
+    r"\bcan you\b",
+    r"\bi (was wondering|wanted to ask|have a question)\b",
+]
+
+QUESTION_PATTERN = re.compile("|".join(QUESTION_SIGNALS), re.IGNORECASE)
+
+# Minimum durations for a meaningful exchange
+MIN_QUESTION_DURATION = 5.0   # seconds
+MIN_ANSWER_DURATION = 15.0    # seconds
+MAX_GAP_BETWEEN_QA = 30.0     # seconds between question end and answer start
+
+# ── Promo / bumper filter ──────────────────────────────────────────────────
+# Promos evolve across years but preserve signature phrases.
+# Weight 2 = highly distinctive (one match sufficient to filter).
+# Weight 1 = semi-generic (need 2+ to filter).
+# A question turn with total score >= PROMO_SCORE_THRESHOLD is suppressed.
+PROMO_SCORE_THRESHOLD = 2
+
+_PROMO_SIGS: list[tuple[re.Pattern, int]] = [
+    # Highly distinctive — score 2 each
+    (re.compile(r"acquired a life of its own",      re.I), 2),
+    (re.compile(r"simply desire a deeper",           re.I), 2),
+    (re.compile(r"tame that beast",                  re.I), 2),
+    (re.compile(r"mike swanson will be back after",  re.I), 2),
+    (re.compile(r"heaven forbid.{0,20}virus",        re.I | re.DOTALL), 2),
+    (re.compile(r"mike swanson is answering all",    re.I), 2),
+    # Semi-distinctive — score 1 each, need two to filter
+    (re.compile(r"\bcomputer running slow\b",        re.I), 1),
+    (re.compile(r"\bafter these messages\b",         re.I), 1),
+    (re.compile(r"\b790.?2040\b",                   re.I), 1),
+    (re.compile(r"\bgurushow\.com\b",               re.I), 1),
+    (re.compile(r"\bcall in now\b",                  re.I), 1),
+    (re.compile(r"\bcomputer troubles\?",            re.I), 1),
+    (re.compile(r"\bhardware installation\b",        re.I), 1),
+]
+
+
+def _is_promo_or_bumper(text: str) -> bool:
+    """Return True if text scores above threshold on show promo/bumper signatures."""
+    score = sum(w for pat, w in _PROMO_SIGS if pat.search(text))
+    return score >= PROMO_SCORE_THRESHOLD
+
+
+@dataclass
+class QAPair:
+    question_start: float
+    question_end: float
+    answer_start: float
+    answer_end: float
+    question_text: str
+    answer_text: str
+    topic: Optional[str] = None
+    topic_tags: list[str] = field(default_factory=list)
+
+    def to_dict(self) -> dict:
+        return {
+            "question_start": self.question_start,
+            "question_end": self.question_end,
+            "answer_start": self.answer_start,
+            "answer_end": self.answer_end,
+            "question_text": self.question_text,
+            "answer_text": self.answer_text,
+            "topic": self.topic,
+            "topic_tags": self.topic_tags,
+        }
+
+    def clip_start(self, padding: float = 1.5) -> float:
+        return max(0.0, self.question_start - padding)
+
+    def clip_end(self, padding: float = 1.5) -> float:
+        return self.answer_end + padding
+
+    def duration(self) -> float:
+        return self.answer_end - self.question_start
+
+
+def extract_qa_pairs(diarized_segments: list[dict]) -> list[QAPair]:
+    """
+    Extract caller Q&A pairs from diarized transcript segments.
+
+    Each segment dict: {start, end, text, speaker}
+    Speaker values: "HOST", "CALLER", "UNKNOWN"
+    """
+    pairs = []
+
+    # Group consecutive segments by speaker into speaker turns
+    turns = _merge_consecutive_speaker_turns(diarized_segments)
+
+    # Check if diarization produced any non-HOST speakers
+    has_caller_labels = any(t["speaker"] in ("CALLER", "UNKNOWN") for t in turns)
+
+    if not has_caller_labels:
+        # Diarization labels are absent or unreliable — fall back to text-pattern detection
+        return _extract_qa_text_only(turns)
+
+    i = 0
+    while i < len(turns):
+        turn = turns[i]
+
+        # Look for a CALLER turn that looks like a question
+        if turn["speaker"] in ("CALLER", "UNKNOWN") and _looks_like_question(turn["text"]):
+            if _is_promo_or_bumper(turn["text"]):
+                i += 1
+                continue
+            q_duration = turn["end"] - turn["start"]
+            if q_duration < MIN_QUESTION_DURATION:
+                i += 1
+                continue
+
+            # Look ahead for HOST answer turn(s)
+            j = i + 1
+            answer_turns = []
+            while j < len(turns):
+                next_turn = turns[j]
+                gap = next_turn["start"] - turns[j - 1]["end"]
+
+                if gap > MAX_GAP_BETWEEN_QA and not answer_turns:
+                    break  # too big a gap before any answer
+
+                if next_turn["speaker"] == "HOST":
+                    answer_turns.append(next_turn)
+                    # Keep collecting consecutive HOST turns
+                    j += 1
+                    while j < len(turns) and turns[j]["speaker"] == "HOST":
+                        answer_turns.append(turns[j])
+                        j += 1
+                    break
+                elif next_turn["speaker"] in ("CALLER", "UNKNOWN"):
+                    # Another caller turn before host answered — skip this question
+                    break
+                else:
+                    j += 1
+
+            if answer_turns:
+                answer_text = " ".join(t["text"] for t in answer_turns)
+                answer_duration = answer_turns[-1]["end"] - answer_turns[0]["start"]
+
+                if answer_duration >= MIN_ANSWER_DURATION:
+                    pairs.append(QAPair(
+                        question_start=turn["start"],
+                        question_end=turn["end"],
+                        answer_start=answer_turns[0]["start"],
+                        answer_end=answer_turns[-1]["end"],
+                        question_text=turn["text"].strip(),
+                        answer_text=answer_text.strip(),
+                    ))
+                    i = j
+                    continue
+
+        i += 1
+
+    return pairs
+
+
+# Maximum duration for a question turn in text-only mode — avoids capturing monologues
+_MAX_QUESTION_S_TEXT_MODE = 90.0
+
+# Caller introduction phrases Mike uses before taking a call
+_CALLER_INTRO = re.compile(
+    r"\b(let'?s go to|going to the phones?|you'?re on the air|on the air|"
+    r"first caller|next caller|caller from|go ahead|what'?s (your question|going on)|"
+    r"welcome to the show|thanks for calling|thank you for calling|"
+    r"our (first|next|last) (caller|call)|taking (a |your )?call)\b",
+    re.IGNORECASE,
+)
+
+
+def _extract_qa_text_only(turns: list[dict]) -> list[QAPair]:
+    """
+    Q&A extraction when speaker labels are unavailable or all HOST.
+
+    Uses text patterns to identify question anchors. Works well for call-in
+    radio format where callers describe problems and the host answers at length.
+    Captures both genuine caller questions and Mike's own rhetorical Q&A segments.
+    """
+    pairs = []
+
+    i = 0
+    while i < len(turns):
+        turn = turns[i]
+        q_duration = turn["end"] - turn["start"]
+
+        is_q_candidate = (
+            _looks_like_question(turn["text"])
+            and MIN_QUESTION_DURATION <= q_duration <= _MAX_QUESTION_S_TEXT_MODE
+        )
+
+        # Also treat segments immediately after a caller-intro phrase as candidates
+        if not is_q_candidate and i > 0:
+            prev_text = turns[i - 1]["text"]
+            if _CALLER_INTRO.search(prev_text) and q_duration >= MIN_QUESTION_DURATION:
+                is_q_candidate = True
+
+        if is_q_candidate and _is_promo_or_bumper(turn["text"]):
+            i += 1
+            continue
+
+        if is_q_candidate:
+            # Collect following segments as the answer until we hit another question
+            j = i + 1
+            answer_turns = []
+
+            while j < len(turns):
+                next_turn = turns[j]
+                gap = next_turn["start"] - turns[j - 1]["end"]
+
+                if gap > MAX_GAP_BETWEEN_QA and not answer_turns:
+                    break
+
+                # Stop collecting if we hit another short question-pattern turn
+                if (
+                    _looks_like_question(next_turn["text"])
+                    and (next_turn["end"] - next_turn["start"]) <= _MAX_QUESTION_S_TEXT_MODE
+                    and answer_turns
+                ):
+                    break
+
+                answer_turns.append(next_turn)
+                j += 1
+
+                # Stop once we have a substantial answer block
+                if answer_turns:
+                    ans_dur = answer_turns[-1]["end"] - answer_turns[0]["start"]
+                    if ans_dur >= MIN_ANSWER_DURATION * 3:
+                        break
+
+            if answer_turns:
+                answer_text = " ".join(t["text"] for t in answer_turns)
+                answer_duration = answer_turns[-1]["end"] - answer_turns[0]["start"]
+
+                if answer_duration >= MIN_ANSWER_DURATION:
+                    pairs.append(QAPair(
+                        question_start=turn["start"],
+                        question_end=turn["end"],
+                        answer_start=answer_turns[0]["start"],
+                        answer_end=answer_turns[-1]["end"],
+                        question_text=turn["text"].strip(),
+                        answer_text=answer_text.strip(),
+                    ))
+                    i = j
+                    continue
+
+        i += 1
+
+    return pairs
+
+
+def tag_qa_pairs_with_ollama(pairs: list[QAPair], ollama_host: str = "http://localhost:11434",
+                              model: str = "qwen3:14b") -> list[QAPair]:
+    """Use Ollama to tag each Q&A pair with a topic and tags."""
+    try:
+        import ollama
+        client = ollama.Client(host=ollama_host)
+    except ImportError:
+        console.print("[yellow]ollama not installed — skipping topic tagging[/yellow]")
+        return pairs
+
+    for i, pair in enumerate(pairs):
+        console.print(f"[dim]Tagging Q&A {i+1}/{len(pairs)}...[/dim]")
+        try:
+            prompt = (
+                f"A radio show caller asked:\n\"{pair.question_text[:300]}\"\n\n"
+                f"The host answered:\n\"{pair.answer_text[:500]}\"\n\n"
+                "Respond with JSON only, no explanation:\n"
+                '{"topic": "short topic name (3-5 words)", "tags": ["tag1", "tag2", "tag3"]}'
+            )
+            resp = client.chat(
+                model=model,
+                messages=[{"role": "user", "content": prompt}],
+                options={"temperature": 0},
+            )
+            raw = resp["message"]["content"].strip()
+            # Extract JSON from response
+            start = raw.find("{")
+            end = raw.rfind("}") + 1
+            if start >= 0 and end > start:
+                data = json.loads(raw[start:end])
+                pair.topic = data.get("topic", "")
+                pair.topic_tags = data.get("tags", [])
+        except Exception as e:
+            console.print(f"[yellow]Tagging failed for pair {i+1}: {e}[/yellow]")
+
+    return pairs
+
+
+def load_diarized_transcript(transcript_path: Path,
+                             diarization_path: Optional[Path]) -> list[dict]:
+    """
+    Merge transcript and diarization into speaker-labeled segments.
+    Falls back to HOST-only if no diarization available.
+    """
+    with open(transcript_path) as f:
+        transcript = json.load(f)
+
+    segments = transcript["segments"]
+
+    if diarization_path is None or not diarization_path.exists():
+        return [
+            {"start": s["start"], "end": s["end"],
+             "text": s["text"], "speaker": "HOST"}
+            for s in segments
+        ]
+
+    with open(diarization_path) as f:
+        diarization = json.load(f)
+
+    turns = diarization.get("turns", [])
+
+    def speaker_at(t: float) -> str:
+        """Find which diarization turn covers time t."""
+        for turn in turns:
+            if turn["start"] <= t <= turn["end"]:
+                return turn["speaker"]
+        return "UNKNOWN"
+
+    return [
+        {"start": s["start"], "end": s["end"],
+         "text": s["text"],
+         "speaker": speaker_at((s["start"] + s["end"]) / 2)}
+        for s in segments
+    ]
+
+
+# ── Helpers ────────────────────────────────────────────────────────────────
+
+def _looks_like_question(text: str) -> bool:
+    return bool(QUESTION_PATTERN.search(text))
+
+
+def _merge_consecutive_speaker_turns(segments: list[dict]) -> list[dict]:
+    """Merge adjacent segments from the same speaker into continuous turns."""
+    if not segments:
+        return []
+
+    turns = []
+    current = dict(segments[0])
+
+    for seg in segments[1:]:
+        if seg["speaker"] == current["speaker"]:
+            current["end"] = seg["end"]
+            current["text"] = current["text"].rstrip() + " " + seg["text"].lstrip()
+        else:
+            turns.append(current)
+            current = dict(seg)
+
+    turns.append(current)
+    return turns
--- a/projects/radio-show/audio-processor/src/show_prep.py
+++ b/projects/radio-show/audio-processor/src/show_prep.py
@@ -0,0 +1,206 @@
+"""
+Show prep generator: search the archive index for past caller topics,
+extract clips, and generate "then vs now" talking points via Ollama.
+"""
+
+import json
+from pathlib import Path
+from datetime import datetime
+
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from rich import box
+
+from .indexer import ArchiveIndex, QAResult, SearchResult
+from .clip_extractor import extract_clips_for_results, format_timestamp
+
+console = Console()
+
+
+def generate_show_prep(
+    index: ArchiveIndex,
+    topic: str,
+    output_dir: Path,
+    extract_clips: bool = True,
+    ollama_host: str = "http://localhost:11434",
+    ollama_model: str = "qwen3:14b",
+    limit: int = 10,
+) -> Path:
+    """
+    Search the archive for past discussions of a topic.
+    Extracts audio clips and generates "then vs now" talking points.
+    Returns path to the generated markdown prep file.
+    """
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    console.print(Panel.fit(f"[bold]Show Prep:[/bold] {topic}", border_style="blue"))
+
+    # Search Q&A pairs first (caller exchanges)
+    qa_results = index.search_qa(topic, limit=limit)
+    # Also search raw segments (for monologue mentions)
+    segment_results = index.search(topic, limit=limit)
+
+    if not qa_results and not segment_results:
+        console.print(f"[yellow]No results found for: {topic}[/yellow]")
+        return None
+
+    # Display results table
+    _print_results_table(qa_results, segment_results, topic)
+
+    # Extract clips
+    clip_paths = {}
+    if extract_clips and qa_results:
+        clips_dir = output_dir / "clips"
+        console.print(f"\n[dim]Extracting {len(qa_results)} clip(s)...[/dim]")
+        clip_paths = extract_clips_for_results(qa_results, clips_dir)
+
+    # Generate then-vs-now content via Ollama
+    then_now = _generate_then_vs_now(topic, qa_results, segment_results,
+                                      ollama_host, ollama_model)
+
+    # Write markdown prep file
+    safe_topic = topic.lower().replace(" ", "-").replace("/", "-")[:40]
+    date_str = datetime.now().strftime("%Y-%m-%d")
+    prep_path = output_dir / f"{date_str}-{safe_topic}-prep.md"
+
+    _write_prep_file(prep_path, topic, qa_results, segment_results,
+                     clip_paths, then_now)
+
+    console.print(f"\n[bold green]Prep file:[/bold green] {prep_path}")
+    return prep_path
+
+
+def _print_results_table(qa_results: list[QAResult], segment_results: list[SearchResult],
+                          topic: str):
+    if qa_results:
+        table = Table(title=f"Caller Q&A — \"{topic}\"", box=box.SIMPLE, show_lines=True)
+        table.add_column("Date", style="cyan", width=12)
+        table.add_column("Timestamps", style="dim", width=14)
+        table.add_column("Duration", style="dim", width=8)
+        table.add_column("Caller asked", width=35)
+        table.add_column("Topic", style="green", width=20)
+
+        for r in qa_results:
+            dur = r.duration()
+            table.add_row(
+                r.date or r.episode_id,
+                r.timestamp_str(),
+                f"{int(dur//60)}m{int(dur%60):02d}s",
+                r.question_text[:80] + ("…" if len(r.question_text) > 80 else ""),
+                r.topic or "—",
+            )
+        console.print(table)
+
+    if segment_results and not qa_results:
+        console.print(f"\n[dim]No structured Q&A found. Showing {len(segment_results)} "
+                      f"transcript mentions:[/dim]")
+        for r in segment_results:
+            console.print(f"  [cyan]{r.date}[/cyan] [{r.timestamp_str()}] "
+                          f"[dim]{r.speaker}[/dim]: {r.text[:100]}…")
+
+
+def _generate_then_vs_now(topic: str, qa_results: list, segment_results: list,
+                           ollama_host: str, model: str) -> str:
+    try:
+        import ollama
+        client = ollama.Client(host=ollama_host)
+    except ImportError:
+        return "_Ollama not available — install with: pip install ollama_"
+
+    # Build context from past discussions
+    past_context = ""
+    for r in qa_results[:5]:
+        date = r.date or r.episode_id
+        past_context += f"\n[{date}] Caller: {r.question_text[:200]}\n"
+        past_context += f"Host answer: {r.answer_text[:400]}\n"
+
+    if not past_context and segment_results:
+        for r in segment_results[:5]:
+            past_context += f"\n[{r.date}] {r.speaker}: {r.text[:300]}\n"
+
+    if not past_context:
+        return ""
+
+    prompt = f"""You are helping prepare talking points for a technology radio show host.
+The host discussed "{topic}" in past episodes. Here are excerpts:
+
+{past_context}
+
+The host wants to do a new segment revisiting this topic.
+
+Write talking points in this format:
+## What I Said Then
+- [2-3 bullets summarizing the past advice/position]
+
+## What's Changed Since Then
+- [2-3 bullets on how the technology/situation has evolved]
+
+## Why My Answer Is Different Now
+- [2-3 bullets on the updated recommendation/position]
+
+## Suggested Opening
+[1-2 sentences the host can use to open the segment, referencing the old clip]
+
+Keep it conversational, radio-friendly. Be specific about what actually changed."""
+
+    try:
+        resp = client.chat(
+            model=model,
+            messages=[{"role": "user", "content": prompt}],
+            options={"temperature": 0.3},
+        )
+        return resp["message"]["content"]
+    except Exception as e:
+        return f"_Ollama generation failed: {e}_"
+
+
+def _write_prep_file(path: Path, topic: str, qa_results: list, segment_results: list,
+                      clip_paths: dict, then_now: str):
+    lines = [
+        f"# Show Prep: {topic}",
+        f"",
+        f"_Generated {datetime.now().strftime('%Y-%m-%d %H:%M')}_",
+        f"",
+    ]
+
+    if qa_results:
+        lines += [f"## Past Caller Exchanges ({len(qa_results)} found)", ""]
+        for i, r in enumerate(qa_results):
+            clip_info = ""
+            if i in clip_paths:
+                clip_info = f" — `{clip_paths[i].name}`"
+            lines += [
+                f"### {r.date or r.episode_id} — [{r.timestamp_str()}]{clip_info}",
+                f"**Caller:** {r.question_text}",
+                f"",
+                f"**Host:** {r.answer_text[:600]}{'…' if len(r.answer_text) > 600 else ''}",
+                f"",
+            ]
+
+    elif segment_results:
+        lines += [f"## Transcript Mentions ({len(segment_results)} found)", ""]
+        for r in segment_results:
+            lines += [
+                f"- **{r.date}** [{r.timestamp_str()}] ({r.speaker}): {r.text[:200]}",
+            ]
+        lines.append("")
+
+    if then_now:
+        lines += ["## Then vs Now", "", then_now, ""]
+
+    if clip_paths:
+        lines += [
+            "## Clips",
+            "",
+            f"Extracted to `clips/` — drag into Audition/Audacity:",
+            "",
+        ]
+        for i, p in clip_paths.items():
+            if i < len(qa_results):
+                r = qa_results[i]
+                lines.append(f"- `{p.name}` — {r.date} [{r.timestamp_str()}]")
+        lines.append("")
+
+    path.write_text("\n".join(lines), encoding="utf-8")
--- a/projects/radio-show/audio-processor/src/voice_profiler.py
+++ b/projects/radio-show/audio-processor/src/voice_profiler.py
@@ -13,7 +13,6 @@ import numpy as np
 import torch
 import soundfile as sf
 from rich.console import Console
-from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeElapsedColumn
 from rich.table import Table

 console = Console()
@@ -159,36 +158,43 @@ class VoiceProfiler:

    def extract_embedding(self, audio_path: Path, start: float = 0.0,
                          end: float | None = None) -> np.ndarray:
-        """Extract a speaker embedding from an audio segment."""
-        model = self._get_model()
+        """Extract a speaker embedding from an audio segment (file-based, any format)."""
+        self._get_model()
+        waveform, _ = self._load_audio_segment(audio_path, start, end)
+        return self._embed_audio_np(waveform.squeeze(0).numpy())

-        # Load audio segment (already at SAMPLE_RATE via ffmpeg)
-        waveform, sr = self._load_audio_segment(audio_path, start, end)
-
-        # waveform is [1, samples] tensor, need just the numpy array for the extractor
-        audio_np = waveform.squeeze(0).numpy()
-
-        # Extract features
+    def _embed_audio_np(self, audio_np: np.ndarray) -> np.ndarray:
+        """Embed a float32 mono numpy array (already at SAMPLE_RATE). Returns L2-normalized embedding."""
+        self._get_model()
        inputs = self._extractor(
            audio_np, sampling_rate=SAMPLE_RATE,
            return_tensors="pt", padding=True,
        )
-
-        # Get embedding
        with torch.no_grad():
-            outputs = model(**{k: v.to(self.device) for k, v in inputs.items()})
-
+            outputs = self._model(**{k: v.to(self.device) for k, v in inputs.items()})
        embedding = outputs.embeddings.squeeze().cpu().numpy()
-        # L2 normalize
        norm = np.linalg.norm(embedding)
        if norm > 0:
            embedding = embedding / norm
-
        return embedding

+    def _load_full_audio(self, audio_path: Path) -> np.ndarray:
+        """Decode entire audio file to float32 mono at SAMPLE_RATE via a single ffmpeg call."""
+        cmd = [
+            "ffmpeg", "-i", str(audio_path),
+            "-f", "wav", "-ac", "1", "-ar", str(SAMPLE_RATE),
+            "-acodec", "pcm_s16le", "pipe:1",
+        ]
+        result = subprocess.run(cmd, capture_output=True, timeout=600)
+        if result.returncode != 0:
+            raise RuntimeError(f"ffmpeg failed: {result.stderr.decode()[:200]}")
+        import io
+        data, _ = sf.read(io.BytesIO(result.stdout), dtype="float32")
+        return data  # shape: (samples,)
+
    def _load_audio_segment(self, audio_path: Path, start: float = 0.0,
                            end: float | None = None) -> tuple[torch.Tensor, int]:
-        """Load an audio segment using ffmpeg (handles any format)."""
+        """Load a single audio segment via ffmpeg (used for one-off extraction)."""
        cmd = ["ffmpeg", "-i", str(audio_path)]
        if start > 0:
            cmd.extend(["-ss", str(start)])
@@ -227,68 +233,39 @@ class VoiceProfiler:

        profile = self.profiles[host_name]

-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            BarColumn(),
-            TextColumn("{task.completed}/{task.total}"),
-            TimeElapsedColumn(),
-            console=console,
-        ) as progress:
-            task = progress.add_task("Processing episodes...",
-                                     total=len(episode_paths))
+        for ep_idx, ep_path in enumerate(episode_paths, 1):
+            console.print(f"[dim]  [{ep_idx}/{len(episode_paths)}] {ep_path.name}[/dim]")

-            for ep_path in episode_paths:
-                progress.update(task, description=f"Processing {ep_path.name}...")
+            try:
+                duration = self._get_duration(ep_path)

-                try:
-                    # Get episode duration
-                    duration = self._get_duration(ep_path)
+                windows = []
+                if duration > 90:
+                    windows.append((30.0, 90.0))
+                if duration > 180:
+                    windows.append((120.0, 180.0))
+                mid = duration / 2
+                if mid > 60:
+                    windows.append((mid, min(mid + 60, duration)))
+                late = duration - 180
+                if late > 300:
+                    windows.append((late, late + 60))

-                    # Strategy: extract embeddings from multiple time windows
-                    # Skip first 30s (likely intro jingle), then sample every 2 min
-                    windows = []
+                chunk_duration = 10.0
+                for start, end in windows:
+                    for chunk_start in np.arange(start, end - chunk_duration, chunk_duration):
+                        try:
+                            emb = self.extract_embedding(
+                                ep_path, chunk_start, chunk_start + chunk_duration
+                            )
+                            profile.embeddings.append(emb)
+                        except Exception as e:
+                            console.print(f"    [dim red]Chunk {chunk_start:.0f}s failed: {e}[/dim red]")

-                    # Window 1: After intro (30s-90s) — usually host monologue
-                    if duration > 90:
-                        windows.append((30.0, 90.0))
+                profile.source_episodes.append(ep_path.name)

-                    # Window 2: Early show (2min-3min)
-                    if duration > 180:
-                        windows.append((120.0, 180.0))
-
-                    # Window 3: Mid show
-                    mid = duration / 2
-                    if mid > 60:
-                        windows.append((mid, min(mid + 60, duration)))
-
-                    # Window 4: Late show (but not last 2 min — likely outro)
-                    late = duration - 180
-                    if late > 300:
-                        windows.append((late, late + 60))
-
-                    for start, end in windows:
-                        # Extract 10-second chunks within each window
-                        # and take the embedding of each chunk
-                        chunk_duration = 10.0
-                        for chunk_start in np.arange(start, end - chunk_duration,
-                                                     chunk_duration):
-                            try:
-                                emb = self.extract_embedding(
-                                    ep_path, chunk_start,
-                                    chunk_start + chunk_duration
-                                )
-                                profile.embeddings.append(emb)
-                            except Exception as e:
-                                console.print(f"    [dim red]Chunk {chunk_start:.0f}s failed: {e}[/dim red]")
-                                continue
-
-                    profile.source_episodes.append(ep_path.name)
-
-                except Exception as e:
-                    console.print(f"  [red]Failed: {ep_path.name}: {e}[/red]")
-
-                progress.update(task, advance=1)
+            except Exception as e:
+                console.print(f"  [red]Failed: {ep_path.name}: {e}[/red]")

        # Compute composite
        profile.compute_composite()
@@ -305,58 +282,66 @@ class VoiceProfiler:
                          threshold: float = 0.70) -> list[VoiceSegment]:
        """Identify speakers throughout an audio file using sliding window.

+        Loads the full audio once then slices in memory — avoids spawning
+        hundreds of ffmpeg subprocesses.
        Returns timestamped segments with speaker labels and embeddings.
        """
        console.print(f"[bold]Identifying speakers:[/bold] {audio_path.name}")

        duration = self._get_duration(audio_path)
+        console.print(f"[dim]Loading audio into memory...[/dim]")
+        audio = self._load_full_audio(audio_path)  # float32 mono array
+        self._get_model()  # ensure model is warm before the loop
+
        segments = []
+        window_samples = int(window_s * SAMPLE_RATE)
+        hop_samples = int(hop_s * SAMPLE_RATE)
+        total_samples = len(audio)

-        with Progress(
-            SpinnerColumn(),
-            TextColumn("[progress.description]{task.description}"),
-            BarColumn(),
-            TextColumn("{task.percentage:>3.0f}%"),
-            TimeElapsedColumn(),
-            console=console,
-        ) as progress:
-            task = progress.add_task("Analyzing speakers...",
-                                     total=int(duration))
+        total_windows = int((duration - window_s) / hop_s) + 1
+        report_every = max(1, total_windows // 10)

-            for start in np.arange(0, duration - window_s, hop_s):
-                end = min(start + window_s, duration)
+        for idx, start in enumerate(np.arange(0, duration - window_s, hop_s)):
+            end = min(start + window_s, duration)
+            s = int(start * SAMPLE_RATE)
+            e = min(s + window_samples, total_samples)

-                try:
-                    emb = self.extract_embedding(audio_path, start, end)
+            try:
+                emb = self._embed_audio_np(audio[s:e])

-                    # Match against known profiles
-                    best_match = None
-                    best_score = 0.0
+                best_match = None
+                best_score = 0.0

-                    for name, profile in self.profiles.items():
-                        score = profile.similarity(emb)
-                        if score > best_score:
-                            best_score = score
-                            best_match = name
+                for name, profile in self.profiles.items():
+                    score = profile.similarity(emb)
+                    if score > best_score:
+                        best_score = score
+                        best_match = name

-                    label = best_match if best_score >= threshold else "Unknown"
+                if best_score >= threshold:
                    if best_match and self.profiles[best_match].role == "host":
                        label = f"Host: {best_match}"
+                    else:
+                        label = best_match
+                else:
+                    label = "Unknown"

-                    segments.append(VoiceSegment(
-                        start=start,
-                        end=end,
-                        embedding=emb,
-                        speaker_label=f"{label} ({best_score:.2f})",
-                    ))
+                segments.append(VoiceSegment(
+                    start=start,
+                    end=end,
+                    embedding=emb,
+                    speaker_label=f"{label} ({best_score:.2f})",
+                ))

-                except Exception:
-                    segments.append(VoiceSegment(
-                        start=start, end=end,
-                        speaker_label="[error]",
-                    ))
+            except Exception:
+                segments.append(VoiceSegment(
+                    start=start, end=end,
+                    speaker_label="[error]",
+                ))

-                progress.update(task, completed=int(end))
+            if idx % report_every == 0:
+                pct = int(end / duration * 100)
+                console.print(f"[dim]  {pct}% ({end:.0f}s / {duration:.0f}s)[/dim]")

        # Print summary
        self._print_speaker_summary(segments, duration)