radio: diarization pipeline fixes, benchmark setup, test episode set

- Fix voice_profiler threshold bug (HOST label overwrote Unknown unconditionally) - Audio preload optimization: single ffmpeg per episode, 149.5x realtime on 5070 Ti - WavLM threshold raised to 0.85 (Mike 0.90-0.99, callers 0.46-0.83) - Promo/bumper filter: weighted signature scoring, 42->27 clean Q&A pairs - Text-only Q&A fallback for episodes with no CALLER diarization labels - TRANSFORMERS_OFFLINE=1 to skip HuggingFace freshness checks - Add diarize_2018.py for targeted re-run + FTS5 rebuild - Add benchmark.py + BENCH_SETUP.md for GURU-BEAST-ROG (RTX 4090) comparison - Commit 9-episode training diarization.json outputs - Session log: 2026-04-27-diarization-pipeline.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-27 13:20:10 -07:00
parent 206cd2f929
commit 79abef9dc9
21 changed files with 4720 additions and 202 deletions
--- a/projects/radio-show/audio-processor/src/show_prep.py
+++ b/projects/radio-show/audio-processor/src/show_prep.py
@@ -0,0 +1,206 @@
+"""
+Show prep generator: search the archive index for past caller topics,
+extract clips, and generate "then vs now" talking points via Ollama.
+"""
+
+import json
+from pathlib import Path
+from datetime import datetime
+
+from rich.console import Console
+from rich.panel import Panel
+from rich.table import Table
+from rich import box
+
+from .indexer import ArchiveIndex, QAResult, SearchResult
+from .clip_extractor import extract_clips_for_results, format_timestamp
+
+console = Console()
+
+
+def generate_show_prep(
+    index: ArchiveIndex,
+    topic: str,
+    output_dir: Path,
+    extract_clips: bool = True,
+    ollama_host: str = "http://localhost:11434",
+    ollama_model: str = "qwen3:14b",
+    limit: int = 10,
+) -> Path:
+    """
+    Search the archive for past discussions of a topic.
+    Extracts audio clips and generates "then vs now" talking points.
+    Returns path to the generated markdown prep file.
+    """
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    console.print(Panel.fit(f"[bold]Show Prep:[/bold] {topic}", border_style="blue"))
+
+    # Search Q&A pairs first (caller exchanges)
+    qa_results = index.search_qa(topic, limit=limit)
+    # Also search raw segments (for monologue mentions)
+    segment_results = index.search(topic, limit=limit)
+
+    if not qa_results and not segment_results:
+        console.print(f"[yellow]No results found for: {topic}[/yellow]")
+        return None
+
+    # Display results table
+    _print_results_table(qa_results, segment_results, topic)
+
+    # Extract clips
+    clip_paths = {}
+    if extract_clips and qa_results:
+        clips_dir = output_dir / "clips"
+        console.print(f"\n[dim]Extracting {len(qa_results)} clip(s)...[/dim]")
+        clip_paths = extract_clips_for_results(qa_results, clips_dir)
+
+    # Generate then-vs-now content via Ollama
+    then_now = _generate_then_vs_now(topic, qa_results, segment_results,
+                                      ollama_host, ollama_model)
+
+    # Write markdown prep file
+    safe_topic = topic.lower().replace(" ", "-").replace("/", "-")[:40]
+    date_str = datetime.now().strftime("%Y-%m-%d")
+    prep_path = output_dir / f"{date_str}-{safe_topic}-prep.md"
+
+    _write_prep_file(prep_path, topic, qa_results, segment_results,
+                     clip_paths, then_now)
+
+    console.print(f"\n[bold green]Prep file:[/bold green] {prep_path}")
+    return prep_path
+
+
+def _print_results_table(qa_results: list[QAResult], segment_results: list[SearchResult],
+                          topic: str):
+    if qa_results:
+        table = Table(title=f"Caller Q&A — \"{topic}\"", box=box.SIMPLE, show_lines=True)
+        table.add_column("Date", style="cyan", width=12)
+        table.add_column("Timestamps", style="dim", width=14)
+        table.add_column("Duration", style="dim", width=8)
+        table.add_column("Caller asked", width=35)
+        table.add_column("Topic", style="green", width=20)
+
+        for r in qa_results:
+            dur = r.duration()
+            table.add_row(
+                r.date or r.episode_id,
+                r.timestamp_str(),
+                f"{int(dur//60)}m{int(dur%60):02d}s",
+                r.question_text[:80] + ("…" if len(r.question_text) > 80 else ""),
+                r.topic or "—",
+            )
+        console.print(table)
+
+    if segment_results and not qa_results:
+        console.print(f"\n[dim]No structured Q&A found. Showing {len(segment_results)} "
+                      f"transcript mentions:[/dim]")
+        for r in segment_results:
+            console.print(f"  [cyan]{r.date}[/cyan] [{r.timestamp_str()}] "
+                          f"[dim]{r.speaker}[/dim]: {r.text[:100]}…")
+
+
+def _generate_then_vs_now(topic: str, qa_results: list, segment_results: list,
+                           ollama_host: str, model: str) -> str:
+    try:
+        import ollama
+        client = ollama.Client(host=ollama_host)
+    except ImportError:
+        return "_Ollama not available — install with: pip install ollama_"
+
+    # Build context from past discussions
+    past_context = ""
+    for r in qa_results[:5]:
+        date = r.date or r.episode_id
+        past_context += f"\n[{date}] Caller: {r.question_text[:200]}\n"
+        past_context += f"Host answer: {r.answer_text[:400]}\n"
+
+    if not past_context and segment_results:
+        for r in segment_results[:5]:
+            past_context += f"\n[{r.date}] {r.speaker}: {r.text[:300]}\n"
+
+    if not past_context:
+        return ""
+
+    prompt = f"""You are helping prepare talking points for a technology radio show host.
+The host discussed "{topic}" in past episodes. Here are excerpts:
+
+{past_context}
+
+The host wants to do a new segment revisiting this topic.
+
+Write talking points in this format:
+## What I Said Then
+- [2-3 bullets summarizing the past advice/position]
+
+## What's Changed Since Then
+- [2-3 bullets on how the technology/situation has evolved]
+
+## Why My Answer Is Different Now
+- [2-3 bullets on the updated recommendation/position]
+
+## Suggested Opening
+[1-2 sentences the host can use to open the segment, referencing the old clip]
+
+Keep it conversational, radio-friendly. Be specific about what actually changed."""
+
+    try:
+        resp = client.chat(
+            model=model,
+            messages=[{"role": "user", "content": prompt}],
+            options={"temperature": 0.3},
+        )
+        return resp["message"]["content"]
+    except Exception as e:
+        return f"_Ollama generation failed: {e}_"
+
+
+def _write_prep_file(path: Path, topic: str, qa_results: list, segment_results: list,
+                      clip_paths: dict, then_now: str):
+    lines = [
+        f"# Show Prep: {topic}",
+        f"",
+        f"_Generated {datetime.now().strftime('%Y-%m-%d %H:%M')}_",
+        f"",
+    ]
+
+    if qa_results:
+        lines += [f"## Past Caller Exchanges ({len(qa_results)} found)", ""]
+        for i, r in enumerate(qa_results):
+            clip_info = ""
+            if i in clip_paths:
+                clip_info = f" — `{clip_paths[i].name}`"
+            lines += [
+                f"### {r.date or r.episode_id} — [{r.timestamp_str()}]{clip_info}",
+                f"**Caller:** {r.question_text}",
+                f"",
+                f"**Host:** {r.answer_text[:600]}{'…' if len(r.answer_text) > 600 else ''}",
+                f"",
+            ]
+
+    elif segment_results:
+        lines += [f"## Transcript Mentions ({len(segment_results)} found)", ""]
+        for r in segment_results:
+            lines += [
+                f"- **{r.date}** [{r.timestamp_str()}] ({r.speaker}): {r.text[:200]}",
+            ]
+        lines.append("")
+
+    if then_now:
+        lines += ["## Then vs Now", "", then_now, ""]
+
+    if clip_paths:
+        lines += [
+            "## Clips",
+            "",
+            f"Extracted to `clips/` — drag into Audition/Audacity:",
+            "",
+        ]
+        for i, p in clip_paths.items():
+            if i < len(qa_results):
+                r = qa_results[i]
+                lines.append(f"- `{p.name}` — {r.date} [{r.timestamp_str()}]")
+        lines.append("")
+
+    path.write_text("\n".join(lines), encoding="utf-8")