radio: diarization pipeline fixes, benchmark setup, test episode set
- Fix voice_profiler threshold bug (HOST label overwrote Unknown unconditionally) - Audio preload optimization: single ffmpeg per episode, 149.5x realtime on 5070 Ti - WavLM threshold raised to 0.85 (Mike 0.90-0.99, callers 0.46-0.83) - Promo/bumper filter: weighted signature scoring, 42->27 clean Q&A pairs - Text-only Q&A fallback for episodes with no CALLER diarization labels - TRANSFORMERS_OFFLINE=1 to skip HuggingFace freshness checks - Add diarize_2018.py for targeted re-run + FTS5 rebuild - Add benchmark.py + BENCH_SETUP.md for GURU-BEAST-ROG (RTX 4090) comparison - Commit 9-episode training diarization.json outputs - Session log: 2026-04-27-diarization-pipeline.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
206
projects/radio-show/audio-processor/src/show_prep.py
Normal file
206
projects/radio-show/audio-processor/src/show_prep.py
Normal file
@@ -0,0 +1,206 @@
|
||||
"""
|
||||
Show prep generator: search the archive index for past caller topics,
|
||||
extract clips, and generate "then vs now" talking points via Ollama.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
from rich.console import Console
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich import box
|
||||
|
||||
from .indexer import ArchiveIndex, QAResult, SearchResult
|
||||
from .clip_extractor import extract_clips_for_results, format_timestamp
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
def generate_show_prep(
|
||||
index: ArchiveIndex,
|
||||
topic: str,
|
||||
output_dir: Path,
|
||||
extract_clips: bool = True,
|
||||
ollama_host: str = "http://localhost:11434",
|
||||
ollama_model: str = "qwen3:14b",
|
||||
limit: int = 10,
|
||||
) -> Path:
|
||||
"""
|
||||
Search the archive for past discussions of a topic.
|
||||
Extracts audio clips and generates "then vs now" talking points.
|
||||
Returns path to the generated markdown prep file.
|
||||
"""
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
console.print(Panel.fit(f"[bold]Show Prep:[/bold] {topic}", border_style="blue"))
|
||||
|
||||
# Search Q&A pairs first (caller exchanges)
|
||||
qa_results = index.search_qa(topic, limit=limit)
|
||||
# Also search raw segments (for monologue mentions)
|
||||
segment_results = index.search(topic, limit=limit)
|
||||
|
||||
if not qa_results and not segment_results:
|
||||
console.print(f"[yellow]No results found for: {topic}[/yellow]")
|
||||
return None
|
||||
|
||||
# Display results table
|
||||
_print_results_table(qa_results, segment_results, topic)
|
||||
|
||||
# Extract clips
|
||||
clip_paths = {}
|
||||
if extract_clips and qa_results:
|
||||
clips_dir = output_dir / "clips"
|
||||
console.print(f"\n[dim]Extracting {len(qa_results)} clip(s)...[/dim]")
|
||||
clip_paths = extract_clips_for_results(qa_results, clips_dir)
|
||||
|
||||
# Generate then-vs-now content via Ollama
|
||||
then_now = _generate_then_vs_now(topic, qa_results, segment_results,
|
||||
ollama_host, ollama_model)
|
||||
|
||||
# Write markdown prep file
|
||||
safe_topic = topic.lower().replace(" ", "-").replace("/", "-")[:40]
|
||||
date_str = datetime.now().strftime("%Y-%m-%d")
|
||||
prep_path = output_dir / f"{date_str}-{safe_topic}-prep.md"
|
||||
|
||||
_write_prep_file(prep_path, topic, qa_results, segment_results,
|
||||
clip_paths, then_now)
|
||||
|
||||
console.print(f"\n[bold green]Prep file:[/bold green] {prep_path}")
|
||||
return prep_path
|
||||
|
||||
|
||||
def _print_results_table(qa_results: list[QAResult], segment_results: list[SearchResult],
|
||||
topic: str):
|
||||
if qa_results:
|
||||
table = Table(title=f"Caller Q&A — \"{topic}\"", box=box.SIMPLE, show_lines=True)
|
||||
table.add_column("Date", style="cyan", width=12)
|
||||
table.add_column("Timestamps", style="dim", width=14)
|
||||
table.add_column("Duration", style="dim", width=8)
|
||||
table.add_column("Caller asked", width=35)
|
||||
table.add_column("Topic", style="green", width=20)
|
||||
|
||||
for r in qa_results:
|
||||
dur = r.duration()
|
||||
table.add_row(
|
||||
r.date or r.episode_id,
|
||||
r.timestamp_str(),
|
||||
f"{int(dur//60)}m{int(dur%60):02d}s",
|
||||
r.question_text[:80] + ("…" if len(r.question_text) > 80 else ""),
|
||||
r.topic or "—",
|
||||
)
|
||||
console.print(table)
|
||||
|
||||
if segment_results and not qa_results:
|
||||
console.print(f"\n[dim]No structured Q&A found. Showing {len(segment_results)} "
|
||||
f"transcript mentions:[/dim]")
|
||||
for r in segment_results:
|
||||
console.print(f" [cyan]{r.date}[/cyan] [{r.timestamp_str()}] "
|
||||
f"[dim]{r.speaker}[/dim]: {r.text[:100]}…")
|
||||
|
||||
|
||||
def _generate_then_vs_now(topic: str, qa_results: list, segment_results: list,
|
||||
ollama_host: str, model: str) -> str:
|
||||
try:
|
||||
import ollama
|
||||
client = ollama.Client(host=ollama_host)
|
||||
except ImportError:
|
||||
return "_Ollama not available — install with: pip install ollama_"
|
||||
|
||||
# Build context from past discussions
|
||||
past_context = ""
|
||||
for r in qa_results[:5]:
|
||||
date = r.date or r.episode_id
|
||||
past_context += f"\n[{date}] Caller: {r.question_text[:200]}\n"
|
||||
past_context += f"Host answer: {r.answer_text[:400]}\n"
|
||||
|
||||
if not past_context and segment_results:
|
||||
for r in segment_results[:5]:
|
||||
past_context += f"\n[{r.date}] {r.speaker}: {r.text[:300]}\n"
|
||||
|
||||
if not past_context:
|
||||
return ""
|
||||
|
||||
prompt = f"""You are helping prepare talking points for a technology radio show host.
|
||||
The host discussed "{topic}" in past episodes. Here are excerpts:
|
||||
|
||||
{past_context}
|
||||
|
||||
The host wants to do a new segment revisiting this topic.
|
||||
|
||||
Write talking points in this format:
|
||||
## What I Said Then
|
||||
- [2-3 bullets summarizing the past advice/position]
|
||||
|
||||
## What's Changed Since Then
|
||||
- [2-3 bullets on how the technology/situation has evolved]
|
||||
|
||||
## Why My Answer Is Different Now
|
||||
- [2-3 bullets on the updated recommendation/position]
|
||||
|
||||
## Suggested Opening
|
||||
[1-2 sentences the host can use to open the segment, referencing the old clip]
|
||||
|
||||
Keep it conversational, radio-friendly. Be specific about what actually changed."""
|
||||
|
||||
try:
|
||||
resp = client.chat(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
options={"temperature": 0.3},
|
||||
)
|
||||
return resp["message"]["content"]
|
||||
except Exception as e:
|
||||
return f"_Ollama generation failed: {e}_"
|
||||
|
||||
|
||||
def _write_prep_file(path: Path, topic: str, qa_results: list, segment_results: list,
|
||||
clip_paths: dict, then_now: str):
|
||||
lines = [
|
||||
f"# Show Prep: {topic}",
|
||||
f"",
|
||||
f"_Generated {datetime.now().strftime('%Y-%m-%d %H:%M')}_",
|
||||
f"",
|
||||
]
|
||||
|
||||
if qa_results:
|
||||
lines += [f"## Past Caller Exchanges ({len(qa_results)} found)", ""]
|
||||
for i, r in enumerate(qa_results):
|
||||
clip_info = ""
|
||||
if i in clip_paths:
|
||||
clip_info = f" — `{clip_paths[i].name}`"
|
||||
lines += [
|
||||
f"### {r.date or r.episode_id} — [{r.timestamp_str()}]{clip_info}",
|
||||
f"**Caller:** {r.question_text}",
|
||||
f"",
|
||||
f"**Host:** {r.answer_text[:600]}{'…' if len(r.answer_text) > 600 else ''}",
|
||||
f"",
|
||||
]
|
||||
|
||||
elif segment_results:
|
||||
lines += [f"## Transcript Mentions ({len(segment_results)} found)", ""]
|
||||
for r in segment_results:
|
||||
lines += [
|
||||
f"- **{r.date}** [{r.timestamp_str()}] ({r.speaker}): {r.text[:200]}",
|
||||
]
|
||||
lines.append("")
|
||||
|
||||
if then_now:
|
||||
lines += ["## Then vs Now", "", then_now, ""]
|
||||
|
||||
if clip_paths:
|
||||
lines += [
|
||||
"## Clips",
|
||||
"",
|
||||
f"Extracted to `clips/` — drag into Audition/Audacity:",
|
||||
"",
|
||||
]
|
||||
for i, p in clip_paths.items():
|
||||
if i < len(qa_results):
|
||||
r = qa_results[i]
|
||||
lines.append(f"- `{p.name}` — {r.date} [{r.timestamp_str()}]")
|
||||
lines.append("")
|
||||
|
||||
path.write_text("\n".join(lines), encoding="utf-8")
|
||||
Reference in New Issue
Block a user