- Fix voice_profiler threshold bug (HOST label overwrote Unknown unconditionally) - Audio preload optimization: single ffmpeg per episode, 149.5x realtime on 5070 Ti - WavLM threshold raised to 0.85 (Mike 0.90-0.99, callers 0.46-0.83) - Promo/bumper filter: weighted signature scoring, 42->27 clean Q&A pairs - Text-only Q&A fallback for episodes with no CALLER diarization labels - TRANSFORMERS_OFFLINE=1 to skip HuggingFace freshness checks - Add diarize_2018.py for targeted re-run + FTS5 rebuild - Add benchmark.py + BENCH_SETUP.md for GURU-BEAST-ROG (RTX 4090) comparison - Commit 9-episode training diarization.json outputs - Session log: 2026-04-27-diarization-pipeline.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
207 lines
7.0 KiB
Python
207 lines
7.0 KiB
Python
"""
|
|
Show prep generator: search the archive index for past caller topics,
|
|
extract clips, and generate "then vs now" talking points via Ollama.
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
from rich.console import Console
|
|
from rich.panel import Panel
|
|
from rich.table import Table
|
|
from rich import box
|
|
|
|
from .indexer import ArchiveIndex, QAResult, SearchResult
|
|
from .clip_extractor import extract_clips_for_results, format_timestamp
|
|
|
|
console = Console()
|
|
|
|
|
|
def generate_show_prep(
|
|
index: ArchiveIndex,
|
|
topic: str,
|
|
output_dir: Path,
|
|
extract_clips: bool = True,
|
|
ollama_host: str = "http://localhost:11434",
|
|
ollama_model: str = "qwen3:14b",
|
|
limit: int = 10,
|
|
) -> Path:
|
|
"""
|
|
Search the archive for past discussions of a topic.
|
|
Extracts audio clips and generates "then vs now" talking points.
|
|
Returns path to the generated markdown prep file.
|
|
"""
|
|
output_dir = Path(output_dir)
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
console.print(Panel.fit(f"[bold]Show Prep:[/bold] {topic}", border_style="blue"))
|
|
|
|
# Search Q&A pairs first (caller exchanges)
|
|
qa_results = index.search_qa(topic, limit=limit)
|
|
# Also search raw segments (for monologue mentions)
|
|
segment_results = index.search(topic, limit=limit)
|
|
|
|
if not qa_results and not segment_results:
|
|
console.print(f"[yellow]No results found for: {topic}[/yellow]")
|
|
return None
|
|
|
|
# Display results table
|
|
_print_results_table(qa_results, segment_results, topic)
|
|
|
|
# Extract clips
|
|
clip_paths = {}
|
|
if extract_clips and qa_results:
|
|
clips_dir = output_dir / "clips"
|
|
console.print(f"\n[dim]Extracting {len(qa_results)} clip(s)...[/dim]")
|
|
clip_paths = extract_clips_for_results(qa_results, clips_dir)
|
|
|
|
# Generate then-vs-now content via Ollama
|
|
then_now = _generate_then_vs_now(topic, qa_results, segment_results,
|
|
ollama_host, ollama_model)
|
|
|
|
# Write markdown prep file
|
|
safe_topic = topic.lower().replace(" ", "-").replace("/", "-")[:40]
|
|
date_str = datetime.now().strftime("%Y-%m-%d")
|
|
prep_path = output_dir / f"{date_str}-{safe_topic}-prep.md"
|
|
|
|
_write_prep_file(prep_path, topic, qa_results, segment_results,
|
|
clip_paths, then_now)
|
|
|
|
console.print(f"\n[bold green]Prep file:[/bold green] {prep_path}")
|
|
return prep_path
|
|
|
|
|
|
def _print_results_table(qa_results: list[QAResult], segment_results: list[SearchResult],
|
|
topic: str):
|
|
if qa_results:
|
|
table = Table(title=f"Caller Q&A — \"{topic}\"", box=box.SIMPLE, show_lines=True)
|
|
table.add_column("Date", style="cyan", width=12)
|
|
table.add_column("Timestamps", style="dim", width=14)
|
|
table.add_column("Duration", style="dim", width=8)
|
|
table.add_column("Caller asked", width=35)
|
|
table.add_column("Topic", style="green", width=20)
|
|
|
|
for r in qa_results:
|
|
dur = r.duration()
|
|
table.add_row(
|
|
r.date or r.episode_id,
|
|
r.timestamp_str(),
|
|
f"{int(dur//60)}m{int(dur%60):02d}s",
|
|
r.question_text[:80] + ("…" if len(r.question_text) > 80 else ""),
|
|
r.topic or "—",
|
|
)
|
|
console.print(table)
|
|
|
|
if segment_results and not qa_results:
|
|
console.print(f"\n[dim]No structured Q&A found. Showing {len(segment_results)} "
|
|
f"transcript mentions:[/dim]")
|
|
for r in segment_results:
|
|
console.print(f" [cyan]{r.date}[/cyan] [{r.timestamp_str()}] "
|
|
f"[dim]{r.speaker}[/dim]: {r.text[:100]}…")
|
|
|
|
|
|
def _generate_then_vs_now(topic: str, qa_results: list, segment_results: list,
|
|
ollama_host: str, model: str) -> str:
|
|
try:
|
|
import ollama
|
|
client = ollama.Client(host=ollama_host)
|
|
except ImportError:
|
|
return "_Ollama not available — install with: pip install ollama_"
|
|
|
|
# Build context from past discussions
|
|
past_context = ""
|
|
for r in qa_results[:5]:
|
|
date = r.date or r.episode_id
|
|
past_context += f"\n[{date}] Caller: {r.question_text[:200]}\n"
|
|
past_context += f"Host answer: {r.answer_text[:400]}\n"
|
|
|
|
if not past_context and segment_results:
|
|
for r in segment_results[:5]:
|
|
past_context += f"\n[{r.date}] {r.speaker}: {r.text[:300]}\n"
|
|
|
|
if not past_context:
|
|
return ""
|
|
|
|
prompt = f"""You are helping prepare talking points for a technology radio show host.
|
|
The host discussed "{topic}" in past episodes. Here are excerpts:
|
|
|
|
{past_context}
|
|
|
|
The host wants to do a new segment revisiting this topic.
|
|
|
|
Write talking points in this format:
|
|
## What I Said Then
|
|
- [2-3 bullets summarizing the past advice/position]
|
|
|
|
## What's Changed Since Then
|
|
- [2-3 bullets on how the technology/situation has evolved]
|
|
|
|
## Why My Answer Is Different Now
|
|
- [2-3 bullets on the updated recommendation/position]
|
|
|
|
## Suggested Opening
|
|
[1-2 sentences the host can use to open the segment, referencing the old clip]
|
|
|
|
Keep it conversational, radio-friendly. Be specific about what actually changed."""
|
|
|
|
try:
|
|
resp = client.chat(
|
|
model=model,
|
|
messages=[{"role": "user", "content": prompt}],
|
|
options={"temperature": 0.3},
|
|
)
|
|
return resp["message"]["content"]
|
|
except Exception as e:
|
|
return f"_Ollama generation failed: {e}_"
|
|
|
|
|
|
def _write_prep_file(path: Path, topic: str, qa_results: list, segment_results: list,
|
|
clip_paths: dict, then_now: str):
|
|
lines = [
|
|
f"# Show Prep: {topic}",
|
|
f"",
|
|
f"_Generated {datetime.now().strftime('%Y-%m-%d %H:%M')}_",
|
|
f"",
|
|
]
|
|
|
|
if qa_results:
|
|
lines += [f"## Past Caller Exchanges ({len(qa_results)} found)", ""]
|
|
for i, r in enumerate(qa_results):
|
|
clip_info = ""
|
|
if i in clip_paths:
|
|
clip_info = f" — `{clip_paths[i].name}`"
|
|
lines += [
|
|
f"### {r.date or r.episode_id} — [{r.timestamp_str()}]{clip_info}",
|
|
f"**Caller:** {r.question_text}",
|
|
f"",
|
|
f"**Host:** {r.answer_text[:600]}{'…' if len(r.answer_text) > 600 else ''}",
|
|
f"",
|
|
]
|
|
|
|
elif segment_results:
|
|
lines += [f"## Transcript Mentions ({len(segment_results)} found)", ""]
|
|
for r in segment_results:
|
|
lines += [
|
|
f"- **{r.date}** [{r.timestamp_str()}] ({r.speaker}): {r.text[:200]}",
|
|
]
|
|
lines.append("")
|
|
|
|
if then_now:
|
|
lines += ["## Then vs Now", "", then_now, ""]
|
|
|
|
if clip_paths:
|
|
lines += [
|
|
"## Clips",
|
|
"",
|
|
f"Extracted to `clips/` — drag into Audition/Audacity:",
|
|
"",
|
|
]
|
|
for i, p in clip_paths.items():
|
|
if i < len(qa_results):
|
|
r = qa_results[i]
|
|
lines.append(f"- `{p.name}` — {r.date} [{r.timestamp_str()}]")
|
|
lines.append("")
|
|
|
|
path.write_text("\n".join(lines), encoding="utf-8")
|