Add radio show audio processor and post-show workflow

- Audio processor CLI tool with 6-stage pipeline: transcribe (faster-whisper GPU), diarize (pyannote), detect segments (multi-signal classifier), remove commercials, split segments, analyze content (Ollama) - Post-show workflow doc for episode posts, forum threads, deep-dive blog posts - Training plan for using 579-episode archive for voice profiles and commercial detection - Successful test: 45min episode transcribed in 2:37 on RTX 5070 Ti - Sample transcript output from S7E30 (March 2015) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 11:51:59 -07:00
parent 98c3ee4225
commit 3cbb1b8aab
14 changed files with 2723 additions and 0 deletions
--- a/projects/radio-show/audio-processor/src/analyzer.py
+++ b/projects/radio-show/audio-processor/src/analyzer.py
@@ -0,0 +1,187 @@
+"""Stage 6: Content analysis using Ollama for summary, topics, and post-show debrief."""
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+
+from rich.console import Console
+
+console = Console()
+
+
+@dataclass
+class EpisodeAnalysis:
+    summary: str
+    segment_summaries: list[dict]  # [{title, summary, key_points}]
+    key_quotes: list[dict]         # [{quote, speaker, timestamp}]
+    topics: list[str]
+    tags: list[str]
+    blog_post_candidates: list[dict]  # [{title, angle, why}]
+    debrief_draft: str             # Markdown debrief template
+
+    def to_dict(self) -> dict:
+        return {
+            "summary": self.summary,
+            "segment_summaries": self.segment_summaries,
+            "key_quotes": self.key_quotes,
+            "topics": self.topics,
+            "tags": self.tags,
+            "blog_post_candidates": self.blog_post_candidates,
+        }
+
+    def save(self, output_dir: Path):
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        with open(output_dir / "analysis.json", "w") as f:
+            json.dump(self.to_dict(), f, indent=2)
+
+        with open(output_dir / "post-show-debrief.md", "w") as f:
+            f.write(self.debrief_draft)
+
+        console.print(f"[green]Analysis saved to {output_dir}[/green]")
+
+
+def analyze_episode(transcript_text: str, diarization_data: dict | None = None,
+                    show_prep: str | None = None, segments: list | None = None,
+                    model: str = "qwen3:14b",
+                    ollama_host: str = "http://localhost:11434") -> EpisodeAnalysis:
+    """Analyze a transcribed episode using a local LLM."""
+    import ollama as ollama_client
+
+    console.print(f"[bold]Analyzing episode with {model}[/bold]")
+
+    client = ollama_client.Client(host=ollama_host)
+
+    # Build context for the LLM
+    context_parts = []
+
+    if show_prep:
+        context_parts.append(f"## Show Prep (planned topics)\n\n{show_prep[:3000]}")
+
+    context_parts.append(f"## Transcript\n\n{transcript_text[:12000]}")
+
+    if diarization_data:
+        speakers = diarization_data.get("speaker_map", {})
+        if speakers:
+            speaker_info = "\n".join(f"- {v}" for v in speakers.values())
+            context_parts.append(f"## Speakers Identified\n\n{speaker_info}")
+
+    context = "\n\n---\n\n".join(context_parts)
+
+    # Query 1: Episode summary and segment summaries
+    summary_prompt = f"""You are analyzing a radio show episode transcript.
+Provide a JSON response with:
+
+1. "summary": A 2-3 paragraph episode summary suitable for a podcast episode page.
+   Write in third person. Be specific about topics discussed.
+
+2. "segment_summaries": An array of objects, each with:
+   - "title": A compelling segment title
+   - "summary": 3-5 sentence summary
+   - "key_points": Array of key takeaway bullet points
+
+3. "topics": Array of main topics discussed (short phrases)
+
+4. "tags": Array of SEO-friendly tags (lowercase, hyphenated)
+
+5. "key_quotes": Array of notable quotes, each with:
+   - "quote": The quote text
+   - "speaker": Who said it (if identifiable)
+   - "context": Brief context
+
+6. "blog_post_candidates": Array of topics worth expanding into blog posts, each with:
+   - "title": Proposed blog post title
+   - "angle": The specific angle or thesis
+   - "why": Why this topic deserves expansion
+
+Respond ONLY with valid JSON, no markdown fencing.
+
+{context}"""
+
+    console.print("[dim]Generating episode analysis...[/dim]")
+
+    response = client.chat(
+        model=model,
+        messages=[{"role": "user", "content": summary_prompt}],
+        options={"temperature": 0.3, "num_ctx": 16384},
+    )
+
+    # Parse LLM response
+    response_text = response["message"]["content"]
+
+    # Strip markdown code fences if present
+    if "```json" in response_text:
+        response_text = response_text.split("```json", 1)[1]
+        response_text = response_text.split("```", 1)[0]
+    elif "```" in response_text:
+        response_text = response_text.split("```", 1)[1]
+        response_text = response_text.split("```", 1)[0]
+
+    try:
+        analysis_data = json.loads(response_text.strip())
+    except json.JSONDecodeError:
+        console.print("[yellow]LLM response was not valid JSON, using raw text[/yellow]")
+        analysis_data = {
+            "summary": response_text,
+            "segment_summaries": [],
+            "topics": [],
+            "tags": [],
+            "key_quotes": [],
+            "blog_post_candidates": [],
+        }
+
+    # Query 2: Generate debrief draft
+    debrief_prompt = f"""Based on this radio show transcript, generate a post-show debrief
+in markdown format. Compare what was discussed against the show prep (planned topics)
+to identify what made it in, what was cut, and what was added.
+
+Format:
+
+# Post-Show Debrief
+## Episode: [derive title from content]
+## Air Date: [today's date if not clear]
+
+### What Made It In
+[For each planned segment, note: Used / Modified / Cut]
+
+### What Changed Live
+[Topics expanded, cut short, or reordered vs. prep]
+
+### Caller/Audience Interaction
+[Any caller topics or audience engagement noted in transcript]
+
+### Unplanned Additions
+[Topics not in prep that came up]
+
+### Best Moments
+[Most compelling segments or quotes]
+
+### Topics That Deserve More
+[Topics that were rushed or generated high interest]
+
+### Suggested Blog Posts
+[2-3 specific blog post ideas with proposed titles and angles]
+
+{context}"""
+
+    console.print("[dim]Generating debrief draft...[/dim]")
+
+    debrief_response = client.chat(
+        model=model,
+        messages=[{"role": "user", "content": debrief_prompt}],
+        options={"temperature": 0.4, "num_ctx": 16384},
+    )
+
+    debrief_text = debrief_response["message"]["content"]
+
+    console.print("[green]Analysis complete[/green]")
+
+    return EpisodeAnalysis(
+        summary=analysis_data.get("summary", ""),
+        segment_summaries=analysis_data.get("segment_summaries", []),
+        key_quotes=analysis_data.get("key_quotes", []),
+        topics=analysis_data.get("topics", []),
+        tags=analysis_data.get("tags", []),
+        blog_post_candidates=analysis_data.get("blog_post_candidates", []),
+        debrief_draft=debrief_text,
+    )