Files
claudetools/projects/radio-show/audio-processor/src/analyzer.py
Mike Swanson a1e0442d8b Add radio show audio processor and post-show workflow
- Audio processor CLI tool with 6-stage pipeline: transcribe (faster-whisper GPU),
  diarize (pyannote), detect segments (multi-signal classifier), remove commercials,
  split segments, analyze content (Ollama)
- Post-show workflow doc for episode posts, forum threads, deep-dive blog posts
- Training plan for using 579-episode archive for voice profiles and commercial detection
- Successful test: 45min episode transcribed in 2:37 on RTX 5070 Ti
- Sample transcript output from S7E30 (March 2015)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 11:51:59 -07:00

188 lines
6.0 KiB
Python

"""Stage 6: Content analysis using Ollama for summary, topics, and post-show debrief."""
import json
from dataclasses import dataclass
from pathlib import Path
from rich.console import Console
console = Console()
@dataclass
class EpisodeAnalysis:
summary: str
segment_summaries: list[dict] # [{title, summary, key_points}]
key_quotes: list[dict] # [{quote, speaker, timestamp}]
topics: list[str]
tags: list[str]
blog_post_candidates: list[dict] # [{title, angle, why}]
debrief_draft: str # Markdown debrief template
def to_dict(self) -> dict:
return {
"summary": self.summary,
"segment_summaries": self.segment_summaries,
"key_quotes": self.key_quotes,
"topics": self.topics,
"tags": self.tags,
"blog_post_candidates": self.blog_post_candidates,
}
def save(self, output_dir: Path):
output_dir.mkdir(parents=True, exist_ok=True)
with open(output_dir / "analysis.json", "w") as f:
json.dump(self.to_dict(), f, indent=2)
with open(output_dir / "post-show-debrief.md", "w") as f:
f.write(self.debrief_draft)
console.print(f"[green]Analysis saved to {output_dir}[/green]")
def analyze_episode(transcript_text: str, diarization_data: dict | None = None,
show_prep: str | None = None, segments: list | None = None,
model: str = "qwen3:14b",
ollama_host: str = "http://localhost:11434") -> EpisodeAnalysis:
"""Analyze a transcribed episode using a local LLM."""
import ollama as ollama_client
console.print(f"[bold]Analyzing episode with {model}[/bold]")
client = ollama_client.Client(host=ollama_host)
# Build context for the LLM
context_parts = []
if show_prep:
context_parts.append(f"## Show Prep (planned topics)\n\n{show_prep[:3000]}")
context_parts.append(f"## Transcript\n\n{transcript_text[:12000]}")
if diarization_data:
speakers = diarization_data.get("speaker_map", {})
if speakers:
speaker_info = "\n".join(f"- {v}" for v in speakers.values())
context_parts.append(f"## Speakers Identified\n\n{speaker_info}")
context = "\n\n---\n\n".join(context_parts)
# Query 1: Episode summary and segment summaries
summary_prompt = f"""You are analyzing a radio show episode transcript.
Provide a JSON response with:
1. "summary": A 2-3 paragraph episode summary suitable for a podcast episode page.
Write in third person. Be specific about topics discussed.
2. "segment_summaries": An array of objects, each with:
- "title": A compelling segment title
- "summary": 3-5 sentence summary
- "key_points": Array of key takeaway bullet points
3. "topics": Array of main topics discussed (short phrases)
4. "tags": Array of SEO-friendly tags (lowercase, hyphenated)
5. "key_quotes": Array of notable quotes, each with:
- "quote": The quote text
- "speaker": Who said it (if identifiable)
- "context": Brief context
6. "blog_post_candidates": Array of topics worth expanding into blog posts, each with:
- "title": Proposed blog post title
- "angle": The specific angle or thesis
- "why": Why this topic deserves expansion
Respond ONLY with valid JSON, no markdown fencing.
{context}"""
console.print("[dim]Generating episode analysis...[/dim]")
response = client.chat(
model=model,
messages=[{"role": "user", "content": summary_prompt}],
options={"temperature": 0.3, "num_ctx": 16384},
)
# Parse LLM response
response_text = response["message"]["content"]
# Strip markdown code fences if present
if "```json" in response_text:
response_text = response_text.split("```json", 1)[1]
response_text = response_text.split("```", 1)[0]
elif "```" in response_text:
response_text = response_text.split("```", 1)[1]
response_text = response_text.split("```", 1)[0]
try:
analysis_data = json.loads(response_text.strip())
except json.JSONDecodeError:
console.print("[yellow]LLM response was not valid JSON, using raw text[/yellow]")
analysis_data = {
"summary": response_text,
"segment_summaries": [],
"topics": [],
"tags": [],
"key_quotes": [],
"blog_post_candidates": [],
}
# Query 2: Generate debrief draft
debrief_prompt = f"""Based on this radio show transcript, generate a post-show debrief
in markdown format. Compare what was discussed against the show prep (planned topics)
to identify what made it in, what was cut, and what was added.
Format:
# Post-Show Debrief
## Episode: [derive title from content]
## Air Date: [today's date if not clear]
### What Made It In
[For each planned segment, note: Used / Modified / Cut]
### What Changed Live
[Topics expanded, cut short, or reordered vs. prep]
### Caller/Audience Interaction
[Any caller topics or audience engagement noted in transcript]
### Unplanned Additions
[Topics not in prep that came up]
### Best Moments
[Most compelling segments or quotes]
### Topics That Deserve More
[Topics that were rushed or generated high interest]
### Suggested Blog Posts
[2-3 specific blog post ideas with proposed titles and angles]
{context}"""
console.print("[dim]Generating debrief draft...[/dim]")
debrief_response = client.chat(
model=model,
messages=[{"role": "user", "content": debrief_prompt}],
options={"temperature": 0.4, "num_ctx": 16384},
)
debrief_text = debrief_response["message"]["content"]
console.print("[green]Analysis complete[/green]")
return EpisodeAnalysis(
summary=analysis_data.get("summary", ""),
segment_summaries=analysis_data.get("segment_summaries", []),
key_quotes=analysis_data.get("key_quotes", []),
topics=analysis_data.get("topics", []),
tags=analysis_data.get("tags", []),
blog_post_candidates=analysis_data.get("blog_post_candidates", []),
debrief_draft=debrief_text,
)