Add radio show audio processor and post-show workflow
- Audio processor CLI tool with 6-stage pipeline: transcribe (faster-whisper GPU), diarize (pyannote), detect segments (multi-signal classifier), remove commercials, split segments, analyze content (Ollama) - Post-show workflow doc for episode posts, forum threads, deep-dive blog posts - Training plan for using 579-episode archive for voice profiles and commercial detection - Successful test: 45min episode transcribed in 2:37 on RTX 5070 Ti - Sample transcript output from S7E30 (March 2015) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
187
projects/radio-show/audio-processor/src/analyzer.py
Normal file
187
projects/radio-show/audio-processor/src/analyzer.py
Normal file
@@ -0,0 +1,187 @@
|
||||
"""Stage 6: Content analysis using Ollama for summary, topics, and post-show debrief."""
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from rich.console import Console
|
||||
|
||||
console = Console()
|
||||
|
||||
|
||||
@dataclass
|
||||
class EpisodeAnalysis:
|
||||
summary: str
|
||||
segment_summaries: list[dict] # [{title, summary, key_points}]
|
||||
key_quotes: list[dict] # [{quote, speaker, timestamp}]
|
||||
topics: list[str]
|
||||
tags: list[str]
|
||||
blog_post_candidates: list[dict] # [{title, angle, why}]
|
||||
debrief_draft: str # Markdown debrief template
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"summary": self.summary,
|
||||
"segment_summaries": self.segment_summaries,
|
||||
"key_quotes": self.key_quotes,
|
||||
"topics": self.topics,
|
||||
"tags": self.tags,
|
||||
"blog_post_candidates": self.blog_post_candidates,
|
||||
}
|
||||
|
||||
def save(self, output_dir: Path):
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_dir / "analysis.json", "w") as f:
|
||||
json.dump(self.to_dict(), f, indent=2)
|
||||
|
||||
with open(output_dir / "post-show-debrief.md", "w") as f:
|
||||
f.write(self.debrief_draft)
|
||||
|
||||
console.print(f"[green]Analysis saved to {output_dir}[/green]")
|
||||
|
||||
|
||||
def analyze_episode(transcript_text: str, diarization_data: dict | None = None,
|
||||
show_prep: str | None = None, segments: list | None = None,
|
||||
model: str = "qwen3:14b",
|
||||
ollama_host: str = "http://localhost:11434") -> EpisodeAnalysis:
|
||||
"""Analyze a transcribed episode using a local LLM."""
|
||||
import ollama as ollama_client
|
||||
|
||||
console.print(f"[bold]Analyzing episode with {model}[/bold]")
|
||||
|
||||
client = ollama_client.Client(host=ollama_host)
|
||||
|
||||
# Build context for the LLM
|
||||
context_parts = []
|
||||
|
||||
if show_prep:
|
||||
context_parts.append(f"## Show Prep (planned topics)\n\n{show_prep[:3000]}")
|
||||
|
||||
context_parts.append(f"## Transcript\n\n{transcript_text[:12000]}")
|
||||
|
||||
if diarization_data:
|
||||
speakers = diarization_data.get("speaker_map", {})
|
||||
if speakers:
|
||||
speaker_info = "\n".join(f"- {v}" for v in speakers.values())
|
||||
context_parts.append(f"## Speakers Identified\n\n{speaker_info}")
|
||||
|
||||
context = "\n\n---\n\n".join(context_parts)
|
||||
|
||||
# Query 1: Episode summary and segment summaries
|
||||
summary_prompt = f"""You are analyzing a radio show episode transcript.
|
||||
Provide a JSON response with:
|
||||
|
||||
1. "summary": A 2-3 paragraph episode summary suitable for a podcast episode page.
|
||||
Write in third person. Be specific about topics discussed.
|
||||
|
||||
2. "segment_summaries": An array of objects, each with:
|
||||
- "title": A compelling segment title
|
||||
- "summary": 3-5 sentence summary
|
||||
- "key_points": Array of key takeaway bullet points
|
||||
|
||||
3. "topics": Array of main topics discussed (short phrases)
|
||||
|
||||
4. "tags": Array of SEO-friendly tags (lowercase, hyphenated)
|
||||
|
||||
5. "key_quotes": Array of notable quotes, each with:
|
||||
- "quote": The quote text
|
||||
- "speaker": Who said it (if identifiable)
|
||||
- "context": Brief context
|
||||
|
||||
6. "blog_post_candidates": Array of topics worth expanding into blog posts, each with:
|
||||
- "title": Proposed blog post title
|
||||
- "angle": The specific angle or thesis
|
||||
- "why": Why this topic deserves expansion
|
||||
|
||||
Respond ONLY with valid JSON, no markdown fencing.
|
||||
|
||||
{context}"""
|
||||
|
||||
console.print("[dim]Generating episode analysis...[/dim]")
|
||||
|
||||
response = client.chat(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": summary_prompt}],
|
||||
options={"temperature": 0.3, "num_ctx": 16384},
|
||||
)
|
||||
|
||||
# Parse LLM response
|
||||
response_text = response["message"]["content"]
|
||||
|
||||
# Strip markdown code fences if present
|
||||
if "```json" in response_text:
|
||||
response_text = response_text.split("```json", 1)[1]
|
||||
response_text = response_text.split("```", 1)[0]
|
||||
elif "```" in response_text:
|
||||
response_text = response_text.split("```", 1)[1]
|
||||
response_text = response_text.split("```", 1)[0]
|
||||
|
||||
try:
|
||||
analysis_data = json.loads(response_text.strip())
|
||||
except json.JSONDecodeError:
|
||||
console.print("[yellow]LLM response was not valid JSON, using raw text[/yellow]")
|
||||
analysis_data = {
|
||||
"summary": response_text,
|
||||
"segment_summaries": [],
|
||||
"topics": [],
|
||||
"tags": [],
|
||||
"key_quotes": [],
|
||||
"blog_post_candidates": [],
|
||||
}
|
||||
|
||||
# Query 2: Generate debrief draft
|
||||
debrief_prompt = f"""Based on this radio show transcript, generate a post-show debrief
|
||||
in markdown format. Compare what was discussed against the show prep (planned topics)
|
||||
to identify what made it in, what was cut, and what was added.
|
||||
|
||||
Format:
|
||||
|
||||
# Post-Show Debrief
|
||||
## Episode: [derive title from content]
|
||||
## Air Date: [today's date if not clear]
|
||||
|
||||
### What Made It In
|
||||
[For each planned segment, note: Used / Modified / Cut]
|
||||
|
||||
### What Changed Live
|
||||
[Topics expanded, cut short, or reordered vs. prep]
|
||||
|
||||
### Caller/Audience Interaction
|
||||
[Any caller topics or audience engagement noted in transcript]
|
||||
|
||||
### Unplanned Additions
|
||||
[Topics not in prep that came up]
|
||||
|
||||
### Best Moments
|
||||
[Most compelling segments or quotes]
|
||||
|
||||
### Topics That Deserve More
|
||||
[Topics that were rushed or generated high interest]
|
||||
|
||||
### Suggested Blog Posts
|
||||
[2-3 specific blog post ideas with proposed titles and angles]
|
||||
|
||||
{context}"""
|
||||
|
||||
console.print("[dim]Generating debrief draft...[/dim]")
|
||||
|
||||
debrief_response = client.chat(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": debrief_prompt}],
|
||||
options={"temperature": 0.4, "num_ctx": 16384},
|
||||
)
|
||||
|
||||
debrief_text = debrief_response["message"]["content"]
|
||||
|
||||
console.print("[green]Analysis complete[/green]")
|
||||
|
||||
return EpisodeAnalysis(
|
||||
summary=analysis_data.get("summary", ""),
|
||||
segment_summaries=analysis_data.get("segment_summaries", []),
|
||||
key_quotes=analysis_data.get("key_quotes", []),
|
||||
topics=analysis_data.get("topics", []),
|
||||
tags=analysis_data.get("tags", []),
|
||||
blog_post_candidates=analysis_data.get("blog_post_candidates", []),
|
||||
debrief_draft=debrief_text,
|
||||
)
|
||||
Reference in New Issue
Block a user