- Audio processor CLI tool with 6-stage pipeline: transcribe (faster-whisper GPU), diarize (pyannote), detect segments (multi-signal classifier), remove commercials, split segments, analyze content (Ollama) - Post-show workflow doc for episode posts, forum threads, deep-dive blog posts - Training plan for using 579-episode archive for voice profiles and commercial detection - Successful test: 45min episode transcribed in 2:37 on RTX 5070 Ti - Sample transcript output from S7E30 (March 2015) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
188 lines
6.0 KiB
Python
188 lines
6.0 KiB
Python
"""Stage 6: Content analysis using Ollama for summary, topics, and post-show debrief."""
|
|
|
|
import json
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
from rich.console import Console
|
|
|
|
console = Console()
|
|
|
|
|
|
@dataclass
|
|
class EpisodeAnalysis:
|
|
summary: str
|
|
segment_summaries: list[dict] # [{title, summary, key_points}]
|
|
key_quotes: list[dict] # [{quote, speaker, timestamp}]
|
|
topics: list[str]
|
|
tags: list[str]
|
|
blog_post_candidates: list[dict] # [{title, angle, why}]
|
|
debrief_draft: str # Markdown debrief template
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"summary": self.summary,
|
|
"segment_summaries": self.segment_summaries,
|
|
"key_quotes": self.key_quotes,
|
|
"topics": self.topics,
|
|
"tags": self.tags,
|
|
"blog_post_candidates": self.blog_post_candidates,
|
|
}
|
|
|
|
def save(self, output_dir: Path):
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_dir / "analysis.json", "w") as f:
|
|
json.dump(self.to_dict(), f, indent=2)
|
|
|
|
with open(output_dir / "post-show-debrief.md", "w") as f:
|
|
f.write(self.debrief_draft)
|
|
|
|
console.print(f"[green]Analysis saved to {output_dir}[/green]")
|
|
|
|
|
|
def analyze_episode(transcript_text: str, diarization_data: dict | None = None,
|
|
show_prep: str | None = None, segments: list | None = None,
|
|
model: str = "qwen3:14b",
|
|
ollama_host: str = "http://localhost:11434") -> EpisodeAnalysis:
|
|
"""Analyze a transcribed episode using a local LLM."""
|
|
import ollama as ollama_client
|
|
|
|
console.print(f"[bold]Analyzing episode with {model}[/bold]")
|
|
|
|
client = ollama_client.Client(host=ollama_host)
|
|
|
|
# Build context for the LLM
|
|
context_parts = []
|
|
|
|
if show_prep:
|
|
context_parts.append(f"## Show Prep (planned topics)\n\n{show_prep[:3000]}")
|
|
|
|
context_parts.append(f"## Transcript\n\n{transcript_text[:12000]}")
|
|
|
|
if diarization_data:
|
|
speakers = diarization_data.get("speaker_map", {})
|
|
if speakers:
|
|
speaker_info = "\n".join(f"- {v}" for v in speakers.values())
|
|
context_parts.append(f"## Speakers Identified\n\n{speaker_info}")
|
|
|
|
context = "\n\n---\n\n".join(context_parts)
|
|
|
|
# Query 1: Episode summary and segment summaries
|
|
summary_prompt = f"""You are analyzing a radio show episode transcript.
|
|
Provide a JSON response with:
|
|
|
|
1. "summary": A 2-3 paragraph episode summary suitable for a podcast episode page.
|
|
Write in third person. Be specific about topics discussed.
|
|
|
|
2. "segment_summaries": An array of objects, each with:
|
|
- "title": A compelling segment title
|
|
- "summary": 3-5 sentence summary
|
|
- "key_points": Array of key takeaway bullet points
|
|
|
|
3. "topics": Array of main topics discussed (short phrases)
|
|
|
|
4. "tags": Array of SEO-friendly tags (lowercase, hyphenated)
|
|
|
|
5. "key_quotes": Array of notable quotes, each with:
|
|
- "quote": The quote text
|
|
- "speaker": Who said it (if identifiable)
|
|
- "context": Brief context
|
|
|
|
6. "blog_post_candidates": Array of topics worth expanding into blog posts, each with:
|
|
- "title": Proposed blog post title
|
|
- "angle": The specific angle or thesis
|
|
- "why": Why this topic deserves expansion
|
|
|
|
Respond ONLY with valid JSON, no markdown fencing.
|
|
|
|
{context}"""
|
|
|
|
console.print("[dim]Generating episode analysis...[/dim]")
|
|
|
|
response = client.chat(
|
|
model=model,
|
|
messages=[{"role": "user", "content": summary_prompt}],
|
|
options={"temperature": 0.3, "num_ctx": 16384},
|
|
)
|
|
|
|
# Parse LLM response
|
|
response_text = response["message"]["content"]
|
|
|
|
# Strip markdown code fences if present
|
|
if "```json" in response_text:
|
|
response_text = response_text.split("```json", 1)[1]
|
|
response_text = response_text.split("```", 1)[0]
|
|
elif "```" in response_text:
|
|
response_text = response_text.split("```", 1)[1]
|
|
response_text = response_text.split("```", 1)[0]
|
|
|
|
try:
|
|
analysis_data = json.loads(response_text.strip())
|
|
except json.JSONDecodeError:
|
|
console.print("[yellow]LLM response was not valid JSON, using raw text[/yellow]")
|
|
analysis_data = {
|
|
"summary": response_text,
|
|
"segment_summaries": [],
|
|
"topics": [],
|
|
"tags": [],
|
|
"key_quotes": [],
|
|
"blog_post_candidates": [],
|
|
}
|
|
|
|
# Query 2: Generate debrief draft
|
|
debrief_prompt = f"""Based on this radio show transcript, generate a post-show debrief
|
|
in markdown format. Compare what was discussed against the show prep (planned topics)
|
|
to identify what made it in, what was cut, and what was added.
|
|
|
|
Format:
|
|
|
|
# Post-Show Debrief
|
|
## Episode: [derive title from content]
|
|
## Air Date: [today's date if not clear]
|
|
|
|
### What Made It In
|
|
[For each planned segment, note: Used / Modified / Cut]
|
|
|
|
### What Changed Live
|
|
[Topics expanded, cut short, or reordered vs. prep]
|
|
|
|
### Caller/Audience Interaction
|
|
[Any caller topics or audience engagement noted in transcript]
|
|
|
|
### Unplanned Additions
|
|
[Topics not in prep that came up]
|
|
|
|
### Best Moments
|
|
[Most compelling segments or quotes]
|
|
|
|
### Topics That Deserve More
|
|
[Topics that were rushed or generated high interest]
|
|
|
|
### Suggested Blog Posts
|
|
[2-3 specific blog post ideas with proposed titles and angles]
|
|
|
|
{context}"""
|
|
|
|
console.print("[dim]Generating debrief draft...[/dim]")
|
|
|
|
debrief_response = client.chat(
|
|
model=model,
|
|
messages=[{"role": "user", "content": debrief_prompt}],
|
|
options={"temperature": 0.4, "num_ctx": 16384},
|
|
)
|
|
|
|
debrief_text = debrief_response["message"]["content"]
|
|
|
|
console.print("[green]Analysis complete[/green]")
|
|
|
|
return EpisodeAnalysis(
|
|
summary=analysis_data.get("summary", ""),
|
|
segment_summaries=analysis_data.get("segment_summaries", []),
|
|
key_quotes=analysis_data.get("key_quotes", []),
|
|
topics=analysis_data.get("topics", []),
|
|
tags=analysis_data.get("tags", []),
|
|
blog_post_candidates=analysis_data.get("blog_post_candidates", []),
|
|
debrief_draft=debrief_text,
|
|
)
|