claudetools/projects/radio-show/audio-processor/src/analyzer.py

"""Stage 6: Content analysis using Ollama for summary, topics, and post-show debrief."""

import json
from dataclasses import dataclass
from pathlib import Path

from rich.console import Console

console = Console()


@dataclass
class EpisodeAnalysis:
    summary: str
    segment_summaries: list[dict]  # [{title, summary, key_points}]
    key_quotes: list[dict]         # [{quote, speaker, timestamp}]
    topics: list[str]
    tags: list[str]
    blog_post_candidates: list[dict]  # [{title, angle, why}]
    debrief_draft: str             # Markdown debrief template

    def to_dict(self) -> dict:
        return {
            "summary": self.summary,
            "segment_summaries": self.segment_summaries,
            "key_quotes": self.key_quotes,
            "topics": self.topics,
            "tags": self.tags,
            "blog_post_candidates": self.blog_post_candidates,
        }

    def save(self, output_dir: Path):
        output_dir.mkdir(parents=True, exist_ok=True)

        with open(output_dir / "analysis.json", "w") as f:
            json.dump(self.to_dict(), f, indent=2)

        with open(output_dir / "post-show-debrief.md", "w") as f:
            f.write(self.debrief_draft)

        console.print(f"[green]Analysis saved to {output_dir}[/green]")


def analyze_episode(transcript_text: str, diarization_data: dict | None = None,
                    show_prep: str | None = None, segments: list | None = None,
                    model: str = "qwen3:14b",
                    ollama_host: str = "http://localhost:11434") -> EpisodeAnalysis:
    """Analyze a transcribed episode using a local LLM."""
    import ollama as ollama_client

    console.print(f"[bold]Analyzing episode with {model}[/bold]")

    client = ollama_client.Client(host=ollama_host)

    # Build context for the LLM
    context_parts = []

    if show_prep:
        context_parts.append(f"## Show Prep (planned topics)\n\n{show_prep[:3000]}")

    context_parts.append(f"## Transcript\n\n{transcript_text[:12000]}")

    if diarization_data:
        speakers = diarization_data.get("speaker_map", {})
        if speakers:
            speaker_info = "\n".join(f"- {v}" for v in speakers.values())
            context_parts.append(f"## Speakers Identified\n\n{speaker_info}")

    context = "\n\n---\n\n".join(context_parts)

    # Query 1: Episode summary and segment summaries
    summary_prompt = f"""You are analyzing a radio show episode transcript.
Provide a JSON response with:

1. "summary": A 2-3 paragraph episode summary suitable for a podcast episode page.
   Write in third person. Be specific about topics discussed.

2. "segment_summaries": An array of objects, each with:
   - "title": A compelling segment title
   - "summary": 3-5 sentence summary
   - "key_points": Array of key takeaway bullet points

3. "topics": Array of main topics discussed (short phrases)

4. "tags": Array of SEO-friendly tags (lowercase, hyphenated)

5. "key_quotes": Array of notable quotes, each with:
   - "quote": The quote text
   - "speaker": Who said it (if identifiable)
   - "context": Brief context

6. "blog_post_candidates": Array of topics worth expanding into blog posts, each with:
   - "title": Proposed blog post title
   - "angle": The specific angle or thesis
   - "why": Why this topic deserves expansion

Respond ONLY with valid JSON, no markdown fencing.

{context}"""

    console.print("[dim]Generating episode analysis...[/dim]")

    response = client.chat(
        model=model,
        messages=[{"role": "user", "content": summary_prompt}],
        options={"temperature": 0.3, "num_ctx": 16384},
    )

    # Parse LLM response
    response_text = response["message"]["content"]

    # Strip markdown code fences if present
    if "```json" in response_text:
        response_text = response_text.split("```json", 1)[1]
        response_text = response_text.split("```", 1)[0]
    elif "```" in response_text:
        response_text = response_text.split("```", 1)[1]
        response_text = response_text.split("```", 1)[0]

    try:
        analysis_data = json.loads(response_text.strip())
    except json.JSONDecodeError:
        console.print("[yellow]LLM response was not valid JSON, using raw text[/yellow]")
        analysis_data = {
            "summary": response_text,
            "segment_summaries": [],
            "topics": [],
            "tags": [],
            "key_quotes": [],
            "blog_post_candidates": [],
        }

    # Query 2: Generate debrief draft
    debrief_prompt = f"""Based on this radio show transcript, generate a post-show debrief
in markdown format. Compare what was discussed against the show prep (planned topics)
to identify what made it in, what was cut, and what was added.

Format:

# Post-Show Debrief
## Episode: [derive title from content]
## Air Date: [today's date if not clear]

### What Made It In
[For each planned segment, note: Used / Modified / Cut]

### What Changed Live
[Topics expanded, cut short, or reordered vs. prep]

### Caller/Audience Interaction
[Any caller topics or audience engagement noted in transcript]

### Unplanned Additions
[Topics not in prep that came up]

### Best Moments
[Most compelling segments or quotes]

### Topics That Deserve More
[Topics that were rushed or generated high interest]

### Suggested Blog Posts
[2-3 specific blog post ideas with proposed titles and angles]

{context}"""

    console.print("[dim]Generating debrief draft...[/dim]")

    debrief_response = client.chat(
        model=model,
        messages=[{"role": "user", "content": debrief_prompt}],
        options={"temperature": 0.4, "num_ctx": 16384},
    )

    debrief_text = debrief_response["message"]["content"]

    console.print("[green]Analysis complete[/green]")

    return EpisodeAnalysis(
        summary=analysis_data.get("summary", ""),
        segment_summaries=analysis_data.get("segment_summaries", []),
        key_quotes=analysis_data.get("key_quotes", []),
        topics=analysis_data.get("topics", []),
        tags=analysis_data.get("tags", []),
        blog_post_candidates=analysis_data.get("blog_post_candidates", []),
        debrief_draft=debrief_text,
    )