claudetools/projects/radio-show/audio-processor/test_content_generation.py

#!/usr/bin/env python3
"""Test content generation from a transcript using Ollama qwen3:14b.

Generates:
1. Episode analysis (summary, segments, topics, tags, quotes, blog candidates)
2. Sample forum discussion post
3. Sample blog post draft
"""

import json
import sys
import time
from pathlib import Path

import ollama

MODEL = "qwen3:14b"
OLLAMA_HOST = "http://localhost:11434"
# qwen3:14b supports 32k context -- use more of it
MAX_TRANSCRIPT_CHARS = 40000

client = ollama.Client(host=OLLAMA_HOST)


def load_transcript(transcript_dir: str) -> str:
    """Load transcript text."""
    txt_path = Path(transcript_dir) / "transcript.txt"
    if not txt_path.exists():
        print(f"ERROR: {txt_path} not found")
        sys.exit(1)
    return txt_path.read_text()


def timed_query(label: str, prompt: str, temperature: float = 0.3) -> str:
    """Run an Ollama query with timing."""
    print(f"\n{'='*60}")
    print(f"  {label}")
    print(f"{'='*60}")
    start = time.time()

    response = client.chat(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        options={"temperature": temperature, "num_ctx": 32768},
    )

    elapsed = time.time() - start
    result = response["message"]["content"]
    print(f"  [{elapsed:.1f}s, {len(result)} chars]")
    return result


def generate_analysis(transcript: str) -> dict:
    """Generate episode analysis JSON."""
    prompt = f"""You are analyzing a transcript from "The Computer Guru Show", a live call-in
radio show hosted by Mike Swanson on AM1030 KVOI in Tucson, Arizona. The show covers
technology news, tips, and takes listener calls for free tech support.

Analyze this transcript and provide a JSON response with:

1. "summary": A 2-3 paragraph episode summary suitable for a podcast page. Write in third
   person. Be specific about topics and conversations.

2. "segment_summaries": Array of distinct topic segments discussed, each with:
   - "title": Compelling segment title
   - "summary": 3-5 sentence summary
   - "key_points": Array of key takeaway bullet points
   - "approximate_position": "early", "mid", or "late" in the show

3. "topics": Array of main topics discussed (short phrases)

4. "tags": Array of SEO-friendly tags (lowercase, hyphenated)

5. "key_quotes": Array of 3-5 notable/quotable moments, each with:
   - "quote": The exact quote text
   - "speaker": Who said it
   - "context": Brief context for why it's notable

6. "blog_post_candidates": Array of 2-3 topics worth expanding into full blog posts, each with:
   - "title": Proposed blog post title
   - "angle": The specific thesis or angle
   - "why": Why this deserves expansion (audience interest, SEO potential, etc.)
   - "key_points_to_expand": Array of points from the show to develop further

Respond ONLY with valid JSON. No markdown fencing, no explanation outside the JSON.

## Transcript

{transcript[:MAX_TRANSCRIPT_CHARS]}"""

    result = timed_query("Episode Analysis (JSON)", prompt)

    # Strip markdown fences if present
    if "```json" in result:
        result = result.split("```json", 1)[1].split("```", 1)[0]
    elif "```" in result:
        result = result.split("```", 1)[1].split("```", 1)[0]

    # Strip thinking tags if qwen3 uses them
    if "<think>" in result:
        result = result.split("</think>")[-1]

    try:
        return json.loads(result.strip())
    except json.JSONDecodeError as e:
        print(f"  WARNING: JSON parse failed: {e}")
        print(f"  Raw response (first 500 chars): {result[:500]}")
        return {"raw_response": result}


def generate_forum_post(transcript: str, analysis: dict) -> str:
    """Generate a forum discussion thread post."""
    summary = analysis.get("summary", "")
    topics = analysis.get("topics", [])

    prompt = f"""You are writing a forum discussion post for "The Computer Guru Show" community
forum. The tone should be conversational, engaging, and invite discussion. This is NOT a
formal article -- it's a community post that makes people want to comment.

Show info:
- Host: Mike Swanson ("The Computer Guru")
- Station: AM1030 KVOI, Tucson AZ
- Format: Live call-in tech show

Episode summary: {summary}
Topics covered: {', '.join(topics)}

Write a forum discussion post with:
1. A brief, engaging hook (2-3 sentences about the most interesting thing from the episode)
2. Bullet list of topics covered (with one-line teasers, not full summaries)
3. 2-3 discussion questions that invite audience participation
4. A "Listen to the full episode" call-to-action at the end

Keep it under 300 words. Use a casual, friendly tone. No emojis.

Key transcript excerpts for context:
{transcript[:8000]}"""

    return timed_query("Forum Discussion Post", prompt, temperature=0.5)


def generate_blog_post(transcript: str, candidate: dict) -> str:
    """Generate a full blog post draft from a blog candidate."""
    prompt = f"""You are writing a blog post for the "Computer Guru Show" website
(radio.azcomputerguru.com). The author is Mike Swanson, a veteran IT professional and
radio host in Tucson, Arizona. His style is:
- Explains complex tech in plain English
- Uses analogies and humor
- Gives practical, actionable advice
- Takes strong positions on consumer rights and privacy
- Speaks directly to the reader

Write a blog post with this info:
- Title: {candidate.get('title', 'Untitled')}
- Angle: {candidate.get('angle', '')}
- Points to expand: {json.dumps(candidate.get('key_points_to_expand', []))}

Format:
1. Engaging opening paragraph (hook the reader)
2. 3-5 sections with subheadings
3. Practical "what this means for you" section
4. Key Takeaways (bullet points)
5. Closing paragraph that ties back to the show

Target length: 800-1200 words. Write in first person as Mike Swanson.
Include a note at the bottom: "This topic was discussed on The Computer Guru Show.
Listen to the full episode for more."

Relevant transcript excerpts:
{transcript[:12000]}"""

    return timed_query(f"Blog Post: {candidate.get('title', '?')}", prompt, temperature=0.5)


def main():
    transcript_dir = sys.argv[1] if len(sys.argv) > 1 else \
        "training-data/transcripts/2016-s8e42"

    print(f"Loading transcript from: {transcript_dir}")
    transcript = load_transcript(transcript_dir)
    print(f"Transcript length: {len(transcript)} chars ({len(transcript.splitlines())} lines)")
    print(f"Sending first {min(len(transcript), MAX_TRANSCRIPT_CHARS)} chars to LLM")

    # Output directory
    output_dir = Path(transcript_dir) / "generated"
    output_dir.mkdir(parents=True, exist_ok=True)

    # Step 1: Analysis
    analysis = generate_analysis(transcript)
    with open(output_dir / "analysis.json", "w") as f:
        json.dump(analysis, f, indent=2)
    print(f"\n  Saved: {output_dir}/analysis.json")

    # Print summary
    if "summary" in analysis:
        print(f"\n--- EPISODE SUMMARY ---")
        print(analysis["summary"])

    if "topics" in analysis:
        print(f"\n--- TOPICS ---")
        for t in analysis["topics"]:
            print(f"  - {t}")

    if "tags" in analysis:
        print(f"\n--- TAGS ---")
        print(f"  {', '.join(analysis['tags'])}")

    if "blog_post_candidates" in analysis:
        print(f"\n--- BLOG POST CANDIDATES ---")
        for i, c in enumerate(analysis["blog_post_candidates"], 1):
            print(f"  {i}. {c.get('title', '?')}")
            print(f"     Angle: {c.get('angle', '?')}")

    # Step 2: Forum post
    forum_post = generate_forum_post(transcript, analysis)
    with open(output_dir / "forum-post.md", "w") as f:
        f.write(forum_post)
    print(f"\n  Saved: {output_dir}/forum-post.md")
    print(f"\n--- FORUM POST ---")
    print(forum_post)

    # Step 3: Blog post (pick the first candidate)
    candidates = analysis.get("blog_post_candidates", [])
    if candidates:
        blog_post = generate_blog_post(transcript, candidates[0])
        slug = candidates[0].get("title", "draft").lower().replace(" ", "-")[:50]
        with open(output_dir / f"blog-{slug}.md", "w") as f:
            f.write(blog_post)
        print(f"\n  Saved: {output_dir}/blog-{slug}.md")
        print(f"\n--- BLOG POST DRAFT ---")
        print(blog_post)
    else:
        print("\n  No blog post candidates found, skipping blog generation")

    print(f"\n{'='*60}")
    print(f"  All outputs saved to: {output_dir}/")
    print(f"{'='*60}")


if __name__ == "__main__":
    main()