claudetools/projects/radio-show/audio-processor/test_segment_first.py

#!/usr/bin/env python3
"""Segment-first content generation test.

Architecture:
1. Split transcript at break markers (text-based detection)
2. Analyze each segment individually (full context, no truncation)
3. Cross-segment synthesis (callbacks, recurring topics, narrative arc)
4. Generate forum post and blog post from complete analysis
"""

import json
import re
import sys
import time
from pathlib import Path

import ollama

MODEL = "qwen3:14b"
OLLAMA_HOST = "http://localhost:11434"

client = ollama.Client(host=OLLAMA_HOST)

# Break markers — patterns that indicate commercial breaks
BREAK_START = re.compile(
    r"^(We'll be right back|We will be right back)",
    re.IGNORECASE
)
BREAK_END = re.compile(
    r"^(Welcome back to [Tt]he Computer Guru|All right, if you'd like to be a part of the show)",
    re.IGNORECASE
)
# Station IDs and bumper text that appear during breaks
BREAK_FILLER = re.compile(
    r"^(This is the Computer Guru Show on|This is a computer guru show|"
    r"Your computer guru|Whether you're dealing with|"
    r"Computer running slow|Has your machine somehow|"
    r"Be one with your operating system|"
    r"Listen in, chat in|Want your voice to be heard)",
    re.IGNORECASE
)


def load_transcript(transcript_dir: str) -> list[str]:
    """Load transcript as lines."""
    txt_path = Path(transcript_dir) / "transcript.txt"
    if not txt_path.exists():
        print(f"ERROR: {txt_path} not found")
        sys.exit(1)
    return txt_path.read_text().splitlines()


def split_into_segments(lines: list[str]) -> list[dict]:
    """Split transcript lines into show segments, removing commercial breaks.

    Returns list of segments, each with:
      - number: segment number (1-based)
      - start_line: first line number in original transcript
      - end_line: last line number
      - lines: list of text lines (show content only)
      - text: joined text
    """
    segments = []
    current_segment_lines = []
    current_start = 1
    in_break = False
    segment_num = 0

    for i, line in enumerate(lines, 1):
        stripped = line.strip()
        if not stripped:
            continue

        # Detect break start
        if BREAK_START.match(stripped) and not in_break:
            # Save current segment if it has content
            if current_segment_lines:
                segment_num += 1
                text = "\n".join(current_segment_lines)
                segments.append({
                    "number": segment_num,
                    "start_line": current_start,
                    "end_line": i - 1,
                    "lines": current_segment_lines,
                    "text": text,
                    "char_count": len(text),
                })
            in_break = True
            current_segment_lines = []
            continue

        # Detect break end
        if in_break and BREAK_END.match(stripped):
            in_break = False
            current_start = i
            # Don't include the "welcome back" line itself — it's transitional
            continue

        # Skip break filler (station IDs, bumper text during breaks)
        if in_break or BREAK_FILLER.match(stripped):
            continue

        # Regular show content
        current_segment_lines.append(stripped)

    # Don't forget the last segment
    if current_segment_lines:
        segment_num += 1
        text = "\n".join(current_segment_lines)
        segments.append({
            "number": segment_num,
            "start_line": current_start,
            "end_line": len(lines),
            "lines": current_segment_lines,
            "text": text,
            "char_count": len(text),
        })

    return segments


def timed_query(label: str, prompt: str, temperature: float = 0.3,
                ctx_size: int = 32768) -> str:
    """Run an Ollama query with timing."""
    print(f"\n{'='*60}")
    print(f"  {label}")
    print(f"{'='*60}")
    start = time.time()

    response = client.chat(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}],
        options={"temperature": temperature, "num_ctx": ctx_size},
    )

    elapsed = time.time() - start
    result = response["message"]["content"]

    # Strip thinking tags if qwen3 uses them
    if "<think>" in result:
        parts = result.split("</think>")
        if len(parts) > 1:
            result = parts[-1].strip()

    print(f"  [{elapsed:.1f}s, {len(result)} chars]")
    return result


def parse_json_response(text: str) -> dict:
    """Parse JSON from LLM response, handling markdown fences."""
    if "```json" in text:
        text = text.split("```json", 1)[1].split("```", 1)[0]
    elif "```" in text:
        text = text.split("```", 1)[1].split("```", 1)[0]
    try:
        return json.loads(text.strip())
    except json.JSONDecodeError as e:
        print(f"  WARNING: JSON parse failed: {e}")
        print(f"  First 300 chars: {text[:300]}")
        return {}


def analyze_segment(segment: dict, segment_count: int) -> dict:
    """Analyze a single segment with full context."""
    prompt = f"""You are analyzing segment {segment['number']} of {segment_count} from
"The Computer Guru Show", a live call-in radio show hosted by Mike Swanson on AM1030
KVOI in Tucson, Arizona. Co-host Rob is often present. The show takes listener calls
for free tech support and discusses tech news.

This is the COMPLETE transcript of this segment (nothing is truncated).
Analyze it and respond with JSON:

{{
  "title": "Compelling segment title",
  "summary": "3-5 sentence summary of what happened in this segment",
  "key_points": ["array of key takeaway bullet points"],
  "topics": ["array of topics discussed"],
  "speakers": ["array of speakers heard (Mike, Rob, caller names if given)"],
  "caller_questions": ["array of specific questions callers asked, if any"],
  "key_quotes": [
    {{"quote": "exact quote text", "speaker": "who said it", "context": "why notable"}}
  ],
  "blog_worthy_topics": [
    {{"topic": "topic name", "angle": "what makes it worth expanding", "details_from_show": "specific points Mike made that a blog post should include"}}
  ],
  "callbacks": ["any references to earlier segments or topics discussed before the break"]
}}

Respond ONLY with valid JSON.

## Segment {segment['number']} of {segment_count} — Full Transcript

{segment['text']}"""

    result = timed_query(
        f"Segment {segment['number']}/{segment_count} ({segment['char_count']} chars)",
        prompt
    )
    return parse_json_response(result)


def cross_segment_synthesis(segment_analyses: list[dict], segments: list[dict]) -> dict:
    """Synthesize across all segments for episode-level analysis."""
    # Build a compact summary of each segment for the synthesis prompt
    segment_summaries = []
    for i, analysis in enumerate(segment_analyses, 1):
        if not analysis:
            continue
        segment_summaries.append(
            f"### Segment {i}: {analysis.get('title', 'Unknown')}\n"
            f"Summary: {analysis.get('summary', 'N/A')}\n"
            f"Topics: {', '.join(analysis.get('topics', []))}\n"
            f"Speakers: {', '.join(analysis.get('speakers', []))}\n"
            f"Key points: {json.dumps(analysis.get('key_points', []))}\n"
            f"Callbacks: {json.dumps(analysis.get('callbacks', []))}"
        )

    all_blog_topics = []
    for analysis in segment_analyses:
        if analysis:
            all_blog_topics.extend(analysis.get("blog_worthy_topics", []))

    prompt = f"""You are producing the final episode analysis for "The Computer Guru Show".
Below are analyses of each individual segment. Your job is to synthesize them into a
cohesive episode-level view.

Respond with JSON:

{{
  "episode_title": "A compelling episode title that captures the main theme",
  "episode_summary": "2-3 paragraph summary of the entire episode. Be specific about topics, callers, and conversations. Write in third person, suitable for a podcast episode page.",
  "narrative_arc": "1 paragraph describing how the show flowed — what opened, how topics evolved, what closed it out",
  "recurring_themes": ["topics or ideas that came up across multiple segments"],
  "cross_segment_connections": ["specific callbacks or topic continuations across segments"],
  "all_topics": ["complete deduplicated list of every topic discussed"],
  "all_tags": ["SEO-friendly lowercase hyphenated tags"],
  "top_quotes": [
    {{"quote": "text", "speaker": "name", "context": "why notable", "segment": 1}}
  ],
  "blog_post_candidates": [
    {{
      "title": "Proposed blog post title",
      "angle": "specific thesis or angle",
      "why": "why this deserves expansion",
      "source_segments": [1, 2],
      "key_details_from_show": ["specific points, quotes, and examples from the show to include"]
    }}
  ]
}}

Respond ONLY with valid JSON.

## Per-Segment Analyses

{chr(10).join(segment_summaries)}

## Blog-Worthy Topics Identified Across All Segments

{json.dumps(all_blog_topics, indent=2)}"""

    result = timed_query("Cross-Segment Synthesis", prompt)
    return parse_json_response(result)


def generate_forum_post(synthesis: dict) -> str:
    """Generate forum discussion post from synthesis."""
    prompt = f"""Write a community forum discussion post for "The Computer Guru Show" forum.

Episode title: {synthesis.get('episode_title', 'Unknown')}
Summary: {synthesis.get('episode_summary', '')}
Topics: {json.dumps(synthesis.get('all_topics', []))}
Narrative arc: {synthesis.get('narrative_arc', '')}

Rules:
- Conversational, engaging tone that invites discussion
- Brief hook (2-3 sentences about the most interesting thing)
- Bullet list of topics with one-line teasers
- 2-3 discussion questions that invite audience participation
- "Listen to the full episode" call-to-action
- Under 300 words
- Casual, friendly tone
- No emojis
- No markdown headers larger than ###

Write the post now."""

    return timed_query("Forum Post", prompt, temperature=0.5)


def generate_blog_post(synthesis: dict, candidate: dict,
                       segments: list[dict]) -> str:
    """Generate a blog post using the full segment transcripts for source material."""
    # Find the source segments referenced by the blog candidate
    source_nums = candidate.get("source_segments", [1])
    source_text = ""
    for num in source_nums:
        if 0 < num <= len(segments):
            source_text += f"\n--- Segment {num} transcript ---\n{segments[num-1]['text'][:15000]}\n"

    # If no specific segments referenced, use the first two
    if not source_text:
        for seg in segments[:2]:
            source_text += f"\n--- Segment {seg['number']} transcript ---\n{seg['text'][:10000]}\n"

    prompt = f"""Write a blog post for the Computer Guru Show website (radio.azcomputerguru.com).
Author: Mike Swanson — veteran IT professional, radio host in Tucson AZ.

His writing style:
- Explains complex tech in plain English using analogies
- Uses humor — dry, self-deprecating, occasionally sarcastic
- Gives practical, actionable advice
- Takes strong positions on consumer rights, privacy, and corporate BS
- Speaks directly to the reader like a friend
- References real conversations from the show

Blog post details:
- Title: {candidate.get('title', 'Untitled')}
- Angle: {candidate.get('angle', '')}
- Key details from show: {json.dumps(candidate.get('key_details_from_show', []))}

Format:
1. Engaging opening paragraph (hook the reader with something from the show)
2. 3-5 sections with ### subheadings
3. "What This Means for You" practical section
4. Key Takeaways (bullet points)
5. Closing that ties back to the show conversation

Target: 800-1200 words. First person as Mike Swanson.
End with: "This topic was discussed on The Computer Guru Show. Listen to the full episode for more."

IMPORTANT: Draw directly from the transcript below. Use Mike's actual words, analogies, and
examples — not generic filler. If Mike made a joke or analogy on air, reference it in the post.

## Source transcript from the show:
{source_text}"""

    return timed_query(f"Blog: {candidate.get('title', '?')}", prompt, temperature=0.5)


def main():
    transcript_dir = sys.argv[1] if len(sys.argv) > 1 else \
        "training-data/transcripts/2016-s8e42"

    print(f"Loading transcript from: {transcript_dir}")
    lines = load_transcript(transcript_dir)
    print(f"Total lines: {len(lines)}")

    # Step 1: Split into segments
    print(f"\n{'='*60}")
    print(f"  STEP 1: Splitting into segments")
    print(f"{'='*60}")
    segments = split_into_segments(lines)
    print(f"  Found {len(segments)} segments:\n")
    for seg in segments:
        print(f"  Segment {seg['number']}: lines {seg['start_line']}-{seg['end_line']}, "
              f"{seg['char_count']} chars, {len(seg['lines'])} lines")
        # Show first line as preview
        preview = seg['lines'][0][:80] if seg['lines'] else "(empty)"
        print(f"    Preview: {preview}")

    output_dir = Path(transcript_dir) / "generated-v2"
    output_dir.mkdir(parents=True, exist_ok=True)

    # Save segments for reference
    segments_meta = [{k: v for k, v in s.items() if k != 'lines'} for s in segments]
    with open(output_dir / "segments.json", "w") as f:
        json.dump(segments_meta, f, indent=2)

    # Step 2: Analyze each segment
    print(f"\n{'='*60}")
    print(f"  STEP 2: Analyzing {len(segments)} segments individually")
    print(f"{'='*60}")
    segment_analyses = []
    for seg in segments:
        analysis = analyze_segment(seg, len(segments))
        segment_analyses.append(analysis)

        # Save individual segment analysis
        with open(output_dir / f"segment-{seg['number']}-analysis.json", "w") as f:
            json.dump(analysis, f, indent=2)

        if analysis:
            print(f"    Title: {analysis.get('title', '?')}")
            print(f"    Topics: {', '.join(analysis.get('topics', []))}")

    # Step 3: Cross-segment synthesis
    print(f"\n{'='*60}")
    print(f"  STEP 3: Cross-segment synthesis")
    print(f"{'='*60}")
    synthesis = cross_segment_synthesis(segment_analyses, segments)
    with open(output_dir / "synthesis.json", "w") as f:
        json.dump(synthesis, f, indent=2)

    if synthesis:
        print(f"\n  Episode title: {synthesis.get('episode_title', '?')}")
        print(f"  Recurring themes: {synthesis.get('recurring_themes', [])}")
        print(f"\n  Episode summary:")
        print(f"  {synthesis.get('episode_summary', 'N/A')[:500]}")

    # Step 4: Generate forum post
    print(f"\n{'='*60}")
    print(f"  STEP 4: Generate content")
    print(f"{'='*60}")
    forum_post = generate_forum_post(synthesis)
    with open(output_dir / "forum-post.md", "w") as f:
        f.write(forum_post)
    print(f"\n--- FORUM POST ---")
    print(forum_post)

    # Step 5: Generate blog post from best candidate
    candidates = synthesis.get("blog_post_candidates", [])
    if candidates:
        blog_post = generate_blog_post(synthesis, candidates[0], segments)
        slug = re.sub(r'[^a-z0-9]+', '-', candidates[0].get("title", "draft").lower())[:50]
        with open(output_dir / f"blog-{slug}.md", "w") as f:
            f.write(blog_post)
        print(f"\n--- BLOG POST ---")
        print(blog_post)

    # Summary
    print(f"\n{'='*60}")
    print(f"  COMPLETE — All outputs in: {output_dir}/")
    print(f"{'='*60}")
    print(f"  Segments analyzed: {len(segments)}")
    print(f"  Per-segment analyses: {sum(1 for a in segment_analyses if a)}")
    print(f"  Blog candidates: {len(candidates)}")
    print(f"  Files generated: {len(list(output_dir.iterdir()))}")


if __name__ == "__main__":
    main()