#!/usr/bin/env python3 """Segment-first content generation test. Architecture: 1. Split transcript at break markers (text-based detection) 2. Analyze each segment individually (full context, no truncation) 3. Cross-segment synthesis (callbacks, recurring topics, narrative arc) 4. Generate forum post and blog post from complete analysis """ import json import re import sys import time from pathlib import Path import ollama MODEL = "qwen3:14b" OLLAMA_HOST = "http://localhost:11434" client = ollama.Client(host=OLLAMA_HOST) # Break markers — patterns that indicate commercial breaks BREAK_START = re.compile( r"^(We'll be right back|We will be right back)", re.IGNORECASE ) BREAK_END = re.compile( r"^(Welcome back to [Tt]he Computer Guru|All right, if you'd like to be a part of the show)", re.IGNORECASE ) # Station IDs and bumper text that appear during breaks BREAK_FILLER = re.compile( r"^(This is the Computer Guru Show on|This is a computer guru show|" r"Your computer guru|Whether you're dealing with|" r"Computer running slow|Has your machine somehow|" r"Be one with your operating system|" r"Listen in, chat in|Want your voice to be heard)", re.IGNORECASE ) def load_transcript(transcript_dir: str) -> list[str]: """Load transcript as lines.""" txt_path = Path(transcript_dir) / "transcript.txt" if not txt_path.exists(): print(f"ERROR: {txt_path} not found") sys.exit(1) return txt_path.read_text().splitlines() def split_into_segments(lines: list[str]) -> list[dict]: """Split transcript lines into show segments, removing commercial breaks. Returns list of segments, each with: - number: segment number (1-based) - start_line: first line number in original transcript - end_line: last line number - lines: list of text lines (show content only) - text: joined text """ segments = [] current_segment_lines = [] current_start = 1 in_break = False segment_num = 0 for i, line in enumerate(lines, 1): stripped = line.strip() if not stripped: continue # Detect break start if BREAK_START.match(stripped) and not in_break: # Save current segment if it has content if current_segment_lines: segment_num += 1 text = "\n".join(current_segment_lines) segments.append({ "number": segment_num, "start_line": current_start, "end_line": i - 1, "lines": current_segment_lines, "text": text, "char_count": len(text), }) in_break = True current_segment_lines = [] continue # Detect break end if in_break and BREAK_END.match(stripped): in_break = False current_start = i # Don't include the "welcome back" line itself — it's transitional continue # Skip break filler (station IDs, bumper text during breaks) if in_break or BREAK_FILLER.match(stripped): continue # Regular show content current_segment_lines.append(stripped) # Don't forget the last segment if current_segment_lines: segment_num += 1 text = "\n".join(current_segment_lines) segments.append({ "number": segment_num, "start_line": current_start, "end_line": len(lines), "lines": current_segment_lines, "text": text, "char_count": len(text), }) return segments def timed_query(label: str, prompt: str, temperature: float = 0.3, ctx_size: int = 32768) -> str: """Run an Ollama query with timing.""" print(f"\n{'='*60}") print(f" {label}") print(f"{'='*60}") start = time.time() response = client.chat( model=MODEL, messages=[{"role": "user", "content": prompt}], options={"temperature": temperature, "num_ctx": ctx_size}, ) elapsed = time.time() - start result = response["message"]["content"] # Strip thinking tags if qwen3 uses them if "" in result: parts = result.split("") if len(parts) > 1: result = parts[-1].strip() print(f" [{elapsed:.1f}s, {len(result)} chars]") return result def parse_json_response(text: str) -> dict: """Parse JSON from LLM response, handling markdown fences.""" if "```json" in text: text = text.split("```json", 1)[1].split("```", 1)[0] elif "```" in text: text = text.split("```", 1)[1].split("```", 1)[0] try: return json.loads(text.strip()) except json.JSONDecodeError as e: print(f" WARNING: JSON parse failed: {e}") print(f" First 300 chars: {text[:300]}") return {} def analyze_segment(segment: dict, segment_count: int) -> dict: """Analyze a single segment with full context.""" prompt = f"""You are analyzing segment {segment['number']} of {segment_count} from "The Computer Guru Show", a live call-in radio show hosted by Mike Swanson on AM1030 KVOI in Tucson, Arizona. Co-host Rob is often present. The show takes listener calls for free tech support and discusses tech news. This is the COMPLETE transcript of this segment (nothing is truncated). Analyze it and respond with JSON: {{ "title": "Compelling segment title", "summary": "3-5 sentence summary of what happened in this segment", "key_points": ["array of key takeaway bullet points"], "topics": ["array of topics discussed"], "speakers": ["array of speakers heard (Mike, Rob, caller names if given)"], "caller_questions": ["array of specific questions callers asked, if any"], "key_quotes": [ {{"quote": "exact quote text", "speaker": "who said it", "context": "why notable"}} ], "blog_worthy_topics": [ {{"topic": "topic name", "angle": "what makes it worth expanding", "details_from_show": "specific points Mike made that a blog post should include"}} ], "callbacks": ["any references to earlier segments or topics discussed before the break"] }} Respond ONLY with valid JSON. ## Segment {segment['number']} of {segment_count} — Full Transcript {segment['text']}""" result = timed_query( f"Segment {segment['number']}/{segment_count} ({segment['char_count']} chars)", prompt ) return parse_json_response(result) def cross_segment_synthesis(segment_analyses: list[dict], segments: list[dict]) -> dict: """Synthesize across all segments for episode-level analysis.""" # Build a compact summary of each segment for the synthesis prompt segment_summaries = [] for i, analysis in enumerate(segment_analyses, 1): if not analysis: continue segment_summaries.append( f"### Segment {i}: {analysis.get('title', 'Unknown')}\n" f"Summary: {analysis.get('summary', 'N/A')}\n" f"Topics: {', '.join(analysis.get('topics', []))}\n" f"Speakers: {', '.join(analysis.get('speakers', []))}\n" f"Key points: {json.dumps(analysis.get('key_points', []))}\n" f"Callbacks: {json.dumps(analysis.get('callbacks', []))}" ) all_blog_topics = [] for analysis in segment_analyses: if analysis: all_blog_topics.extend(analysis.get("blog_worthy_topics", [])) prompt = f"""You are producing the final episode analysis for "The Computer Guru Show". Below are analyses of each individual segment. Your job is to synthesize them into a cohesive episode-level view. Respond with JSON: {{ "episode_title": "A compelling episode title that captures the main theme", "episode_summary": "2-3 paragraph summary of the entire episode. Be specific about topics, callers, and conversations. Write in third person, suitable for a podcast episode page.", "narrative_arc": "1 paragraph describing how the show flowed — what opened, how topics evolved, what closed it out", "recurring_themes": ["topics or ideas that came up across multiple segments"], "cross_segment_connections": ["specific callbacks or topic continuations across segments"], "all_topics": ["complete deduplicated list of every topic discussed"], "all_tags": ["SEO-friendly lowercase hyphenated tags"], "top_quotes": [ {{"quote": "text", "speaker": "name", "context": "why notable", "segment": 1}} ], "blog_post_candidates": [ {{ "title": "Proposed blog post title", "angle": "specific thesis or angle", "why": "why this deserves expansion", "source_segments": [1, 2], "key_details_from_show": ["specific points, quotes, and examples from the show to include"] }} ] }} Respond ONLY with valid JSON. ## Per-Segment Analyses {chr(10).join(segment_summaries)} ## Blog-Worthy Topics Identified Across All Segments {json.dumps(all_blog_topics, indent=2)}""" result = timed_query("Cross-Segment Synthesis", prompt) return parse_json_response(result) def generate_forum_post(synthesis: dict) -> str: """Generate forum discussion post from synthesis.""" prompt = f"""Write a community forum discussion post for "The Computer Guru Show" forum. Episode title: {synthesis.get('episode_title', 'Unknown')} Summary: {synthesis.get('episode_summary', '')} Topics: {json.dumps(synthesis.get('all_topics', []))} Narrative arc: {synthesis.get('narrative_arc', '')} Rules: - Conversational, engaging tone that invites discussion - Brief hook (2-3 sentences about the most interesting thing) - Bullet list of topics with one-line teasers - 2-3 discussion questions that invite audience participation - "Listen to the full episode" call-to-action - Under 300 words - Casual, friendly tone - No emojis - No markdown headers larger than ### Write the post now.""" return timed_query("Forum Post", prompt, temperature=0.5) def generate_blog_post(synthesis: dict, candidate: dict, segments: list[dict]) -> str: """Generate a blog post using the full segment transcripts for source material.""" # Find the source segments referenced by the blog candidate source_nums = candidate.get("source_segments", [1]) source_text = "" for num in source_nums: if 0 < num <= len(segments): source_text += f"\n--- Segment {num} transcript ---\n{segments[num-1]['text'][:15000]}\n" # If no specific segments referenced, use the first two if not source_text: for seg in segments[:2]: source_text += f"\n--- Segment {seg['number']} transcript ---\n{seg['text'][:10000]}\n" prompt = f"""Write a blog post for the Computer Guru Show website (radio.azcomputerguru.com). Author: Mike Swanson — veteran IT professional, radio host in Tucson AZ. His writing style: - Explains complex tech in plain English using analogies - Uses humor — dry, self-deprecating, occasionally sarcastic - Gives practical, actionable advice - Takes strong positions on consumer rights, privacy, and corporate BS - Speaks directly to the reader like a friend - References real conversations from the show Blog post details: - Title: {candidate.get('title', 'Untitled')} - Angle: {candidate.get('angle', '')} - Key details from show: {json.dumps(candidate.get('key_details_from_show', []))} Format: 1. Engaging opening paragraph (hook the reader with something from the show) 2. 3-5 sections with ### subheadings 3. "What This Means for You" practical section 4. Key Takeaways (bullet points) 5. Closing that ties back to the show conversation Target: 800-1200 words. First person as Mike Swanson. End with: "This topic was discussed on The Computer Guru Show. Listen to the full episode for more." IMPORTANT: Draw directly from the transcript below. Use Mike's actual words, analogies, and examples — not generic filler. If Mike made a joke or analogy on air, reference it in the post. ## Source transcript from the show: {source_text}""" return timed_query(f"Blog: {candidate.get('title', '?')}", prompt, temperature=0.5) def main(): transcript_dir = sys.argv[1] if len(sys.argv) > 1 else \ "training-data/transcripts/2016-s8e42" print(f"Loading transcript from: {transcript_dir}") lines = load_transcript(transcript_dir) print(f"Total lines: {len(lines)}") # Step 1: Split into segments print(f"\n{'='*60}") print(f" STEP 1: Splitting into segments") print(f"{'='*60}") segments = split_into_segments(lines) print(f" Found {len(segments)} segments:\n") for seg in segments: print(f" Segment {seg['number']}: lines {seg['start_line']}-{seg['end_line']}, " f"{seg['char_count']} chars, {len(seg['lines'])} lines") # Show first line as preview preview = seg['lines'][0][:80] if seg['lines'] else "(empty)" print(f" Preview: {preview}") output_dir = Path(transcript_dir) / "generated-v2" output_dir.mkdir(parents=True, exist_ok=True) # Save segments for reference segments_meta = [{k: v for k, v in s.items() if k != 'lines'} for s in segments] with open(output_dir / "segments.json", "w") as f: json.dump(segments_meta, f, indent=2) # Step 2: Analyze each segment print(f"\n{'='*60}") print(f" STEP 2: Analyzing {len(segments)} segments individually") print(f"{'='*60}") segment_analyses = [] for seg in segments: analysis = analyze_segment(seg, len(segments)) segment_analyses.append(analysis) # Save individual segment analysis with open(output_dir / f"segment-{seg['number']}-analysis.json", "w") as f: json.dump(analysis, f, indent=2) if analysis: print(f" Title: {analysis.get('title', '?')}") print(f" Topics: {', '.join(analysis.get('topics', []))}") # Step 3: Cross-segment synthesis print(f"\n{'='*60}") print(f" STEP 3: Cross-segment synthesis") print(f"{'='*60}") synthesis = cross_segment_synthesis(segment_analyses, segments) with open(output_dir / "synthesis.json", "w") as f: json.dump(synthesis, f, indent=2) if synthesis: print(f"\n Episode title: {synthesis.get('episode_title', '?')}") print(f" Recurring themes: {synthesis.get('recurring_themes', [])}") print(f"\n Episode summary:") print(f" {synthesis.get('episode_summary', 'N/A')[:500]}") # Step 4: Generate forum post print(f"\n{'='*60}") print(f" STEP 4: Generate content") print(f"{'='*60}") forum_post = generate_forum_post(synthesis) with open(output_dir / "forum-post.md", "w") as f: f.write(forum_post) print(f"\n--- FORUM POST ---") print(forum_post) # Step 5: Generate blog post from best candidate candidates = synthesis.get("blog_post_candidates", []) if candidates: blog_post = generate_blog_post(synthesis, candidates[0], segments) slug = re.sub(r'[^a-z0-9]+', '-', candidates[0].get("title", "draft").lower())[:50] with open(output_dir / f"blog-{slug}.md", "w") as f: f.write(blog_post) print(f"\n--- BLOG POST ---") print(blog_post) # Summary print(f"\n{'='*60}") print(f" COMPLETE — All outputs in: {output_dir}/") print(f"{'='*60}") print(f" Segments analyzed: {len(segments)}") print(f" Per-segment analyses: {sum(1 for a in segment_analyses if a)}") print(f" Blog candidates: {len(candidates)}") print(f" Files generated: {len(list(output_dir.iterdir()))}") if __name__ == "__main__": main()