Synced files: - Session logs updated - Latest context and credentials - Command/directive updates Machine: acg-guru-5070 Timestamp: 2026-03-22 22:31:46 Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
432 lines
15 KiB
Python
432 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""Segment-first content generation test.
|
|
|
|
Architecture:
|
|
1. Split transcript at break markers (text-based detection)
|
|
2. Analyze each segment individually (full context, no truncation)
|
|
3. Cross-segment synthesis (callbacks, recurring topics, narrative arc)
|
|
4. Generate forum post and blog post from complete analysis
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import ollama
|
|
|
|
MODEL = "qwen3:14b"
|
|
OLLAMA_HOST = "http://localhost:11434"
|
|
|
|
client = ollama.Client(host=OLLAMA_HOST)
|
|
|
|
# Break markers — patterns that indicate commercial breaks
|
|
BREAK_START = re.compile(
|
|
r"^(We'll be right back|We will be right back)",
|
|
re.IGNORECASE
|
|
)
|
|
BREAK_END = re.compile(
|
|
r"^(Welcome back to [Tt]he Computer Guru|All right, if you'd like to be a part of the show)",
|
|
re.IGNORECASE
|
|
)
|
|
# Station IDs and bumper text that appear during breaks
|
|
BREAK_FILLER = re.compile(
|
|
r"^(This is the Computer Guru Show on|This is a computer guru show|"
|
|
r"Your computer guru|Whether you're dealing with|"
|
|
r"Computer running slow|Has your machine somehow|"
|
|
r"Be one with your operating system|"
|
|
r"Listen in, chat in|Want your voice to be heard)",
|
|
re.IGNORECASE
|
|
)
|
|
|
|
|
|
def load_transcript(transcript_dir: str) -> list[str]:
|
|
"""Load transcript as lines."""
|
|
txt_path = Path(transcript_dir) / "transcript.txt"
|
|
if not txt_path.exists():
|
|
print(f"ERROR: {txt_path} not found")
|
|
sys.exit(1)
|
|
return txt_path.read_text().splitlines()
|
|
|
|
|
|
def split_into_segments(lines: list[str]) -> list[dict]:
|
|
"""Split transcript lines into show segments, removing commercial breaks.
|
|
|
|
Returns list of segments, each with:
|
|
- number: segment number (1-based)
|
|
- start_line: first line number in original transcript
|
|
- end_line: last line number
|
|
- lines: list of text lines (show content only)
|
|
- text: joined text
|
|
"""
|
|
segments = []
|
|
current_segment_lines = []
|
|
current_start = 1
|
|
in_break = False
|
|
segment_num = 0
|
|
|
|
for i, line in enumerate(lines, 1):
|
|
stripped = line.strip()
|
|
if not stripped:
|
|
continue
|
|
|
|
# Detect break start
|
|
if BREAK_START.match(stripped) and not in_break:
|
|
# Save current segment if it has content
|
|
if current_segment_lines:
|
|
segment_num += 1
|
|
text = "\n".join(current_segment_lines)
|
|
segments.append({
|
|
"number": segment_num,
|
|
"start_line": current_start,
|
|
"end_line": i - 1,
|
|
"lines": current_segment_lines,
|
|
"text": text,
|
|
"char_count": len(text),
|
|
})
|
|
in_break = True
|
|
current_segment_lines = []
|
|
continue
|
|
|
|
# Detect break end
|
|
if in_break and BREAK_END.match(stripped):
|
|
in_break = False
|
|
current_start = i
|
|
# Don't include the "welcome back" line itself — it's transitional
|
|
continue
|
|
|
|
# Skip break filler (station IDs, bumper text during breaks)
|
|
if in_break or BREAK_FILLER.match(stripped):
|
|
continue
|
|
|
|
# Regular show content
|
|
current_segment_lines.append(stripped)
|
|
|
|
# Don't forget the last segment
|
|
if current_segment_lines:
|
|
segment_num += 1
|
|
text = "\n".join(current_segment_lines)
|
|
segments.append({
|
|
"number": segment_num,
|
|
"start_line": current_start,
|
|
"end_line": len(lines),
|
|
"lines": current_segment_lines,
|
|
"text": text,
|
|
"char_count": len(text),
|
|
})
|
|
|
|
return segments
|
|
|
|
|
|
def timed_query(label: str, prompt: str, temperature: float = 0.3,
|
|
ctx_size: int = 32768) -> str:
|
|
"""Run an Ollama query with timing."""
|
|
print(f"\n{'='*60}")
|
|
print(f" {label}")
|
|
print(f"{'='*60}")
|
|
start = time.time()
|
|
|
|
response = client.chat(
|
|
model=MODEL,
|
|
messages=[{"role": "user", "content": prompt}],
|
|
options={"temperature": temperature, "num_ctx": ctx_size},
|
|
)
|
|
|
|
elapsed = time.time() - start
|
|
result = response["message"]["content"]
|
|
|
|
# Strip thinking tags if qwen3 uses them
|
|
if "<think>" in result:
|
|
parts = result.split("</think>")
|
|
if len(parts) > 1:
|
|
result = parts[-1].strip()
|
|
|
|
print(f" [{elapsed:.1f}s, {len(result)} chars]")
|
|
return result
|
|
|
|
|
|
def parse_json_response(text: str) -> dict:
|
|
"""Parse JSON from LLM response, handling markdown fences."""
|
|
if "```json" in text:
|
|
text = text.split("```json", 1)[1].split("```", 1)[0]
|
|
elif "```" in text:
|
|
text = text.split("```", 1)[1].split("```", 1)[0]
|
|
try:
|
|
return json.loads(text.strip())
|
|
except json.JSONDecodeError as e:
|
|
print(f" WARNING: JSON parse failed: {e}")
|
|
print(f" First 300 chars: {text[:300]}")
|
|
return {}
|
|
|
|
|
|
def analyze_segment(segment: dict, segment_count: int) -> dict:
|
|
"""Analyze a single segment with full context."""
|
|
prompt = f"""You are analyzing segment {segment['number']} of {segment_count} from
|
|
"The Computer Guru Show", a live call-in radio show hosted by Mike Swanson on AM1030
|
|
KVOI in Tucson, Arizona. Co-host Rob is often present. The show takes listener calls
|
|
for free tech support and discusses tech news.
|
|
|
|
This is the COMPLETE transcript of this segment (nothing is truncated).
|
|
Analyze it and respond with JSON:
|
|
|
|
{{
|
|
"title": "Compelling segment title",
|
|
"summary": "3-5 sentence summary of what happened in this segment",
|
|
"key_points": ["array of key takeaway bullet points"],
|
|
"topics": ["array of topics discussed"],
|
|
"speakers": ["array of speakers heard (Mike, Rob, caller names if given)"],
|
|
"caller_questions": ["array of specific questions callers asked, if any"],
|
|
"key_quotes": [
|
|
{{"quote": "exact quote text", "speaker": "who said it", "context": "why notable"}}
|
|
],
|
|
"blog_worthy_topics": [
|
|
{{"topic": "topic name", "angle": "what makes it worth expanding", "details_from_show": "specific points Mike made that a blog post should include"}}
|
|
],
|
|
"callbacks": ["any references to earlier segments or topics discussed before the break"]
|
|
}}
|
|
|
|
Respond ONLY with valid JSON.
|
|
|
|
## Segment {segment['number']} of {segment_count} — Full Transcript
|
|
|
|
{segment['text']}"""
|
|
|
|
result = timed_query(
|
|
f"Segment {segment['number']}/{segment_count} ({segment['char_count']} chars)",
|
|
prompt
|
|
)
|
|
return parse_json_response(result)
|
|
|
|
|
|
def cross_segment_synthesis(segment_analyses: list[dict], segments: list[dict]) -> dict:
|
|
"""Synthesize across all segments for episode-level analysis."""
|
|
# Build a compact summary of each segment for the synthesis prompt
|
|
segment_summaries = []
|
|
for i, analysis in enumerate(segment_analyses, 1):
|
|
if not analysis:
|
|
continue
|
|
segment_summaries.append(
|
|
f"### Segment {i}: {analysis.get('title', 'Unknown')}\n"
|
|
f"Summary: {analysis.get('summary', 'N/A')}\n"
|
|
f"Topics: {', '.join(analysis.get('topics', []))}\n"
|
|
f"Speakers: {', '.join(analysis.get('speakers', []))}\n"
|
|
f"Key points: {json.dumps(analysis.get('key_points', []))}\n"
|
|
f"Callbacks: {json.dumps(analysis.get('callbacks', []))}"
|
|
)
|
|
|
|
all_blog_topics = []
|
|
for analysis in segment_analyses:
|
|
if analysis:
|
|
all_blog_topics.extend(analysis.get("blog_worthy_topics", []))
|
|
|
|
prompt = f"""You are producing the final episode analysis for "The Computer Guru Show".
|
|
Below are analyses of each individual segment. Your job is to synthesize them into a
|
|
cohesive episode-level view.
|
|
|
|
Respond with JSON:
|
|
|
|
{{
|
|
"episode_title": "A compelling episode title that captures the main theme",
|
|
"episode_summary": "2-3 paragraph summary of the entire episode. Be specific about topics, callers, and conversations. Write in third person, suitable for a podcast episode page.",
|
|
"narrative_arc": "1 paragraph describing how the show flowed — what opened, how topics evolved, what closed it out",
|
|
"recurring_themes": ["topics or ideas that came up across multiple segments"],
|
|
"cross_segment_connections": ["specific callbacks or topic continuations across segments"],
|
|
"all_topics": ["complete deduplicated list of every topic discussed"],
|
|
"all_tags": ["SEO-friendly lowercase hyphenated tags"],
|
|
"top_quotes": [
|
|
{{"quote": "text", "speaker": "name", "context": "why notable", "segment": 1}}
|
|
],
|
|
"blog_post_candidates": [
|
|
{{
|
|
"title": "Proposed blog post title",
|
|
"angle": "specific thesis or angle",
|
|
"why": "why this deserves expansion",
|
|
"source_segments": [1, 2],
|
|
"key_details_from_show": ["specific points, quotes, and examples from the show to include"]
|
|
}}
|
|
]
|
|
}}
|
|
|
|
Respond ONLY with valid JSON.
|
|
|
|
## Per-Segment Analyses
|
|
|
|
{chr(10).join(segment_summaries)}
|
|
|
|
## Blog-Worthy Topics Identified Across All Segments
|
|
|
|
{json.dumps(all_blog_topics, indent=2)}"""
|
|
|
|
result = timed_query("Cross-Segment Synthesis", prompt)
|
|
return parse_json_response(result)
|
|
|
|
|
|
def generate_forum_post(synthesis: dict) -> str:
|
|
"""Generate forum discussion post from synthesis."""
|
|
prompt = f"""Write a community forum discussion post for "The Computer Guru Show" forum.
|
|
|
|
Episode title: {synthesis.get('episode_title', 'Unknown')}
|
|
Summary: {synthesis.get('episode_summary', '')}
|
|
Topics: {json.dumps(synthesis.get('all_topics', []))}
|
|
Narrative arc: {synthesis.get('narrative_arc', '')}
|
|
|
|
Rules:
|
|
- Conversational, engaging tone that invites discussion
|
|
- Brief hook (2-3 sentences about the most interesting thing)
|
|
- Bullet list of topics with one-line teasers
|
|
- 2-3 discussion questions that invite audience participation
|
|
- "Listen to the full episode" call-to-action
|
|
- Under 300 words
|
|
- Casual, friendly tone
|
|
- No emojis
|
|
- No markdown headers larger than ###
|
|
|
|
Write the post now."""
|
|
|
|
return timed_query("Forum Post", prompt, temperature=0.5)
|
|
|
|
|
|
def generate_blog_post(synthesis: dict, candidate: dict,
|
|
segments: list[dict]) -> str:
|
|
"""Generate a blog post using the full segment transcripts for source material."""
|
|
# Find the source segments referenced by the blog candidate
|
|
source_nums = candidate.get("source_segments", [1])
|
|
source_text = ""
|
|
for num in source_nums:
|
|
if 0 < num <= len(segments):
|
|
source_text += f"\n--- Segment {num} transcript ---\n{segments[num-1]['text'][:15000]}\n"
|
|
|
|
# If no specific segments referenced, use the first two
|
|
if not source_text:
|
|
for seg in segments[:2]:
|
|
source_text += f"\n--- Segment {seg['number']} transcript ---\n{seg['text'][:10000]}\n"
|
|
|
|
prompt = f"""Write a blog post for the Computer Guru Show website (radio.azcomputerguru.com).
|
|
Author: Mike Swanson — veteran IT professional, radio host in Tucson AZ.
|
|
|
|
His writing style:
|
|
- Explains complex tech in plain English using analogies
|
|
- Uses humor — dry, self-deprecating, occasionally sarcastic
|
|
- Gives practical, actionable advice
|
|
- Takes strong positions on consumer rights, privacy, and corporate BS
|
|
- Speaks directly to the reader like a friend
|
|
- References real conversations from the show
|
|
|
|
Blog post details:
|
|
- Title: {candidate.get('title', 'Untitled')}
|
|
- Angle: {candidate.get('angle', '')}
|
|
- Key details from show: {json.dumps(candidate.get('key_details_from_show', []))}
|
|
|
|
Format:
|
|
1. Engaging opening paragraph (hook the reader with something from the show)
|
|
2. 3-5 sections with ### subheadings
|
|
3. "What This Means for You" practical section
|
|
4. Key Takeaways (bullet points)
|
|
5. Closing that ties back to the show conversation
|
|
|
|
Target: 800-1200 words. First person as Mike Swanson.
|
|
End with: "This topic was discussed on The Computer Guru Show. Listen to the full episode for more."
|
|
|
|
IMPORTANT: Draw directly from the transcript below. Use Mike's actual words, analogies, and
|
|
examples — not generic filler. If Mike made a joke or analogy on air, reference it in the post.
|
|
|
|
## Source transcript from the show:
|
|
{source_text}"""
|
|
|
|
return timed_query(f"Blog: {candidate.get('title', '?')}", prompt, temperature=0.5)
|
|
|
|
|
|
def main():
|
|
transcript_dir = sys.argv[1] if len(sys.argv) > 1 else \
|
|
"training-data/transcripts/2016-s8e42"
|
|
|
|
print(f"Loading transcript from: {transcript_dir}")
|
|
lines = load_transcript(transcript_dir)
|
|
print(f"Total lines: {len(lines)}")
|
|
|
|
# Step 1: Split into segments
|
|
print(f"\n{'='*60}")
|
|
print(f" STEP 1: Splitting into segments")
|
|
print(f"{'='*60}")
|
|
segments = split_into_segments(lines)
|
|
print(f" Found {len(segments)} segments:\n")
|
|
for seg in segments:
|
|
print(f" Segment {seg['number']}: lines {seg['start_line']}-{seg['end_line']}, "
|
|
f"{seg['char_count']} chars, {len(seg['lines'])} lines")
|
|
# Show first line as preview
|
|
preview = seg['lines'][0][:80] if seg['lines'] else "(empty)"
|
|
print(f" Preview: {preview}")
|
|
|
|
output_dir = Path(transcript_dir) / "generated-v2"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Save segments for reference
|
|
segments_meta = [{k: v for k, v in s.items() if k != 'lines'} for s in segments]
|
|
with open(output_dir / "segments.json", "w") as f:
|
|
json.dump(segments_meta, f, indent=2)
|
|
|
|
# Step 2: Analyze each segment
|
|
print(f"\n{'='*60}")
|
|
print(f" STEP 2: Analyzing {len(segments)} segments individually")
|
|
print(f"{'='*60}")
|
|
segment_analyses = []
|
|
for seg in segments:
|
|
analysis = analyze_segment(seg, len(segments))
|
|
segment_analyses.append(analysis)
|
|
|
|
# Save individual segment analysis
|
|
with open(output_dir / f"segment-{seg['number']}-analysis.json", "w") as f:
|
|
json.dump(analysis, f, indent=2)
|
|
|
|
if analysis:
|
|
print(f" Title: {analysis.get('title', '?')}")
|
|
print(f" Topics: {', '.join(analysis.get('topics', []))}")
|
|
|
|
# Step 3: Cross-segment synthesis
|
|
print(f"\n{'='*60}")
|
|
print(f" STEP 3: Cross-segment synthesis")
|
|
print(f"{'='*60}")
|
|
synthesis = cross_segment_synthesis(segment_analyses, segments)
|
|
with open(output_dir / "synthesis.json", "w") as f:
|
|
json.dump(synthesis, f, indent=2)
|
|
|
|
if synthesis:
|
|
print(f"\n Episode title: {synthesis.get('episode_title', '?')}")
|
|
print(f" Recurring themes: {synthesis.get('recurring_themes', [])}")
|
|
print(f"\n Episode summary:")
|
|
print(f" {synthesis.get('episode_summary', 'N/A')[:500]}")
|
|
|
|
# Step 4: Generate forum post
|
|
print(f"\n{'='*60}")
|
|
print(f" STEP 4: Generate content")
|
|
print(f"{'='*60}")
|
|
forum_post = generate_forum_post(synthesis)
|
|
with open(output_dir / "forum-post.md", "w") as f:
|
|
f.write(forum_post)
|
|
print(f"\n--- FORUM POST ---")
|
|
print(forum_post)
|
|
|
|
# Step 5: Generate blog post from best candidate
|
|
candidates = synthesis.get("blog_post_candidates", [])
|
|
if candidates:
|
|
blog_post = generate_blog_post(synthesis, candidates[0], segments)
|
|
slug = re.sub(r'[^a-z0-9]+', '-', candidates[0].get("title", "draft").lower())[:50]
|
|
with open(output_dir / f"blog-{slug}.md", "w") as f:
|
|
f.write(blog_post)
|
|
print(f"\n--- BLOG POST ---")
|
|
print(blog_post)
|
|
|
|
# Summary
|
|
print(f"\n{'='*60}")
|
|
print(f" COMPLETE — All outputs in: {output_dir}/")
|
|
print(f"{'='*60}")
|
|
print(f" Segments analyzed: {len(segments)}")
|
|
print(f" Per-segment analyses: {sum(1 for a in segment_analyses if a)}")
|
|
print(f" Blog candidates: {len(candidates)}")
|
|
print(f" Files generated: {len(list(output_dir.iterdir()))}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|