From 5e6ec54614f0426a7358e0daf8ad1e508522530a Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Sun, 22 Mar 2026 22:31:46 -0700 Subject: [PATCH] sync: Auto-sync from acg-guru-5070 at 2026-03-22 22:31:46 Synced files: - Session logs updated - Latest context and credentials - Command/directive updates Machine: acg-guru-5070 Timestamp: 2026-03-22 22:31:46 Co-Authored-By: Claude Sonnet 4.5 --- .claude/CLAUDE.md | 13 +- .claude/memory/MEMORY.md | 19 + .claude/memory/feedback_d2testnas_ssh.md | 11 + .../project_audio_processor_architecture.md | 32 ++ .../memory/project_email_routing_neptune.md | 11 + .../project_neptune_sbr_email_routing.md | 49 ++ .claude/memory/reference_community_forum.md | 48 ++ .claude/memory/reference_dataforth_contact.md | 7 + .../memory/reference_ix_access_tailscale.md | 7 + .claude/memory/reference_ix_server_ssh.md | 18 + .claude/memory/reference_matomo_analytics.md | 40 ++ .../reference_neptune_access_d2testnas.md | 7 + .claude/memory/reference_radio_website.md | 23 + .claude/memory/reference_workstation_setup.md | 35 ++ .../test_content_generation.py | 241 ++++++++++ .../audio-processor/test_segment_first.py | 431 ++++++++++++++++++ 16 files changed, 991 insertions(+), 1 deletion(-) create mode 100644 .claude/memory/MEMORY.md create mode 100644 .claude/memory/feedback_d2testnas_ssh.md create mode 100644 .claude/memory/project_audio_processor_architecture.md create mode 100644 .claude/memory/project_email_routing_neptune.md create mode 100644 .claude/memory/project_neptune_sbr_email_routing.md create mode 100644 .claude/memory/reference_community_forum.md create mode 100644 .claude/memory/reference_dataforth_contact.md create mode 100644 .claude/memory/reference_ix_access_tailscale.md create mode 100644 .claude/memory/reference_ix_server_ssh.md create mode 100644 .claude/memory/reference_matomo_analytics.md create mode 100644 .claude/memory/reference_neptune_access_d2testnas.md create mode 100644 .claude/memory/reference_radio_website.md create mode 100644 .claude/memory/reference_workstation_setup.md create mode 100644 projects/radio-show/audio-processor/test_content_generation.py create mode 100644 projects/radio-show/audio-processor/test_segment_first.py diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 125f8050..c5b123ee 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -167,13 +167,24 @@ When using Ollama for bulk tasks (e.g., processing N items), review the first 2- --- +## Memory (Shared Across Machines) + +Claude Code's auto-memory is stored **in-repo** at `.claude/memory/` so it syncs via Gitea to all workstations. + +**IMPORTANT for all machines:** Configure Claude Code to use the repo memory path, NOT the default `~/.claude/projects/` path. When the auto-memory system prompts you to write to `~/.claude/projects/-home-guru-ClaudeTools/memory/`, write to `.claude/memory/` (repo-relative) instead. The index file is `.claude/memory/MEMORY.md`. + +This ensures memory created on one workstation (CachyOS, Mac, Windows) is available on all others after a git pull/sync. + +--- + ## Reference (read on-demand, not every session) - **Project structure, endpoints, workflows, troubleshooting:** `.claude/REFERENCE.md` - **Agent definitions:** `.claude/agents/*.md` - **MCP servers:** `MCP_SERVERS.md` - **Coding standards:** `.claude/CODING_GUIDELINES.md` +- **Shared memory:** `.claude/memory/MEMORY.md` (index) + `.claude/memory/*.md` (individual memories) --- -**Last Updated:** 2026-03-20 +**Last Updated:** 2026-03-22 diff --git a/.claude/memory/MEMORY.md b/.claude/memory/MEMORY.md new file mode 100644 index 00000000..37e528f3 --- /dev/null +++ b/.claude/memory/MEMORY.md @@ -0,0 +1,19 @@ +# Memory Index + +## Reference +- [Community Forum (Flarum)](reference_community_forum.md) - Flarum forum at community.azcomputerguru.com, API access, database, posting workflow +- [Radio Show Website](reference_radio_website.md) - Astro static site at radio.azcomputerguru.com on IX server +- [IX Server SSH Access](reference_ix_server_ssh.md) - SSH access notes, no key auth from CachyOS workstation yet +- [IX Access via Tailscale](reference_ix_access_tailscale.md) - IX server accessible with Tailscale on, no VPN needed +- [Neptune Access via D2TESTNAS](reference_neptune_access_d2testnas.md) - Neptune must be routed through D2TESTNAS +- [CachyOS Workstation Setup](reference_workstation_setup.md) - Dual NVMe, autostart apps, key fixes applied, old home location +- [Matomo Analytics](reference_matomo_analytics.md) - Self-hosted analytics at analytics.azcomputerguru.com, site IDs, tracking for all 3 sites +- [Dataforth Contact - AJ](reference_dataforth_contact.md) - AJ at Dataforth, dataforthgit@ email forwarding to him + +## Feedback +- [D2TESTNAS SSH Access](feedback_d2testnas_ssh.md) - Use root@192.168.0.9 with Paper123!@#, not sysadmin + +## Project +- [Audio Processor Architecture](project_audio_processor_architecture.md) - Segment-first pipeline: detect breaks before transcription for complete content capture +- [Neptune Email Routing Issues](project_email_routing_neptune.md) - Multiple clients (devcon, Sorensen/rieussetcorp) have email not routing properly from Neptune +- [Neptune SBR Email Routing Setup](project_neptune_sbr_email_routing.md) - Full SBR routing chain, config file locations, MailProtector integration, access methods diff --git a/.claude/memory/feedback_d2testnas_ssh.md b/.claude/memory/feedback_d2testnas_ssh.md new file mode 100644 index 00000000..256039b0 --- /dev/null +++ b/.claude/memory/feedback_d2testnas_ssh.md @@ -0,0 +1,11 @@ +--- +name: D2TESTNAS SSH Access +description: D2TESTNAS SSH is root@192.168.0.9 with Paper123!@#, not sysadmin +type: feedback +--- + +D2TESTNAS SSH: use `root@192.168.0.9` with password `Paper123!@#`. The `sysadmin` user does not work for SSH. CachyOS workstation (acg-guru-5070) now has an ed25519 key authorized on D2TESTNAS for root. + +**Why:** Credentials in credentials.md listed sysadmin as SSH user, which was incorrect and caused multiple failed attempts. + +**How to apply:** When SSHing to D2TESTNAS, always use root@192.168.0.9. The SSH key at ~/.ssh/id_ed25519 (guru@acg-guru-5070) should work without password. diff --git a/.claude/memory/project_audio_processor_architecture.md b/.claude/memory/project_audio_processor_architecture.md new file mode 100644 index 00000000..527a39a7 --- /dev/null +++ b/.claude/memory/project_audio_processor_architecture.md @@ -0,0 +1,32 @@ +--- +name: Audio Processor - Segment-First Architecture +description: Revised pipeline architecture - detect breaks and split into segments BEFORE transcription for complete content capture +type: project +--- + +## Revised Pipeline Architecture (decided 2026-03-22) + +Shows are almost always 4 segments per hour (8 total for a 2-hour show). Extra breaks are rare. + +**Old approach:** Transcribe full episode -> truncate to fit LLM context -> analyze (loses content) + +**New approach:** Detect breaks first (audio-only) -> split into ~8 segments -> transcribe each -> analyze each with full context -> cross-segment synthesis + +### Pipeline Order + +1. **Audio-level break detection** (no transcript needed) — loudness/compression jumps, silence gaps, known bumper fingerprints, HR1/HR2 boundary +2. **Split into segments** — ~7-15 min each, complete audio chunks +3. **Transcribe each segment** — smaller files, complete content, no truncation +4. **Analyze each segment** — full transcript fits in LLM context window easily +5. **Cross-segment synthesis** — detect topics spanning segments, callbacks ("going back to what we said before the break"), narrative arc +6. **Generate content** — blog posts, forum posts, episode summary from complete analysis + +### Key Insights + +- 4 segments/hour is a strong structural prior for break detection — if 12-18 min into a segment and audio signatures appear, almost certainly a break. At 5 min, probably not. +- Each segment transcript is ~5-10K chars — fits in any LLM context with room for detailed prompts +- Cross-segment synthesis pass is new and essential for catching callbacks and recurring topics + +**Why:** Solves the context window truncation problem that loses show content. Each segment gets complete analysis. + +**How to apply:** This is the architecture direction for all future audio processor work. The existing Stage 3 segment detector needs to work without transcript input (audio-only signals). Stage 6 analyzer needs per-segment + synthesis passes. diff --git a/.claude/memory/project_email_routing_neptune.md b/.claude/memory/project_email_routing_neptune.md new file mode 100644 index 00000000..767bfd84 --- /dev/null +++ b/.claude/memory/project_email_routing_neptune.md @@ -0,0 +1,11 @@ +--- +name: Neptune Email Routing Issues +description: Multiple clients (devcon, Sorensen/rieussetcorp) have email not routing properly from Neptune +type: project +--- + +Sorensen (rieussetcorp) and devcon both have the same email routing issue from Neptune — emails not routing properly. + +**Why:** Recurring issue affecting multiple clients, likely a shared configuration or Neptune platform problem rather than isolated incidents. + +**How to apply:** When troubleshooting email routing for any client on Neptune, check if the fix applied to one client needs to be replicated for others. Track as a systemic Neptune issue, not individual client problems. diff --git a/.claude/memory/project_neptune_sbr_email_routing.md b/.claude/memory/project_neptune_sbr_email_routing.md new file mode 100644 index 00000000..7598150a --- /dev/null +++ b/.claude/memory/project_neptune_sbr_email_routing.md @@ -0,0 +1,49 @@ +--- +name: Neptune SBR Email Routing Setup +description: How outbound email routing works on Neptune Exchange - SBR agent, MailProtector smarthost, send connectors, and common fix for new clients +type: project +--- + +## Neptune Outbound Email Routing Chain + +1. User sends mail from Exchange mailbox on Neptune (172.16.3.11) +2. **Microsoft.Exchange.SBR** transport agent (Priority 12) fires on OnResolved event +3. SBR reads config files at `C:\Program Files\Microsoft\Exchange Server\V15\TransportRoles\agents\Custom\`: + - `Microsoft.Exchange.SBR.InternalDomains.config` — list of domains SBR handles + - `Microsoft.Exchange.SBR.OverrideSettings.config` — maps `domain.com;domain.sbr` for routing + - `Microsoft.Exchange.SBR.IgnoreAuthAs.config` — exclusions +4. SBR rewrites recipient routing to `.sbr` domain (e.g., `rieussetcorp.sbr`) +5. Exchange matches `.sbr` address space to the corresponding Send Connector (e.g., `Outbound.Sorensen`) +6. Send connector smarthosts through MailProtector: `domain-com.outbound.emailservice.io` +7. MailProtector relays to final destination + +There is also a **messageconcept ExSBR** agent at Priority 11 (`C:\Program Files\messageconcept\ExSBR\`). + +## Common Issue: New client or server move + +When Neptune's IP changes or a new domain is added, MailProtector must have the sending server IP authorized. Without this, MailProtector accepts the relay but drops/rejects the message. + +**Fix (2026-03-22 for rieussetcorp.com):** Added 67.206.163.124 and 67.206.163.122 to MailProtector's authorized sender IPs. + +## Neptune Location + +Neptune physically moved from ACG office (72.194.62.7) to Dataforth (67.206.163.124 inbound, 67.206.163.122 outbound). SNAT rule on Dataforth UDM (`/data/on_boot.d/10-neptune-snat.sh`) should force outbound to use .124. + +## Access + +- WinRM: `172.16.3.11`, ACG\administrator, via pywinrm with NTLM +- Exchange PS: Connect via `New-PSSession -ConfigurationName Microsoft.Exchange -ConnectionUri http://neptune.acg.local/PowerShell/ -Authentication Kerberos` +- Requires Tailscale route through D2TESTNAS (192.168.0.9) for 172.16.0.0/22 + +## Known Issues (as of 2026-03-22) + +- 67.206.163.122 has no PTR record and is blacklisted by some providers +- SNAT rule may not be active — outbound was going as .122 not .124 on 3/16. Need to check UDM (192.168.0.254) — couldn't auth via SSH tonight, check in morning +- MAIL transport server still exists in Exchange config but server is decommissioned +- Spam queues with junk domains (wwwyamaha666.ru, bestspatulas.com, etc.) +- Tailscale 172.16.0.0/22 route moved from ACG pfSense to D2TESTNAS — may need permanent solution +- UDM SSH password (Paper123!@#-unifi) was rejected — may have changed + +## Resolved (2026-03-22) + +- rieussetcorp.com outbound: Added 67.206.163.124 and .122 to MailProtector authorized IPs — mail now flowing diff --git a/.claude/memory/reference_community_forum.md b/.claude/memory/reference_community_forum.md new file mode 100644 index 00000000..719757e8 --- /dev/null +++ b/.claude/memory/reference_community_forum.md @@ -0,0 +1,48 @@ +--- +name: Community Forum (Flarum) +description: Flarum forum at community.azcomputerguru.com - platform details, API access, database credentials, and posting workflow +type: reference +--- + +## Community Forum - Flarum + +- **URL:** https://community.azcomputerguru.com +- **Platform:** Flarum 1.8.14 +- **Server:** IX server (172.16.3.10), cPanel account `azcomputerguru` +- **Document Root:** `/home/azcomputerguru/public_html/community/public` +- **PHP Version:** 8.1.33 + +### Database +- **Host:** localhost (on IX server) +- **Database:** `azcompu_flarum` +- **User:** `azcompu_flarum` +- **Password:** `Fl@rum2026!CGS` + +### API +- **API Key:** `581b6c8c162a383ba87757f41b4381e9bf8db61d71bd578ee97fe32b7aeac046` (admin user, ID 1) +- **API Base:** `https://community.azcomputerguru.com/api` +- **Note:** Cloudflare blocks external API access. Must either: + 1. Use `--resolve` with `curl -k` from IX server localhost + 2. Use direct PHP/database script on IX server (preferred, more reliable) + +### Forum Tags (Categories) +| ID | Name | Slug | +|----|------|------| +| 1 | General | general | +| 2 | Tech News | tech-news | +| 3 | Security & Privacy | security-privacy | +| 4 | Artificial Intelligence | artificial-intelligence | +| 5 | Space Tech | space-tech | +| 6 | Gadgets & Hardware | gadgets-hardware | +| 7 | How-Tos & Tips | how-tos-tips | +| 8 | Show Discussion | show-discussion | +| 9 | Off-Topic | off-topic | + +### Posting Workflow +Cloudflare blocks the Flarum REST API from external requests. To create posts programmatically: +1. Write a PHP script that inserts directly into the database (discussions + posts + discussion_tag tables) +2. SCP the script and JSON payload to IX server `/tmp/` +3. Execute via `php /tmp/script.php` over SSH +4. Clean up temp files + +**How to apply:** Use this when the user asks to create forum posts or manage the community forum. diff --git a/.claude/memory/reference_dataforth_contact.md b/.claude/memory/reference_dataforth_contact.md new file mode 100644 index 00000000..fdd8d278 --- /dev/null +++ b/.claude/memory/reference_dataforth_contact.md @@ -0,0 +1,7 @@ +--- +name: Dataforth Contact - AJ +description: AJ at Dataforth - email forwarding setup needed for dataforthgit@ address +type: reference +--- + +AJ at Dataforth needs messages sent to the dataforthgit@ email address to forward to him. diff --git a/.claude/memory/reference_ix_access_tailscale.md b/.claude/memory/reference_ix_access_tailscale.md new file mode 100644 index 00000000..82d43efa --- /dev/null +++ b/.claude/memory/reference_ix_access_tailscale.md @@ -0,0 +1,7 @@ +--- +name: IX Server Access via Tailscale +description: IX server (ix.azcomputerguru.com) is accessible with Tailscale on, no VPN needed +type: reference +--- + +IX server (ix.azcomputerguru.com / 172.16.3.10) can be accessed directly when Tailscale is on. No separate VPN connection required. diff --git a/.claude/memory/reference_ix_server_ssh.md b/.claude/memory/reference_ix_server_ssh.md new file mode 100644 index 00000000..cda6a956 --- /dev/null +++ b/.claude/memory/reference_ix_server_ssh.md @@ -0,0 +1,18 @@ +--- +name: IX Server SSH Access +description: SSH access notes for IX server - key auth not set up on CachyOS workstation, must use sshpass with password +type: reference +--- + +## IX Server SSH from CachyOS Workstation + +- **Host:** 172.16.3.10 (ix.azcomputerguru.com) +- **User:** root +- **Password:** See credentials.md +- **SSH Key Auth:** NOT configured on CachyOS workstation (acg-guru-5070) +- **Must use:** `sshpass -p 'PASSWORD' ssh -o StrictHostKeyChecking=no -o PubkeyAuthentication=no root@172.16.3.10` +- **Suppress warnings:** Pipe through `grep -v WARNING | grep -v 'not using'` or `tail` + +**Why:** The SSH key from this machine hasn't been added to IX server's authorized_keys yet. The old WSL key (guru@wsl) was authorized but this is a new CachyOS install. + +**How to apply:** When running commands on IX server, use sshpass approach. Consider setting up SSH key auth to simplify future access. diff --git a/.claude/memory/reference_matomo_analytics.md b/.claude/memory/reference_matomo_analytics.md new file mode 100644 index 00000000..07e48ac3 --- /dev/null +++ b/.claude/memory/reference_matomo_analytics.md @@ -0,0 +1,40 @@ +--- +name: Matomo Analytics +description: Self-hosted Matomo analytics at analytics.azcomputerguru.com - credentials, site IDs, tracking setup for all 3 sites +type: reference +--- + +## Matomo Analytics + +- **URL:** https://analytics.azcomputerguru.com +- **Platform:** Matomo 5.8.0 (PHP) +- **Server:** IX server (172.16.3.10), cPanel account `azcomputerguru` +- **Document Root:** `/home/azcomputerguru/public_html/analytics/` + +### Login +- **User:** MikeSwanson +- **Password:** Mat0mo2026!CGS +- **Email:** mike@azcomputerguru.com + +### Database +- **Host:** localhost (on IX server) +- **Database:** `azcompu_matomo` +- **User:** `azcompu_matomo` +- **Password:** `Mat0mo2026!CGS` + +### Tracked Sites +| Site ID | Name | URL | Tracking Method | +|---------|------|-----|-----------------| +| 1 | AZ Computer Guru | https://azcomputerguru.com | WordPress mu-plugin (`wp-content/mu-plugins/matomo-tracking.php`) | +| 2 | Community Forum | https://community.azcomputerguru.com | Flarum `custom_header` DB setting | +| 3 | Radio Show | https://radio.azcomputerguru.com | Injected into HTML files before `` | + +### Cron +- Archiving cron runs every 5 minutes as `azcomputerguru` user +- Command: `php /home/azcomputerguru/public_html/analytics/console core:archive` + +### Cloudflare +- DNS record points to 72.194.62.5, proxied (orange cloud) +- Was previously pointing to wrong IP (52.52.94.202), fixed 2026-03-20 + +**How to apply:** Use this when managing analytics, adding new sites to track, or troubleshooting tracking code. diff --git a/.claude/memory/reference_neptune_access_d2testnas.md b/.claude/memory/reference_neptune_access_d2testnas.md new file mode 100644 index 00000000..692d35ff --- /dev/null +++ b/.claude/memory/reference_neptune_access_d2testnas.md @@ -0,0 +1,7 @@ +--- +name: Neptune Access via D2TESTNAS +description: Neptune Exchange server must be accessed by routing through D2TESTNAS (not direct VPN) +type: reference +--- + +Neptune (neptune.acghosting.com / 172.16.3.11) must be accessed by routing through D2TESTNAS, not via direct VPN connection. diff --git a/.claude/memory/reference_radio_website.md b/.claude/memory/reference_radio_website.md new file mode 100644 index 00000000..f62067a8 --- /dev/null +++ b/.claude/memory/reference_radio_website.md @@ -0,0 +1,23 @@ +--- +name: Radio Show Website +description: The Computer Guru Show website at radio.azcomputerguru.com - Astro static site on IX server cPanel +type: reference +--- + +## Radio Show Website + +- **URL:** https://radio.azcomputerguru.com +- **Platform:** Astro 6.0.4 (static site generator) +- **Server:** IX server (172.16.3.10), cPanel account `azcomputerguru` +- **Document Root:** `/home/azcomputerguru/public_html/radio` +- **Source Code:** `projects/radio-show/website/` in ClaudeTools repo +- **Build:** `cd projects/radio-show/website && npm run build` produces `dist/` folder +- **Deploy:** rsync/SCP `dist/` contents to document root on IX server + +### Community Link +- The community page (`/community`) links to: + - Discord server (placeholder, WidgetBot) + - Flarum forum at https://community.azcomputerguru.com + - Newsletter signup (placeholder) + +**How to apply:** Use when deploying website updates or managing the radio show project. diff --git a/.claude/memory/reference_workstation_setup.md b/.claude/memory/reference_workstation_setup.md new file mode 100644 index 00000000..5e270fe9 --- /dev/null +++ b/.claude/memory/reference_workstation_setup.md @@ -0,0 +1,35 @@ +--- +name: CachyOS Workstation Setup +description: Current workstation config - CachyOS on ASUS laptop, dual NVMe, autostart apps, old home btrfs subvolume location +type: reference +--- + +## Workstation: acg-guru-5070 + +- **OS:** CachyOS (Arch-based), kernel 6.19.x +- **DE:** KDE Plasma 6 (Wayland) +- **CPU/GPU:** Intel Arrow Lake-S + NVIDIA RTX 5070 Ti Mobile +- **Tailscale IP:** 100.95.216.79 + +### Storage +- **nvme0n1:** 954GB btrfs - CachyOS install (OS, root) +- **nvme1n1:** 954GB ext4 - `/home` (formatted from old Windows drive) +- **Old home:** btrfs `@home` subvolume on nvme0n1, mount with: `sudo mount -o subvol=@home UUID=8a8b1d34-99fb-470f-82ca-b5d08e43ec32 /mnt/old-home` + +### Autostart Apps (~/.config/autostart/) +- `arch-update-tray.desktop` (pre-existing) +- `cachyos-hello.desktop` (pre-existing) +- `discord.desktop` (added, starts minimized) +- `tailscale-systray.desktop` (added) +- ScreenConnect: autostart removed (on-demand only via URI scheme handler from web UI) + +### Known Issues +- **Warm reboot hangs:** Rebooting (e.g. for GPU issues) causes system to hang with spinning symbol — requires hard power-off. Observed multiple times. Likely NVIDIA driver not unloading cleanly during shutdown. + +### Key Fixes Applied +- **Tailscale:** `--accept-routes`, systemd-resolved + NetworkManager DNS config +- **Brightness:** Hide nvidia_0 backlight via udev rule, KDE controls intel_backlight only +- **ScreenConnect:** dpkg + full JRE + Wayland patch (GDK_BACKEND=x11) +- **Sudo:** NOPASSWD for guru user + +**How to apply:** Reference when troubleshooting workstation issues or setting up additional services. diff --git a/projects/radio-show/audio-processor/test_content_generation.py b/projects/radio-show/audio-processor/test_content_generation.py new file mode 100644 index 00000000..99a13268 --- /dev/null +++ b/projects/radio-show/audio-processor/test_content_generation.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +"""Test content generation from a transcript using Ollama qwen3:14b. + +Generates: +1. Episode analysis (summary, segments, topics, tags, quotes, blog candidates) +2. Sample forum discussion post +3. Sample blog post draft +""" + +import json +import sys +import time +from pathlib import Path + +import ollama + +MODEL = "qwen3:14b" +OLLAMA_HOST = "http://localhost:11434" +# qwen3:14b supports 32k context -- use more of it +MAX_TRANSCRIPT_CHARS = 40000 + +client = ollama.Client(host=OLLAMA_HOST) + + +def load_transcript(transcript_dir: str) -> str: + """Load transcript text.""" + txt_path = Path(transcript_dir) / "transcript.txt" + if not txt_path.exists(): + print(f"ERROR: {txt_path} not found") + sys.exit(1) + return txt_path.read_text() + + +def timed_query(label: str, prompt: str, temperature: float = 0.3) -> str: + """Run an Ollama query with timing.""" + print(f"\n{'='*60}") + print(f" {label}") + print(f"{'='*60}") + start = time.time() + + response = client.chat( + model=MODEL, + messages=[{"role": "user", "content": prompt}], + options={"temperature": temperature, "num_ctx": 32768}, + ) + + elapsed = time.time() - start + result = response["message"]["content"] + print(f" [{elapsed:.1f}s, {len(result)} chars]") + return result + + +def generate_analysis(transcript: str) -> dict: + """Generate episode analysis JSON.""" + prompt = f"""You are analyzing a transcript from "The Computer Guru Show", a live call-in +radio show hosted by Mike Swanson on AM1030 KVOI in Tucson, Arizona. The show covers +technology news, tips, and takes listener calls for free tech support. + +Analyze this transcript and provide a JSON response with: + +1. "summary": A 2-3 paragraph episode summary suitable for a podcast page. Write in third + person. Be specific about topics and conversations. + +2. "segment_summaries": Array of distinct topic segments discussed, each with: + - "title": Compelling segment title + - "summary": 3-5 sentence summary + - "key_points": Array of key takeaway bullet points + - "approximate_position": "early", "mid", or "late" in the show + +3. "topics": Array of main topics discussed (short phrases) + +4. "tags": Array of SEO-friendly tags (lowercase, hyphenated) + +5. "key_quotes": Array of 3-5 notable/quotable moments, each with: + - "quote": The exact quote text + - "speaker": Who said it + - "context": Brief context for why it's notable + +6. "blog_post_candidates": Array of 2-3 topics worth expanding into full blog posts, each with: + - "title": Proposed blog post title + - "angle": The specific thesis or angle + - "why": Why this deserves expansion (audience interest, SEO potential, etc.) + - "key_points_to_expand": Array of points from the show to develop further + +Respond ONLY with valid JSON. No markdown fencing, no explanation outside the JSON. + +## Transcript + +{transcript[:MAX_TRANSCRIPT_CHARS]}""" + + result = timed_query("Episode Analysis (JSON)", prompt) + + # Strip markdown fences if present + if "```json" in result: + result = result.split("```json", 1)[1].split("```", 1)[0] + elif "```" in result: + result = result.split("```", 1)[1].split("```", 1)[0] + + # Strip thinking tags if qwen3 uses them + if "" in result: + result = result.split("")[-1] + + try: + return json.loads(result.strip()) + except json.JSONDecodeError as e: + print(f" WARNING: JSON parse failed: {e}") + print(f" Raw response (first 500 chars): {result[:500]}") + return {"raw_response": result} + + +def generate_forum_post(transcript: str, analysis: dict) -> str: + """Generate a forum discussion thread post.""" + summary = analysis.get("summary", "") + topics = analysis.get("topics", []) + + prompt = f"""You are writing a forum discussion post for "The Computer Guru Show" community +forum. The tone should be conversational, engaging, and invite discussion. This is NOT a +formal article -- it's a community post that makes people want to comment. + +Show info: +- Host: Mike Swanson ("The Computer Guru") +- Station: AM1030 KVOI, Tucson AZ +- Format: Live call-in tech show + +Episode summary: {summary} +Topics covered: {', '.join(topics)} + +Write a forum discussion post with: +1. A brief, engaging hook (2-3 sentences about the most interesting thing from the episode) +2. Bullet list of topics covered (with one-line teasers, not full summaries) +3. 2-3 discussion questions that invite audience participation +4. A "Listen to the full episode" call-to-action at the end + +Keep it under 300 words. Use a casual, friendly tone. No emojis. + +Key transcript excerpts for context: +{transcript[:8000]}""" + + return timed_query("Forum Discussion Post", prompt, temperature=0.5) + + +def generate_blog_post(transcript: str, candidate: dict) -> str: + """Generate a full blog post draft from a blog candidate.""" + prompt = f"""You are writing a blog post for the "Computer Guru Show" website +(radio.azcomputerguru.com). The author is Mike Swanson, a veteran IT professional and +radio host in Tucson, Arizona. His style is: +- Explains complex tech in plain English +- Uses analogies and humor +- Gives practical, actionable advice +- Takes strong positions on consumer rights and privacy +- Speaks directly to the reader + +Write a blog post with this info: +- Title: {candidate.get('title', 'Untitled')} +- Angle: {candidate.get('angle', '')} +- Points to expand: {json.dumps(candidate.get('key_points_to_expand', []))} + +Format: +1. Engaging opening paragraph (hook the reader) +2. 3-5 sections with subheadings +3. Practical "what this means for you" section +4. Key Takeaways (bullet points) +5. Closing paragraph that ties back to the show + +Target length: 800-1200 words. Write in first person as Mike Swanson. +Include a note at the bottom: "This topic was discussed on The Computer Guru Show. +Listen to the full episode for more." + +Relevant transcript excerpts: +{transcript[:12000]}""" + + return timed_query(f"Blog Post: {candidate.get('title', '?')}", prompt, temperature=0.5) + + +def main(): + transcript_dir = sys.argv[1] if len(sys.argv) > 1 else \ + "training-data/transcripts/2016-s8e42" + + print(f"Loading transcript from: {transcript_dir}") + transcript = load_transcript(transcript_dir) + print(f"Transcript length: {len(transcript)} chars ({len(transcript.splitlines())} lines)") + print(f"Sending first {min(len(transcript), MAX_TRANSCRIPT_CHARS)} chars to LLM") + + # Output directory + output_dir = Path(transcript_dir) / "generated" + output_dir.mkdir(parents=True, exist_ok=True) + + # Step 1: Analysis + analysis = generate_analysis(transcript) + with open(output_dir / "analysis.json", "w") as f: + json.dump(analysis, f, indent=2) + print(f"\n Saved: {output_dir}/analysis.json") + + # Print summary + if "summary" in analysis: + print(f"\n--- EPISODE SUMMARY ---") + print(analysis["summary"]) + + if "topics" in analysis: + print(f"\n--- TOPICS ---") + for t in analysis["topics"]: + print(f" - {t}") + + if "tags" in analysis: + print(f"\n--- TAGS ---") + print(f" {', '.join(analysis['tags'])}") + + if "blog_post_candidates" in analysis: + print(f"\n--- BLOG POST CANDIDATES ---") + for i, c in enumerate(analysis["blog_post_candidates"], 1): + print(f" {i}. {c.get('title', '?')}") + print(f" Angle: {c.get('angle', '?')}") + + # Step 2: Forum post + forum_post = generate_forum_post(transcript, analysis) + with open(output_dir / "forum-post.md", "w") as f: + f.write(forum_post) + print(f"\n Saved: {output_dir}/forum-post.md") + print(f"\n--- FORUM POST ---") + print(forum_post) + + # Step 3: Blog post (pick the first candidate) + candidates = analysis.get("blog_post_candidates", []) + if candidates: + blog_post = generate_blog_post(transcript, candidates[0]) + slug = candidates[0].get("title", "draft").lower().replace(" ", "-")[:50] + with open(output_dir / f"blog-{slug}.md", "w") as f: + f.write(blog_post) + print(f"\n Saved: {output_dir}/blog-{slug}.md") + print(f"\n--- BLOG POST DRAFT ---") + print(blog_post) + else: + print("\n No blog post candidates found, skipping blog generation") + + print(f"\n{'='*60}") + print(f" All outputs saved to: {output_dir}/") + print(f"{'='*60}") + + +if __name__ == "__main__": + main() diff --git a/projects/radio-show/audio-processor/test_segment_first.py b/projects/radio-show/audio-processor/test_segment_first.py new file mode 100644 index 00000000..ece158ae --- /dev/null +++ b/projects/radio-show/audio-processor/test_segment_first.py @@ -0,0 +1,431 @@ +#!/usr/bin/env python3 +"""Segment-first content generation test. + +Architecture: +1. Split transcript at break markers (text-based detection) +2. Analyze each segment individually (full context, no truncation) +3. Cross-segment synthesis (callbacks, recurring topics, narrative arc) +4. Generate forum post and blog post from complete analysis +""" + +import json +import re +import sys +import time +from pathlib import Path + +import ollama + +MODEL = "qwen3:14b" +OLLAMA_HOST = "http://localhost:11434" + +client = ollama.Client(host=OLLAMA_HOST) + +# Break markers — patterns that indicate commercial breaks +BREAK_START = re.compile( + r"^(We'll be right back|We will be right back)", + re.IGNORECASE +) +BREAK_END = re.compile( + r"^(Welcome back to [Tt]he Computer Guru|All right, if you'd like to be a part of the show)", + re.IGNORECASE +) +# Station IDs and bumper text that appear during breaks +BREAK_FILLER = re.compile( + r"^(This is the Computer Guru Show on|This is a computer guru show|" + r"Your computer guru|Whether you're dealing with|" + r"Computer running slow|Has your machine somehow|" + r"Be one with your operating system|" + r"Listen in, chat in|Want your voice to be heard)", + re.IGNORECASE +) + + +def load_transcript(transcript_dir: str) -> list[str]: + """Load transcript as lines.""" + txt_path = Path(transcript_dir) / "transcript.txt" + if not txt_path.exists(): + print(f"ERROR: {txt_path} not found") + sys.exit(1) + return txt_path.read_text().splitlines() + + +def split_into_segments(lines: list[str]) -> list[dict]: + """Split transcript lines into show segments, removing commercial breaks. + + Returns list of segments, each with: + - number: segment number (1-based) + - start_line: first line number in original transcript + - end_line: last line number + - lines: list of text lines (show content only) + - text: joined text + """ + segments = [] + current_segment_lines = [] + current_start = 1 + in_break = False + segment_num = 0 + + for i, line in enumerate(lines, 1): + stripped = line.strip() + if not stripped: + continue + + # Detect break start + if BREAK_START.match(stripped) and not in_break: + # Save current segment if it has content + if current_segment_lines: + segment_num += 1 + text = "\n".join(current_segment_lines) + segments.append({ + "number": segment_num, + "start_line": current_start, + "end_line": i - 1, + "lines": current_segment_lines, + "text": text, + "char_count": len(text), + }) + in_break = True + current_segment_lines = [] + continue + + # Detect break end + if in_break and BREAK_END.match(stripped): + in_break = False + current_start = i + # Don't include the "welcome back" line itself — it's transitional + continue + + # Skip break filler (station IDs, bumper text during breaks) + if in_break or BREAK_FILLER.match(stripped): + continue + + # Regular show content + current_segment_lines.append(stripped) + + # Don't forget the last segment + if current_segment_lines: + segment_num += 1 + text = "\n".join(current_segment_lines) + segments.append({ + "number": segment_num, + "start_line": current_start, + "end_line": len(lines), + "lines": current_segment_lines, + "text": text, + "char_count": len(text), + }) + + return segments + + +def timed_query(label: str, prompt: str, temperature: float = 0.3, + ctx_size: int = 32768) -> str: + """Run an Ollama query with timing.""" + print(f"\n{'='*60}") + print(f" {label}") + print(f"{'='*60}") + start = time.time() + + response = client.chat( + model=MODEL, + messages=[{"role": "user", "content": prompt}], + options={"temperature": temperature, "num_ctx": ctx_size}, + ) + + elapsed = time.time() - start + result = response["message"]["content"] + + # Strip thinking tags if qwen3 uses them + if "" in result: + parts = result.split("") + if len(parts) > 1: + result = parts[-1].strip() + + print(f" [{elapsed:.1f}s, {len(result)} chars]") + return result + + +def parse_json_response(text: str) -> dict: + """Parse JSON from LLM response, handling markdown fences.""" + if "```json" in text: + text = text.split("```json", 1)[1].split("```", 1)[0] + elif "```" in text: + text = text.split("```", 1)[1].split("```", 1)[0] + try: + return json.loads(text.strip()) + except json.JSONDecodeError as e: + print(f" WARNING: JSON parse failed: {e}") + print(f" First 300 chars: {text[:300]}") + return {} + + +def analyze_segment(segment: dict, segment_count: int) -> dict: + """Analyze a single segment with full context.""" + prompt = f"""You are analyzing segment {segment['number']} of {segment_count} from +"The Computer Guru Show", a live call-in radio show hosted by Mike Swanson on AM1030 +KVOI in Tucson, Arizona. Co-host Rob is often present. The show takes listener calls +for free tech support and discusses tech news. + +This is the COMPLETE transcript of this segment (nothing is truncated). +Analyze it and respond with JSON: + +{{ + "title": "Compelling segment title", + "summary": "3-5 sentence summary of what happened in this segment", + "key_points": ["array of key takeaway bullet points"], + "topics": ["array of topics discussed"], + "speakers": ["array of speakers heard (Mike, Rob, caller names if given)"], + "caller_questions": ["array of specific questions callers asked, if any"], + "key_quotes": [ + {{"quote": "exact quote text", "speaker": "who said it", "context": "why notable"}} + ], + "blog_worthy_topics": [ + {{"topic": "topic name", "angle": "what makes it worth expanding", "details_from_show": "specific points Mike made that a blog post should include"}} + ], + "callbacks": ["any references to earlier segments or topics discussed before the break"] +}} + +Respond ONLY with valid JSON. + +## Segment {segment['number']} of {segment_count} — Full Transcript + +{segment['text']}""" + + result = timed_query( + f"Segment {segment['number']}/{segment_count} ({segment['char_count']} chars)", + prompt + ) + return parse_json_response(result) + + +def cross_segment_synthesis(segment_analyses: list[dict], segments: list[dict]) -> dict: + """Synthesize across all segments for episode-level analysis.""" + # Build a compact summary of each segment for the synthesis prompt + segment_summaries = [] + for i, analysis in enumerate(segment_analyses, 1): + if not analysis: + continue + segment_summaries.append( + f"### Segment {i}: {analysis.get('title', 'Unknown')}\n" + f"Summary: {analysis.get('summary', 'N/A')}\n" + f"Topics: {', '.join(analysis.get('topics', []))}\n" + f"Speakers: {', '.join(analysis.get('speakers', []))}\n" + f"Key points: {json.dumps(analysis.get('key_points', []))}\n" + f"Callbacks: {json.dumps(analysis.get('callbacks', []))}" + ) + + all_blog_topics = [] + for analysis in segment_analyses: + if analysis: + all_blog_topics.extend(analysis.get("blog_worthy_topics", [])) + + prompt = f"""You are producing the final episode analysis for "The Computer Guru Show". +Below are analyses of each individual segment. Your job is to synthesize them into a +cohesive episode-level view. + +Respond with JSON: + +{{ + "episode_title": "A compelling episode title that captures the main theme", + "episode_summary": "2-3 paragraph summary of the entire episode. Be specific about topics, callers, and conversations. Write in third person, suitable for a podcast episode page.", + "narrative_arc": "1 paragraph describing how the show flowed — what opened, how topics evolved, what closed it out", + "recurring_themes": ["topics or ideas that came up across multiple segments"], + "cross_segment_connections": ["specific callbacks or topic continuations across segments"], + "all_topics": ["complete deduplicated list of every topic discussed"], + "all_tags": ["SEO-friendly lowercase hyphenated tags"], + "top_quotes": [ + {{"quote": "text", "speaker": "name", "context": "why notable", "segment": 1}} + ], + "blog_post_candidates": [ + {{ + "title": "Proposed blog post title", + "angle": "specific thesis or angle", + "why": "why this deserves expansion", + "source_segments": [1, 2], + "key_details_from_show": ["specific points, quotes, and examples from the show to include"] + }} + ] +}} + +Respond ONLY with valid JSON. + +## Per-Segment Analyses + +{chr(10).join(segment_summaries)} + +## Blog-Worthy Topics Identified Across All Segments + +{json.dumps(all_blog_topics, indent=2)}""" + + result = timed_query("Cross-Segment Synthesis", prompt) + return parse_json_response(result) + + +def generate_forum_post(synthesis: dict) -> str: + """Generate forum discussion post from synthesis.""" + prompt = f"""Write a community forum discussion post for "The Computer Guru Show" forum. + +Episode title: {synthesis.get('episode_title', 'Unknown')} +Summary: {synthesis.get('episode_summary', '')} +Topics: {json.dumps(synthesis.get('all_topics', []))} +Narrative arc: {synthesis.get('narrative_arc', '')} + +Rules: +- Conversational, engaging tone that invites discussion +- Brief hook (2-3 sentences about the most interesting thing) +- Bullet list of topics with one-line teasers +- 2-3 discussion questions that invite audience participation +- "Listen to the full episode" call-to-action +- Under 300 words +- Casual, friendly tone +- No emojis +- No markdown headers larger than ### + +Write the post now.""" + + return timed_query("Forum Post", prompt, temperature=0.5) + + +def generate_blog_post(synthesis: dict, candidate: dict, + segments: list[dict]) -> str: + """Generate a blog post using the full segment transcripts for source material.""" + # Find the source segments referenced by the blog candidate + source_nums = candidate.get("source_segments", [1]) + source_text = "" + for num in source_nums: + if 0 < num <= len(segments): + source_text += f"\n--- Segment {num} transcript ---\n{segments[num-1]['text'][:15000]}\n" + + # If no specific segments referenced, use the first two + if not source_text: + for seg in segments[:2]: + source_text += f"\n--- Segment {seg['number']} transcript ---\n{seg['text'][:10000]}\n" + + prompt = f"""Write a blog post for the Computer Guru Show website (radio.azcomputerguru.com). +Author: Mike Swanson — veteran IT professional, radio host in Tucson AZ. + +His writing style: +- Explains complex tech in plain English using analogies +- Uses humor — dry, self-deprecating, occasionally sarcastic +- Gives practical, actionable advice +- Takes strong positions on consumer rights, privacy, and corporate BS +- Speaks directly to the reader like a friend +- References real conversations from the show + +Blog post details: +- Title: {candidate.get('title', 'Untitled')} +- Angle: {candidate.get('angle', '')} +- Key details from show: {json.dumps(candidate.get('key_details_from_show', []))} + +Format: +1. Engaging opening paragraph (hook the reader with something from the show) +2. 3-5 sections with ### subheadings +3. "What This Means for You" practical section +4. Key Takeaways (bullet points) +5. Closing that ties back to the show conversation + +Target: 800-1200 words. First person as Mike Swanson. +End with: "This topic was discussed on The Computer Guru Show. Listen to the full episode for more." + +IMPORTANT: Draw directly from the transcript below. Use Mike's actual words, analogies, and +examples — not generic filler. If Mike made a joke or analogy on air, reference it in the post. + +## Source transcript from the show: +{source_text}""" + + return timed_query(f"Blog: {candidate.get('title', '?')}", prompt, temperature=0.5) + + +def main(): + transcript_dir = sys.argv[1] if len(sys.argv) > 1 else \ + "training-data/transcripts/2016-s8e42" + + print(f"Loading transcript from: {transcript_dir}") + lines = load_transcript(transcript_dir) + print(f"Total lines: {len(lines)}") + + # Step 1: Split into segments + print(f"\n{'='*60}") + print(f" STEP 1: Splitting into segments") + print(f"{'='*60}") + segments = split_into_segments(lines) + print(f" Found {len(segments)} segments:\n") + for seg in segments: + print(f" Segment {seg['number']}: lines {seg['start_line']}-{seg['end_line']}, " + f"{seg['char_count']} chars, {len(seg['lines'])} lines") + # Show first line as preview + preview = seg['lines'][0][:80] if seg['lines'] else "(empty)" + print(f" Preview: {preview}") + + output_dir = Path(transcript_dir) / "generated-v2" + output_dir.mkdir(parents=True, exist_ok=True) + + # Save segments for reference + segments_meta = [{k: v for k, v in s.items() if k != 'lines'} for s in segments] + with open(output_dir / "segments.json", "w") as f: + json.dump(segments_meta, f, indent=2) + + # Step 2: Analyze each segment + print(f"\n{'='*60}") + print(f" STEP 2: Analyzing {len(segments)} segments individually") + print(f"{'='*60}") + segment_analyses = [] + for seg in segments: + analysis = analyze_segment(seg, len(segments)) + segment_analyses.append(analysis) + + # Save individual segment analysis + with open(output_dir / f"segment-{seg['number']}-analysis.json", "w") as f: + json.dump(analysis, f, indent=2) + + if analysis: + print(f" Title: {analysis.get('title', '?')}") + print(f" Topics: {', '.join(analysis.get('topics', []))}") + + # Step 3: Cross-segment synthesis + print(f"\n{'='*60}") + print(f" STEP 3: Cross-segment synthesis") + print(f"{'='*60}") + synthesis = cross_segment_synthesis(segment_analyses, segments) + with open(output_dir / "synthesis.json", "w") as f: + json.dump(synthesis, f, indent=2) + + if synthesis: + print(f"\n Episode title: {synthesis.get('episode_title', '?')}") + print(f" Recurring themes: {synthesis.get('recurring_themes', [])}") + print(f"\n Episode summary:") + print(f" {synthesis.get('episode_summary', 'N/A')[:500]}") + + # Step 4: Generate forum post + print(f"\n{'='*60}") + print(f" STEP 4: Generate content") + print(f"{'='*60}") + forum_post = generate_forum_post(synthesis) + with open(output_dir / "forum-post.md", "w") as f: + f.write(forum_post) + print(f"\n--- FORUM POST ---") + print(forum_post) + + # Step 5: Generate blog post from best candidate + candidates = synthesis.get("blog_post_candidates", []) + if candidates: + blog_post = generate_blog_post(synthesis, candidates[0], segments) + slug = re.sub(r'[^a-z0-9]+', '-', candidates[0].get("title", "draft").lower())[:50] + with open(output_dir / f"blog-{slug}.md", "w") as f: + f.write(blog_post) + print(f"\n--- BLOG POST ---") + print(blog_post) + + # Summary + print(f"\n{'='*60}") + print(f" COMPLETE — All outputs in: {output_dir}/") + print(f"{'='*60}") + print(f" Segments analyzed: {len(segments)}") + print(f" Per-segment analyses: {sum(1 for a in segment_analyses if a)}") + print(f" Blog candidates: {len(candidates)}") + print(f" Files generated: {len(list(output_dir.iterdir()))}") + + +if __name__ == "__main__": + main()