Files
claudetools/projects/radio-show/website/scripts/import-episodes.mjs
Mike Swanson ee89727662 Radio show website: Full Astro build with 194 episodes imported
Complete website for The Computer Guru Show (radio.azcomputerguru.com):
- Astro 6.0.4 static site with React islands
- 194 episodes imported from gurushow.com RSS feed
- Dark/light mode HSL design system
- Persistent audio player with session persistence
- Episode archive with search and season filtering
- Home page with animated hero, stats, latest episodes
- All pages: About, Subscribe, Community, Live, Contact, Blog, 404
- Podcast RSS feed with iTunes namespace
- Session log updated

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-14 20:44:42 -07:00

571 lines
15 KiB
JavaScript

/**
* import-episodes.mjs
*
* Fetches all episodes from the Computer Guru Show podcast RSS feed
* and converts them into Astro content collection markdown files.
*
* Usage:
* node scripts/import-episodes.mjs
*
* Dependencies: fast-xml-parser, turndown
*/
import { writeFileSync, mkdirSync, existsSync } from "node:fs";
import { join, dirname } from "node:path";
import { fileURLToPath } from "node:url";
import { XMLParser } from "fast-xml-parser";
import TurndownService from "turndown";
// ---------------------------------------------------------------------------
// Configuration
// ---------------------------------------------------------------------------
const FEED_URL = "https://gurushow.com/feed/podcast";
const __dirname = dirname(fileURLToPath(import.meta.url));
const PROJECT_ROOT = join(__dirname, "..");
const EPISODES_DIR = join(PROJECT_ROOT, "src", "content", "episodes");
/** Common tech keywords mapped to tag slugs. */
const TAG_KEYWORDS = new Map([
["apple", "apple"],
["iphone", "iphone"],
["ipad", "ipad"],
["\\bmac\\b", "mac"],
["macbook", "macbook"],
["google", "google"],
["android", "android"],
["chrome", "chrome"],
["chromebook", "chromebook"],
["microsoft", "microsoft"],
["windows", "windows"],
["security", "security"],
["privacy", "privacy"],
["hacking", "security"],
["hack", "security"],
["malware", "malware"],
["ransomware", "ransomware"],
["virus", "malware"],
["\\bai\\b", "ai"],
["artificial.intelligence", "ai"],
["chatgpt", "ai"],
["openai", "ai"],
["machine.learning", "ai"],
["space", "space"],
["spacex", "space"],
["nasa", "space"],
["bitcoin", "cryptocurrency"],
["crypto", "cryptocurrency"],
["cryptocurrency", "cryptocurrency"],
["blockchain", "cryptocurrency"],
["net.neutrality", "net-neutrality"],
["gaming", "gaming"],
["playstation", "gaming"],
["xbox", "gaming"],
["nintendo", "gaming"],
["social.media", "social-media"],
["facebook", "facebook"],
["twitter", "twitter"],
["instagram", "instagram"],
["tiktok", "social-media"],
["amazon", "amazon"],
["alexa", "amazon"],
["tesla", "tesla"],
["samsung", "samsung"],
["cybersecurity", "security"],
["phishing", "security"],
["data.breach", "security"],
["linux", "linux"],
["ubuntu", "linux"],
["roku", "streaming"],
["netflix", "streaming"],
["streaming", "streaming"],
["5g", "5g"],
["\\bvr\\b", "vr"],
["virtual.reality", "vr"],
["augmented.reality", "ar"],
["drone", "drones"],
["robot", "robotics"],
]);
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
/**
* Decode common HTML entities that appear in RSS feed titles/descriptions.
* Handles numeric (&#8211;), hex (&#x2019;), and named (&amp;) entities.
*/
function decodeHtmlEntities(text) {
if (!text) return "";
const namedEntities = {
"&amp;": "&",
"&lt;": "<",
"&gt;": ">",
"&quot;": '"',
"&apos;": "'",
"&nbsp;": " ",
"&ndash;": "\u2013",
"&mdash;": "\u2014",
"&lsquo;": "\u2018",
"&rsquo;": "\u2019",
"&ldquo;": "\u201C",
"&rdquo;": "\u201D",
"&hellip;": "\u2026",
};
let result = text;
// Named entities
for (const [entity, char] of Object.entries(namedEntities)) {
result = result.replaceAll(entity, char);
}
// Numeric decimal entities: &#8211;
result = result.replace(/&#(\d+);/g, (_match, dec) =>
String.fromCodePoint(Number(dec))
);
// Numeric hex entities: &#x2019;
result = result.replace(/&#x([0-9a-fA-F]+);/g, (_match, hex) =>
String.fromCodePoint(parseInt(hex, 16))
);
return result;
}
/**
* Parse season and episode numbers from a title string.
* Looks for patterns like "S10E21", "S1E3", "s02e05", case-insensitive.
* Returns { season, episode } or null if not found.
*/
function parseSeasonEpisode(title) {
// Pattern: S{n}E{n} anywhere in the title (with optional space between S and E parts)
const match = title.match(/S(\d+)\s*E(\d+)/i);
if (match) {
return {
season: parseInt(match[1], 10),
episode: parseInt(match[2], 10),
};
}
// Fallback: "Season X Episode Y" pattern
const longMatch = title.match(/Season\s+(\d+)\s+Episode\s+(\d+)/i);
if (longMatch) {
return {
season: parseInt(longMatch[1], 10),
episode: parseInt(longMatch[2], 10),
};
}
return null;
}
/**
* Strip "Podcast " prefix from episode titles.
*/
function cleanTitle(rawTitle) {
let title = decodeHtmlEntities(rawTitle).trim();
// Remove leading "Podcast " (case-insensitive)
title = title.replace(/^Podcast\s+/i, "");
return title;
}
/**
* Generate a URL-safe slug from a title string.
* - Lowercase
* - Replace non-alphanumeric chars with hyphens
* - Collapse multiple hyphens
* - Trim hyphens from ends
* - Max 60 characters
*/
function slugify(title) {
// Remove the S{n}E{n} prefix before slugifying to keep the slug about content
const withoutCode = title.replace(/^S\d+\s*E\d+\s*[-\u2013:]?\s*/i, "").trim();
const base = withoutCode || title;
let slug = base
.toLowerCase()
.replace(/[^a-z0-9]+/g, "-")
.replace(/-+/g, "-")
.replace(/^-|-$/g, "");
if (slug.length > 60) {
slug = slug.substring(0, 60).replace(/-$/, "");
}
return slug || "untitled";
}
/**
* Generate a filename for an episode.
* Format: s{SS}e{EE}-{slug}.md
*/
function episodeFilename(season, episode, title) {
const s = String(season).padStart(2, "0");
const e = String(episode).padStart(2, "0");
const slug = slugify(title);
return `s${s}e${e}-${slug}.md`;
}
/**
* Extract tags from a title string based on keyword matching.
* Returns a deduplicated, sorted array of tag slugs.
*/
function extractTags(title, description) {
const text = `${title} ${description}`.toLowerCase();
const tags = new Set();
for (const [pattern, tag] of TAG_KEYWORDS) {
const regex = new RegExp(pattern, "i");
if (regex.test(text)) {
tags.add(tag);
}
}
return [...tags].sort();
}
/**
* Convert an ISO date string or RSS pubDate into YYYY-MM-DD format.
*/
function formatDate(dateStr) {
if (!dateStr) return "1970-01-01";
const d = new Date(dateStr);
if (isNaN(d.getTime())) return "1970-01-01";
return d.toISOString().split("T")[0];
}
/**
* Normalize audio URL to HTTPS.
*/
function normalizeAudioUrl(url) {
if (!url) return "";
return url.replace(/^http:\/\//i, "https://");
}
/**
* Escape YAML string values. Wraps in double quotes and escapes inner quotes.
*/
function yamlString(value) {
if (value === undefined || value === null) return '""';
const str = String(value);
// Escape backslashes first, then double quotes
const escaped = str.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
return `"${escaped}"`;
}
// ---------------------------------------------------------------------------
// Turndown (HTML -> Markdown) setup
// ---------------------------------------------------------------------------
function createTurndown() {
const td = new TurndownService({
headingStyle: "atx",
bulletListMarker: "-",
codeBlockStyle: "fenced",
emDelimiter: "*",
});
// Remove script/style elements
td.remove(["script", "style"]);
return td;
}
/**
* Convert HTML content to clean markdown.
* Handles null/empty input gracefully.
*/
function htmlToMarkdown(html, turndown) {
if (!html || typeof html !== "string" || html.trim().length === 0) {
return "";
}
let md = turndown.turndown(html);
// Clean up excessive blank lines
md = md.replace(/\n{3,}/g, "\n\n");
// Decode any leftover HTML entities
md = decodeHtmlEntities(md);
return md.trim();
}
// ---------------------------------------------------------------------------
// Feed fetching and parsing
// ---------------------------------------------------------------------------
async function fetchFeed(url) {
console.log(`Fetching RSS feed from ${url} ...`);
const response = await fetch(url, {
headers: {
"User-Agent": "GuruShowImporter/1.0",
Accept: "application/rss+xml, application/xml, text/xml",
},
});
if (!response.ok) {
throw new Error(
`Failed to fetch feed: ${response.status} ${response.statusText}`
);
}
const xml = await response.text();
console.log(`Received ${xml.length.toLocaleString()} bytes of XML.`);
return xml;
}
function parseFeed(xml) {
const parser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: "@_",
// Parse CDATA sections
cdataPropName: "__cdata",
// Keep text nodes
textNodeName: "#text",
// Do not trim whitespace from values
trimValues: false,
// Ensure items are always arrays
isArray: (name) => name === "item",
});
const result = parser.parse(xml);
const channel = result?.rss?.channel;
if (!channel) {
throw new Error("Invalid RSS feed: no channel element found.");
}
const items = channel.item;
if (!items || !Array.isArray(items) || items.length === 0) {
throw new Error("No episodes found in feed.");
}
console.log(`Parsed ${items.length} episode items from feed.`);
return items;
}
// ---------------------------------------------------------------------------
// Episode extraction
// ---------------------------------------------------------------------------
/**
* Extract a text value from a parsed XML node, handling CDATA wrappers.
*/
function extractText(node) {
if (node === undefined || node === null) return "";
if (typeof node === "string") return node.trim();
if (typeof node === "number") return String(node);
if (typeof node === "object") {
// CDATA wrapped
if (node.__cdata !== undefined) {
return typeof node.__cdata === "string" ? node.__cdata.trim() : "";
}
if (node["#text"] !== undefined) {
return String(node["#text"]).trim();
}
}
return "";
}
/**
* Process a single RSS item into an episode data object.
*/
function processItem(item, index, warnings) {
const rawTitle = extractText(item.title);
const title = cleanTitle(rawTitle);
// Parse season/episode
const parsed = parseSeasonEpisode(rawTitle);
let season, episode;
if (parsed) {
season = parsed.season;
episode = parsed.episode;
} else {
season = 0;
episode = index + 1;
warnings.push(
`[WARNING] Could not parse S/E from title: "${title}" -- using season=0, episode=${episode}`
);
}
// Date
const pubDate = formatDate(extractText(item.pubDate));
// Description: prefer content:encoded, fall back to description
const contentEncoded = extractText(item["content:encoded"]);
const description = extractText(item.description);
const bodyHtml = contentEncoded || description || "";
if (!bodyHtml || bodyHtml.trim().length === 0) {
warnings.push(`[WARNING] Empty description for: "${title}"`);
}
// Audio enclosure
let audioUrl = "";
let audioSize = 0;
const enclosure = item.enclosure;
if (enclosure) {
audioUrl = normalizeAudioUrl(
enclosure["@_url"] || extractText(enclosure.url) || ""
);
const rawSize =
enclosure["@_length"] || extractText(enclosure.length) || "0";
audioSize = parseInt(rawSize, 10) || 0;
}
// Duration
const duration = extractText(item["itunes:duration"]) || "";
// Original URL
const link = extractText(item.link) || "";
// Tags
const tags = extractTags(title, description);
return {
title,
season,
episode,
pubDate,
bodyHtml,
audioUrl,
audioSize,
duration,
link,
tags,
};
}
// ---------------------------------------------------------------------------
// Markdown file generation
// ---------------------------------------------------------------------------
function generateMarkdown(ep, turndown) {
const body = htmlToMarkdown(ep.bodyHtml, turndown);
const tagList =
ep.tags.length > 0 ? `[${ep.tags.map((t) => `"${t}"`).join(", ")}]` : "[]";
const frontmatter = [
"---",
`title: ${yamlString(ep.title)}`,
`season: ${ep.season}`,
`episode: ${ep.episode}`,
`pubDate: ${ep.pubDate}`,
`duration: ${yamlString(ep.duration)}`,
`audioUrl: ${yamlString(ep.audioUrl)}`,
`audioSize: ${ep.audioSize}`,
`episodeType: "full"`,
`originalUrl: ${yamlString(ep.link)}`,
`featured: false`,
`classic: false`,
`tags: ${tagList}`,
"---",
].join("\n");
return `${frontmatter}\n\n${body}\n`;
}
// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------
async function main() {
console.log("=== Computer Guru Show Episode Importer ===\n");
// Ensure output directory exists
if (!existsSync(EPISODES_DIR)) {
mkdirSync(EPISODES_DIR, { recursive: true });
console.log(`Created directory: ${EPISODES_DIR}`);
}
// Fetch and parse
const xml = await fetchFeed(FEED_URL);
const items = parseFeed(xml);
// Process episodes
const turndown = createTurndown();
const warnings = [];
const episodes = [];
const filenames = new Set();
for (let i = 0; i < items.length; i++) {
const ep = processItem(items[i], i, warnings);
episodes.push(ep);
}
// Sort by season then episode (oldest first)
episodes.sort((a, b) => {
if (a.season !== b.season) return a.season - b.season;
return a.episode - b.episode;
});
// Write files
let written = 0;
let skippedDuplicates = 0;
for (const ep of episodes) {
let filename = episodeFilename(ep.season, ep.episode, ep.title);
// Handle duplicate filenames
if (filenames.has(filename)) {
const suffix = `-${Date.now().toString(36).slice(-4)}`;
filename = filename.replace(/\.md$/, `${suffix}.md`);
warnings.push(
`[WARNING] Duplicate filename resolved: ${filename} for "${ep.title}"`
);
skippedDuplicates++;
}
filenames.add(filename);
const markdown = generateMarkdown(ep, turndown);
const filepath = join(EPISODES_DIR, filename);
writeFileSync(filepath, markdown, "utf-8");
written++;
}
// Report
console.log("\n=== Import Complete ===");
console.log(`Total episodes in feed: ${items.length}`);
console.log(`Files written: ${written}`);
if (skippedDuplicates > 0) {
console.log(`Duplicate filenames resolved: ${skippedDuplicates}`);
}
if (warnings.length > 0) {
console.log(`\n--- Warnings (${warnings.length}) ---`);
for (const w of warnings) {
console.log(w);
}
}
// Show a sample of generated files
console.log("\n--- Sample Output (first 3 files) ---");
const sampleEpisodes = episodes.slice(0, 3);
for (const ep of sampleEpisodes) {
const filename = episodeFilename(ep.season, ep.episode, ep.title);
console.log(`\nFile: ${filename}`);
const markdown = generateMarkdown(ep, turndown);
// Show first 20 lines
const lines = markdown.split("\n").slice(0, 20);
console.log(lines.join("\n"));
if (markdown.split("\n").length > 20) {
console.log(" ...(truncated)");
}
}
console.log("\n[SUCCESS] Episode import finished.");
}
main().catch((err) => {
console.error(`[ERROR] Import failed: ${err.message}`);
console.error(err.stack);
process.exit(1);
});