/** * import-episodes.mjs * * Fetches all episodes from the Computer Guru Show podcast RSS feed * and converts them into Astro content collection markdown files. * * Usage: * node scripts/import-episodes.mjs * * Dependencies: fast-xml-parser, turndown */ import { writeFileSync, mkdirSync, existsSync } from "node:fs"; import { join, dirname } from "node:path"; import { fileURLToPath } from "node:url"; import { XMLParser } from "fast-xml-parser"; import TurndownService from "turndown"; // --------------------------------------------------------------------------- // Configuration // --------------------------------------------------------------------------- const FEED_URL = "https://gurushow.com/feed/podcast"; const __dirname = dirname(fileURLToPath(import.meta.url)); const PROJECT_ROOT = join(__dirname, ".."); const EPISODES_DIR = join(PROJECT_ROOT, "src", "content", "episodes"); /** Common tech keywords mapped to tag slugs. */ const TAG_KEYWORDS = new Map([ ["apple", "apple"], ["iphone", "iphone"], ["ipad", "ipad"], ["\\bmac\\b", "mac"], ["macbook", "macbook"], ["google", "google"], ["android", "android"], ["chrome", "chrome"], ["chromebook", "chromebook"], ["microsoft", "microsoft"], ["windows", "windows"], ["security", "security"], ["privacy", "privacy"], ["hacking", "security"], ["hack", "security"], ["malware", "malware"], ["ransomware", "ransomware"], ["virus", "malware"], ["\\bai\\b", "ai"], ["artificial.intelligence", "ai"], ["chatgpt", "ai"], ["openai", "ai"], ["machine.learning", "ai"], ["space", "space"], ["spacex", "space"], ["nasa", "space"], ["bitcoin", "cryptocurrency"], ["crypto", "cryptocurrency"], ["cryptocurrency", "cryptocurrency"], ["blockchain", "cryptocurrency"], ["net.neutrality", "net-neutrality"], ["gaming", "gaming"], ["playstation", "gaming"], ["xbox", "gaming"], ["nintendo", "gaming"], ["social.media", "social-media"], ["facebook", "facebook"], ["twitter", "twitter"], ["instagram", "instagram"], ["tiktok", "social-media"], ["amazon", "amazon"], ["alexa", "amazon"], ["tesla", "tesla"], ["samsung", "samsung"], ["cybersecurity", "security"], ["phishing", "security"], ["data.breach", "security"], ["linux", "linux"], ["ubuntu", "linux"], ["roku", "streaming"], ["netflix", "streaming"], ["streaming", "streaming"], ["5g", "5g"], ["\\bvr\\b", "vr"], ["virtual.reality", "vr"], ["augmented.reality", "ar"], ["drone", "drones"], ["robot", "robotics"], ]); // --------------------------------------------------------------------------- // Helpers // --------------------------------------------------------------------------- /** * Decode common HTML entities that appear in RSS feed titles/descriptions. * Handles numeric (–), hex (’), and named (&) entities. */ function decodeHtmlEntities(text) { if (!text) return ""; const namedEntities = { "&": "&", "<": "<", ">": ">", """: '"', "'": "'", " ": " ", "–": "\u2013", "—": "\u2014", "‘": "\u2018", "’": "\u2019", "“": "\u201C", "”": "\u201D", "…": "\u2026", }; let result = text; // Named entities for (const [entity, char] of Object.entries(namedEntities)) { result = result.replaceAll(entity, char); } // Numeric decimal entities: – result = result.replace(/&#(\d+);/g, (_match, dec) => String.fromCodePoint(Number(dec)) ); // Numeric hex entities: ’ result = result.replace(/&#x([0-9a-fA-F]+);/g, (_match, hex) => String.fromCodePoint(parseInt(hex, 16)) ); return result; } /** * Parse season and episode numbers from a title string. * Looks for patterns like "S10E21", "S1E3", "s02e05", case-insensitive. * Returns { season, episode } or null if not found. */ function parseSeasonEpisode(title) { // Pattern: S{n}E{n} anywhere in the title (with optional space between S and E parts) const match = title.match(/S(\d+)\s*E(\d+)/i); if (match) { return { season: parseInt(match[1], 10), episode: parseInt(match[2], 10), }; } // Fallback: "Season X Episode Y" pattern const longMatch = title.match(/Season\s+(\d+)\s+Episode\s+(\d+)/i); if (longMatch) { return { season: parseInt(longMatch[1], 10), episode: parseInt(longMatch[2], 10), }; } return null; } /** * Strip "Podcast " prefix from episode titles. */ function cleanTitle(rawTitle) { let title = decodeHtmlEntities(rawTitle).trim(); // Remove leading "Podcast " (case-insensitive) title = title.replace(/^Podcast\s+/i, ""); return title; } /** * Generate a URL-safe slug from a title string. * - Lowercase * - Replace non-alphanumeric chars with hyphens * - Collapse multiple hyphens * - Trim hyphens from ends * - Max 60 characters */ function slugify(title) { // Remove the S{n}E{n} prefix before slugifying to keep the slug about content const withoutCode = title.replace(/^S\d+\s*E\d+\s*[-\u2013:]?\s*/i, "").trim(); const base = withoutCode || title; let slug = base .toLowerCase() .replace(/[^a-z0-9]+/g, "-") .replace(/-+/g, "-") .replace(/^-|-$/g, ""); if (slug.length > 60) { slug = slug.substring(0, 60).replace(/-$/, ""); } return slug || "untitled"; } /** * Generate a filename for an episode. * Format: s{SS}e{EE}-{slug}.md */ function episodeFilename(season, episode, title) { const s = String(season).padStart(2, "0"); const e = String(episode).padStart(2, "0"); const slug = slugify(title); return `s${s}e${e}-${slug}.md`; } /** * Extract tags from a title string based on keyword matching. * Returns a deduplicated, sorted array of tag slugs. */ function extractTags(title, description) { const text = `${title} ${description}`.toLowerCase(); const tags = new Set(); for (const [pattern, tag] of TAG_KEYWORDS) { const regex = new RegExp(pattern, "i"); if (regex.test(text)) { tags.add(tag); } } return [...tags].sort(); } /** * Convert an ISO date string or RSS pubDate into YYYY-MM-DD format. */ function formatDate(dateStr) { if (!dateStr) return "1970-01-01"; const d = new Date(dateStr); if (isNaN(d.getTime())) return "1970-01-01"; return d.toISOString().split("T")[0]; } /** * Normalize audio URL to HTTPS. */ function normalizeAudioUrl(url) { if (!url) return ""; return url.replace(/^http:\/\//i, "https://"); } /** * Escape YAML string values. Wraps in double quotes and escapes inner quotes. */ function yamlString(value) { if (value === undefined || value === null) return '""'; const str = String(value); // Escape backslashes first, then double quotes const escaped = str.replace(/\\/g, "\\\\").replace(/"/g, '\\"'); return `"${escaped}"`; } // --------------------------------------------------------------------------- // Turndown (HTML -> Markdown) setup // --------------------------------------------------------------------------- function createTurndown() { const td = new TurndownService({ headingStyle: "atx", bulletListMarker: "-", codeBlockStyle: "fenced", emDelimiter: "*", }); // Remove script/style elements td.remove(["script", "style"]); return td; } /** * Convert HTML content to clean markdown. * Handles null/empty input gracefully. */ function htmlToMarkdown(html, turndown) { if (!html || typeof html !== "string" || html.trim().length === 0) { return ""; } let md = turndown.turndown(html); // Clean up excessive blank lines md = md.replace(/\n{3,}/g, "\n\n"); // Decode any leftover HTML entities md = decodeHtmlEntities(md); return md.trim(); } // --------------------------------------------------------------------------- // Feed fetching and parsing // --------------------------------------------------------------------------- async function fetchFeed(url) { console.log(`Fetching RSS feed from ${url} ...`); const response = await fetch(url, { headers: { "User-Agent": "GuruShowImporter/1.0", Accept: "application/rss+xml, application/xml, text/xml", }, }); if (!response.ok) { throw new Error( `Failed to fetch feed: ${response.status} ${response.statusText}` ); } const xml = await response.text(); console.log(`Received ${xml.length.toLocaleString()} bytes of XML.`); return xml; } function parseFeed(xml) { const parser = new XMLParser({ ignoreAttributes: false, attributeNamePrefix: "@_", // Parse CDATA sections cdataPropName: "__cdata", // Keep text nodes textNodeName: "#text", // Do not trim whitespace from values trimValues: false, // Ensure items are always arrays isArray: (name) => name === "item", }); const result = parser.parse(xml); const channel = result?.rss?.channel; if (!channel) { throw new Error("Invalid RSS feed: no channel element found."); } const items = channel.item; if (!items || !Array.isArray(items) || items.length === 0) { throw new Error("No episodes found in feed."); } console.log(`Parsed ${items.length} episode items from feed.`); return items; } // --------------------------------------------------------------------------- // Episode extraction // --------------------------------------------------------------------------- /** * Extract a text value from a parsed XML node, handling CDATA wrappers. */ function extractText(node) { if (node === undefined || node === null) return ""; if (typeof node === "string") return node.trim(); if (typeof node === "number") return String(node); if (typeof node === "object") { // CDATA wrapped if (node.__cdata !== undefined) { return typeof node.__cdata === "string" ? node.__cdata.trim() : ""; } if (node["#text"] !== undefined) { return String(node["#text"]).trim(); } } return ""; } /** * Process a single RSS item into an episode data object. */ function processItem(item, index, warnings) { const rawTitle = extractText(item.title); const title = cleanTitle(rawTitle); // Parse season/episode const parsed = parseSeasonEpisode(rawTitle); let season, episode; if (parsed) { season = parsed.season; episode = parsed.episode; } else { season = 0; episode = index + 1; warnings.push( `[WARNING] Could not parse S/E from title: "${title}" -- using season=0, episode=${episode}` ); } // Date const pubDate = formatDate(extractText(item.pubDate)); // Description: prefer content:encoded, fall back to description const contentEncoded = extractText(item["content:encoded"]); const description = extractText(item.description); const bodyHtml = contentEncoded || description || ""; if (!bodyHtml || bodyHtml.trim().length === 0) { warnings.push(`[WARNING] Empty description for: "${title}"`); } // Audio enclosure let audioUrl = ""; let audioSize = 0; const enclosure = item.enclosure; if (enclosure) { audioUrl = normalizeAudioUrl( enclosure["@_url"] || extractText(enclosure.url) || "" ); const rawSize = enclosure["@_length"] || extractText(enclosure.length) || "0"; audioSize = parseInt(rawSize, 10) || 0; } // Duration const duration = extractText(item["itunes:duration"]) || ""; // Original URL const link = extractText(item.link) || ""; // Tags const tags = extractTags(title, description); return { title, season, episode, pubDate, bodyHtml, audioUrl, audioSize, duration, link, tags, }; } // --------------------------------------------------------------------------- // Markdown file generation // --------------------------------------------------------------------------- function generateMarkdown(ep, turndown) { const body = htmlToMarkdown(ep.bodyHtml, turndown); const tagList = ep.tags.length > 0 ? `[${ep.tags.map((t) => `"${t}"`).join(", ")}]` : "[]"; const frontmatter = [ "---", `title: ${yamlString(ep.title)}`, `season: ${ep.season}`, `episode: ${ep.episode}`, `pubDate: ${ep.pubDate}`, `duration: ${yamlString(ep.duration)}`, `audioUrl: ${yamlString(ep.audioUrl)}`, `audioSize: ${ep.audioSize}`, `episodeType: "full"`, `originalUrl: ${yamlString(ep.link)}`, `featured: false`, `classic: false`, `tags: ${tagList}`, "---", ].join("\n"); return `${frontmatter}\n\n${body}\n`; } // --------------------------------------------------------------------------- // Main // --------------------------------------------------------------------------- async function main() { console.log("=== Computer Guru Show Episode Importer ===\n"); // Ensure output directory exists if (!existsSync(EPISODES_DIR)) { mkdirSync(EPISODES_DIR, { recursive: true }); console.log(`Created directory: ${EPISODES_DIR}`); } // Fetch and parse const xml = await fetchFeed(FEED_URL); const items = parseFeed(xml); // Process episodes const turndown = createTurndown(); const warnings = []; const episodes = []; const filenames = new Set(); for (let i = 0; i < items.length; i++) { const ep = processItem(items[i], i, warnings); episodes.push(ep); } // Sort by season then episode (oldest first) episodes.sort((a, b) => { if (a.season !== b.season) return a.season - b.season; return a.episode - b.episode; }); // Write files let written = 0; let skippedDuplicates = 0; for (const ep of episodes) { let filename = episodeFilename(ep.season, ep.episode, ep.title); // Handle duplicate filenames if (filenames.has(filename)) { const suffix = `-${Date.now().toString(36).slice(-4)}`; filename = filename.replace(/\.md$/, `${suffix}.md`); warnings.push( `[WARNING] Duplicate filename resolved: ${filename} for "${ep.title}"` ); skippedDuplicates++; } filenames.add(filename); const markdown = generateMarkdown(ep, turndown); const filepath = join(EPISODES_DIR, filename); writeFileSync(filepath, markdown, "utf-8"); written++; } // Report console.log("\n=== Import Complete ==="); console.log(`Total episodes in feed: ${items.length}`); console.log(`Files written: ${written}`); if (skippedDuplicates > 0) { console.log(`Duplicate filenames resolved: ${skippedDuplicates}`); } if (warnings.length > 0) { console.log(`\n--- Warnings (${warnings.length}) ---`); for (const w of warnings) { console.log(w); } } // Show a sample of generated files console.log("\n--- Sample Output (first 3 files) ---"); const sampleEpisodes = episodes.slice(0, 3); for (const ep of sampleEpisodes) { const filename = episodeFilename(ep.season, ep.episode, ep.title); console.log(`\nFile: ${filename}`); const markdown = generateMarkdown(ep, turndown); // Show first 20 lines const lines = markdown.split("\n").slice(0, 20); console.log(lines.join("\n")); if (markdown.split("\n").length > 20) { console.log(" ...(truncated)"); } } console.log("\n[SUCCESS] Episode import finished."); } main().catch((err) => { console.error(`[ERROR] Import failed: ${err.message}`); console.error(err.stack); process.exit(1); });