Complete website for The Computer Guru Show (radio.azcomputerguru.com): - Astro 6.0.4 static site with React islands - 194 episodes imported from gurushow.com RSS feed - Dark/light mode HSL design system - Persistent audio player with session persistence - Episode archive with search and season filtering - Home page with animated hero, stats, latest episodes - All pages: About, Subscribe, Community, Live, Contact, Blog, 404 - Podcast RSS feed with iTunes namespace - Session log updated Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
571 lines
15 KiB
JavaScript
571 lines
15 KiB
JavaScript
/**
|
|
* import-episodes.mjs
|
|
*
|
|
* Fetches all episodes from the Computer Guru Show podcast RSS feed
|
|
* and converts them into Astro content collection markdown files.
|
|
*
|
|
* Usage:
|
|
* node scripts/import-episodes.mjs
|
|
*
|
|
* Dependencies: fast-xml-parser, turndown
|
|
*/
|
|
|
|
import { writeFileSync, mkdirSync, existsSync } from "node:fs";
|
|
import { join, dirname } from "node:path";
|
|
import { fileURLToPath } from "node:url";
|
|
import { XMLParser } from "fast-xml-parser";
|
|
import TurndownService from "turndown";
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Configuration
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const FEED_URL = "https://gurushow.com/feed/podcast";
|
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
const PROJECT_ROOT = join(__dirname, "..");
|
|
const EPISODES_DIR = join(PROJECT_ROOT, "src", "content", "episodes");
|
|
|
|
/** Common tech keywords mapped to tag slugs. */
|
|
const TAG_KEYWORDS = new Map([
|
|
["apple", "apple"],
|
|
["iphone", "iphone"],
|
|
["ipad", "ipad"],
|
|
["\\bmac\\b", "mac"],
|
|
["macbook", "macbook"],
|
|
["google", "google"],
|
|
["android", "android"],
|
|
["chrome", "chrome"],
|
|
["chromebook", "chromebook"],
|
|
["microsoft", "microsoft"],
|
|
["windows", "windows"],
|
|
["security", "security"],
|
|
["privacy", "privacy"],
|
|
["hacking", "security"],
|
|
["hack", "security"],
|
|
["malware", "malware"],
|
|
["ransomware", "ransomware"],
|
|
["virus", "malware"],
|
|
["\\bai\\b", "ai"],
|
|
["artificial.intelligence", "ai"],
|
|
["chatgpt", "ai"],
|
|
["openai", "ai"],
|
|
["machine.learning", "ai"],
|
|
["space", "space"],
|
|
["spacex", "space"],
|
|
["nasa", "space"],
|
|
["bitcoin", "cryptocurrency"],
|
|
["crypto", "cryptocurrency"],
|
|
["cryptocurrency", "cryptocurrency"],
|
|
["blockchain", "cryptocurrency"],
|
|
["net.neutrality", "net-neutrality"],
|
|
["gaming", "gaming"],
|
|
["playstation", "gaming"],
|
|
["xbox", "gaming"],
|
|
["nintendo", "gaming"],
|
|
["social.media", "social-media"],
|
|
["facebook", "facebook"],
|
|
["twitter", "twitter"],
|
|
["instagram", "instagram"],
|
|
["tiktok", "social-media"],
|
|
["amazon", "amazon"],
|
|
["alexa", "amazon"],
|
|
["tesla", "tesla"],
|
|
["samsung", "samsung"],
|
|
["cybersecurity", "security"],
|
|
["phishing", "security"],
|
|
["data.breach", "security"],
|
|
["linux", "linux"],
|
|
["ubuntu", "linux"],
|
|
["roku", "streaming"],
|
|
["netflix", "streaming"],
|
|
["streaming", "streaming"],
|
|
["5g", "5g"],
|
|
["\\bvr\\b", "vr"],
|
|
["virtual.reality", "vr"],
|
|
["augmented.reality", "ar"],
|
|
["drone", "drones"],
|
|
["robot", "robotics"],
|
|
]);
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Decode common HTML entities that appear in RSS feed titles/descriptions.
|
|
* Handles numeric (–), hex (’), and named (&) entities.
|
|
*/
|
|
function decodeHtmlEntities(text) {
|
|
if (!text) return "";
|
|
|
|
const namedEntities = {
|
|
"&": "&",
|
|
"<": "<",
|
|
">": ">",
|
|
""": '"',
|
|
"'": "'",
|
|
" ": " ",
|
|
"–": "\u2013",
|
|
"—": "\u2014",
|
|
"‘": "\u2018",
|
|
"’": "\u2019",
|
|
"“": "\u201C",
|
|
"”": "\u201D",
|
|
"…": "\u2026",
|
|
};
|
|
|
|
let result = text;
|
|
|
|
// Named entities
|
|
for (const [entity, char] of Object.entries(namedEntities)) {
|
|
result = result.replaceAll(entity, char);
|
|
}
|
|
|
|
// Numeric decimal entities: –
|
|
result = result.replace(/&#(\d+);/g, (_match, dec) =>
|
|
String.fromCodePoint(Number(dec))
|
|
);
|
|
|
|
// Numeric hex entities: ’
|
|
result = result.replace(/&#x([0-9a-fA-F]+);/g, (_match, hex) =>
|
|
String.fromCodePoint(parseInt(hex, 16))
|
|
);
|
|
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Parse season and episode numbers from a title string.
|
|
* Looks for patterns like "S10E21", "S1E3", "s02e05", case-insensitive.
|
|
* Returns { season, episode } or null if not found.
|
|
*/
|
|
function parseSeasonEpisode(title) {
|
|
// Pattern: S{n}E{n} anywhere in the title (with optional space between S and E parts)
|
|
const match = title.match(/S(\d+)\s*E(\d+)/i);
|
|
if (match) {
|
|
return {
|
|
season: parseInt(match[1], 10),
|
|
episode: parseInt(match[2], 10),
|
|
};
|
|
}
|
|
|
|
// Fallback: "Season X Episode Y" pattern
|
|
const longMatch = title.match(/Season\s+(\d+)\s+Episode\s+(\d+)/i);
|
|
if (longMatch) {
|
|
return {
|
|
season: parseInt(longMatch[1], 10),
|
|
episode: parseInt(longMatch[2], 10),
|
|
};
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Strip "Podcast " prefix from episode titles.
|
|
*/
|
|
function cleanTitle(rawTitle) {
|
|
let title = decodeHtmlEntities(rawTitle).trim();
|
|
// Remove leading "Podcast " (case-insensitive)
|
|
title = title.replace(/^Podcast\s+/i, "");
|
|
return title;
|
|
}
|
|
|
|
/**
|
|
* Generate a URL-safe slug from a title string.
|
|
* - Lowercase
|
|
* - Replace non-alphanumeric chars with hyphens
|
|
* - Collapse multiple hyphens
|
|
* - Trim hyphens from ends
|
|
* - Max 60 characters
|
|
*/
|
|
function slugify(title) {
|
|
// Remove the S{n}E{n} prefix before slugifying to keep the slug about content
|
|
const withoutCode = title.replace(/^S\d+\s*E\d+\s*[-\u2013:]?\s*/i, "").trim();
|
|
const base = withoutCode || title;
|
|
|
|
let slug = base
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9]+/g, "-")
|
|
.replace(/-+/g, "-")
|
|
.replace(/^-|-$/g, "");
|
|
|
|
if (slug.length > 60) {
|
|
slug = slug.substring(0, 60).replace(/-$/, "");
|
|
}
|
|
|
|
return slug || "untitled";
|
|
}
|
|
|
|
/**
|
|
* Generate a filename for an episode.
|
|
* Format: s{SS}e{EE}-{slug}.md
|
|
*/
|
|
function episodeFilename(season, episode, title) {
|
|
const s = String(season).padStart(2, "0");
|
|
const e = String(episode).padStart(2, "0");
|
|
const slug = slugify(title);
|
|
return `s${s}e${e}-${slug}.md`;
|
|
}
|
|
|
|
/**
|
|
* Extract tags from a title string based on keyword matching.
|
|
* Returns a deduplicated, sorted array of tag slugs.
|
|
*/
|
|
function extractTags(title, description) {
|
|
const text = `${title} ${description}`.toLowerCase();
|
|
const tags = new Set();
|
|
|
|
for (const [pattern, tag] of TAG_KEYWORDS) {
|
|
const regex = new RegExp(pattern, "i");
|
|
if (regex.test(text)) {
|
|
tags.add(tag);
|
|
}
|
|
}
|
|
|
|
return [...tags].sort();
|
|
}
|
|
|
|
/**
|
|
* Convert an ISO date string or RSS pubDate into YYYY-MM-DD format.
|
|
*/
|
|
function formatDate(dateStr) {
|
|
if (!dateStr) return "1970-01-01";
|
|
const d = new Date(dateStr);
|
|
if (isNaN(d.getTime())) return "1970-01-01";
|
|
return d.toISOString().split("T")[0];
|
|
}
|
|
|
|
/**
|
|
* Normalize audio URL to HTTPS.
|
|
*/
|
|
function normalizeAudioUrl(url) {
|
|
if (!url) return "";
|
|
return url.replace(/^http:\/\//i, "https://");
|
|
}
|
|
|
|
/**
|
|
* Escape YAML string values. Wraps in double quotes and escapes inner quotes.
|
|
*/
|
|
function yamlString(value) {
|
|
if (value === undefined || value === null) return '""';
|
|
const str = String(value);
|
|
// Escape backslashes first, then double quotes
|
|
const escaped = str.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
|
|
return `"${escaped}"`;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Turndown (HTML -> Markdown) setup
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function createTurndown() {
|
|
const td = new TurndownService({
|
|
headingStyle: "atx",
|
|
bulletListMarker: "-",
|
|
codeBlockStyle: "fenced",
|
|
emDelimiter: "*",
|
|
});
|
|
|
|
// Remove script/style elements
|
|
td.remove(["script", "style"]);
|
|
|
|
return td;
|
|
}
|
|
|
|
/**
|
|
* Convert HTML content to clean markdown.
|
|
* Handles null/empty input gracefully.
|
|
*/
|
|
function htmlToMarkdown(html, turndown) {
|
|
if (!html || typeof html !== "string" || html.trim().length === 0) {
|
|
return "";
|
|
}
|
|
|
|
let md = turndown.turndown(html);
|
|
|
|
// Clean up excessive blank lines
|
|
md = md.replace(/\n{3,}/g, "\n\n");
|
|
|
|
// Decode any leftover HTML entities
|
|
md = decodeHtmlEntities(md);
|
|
|
|
return md.trim();
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Feed fetching and parsing
|
|
// ---------------------------------------------------------------------------
|
|
|
|
async function fetchFeed(url) {
|
|
console.log(`Fetching RSS feed from ${url} ...`);
|
|
|
|
const response = await fetch(url, {
|
|
headers: {
|
|
"User-Agent": "GuruShowImporter/1.0",
|
|
Accept: "application/rss+xml, application/xml, text/xml",
|
|
},
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(
|
|
`Failed to fetch feed: ${response.status} ${response.statusText}`
|
|
);
|
|
}
|
|
|
|
const xml = await response.text();
|
|
console.log(`Received ${xml.length.toLocaleString()} bytes of XML.`);
|
|
return xml;
|
|
}
|
|
|
|
function parseFeed(xml) {
|
|
const parser = new XMLParser({
|
|
ignoreAttributes: false,
|
|
attributeNamePrefix: "@_",
|
|
// Parse CDATA sections
|
|
cdataPropName: "__cdata",
|
|
// Keep text nodes
|
|
textNodeName: "#text",
|
|
// Do not trim whitespace from values
|
|
trimValues: false,
|
|
// Ensure items are always arrays
|
|
isArray: (name) => name === "item",
|
|
});
|
|
|
|
const result = parser.parse(xml);
|
|
const channel = result?.rss?.channel;
|
|
|
|
if (!channel) {
|
|
throw new Error("Invalid RSS feed: no channel element found.");
|
|
}
|
|
|
|
const items = channel.item;
|
|
if (!items || !Array.isArray(items) || items.length === 0) {
|
|
throw new Error("No episodes found in feed.");
|
|
}
|
|
|
|
console.log(`Parsed ${items.length} episode items from feed.`);
|
|
return items;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Episode extraction
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Extract a text value from a parsed XML node, handling CDATA wrappers.
|
|
*/
|
|
function extractText(node) {
|
|
if (node === undefined || node === null) return "";
|
|
if (typeof node === "string") return node.trim();
|
|
if (typeof node === "number") return String(node);
|
|
if (typeof node === "object") {
|
|
// CDATA wrapped
|
|
if (node.__cdata !== undefined) {
|
|
return typeof node.__cdata === "string" ? node.__cdata.trim() : "";
|
|
}
|
|
if (node["#text"] !== undefined) {
|
|
return String(node["#text"]).trim();
|
|
}
|
|
}
|
|
return "";
|
|
}
|
|
|
|
/**
|
|
* Process a single RSS item into an episode data object.
|
|
*/
|
|
function processItem(item, index, warnings) {
|
|
const rawTitle = extractText(item.title);
|
|
const title = cleanTitle(rawTitle);
|
|
|
|
// Parse season/episode
|
|
const parsed = parseSeasonEpisode(rawTitle);
|
|
let season, episode;
|
|
|
|
if (parsed) {
|
|
season = parsed.season;
|
|
episode = parsed.episode;
|
|
} else {
|
|
season = 0;
|
|
episode = index + 1;
|
|
warnings.push(
|
|
`[WARNING] Could not parse S/E from title: "${title}" -- using season=0, episode=${episode}`
|
|
);
|
|
}
|
|
|
|
// Date
|
|
const pubDate = formatDate(extractText(item.pubDate));
|
|
|
|
// Description: prefer content:encoded, fall back to description
|
|
const contentEncoded = extractText(item["content:encoded"]);
|
|
const description = extractText(item.description);
|
|
const bodyHtml = contentEncoded || description || "";
|
|
|
|
if (!bodyHtml || bodyHtml.trim().length === 0) {
|
|
warnings.push(`[WARNING] Empty description for: "${title}"`);
|
|
}
|
|
|
|
// Audio enclosure
|
|
let audioUrl = "";
|
|
let audioSize = 0;
|
|
const enclosure = item.enclosure;
|
|
if (enclosure) {
|
|
audioUrl = normalizeAudioUrl(
|
|
enclosure["@_url"] || extractText(enclosure.url) || ""
|
|
);
|
|
const rawSize =
|
|
enclosure["@_length"] || extractText(enclosure.length) || "0";
|
|
audioSize = parseInt(rawSize, 10) || 0;
|
|
}
|
|
|
|
// Duration
|
|
const duration = extractText(item["itunes:duration"]) || "";
|
|
|
|
// Original URL
|
|
const link = extractText(item.link) || "";
|
|
|
|
// Tags
|
|
const tags = extractTags(title, description);
|
|
|
|
return {
|
|
title,
|
|
season,
|
|
episode,
|
|
pubDate,
|
|
bodyHtml,
|
|
audioUrl,
|
|
audioSize,
|
|
duration,
|
|
link,
|
|
tags,
|
|
};
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Markdown file generation
|
|
// ---------------------------------------------------------------------------
|
|
|
|
function generateMarkdown(ep, turndown) {
|
|
const body = htmlToMarkdown(ep.bodyHtml, turndown);
|
|
|
|
const tagList =
|
|
ep.tags.length > 0 ? `[${ep.tags.map((t) => `"${t}"`).join(", ")}]` : "[]";
|
|
|
|
const frontmatter = [
|
|
"---",
|
|
`title: ${yamlString(ep.title)}`,
|
|
`season: ${ep.season}`,
|
|
`episode: ${ep.episode}`,
|
|
`pubDate: ${ep.pubDate}`,
|
|
`duration: ${yamlString(ep.duration)}`,
|
|
`audioUrl: ${yamlString(ep.audioUrl)}`,
|
|
`audioSize: ${ep.audioSize}`,
|
|
`episodeType: "full"`,
|
|
`originalUrl: ${yamlString(ep.link)}`,
|
|
`featured: false`,
|
|
`classic: false`,
|
|
`tags: ${tagList}`,
|
|
"---",
|
|
].join("\n");
|
|
|
|
return `${frontmatter}\n\n${body}\n`;
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Main
|
|
// ---------------------------------------------------------------------------
|
|
|
|
async function main() {
|
|
console.log("=== Computer Guru Show Episode Importer ===\n");
|
|
|
|
// Ensure output directory exists
|
|
if (!existsSync(EPISODES_DIR)) {
|
|
mkdirSync(EPISODES_DIR, { recursive: true });
|
|
console.log(`Created directory: ${EPISODES_DIR}`);
|
|
}
|
|
|
|
// Fetch and parse
|
|
const xml = await fetchFeed(FEED_URL);
|
|
const items = parseFeed(xml);
|
|
|
|
// Process episodes
|
|
const turndown = createTurndown();
|
|
const warnings = [];
|
|
const episodes = [];
|
|
const filenames = new Set();
|
|
|
|
for (let i = 0; i < items.length; i++) {
|
|
const ep = processItem(items[i], i, warnings);
|
|
episodes.push(ep);
|
|
}
|
|
|
|
// Sort by season then episode (oldest first)
|
|
episodes.sort((a, b) => {
|
|
if (a.season !== b.season) return a.season - b.season;
|
|
return a.episode - b.episode;
|
|
});
|
|
|
|
// Write files
|
|
let written = 0;
|
|
let skippedDuplicates = 0;
|
|
|
|
for (const ep of episodes) {
|
|
let filename = episodeFilename(ep.season, ep.episode, ep.title);
|
|
|
|
// Handle duplicate filenames
|
|
if (filenames.has(filename)) {
|
|
const suffix = `-${Date.now().toString(36).slice(-4)}`;
|
|
filename = filename.replace(/\.md$/, `${suffix}.md`);
|
|
warnings.push(
|
|
`[WARNING] Duplicate filename resolved: ${filename} for "${ep.title}"`
|
|
);
|
|
skippedDuplicates++;
|
|
}
|
|
filenames.add(filename);
|
|
|
|
const markdown = generateMarkdown(ep, turndown);
|
|
const filepath = join(EPISODES_DIR, filename);
|
|
writeFileSync(filepath, markdown, "utf-8");
|
|
written++;
|
|
}
|
|
|
|
// Report
|
|
console.log("\n=== Import Complete ===");
|
|
console.log(`Total episodes in feed: ${items.length}`);
|
|
console.log(`Files written: ${written}`);
|
|
|
|
if (skippedDuplicates > 0) {
|
|
console.log(`Duplicate filenames resolved: ${skippedDuplicates}`);
|
|
}
|
|
|
|
if (warnings.length > 0) {
|
|
console.log(`\n--- Warnings (${warnings.length}) ---`);
|
|
for (const w of warnings) {
|
|
console.log(w);
|
|
}
|
|
}
|
|
|
|
// Show a sample of generated files
|
|
console.log("\n--- Sample Output (first 3 files) ---");
|
|
const sampleEpisodes = episodes.slice(0, 3);
|
|
for (const ep of sampleEpisodes) {
|
|
const filename = episodeFilename(ep.season, ep.episode, ep.title);
|
|
console.log(`\nFile: ${filename}`);
|
|
const markdown = generateMarkdown(ep, turndown);
|
|
// Show first 20 lines
|
|
const lines = markdown.split("\n").slice(0, 20);
|
|
console.log(lines.join("\n"));
|
|
if (markdown.split("\n").length > 20) {
|
|
console.log(" ...(truncated)");
|
|
}
|
|
}
|
|
|
|
console.log("\n[SUCCESS] Episode import finished.");
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error(`[ERROR] Import failed: ${err.message}`);
|
|
console.error(err.stack);
|
|
process.exit(1);
|
|
});
|