claudetools/projects/radio-show/website/scripts/import-episodes.mjs

/**
 * import-episodes.mjs
 *
 * Fetches all episodes from the Computer Guru Show podcast RSS feed
 * and converts them into Astro content collection markdown files.
 *
 * Usage:
 *   node scripts/import-episodes.mjs
 *
 * Dependencies: fast-xml-parser, turndown
 */

import { writeFileSync, mkdirSync, existsSync } from "node:fs";
import { join, dirname } from "node:path";
import { fileURLToPath } from "node:url";
import { XMLParser } from "fast-xml-parser";
import TurndownService from "turndown";

// ---------------------------------------------------------------------------
// Configuration
// ---------------------------------------------------------------------------

const FEED_URL = "https://gurushow.com/feed/podcast";
const __dirname = dirname(fileURLToPath(import.meta.url));
const PROJECT_ROOT = join(__dirname, "..");
const EPISODES_DIR = join(PROJECT_ROOT, "src", "content", "episodes");

/** Common tech keywords mapped to tag slugs. */
const TAG_KEYWORDS = new Map([
  ["apple", "apple"],
  ["iphone", "iphone"],
  ["ipad", "ipad"],
  ["\\bmac\\b", "mac"],
  ["macbook", "macbook"],
  ["google", "google"],
  ["android", "android"],
  ["chrome", "chrome"],
  ["chromebook", "chromebook"],
  ["microsoft", "microsoft"],
  ["windows", "windows"],
  ["security", "security"],
  ["privacy", "privacy"],
  ["hacking", "security"],
  ["hack", "security"],
  ["malware", "malware"],
  ["ransomware", "ransomware"],
  ["virus", "malware"],
  ["\\bai\\b", "ai"],
  ["artificial.intelligence", "ai"],
  ["chatgpt", "ai"],
  ["openai", "ai"],
  ["machine.learning", "ai"],
  ["space", "space"],
  ["spacex", "space"],
  ["nasa", "space"],
  ["bitcoin", "cryptocurrency"],
  ["crypto", "cryptocurrency"],
  ["cryptocurrency", "cryptocurrency"],
  ["blockchain", "cryptocurrency"],
  ["net.neutrality", "net-neutrality"],
  ["gaming", "gaming"],
  ["playstation", "gaming"],
  ["xbox", "gaming"],
  ["nintendo", "gaming"],
  ["social.media", "social-media"],
  ["facebook", "facebook"],
  ["twitter", "twitter"],
  ["instagram", "instagram"],
  ["tiktok", "social-media"],
  ["amazon", "amazon"],
  ["alexa", "amazon"],
  ["tesla", "tesla"],
  ["samsung", "samsung"],
  ["cybersecurity", "security"],
  ["phishing", "security"],
  ["data.breach", "security"],
  ["linux", "linux"],
  ["ubuntu", "linux"],
  ["roku", "streaming"],
  ["netflix", "streaming"],
  ["streaming", "streaming"],
  ["5g", "5g"],
  ["\\bvr\\b", "vr"],
  ["virtual.reality", "vr"],
  ["augmented.reality", "ar"],
  ["drone", "drones"],
  ["robot", "robotics"],
]);

// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------

/**
 * Decode common HTML entities that appear in RSS feed titles/descriptions.
 * Handles numeric (&#8211;), hex (&#x2019;), and named (&amp;) entities.
 */
function decodeHtmlEntities(text) {
  if (!text) return "";

  const namedEntities = {
    "&amp;": "&",
    "&lt;": "<",
    "&gt;": ">",
    "&quot;": '"',
    "&apos;": "'",
    "&nbsp;": " ",
    "&ndash;": "\u2013",
    "&mdash;": "\u2014",
    "&lsquo;": "\u2018",
    "&rsquo;": "\u2019",
    "&ldquo;": "\u201C",
    "&rdquo;": "\u201D",
    "&hellip;": "\u2026",
  };

  let result = text;

  // Named entities
  for (const [entity, char] of Object.entries(namedEntities)) {
    result = result.replaceAll(entity, char);
  }

  // Numeric decimal entities: &#8211;
  result = result.replace(/&#(\d+);/g, (_match, dec) =>
    String.fromCodePoint(Number(dec))
  );

  // Numeric hex entities: &#x2019;
  result = result.replace(/&#x([0-9a-fA-F]+);/g, (_match, hex) =>
    String.fromCodePoint(parseInt(hex, 16))
  );

  return result;
}

/**
 * Parse season and episode numbers from a title string.
 * Looks for patterns like "S10E21", "S1E3", "s02e05", case-insensitive.
 * Returns { season, episode } or null if not found.
 */
function parseSeasonEpisode(title) {
  // Pattern: S{n}E{n} anywhere in the title (with optional space between S and E parts)
  const match = title.match(/S(\d+)\s*E(\d+)/i);
  if (match) {
    return {
      season: parseInt(match[1], 10),
      episode: parseInt(match[2], 10),
    };
  }

  // Fallback: "Season X Episode Y" pattern
  const longMatch = title.match(/Season\s+(\d+)\s+Episode\s+(\d+)/i);
  if (longMatch) {
    return {
      season: parseInt(longMatch[1], 10),
      episode: parseInt(longMatch[2], 10),
    };
  }

  return null;
}

/**
 * Strip "Podcast " prefix from episode titles.
 */
function cleanTitle(rawTitle) {
  let title = decodeHtmlEntities(rawTitle).trim();
  // Remove leading "Podcast " (case-insensitive)
  title = title.replace(/^Podcast\s+/i, "");
  return title;
}

/**
 * Generate a URL-safe slug from a title string.
 * - Lowercase
 * - Replace non-alphanumeric chars with hyphens
 * - Collapse multiple hyphens
 * - Trim hyphens from ends
 * - Max 60 characters
 */
function slugify(title) {
  // Remove the S{n}E{n} prefix before slugifying to keep the slug about content
  const withoutCode = title.replace(/^S\d+\s*E\d+\s*[-\u2013:]?\s*/i, "").trim();
  const base = withoutCode || title;

  let slug = base
    .toLowerCase()
    .replace(/[^a-z0-9]+/g, "-")
    .replace(/-+/g, "-")
    .replace(/^-|-$/g, "");

  if (slug.length > 60) {
    slug = slug.substring(0, 60).replace(/-$/, "");
  }

  return slug || "untitled";
}

/**
 * Generate a filename for an episode.
 * Format: s{SS}e{EE}-{slug}.md
 */
function episodeFilename(season, episode, title) {
  const s = String(season).padStart(2, "0");
  const e = String(episode).padStart(2, "0");
  const slug = slugify(title);
  return `s${s}e${e}-${slug}.md`;
}

/**
 * Extract tags from a title string based on keyword matching.
 * Returns a deduplicated, sorted array of tag slugs.
 */
function extractTags(title, description) {
  const text = `${title} ${description}`.toLowerCase();
  const tags = new Set();

  for (const [pattern, tag] of TAG_KEYWORDS) {
    const regex = new RegExp(pattern, "i");
    if (regex.test(text)) {
      tags.add(tag);
    }
  }

  return [...tags].sort();
}

/**
 * Convert an ISO date string or RSS pubDate into YYYY-MM-DD format.
 */
function formatDate(dateStr) {
  if (!dateStr) return "1970-01-01";
  const d = new Date(dateStr);
  if (isNaN(d.getTime())) return "1970-01-01";
  return d.toISOString().split("T")[0];
}

/**
 * Normalize audio URL to HTTPS.
 */
function normalizeAudioUrl(url) {
  if (!url) return "";
  return url.replace(/^http:\/\//i, "https://");
}

/**
 * Escape YAML string values. Wraps in double quotes and escapes inner quotes.
 */
function yamlString(value) {
  if (value === undefined || value === null) return '""';
  const str = String(value);
  // Escape backslashes first, then double quotes
  const escaped = str.replace(/\\/g, "\\\\").replace(/"/g, '\\"');
  return `"${escaped}"`;
}

// ---------------------------------------------------------------------------
// Turndown (HTML -> Markdown) setup
// ---------------------------------------------------------------------------

function createTurndown() {
  const td = new TurndownService({
    headingStyle: "atx",
    bulletListMarker: "-",
    codeBlockStyle: "fenced",
    emDelimiter: "*",
  });

  // Remove script/style elements
  td.remove(["script", "style"]);

  return td;
}

/**
 * Convert HTML content to clean markdown.
 * Handles null/empty input gracefully.
 */
function htmlToMarkdown(html, turndown) {
  if (!html || typeof html !== "string" || html.trim().length === 0) {
    return "";
  }

  let md = turndown.turndown(html);

  // Clean up excessive blank lines
  md = md.replace(/\n{3,}/g, "\n\n");

  // Decode any leftover HTML entities
  md = decodeHtmlEntities(md);

  return md.trim();
}

// ---------------------------------------------------------------------------
// Feed fetching and parsing
// ---------------------------------------------------------------------------

async function fetchFeed(url) {
  console.log(`Fetching RSS feed from ${url} ...`);

  const response = await fetch(url, {
    headers: {
      "User-Agent": "GuruShowImporter/1.0",
      Accept: "application/rss+xml, application/xml, text/xml",
    },
  });

  if (!response.ok) {
    throw new Error(
      `Failed to fetch feed: ${response.status} ${response.statusText}`
    );
  }

  const xml = await response.text();
  console.log(`Received ${xml.length.toLocaleString()} bytes of XML.`);
  return xml;
}

function parseFeed(xml) {
  const parser = new XMLParser({
    ignoreAttributes: false,
    attributeNamePrefix: "@_",
    // Parse CDATA sections
    cdataPropName: "__cdata",
    // Keep text nodes
    textNodeName: "#text",
    // Do not trim whitespace from values
    trimValues: false,
    // Ensure items are always arrays
    isArray: (name) => name === "item",
  });

  const result = parser.parse(xml);
  const channel = result?.rss?.channel;

  if (!channel) {
    throw new Error("Invalid RSS feed: no channel element found.");
  }

  const items = channel.item;
  if (!items || !Array.isArray(items) || items.length === 0) {
    throw new Error("No episodes found in feed.");
  }

  console.log(`Parsed ${items.length} episode items from feed.`);
  return items;
}

// ---------------------------------------------------------------------------
// Episode extraction
// ---------------------------------------------------------------------------

/**
 * Extract a text value from a parsed XML node, handling CDATA wrappers.
 */
function extractText(node) {
  if (node === undefined || node === null) return "";
  if (typeof node === "string") return node.trim();
  if (typeof node === "number") return String(node);
  if (typeof node === "object") {
    // CDATA wrapped
    if (node.__cdata !== undefined) {
      return typeof node.__cdata === "string" ? node.__cdata.trim() : "";
    }
    if (node["#text"] !== undefined) {
      return String(node["#text"]).trim();
    }
  }
  return "";
}

/**
 * Process a single RSS item into an episode data object.
 */
function processItem(item, index, warnings) {
  const rawTitle = extractText(item.title);
  const title = cleanTitle(rawTitle);

  // Parse season/episode
  const parsed = parseSeasonEpisode(rawTitle);
  let season, episode;

  if (parsed) {
    season = parsed.season;
    episode = parsed.episode;
  } else {
    season = 0;
    episode = index + 1;
    warnings.push(
      `[WARNING] Could not parse S/E from title: "${title}" -- using season=0, episode=${episode}`
    );
  }

  // Date
  const pubDate = formatDate(extractText(item.pubDate));

  // Description: prefer content:encoded, fall back to description
  const contentEncoded = extractText(item["content:encoded"]);
  const description = extractText(item.description);
  const bodyHtml = contentEncoded || description || "";

  if (!bodyHtml || bodyHtml.trim().length === 0) {
    warnings.push(`[WARNING] Empty description for: "${title}"`);
  }

  // Audio enclosure
  let audioUrl = "";
  let audioSize = 0;
  const enclosure = item.enclosure;
  if (enclosure) {
    audioUrl = normalizeAudioUrl(
      enclosure["@_url"] || extractText(enclosure.url) || ""
    );
    const rawSize =
      enclosure["@_length"] || extractText(enclosure.length) || "0";
    audioSize = parseInt(rawSize, 10) || 0;
  }

  // Duration
  const duration = extractText(item["itunes:duration"]) || "";

  // Original URL
  const link = extractText(item.link) || "";

  // Tags
  const tags = extractTags(title, description);

  return {
    title,
    season,
    episode,
    pubDate,
    bodyHtml,
    audioUrl,
    audioSize,
    duration,
    link,
    tags,
  };
}

// ---------------------------------------------------------------------------
// Markdown file generation
// ---------------------------------------------------------------------------

function generateMarkdown(ep, turndown) {
  const body = htmlToMarkdown(ep.bodyHtml, turndown);

  const tagList =
    ep.tags.length > 0 ? `[${ep.tags.map((t) => `"${t}"`).join(", ")}]` : "[]";

  const frontmatter = [
    "---",
    `title: ${yamlString(ep.title)}`,
    `season: ${ep.season}`,
    `episode: ${ep.episode}`,
    `pubDate: ${ep.pubDate}`,
    `duration: ${yamlString(ep.duration)}`,
    `audioUrl: ${yamlString(ep.audioUrl)}`,
    `audioSize: ${ep.audioSize}`,
    `episodeType: "full"`,
    `originalUrl: ${yamlString(ep.link)}`,
    `featured: false`,
    `classic: false`,
    `tags: ${tagList}`,
    "---",
  ].join("\n");

  return `${frontmatter}\n\n${body}\n`;
}

// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------

async function main() {
  console.log("=== Computer Guru Show Episode Importer ===\n");

  // Ensure output directory exists
  if (!existsSync(EPISODES_DIR)) {
    mkdirSync(EPISODES_DIR, { recursive: true });
    console.log(`Created directory: ${EPISODES_DIR}`);
  }

  // Fetch and parse
  const xml = await fetchFeed(FEED_URL);
  const items = parseFeed(xml);

  // Process episodes
  const turndown = createTurndown();
  const warnings = [];
  const episodes = [];
  const filenames = new Set();

  for (let i = 0; i < items.length; i++) {
    const ep = processItem(items[i], i, warnings);
    episodes.push(ep);
  }

  // Sort by season then episode (oldest first)
  episodes.sort((a, b) => {
    if (a.season !== b.season) return a.season - b.season;
    return a.episode - b.episode;
  });

  // Write files
  let written = 0;
  let skippedDuplicates = 0;

  for (const ep of episodes) {
    let filename = episodeFilename(ep.season, ep.episode, ep.title);

    // Handle duplicate filenames
    if (filenames.has(filename)) {
      const suffix = `-${Date.now().toString(36).slice(-4)}`;
      filename = filename.replace(/\.md$/, `${suffix}.md`);
      warnings.push(
        `[WARNING] Duplicate filename resolved: ${filename} for "${ep.title}"`
      );
      skippedDuplicates++;
    }
    filenames.add(filename);

    const markdown = generateMarkdown(ep, turndown);
    const filepath = join(EPISODES_DIR, filename);
    writeFileSync(filepath, markdown, "utf-8");
    written++;
  }

  // Report
  console.log("\n=== Import Complete ===");
  console.log(`Total episodes in feed: ${items.length}`);
  console.log(`Files written: ${written}`);

  if (skippedDuplicates > 0) {
    console.log(`Duplicate filenames resolved: ${skippedDuplicates}`);
  }

  if (warnings.length > 0) {
    console.log(`\n--- Warnings (${warnings.length}) ---`);
    for (const w of warnings) {
      console.log(w);
    }
  }

  // Show a sample of generated files
  console.log("\n--- Sample Output (first 3 files) ---");
  const sampleEpisodes = episodes.slice(0, 3);
  for (const ep of sampleEpisodes) {
    const filename = episodeFilename(ep.season, ep.episode, ep.title);
    console.log(`\nFile: ${filename}`);
    const markdown = generateMarkdown(ep, turndown);
    // Show first 20 lines
    const lines = markdown.split("\n").slice(0, 20);
    console.log(lines.join("\n"));
    if (markdown.split("\n").length > 20) {
      console.log("  ...(truncated)");
    }
  }

  console.log("\n[SUCCESS] Episode import finished.");
}

main().catch((err) => {
  console.error(`[ERROR] Import failed: ${err.message}`);
  console.error(err.stack);
  process.exit(1);
});