claudetools/projects/discord-bot/scripts/web-fetch-chrome.py

#!/usr/bin/env python
"""Fetch a page with real (headless) Chrome when plain HTTP/WebFetch is bot-blocked.

Drives the installed Chrome 148 via Playwright's channel="chrome" (no bundled
Chromium download). Runs headless in an isolated temp profile, so it never touches
the interactive Chrome session a human may have open on BEAST.

Usage (always invoke with the bot venv's python):
  projects/discord-bot/.venv/Scripts/python.exe \
    projects/discord-bot/scripts/web-fetch-chrome.py "<url>" [options]

Options:
  --html              Output raw rendered HTML instead of readable body text (default: text)
  --selector CSS      Wait for and extract only this element's text/HTML
  --max-chars N       Truncate output to N chars (default 8000; 0 = no limit)
  --settle-ms N       Extra wait after load for JS to render (default 1500)
  --timeout-ms N      Navigation timeout (default 25000)
  --wait-until STATE  domcontentloaded | load | networkidle (default: load)
  --zip CODE          Set delivery/location zip code for supported retailers
                      (Amazon, Best Buy). Defaults to 85715 (Tucson, AZ).
                      Pass empty string to skip: --zip ""

Exit codes: 0 ok, 2 navigation/render error, 3 bad usage.
Errors go to stderr; page content goes to stdout.
"""
from __future__ import annotations

import argparse
import sys


def main() -> int:
    ap = argparse.ArgumentParser(add_help=True)
    ap.add_argument("url")
    ap.add_argument("--html", action="store_true")
    ap.add_argument("--selector", default=None)
    ap.add_argument("--max-chars", type=int, default=8000)
    ap.add_argument("--settle-ms", type=int, default=1500)
    ap.add_argument("--timeout-ms", type=int, default=25000)
    ap.add_argument("--wait-until", default="load",
                    choices=["domcontentloaded", "load", "networkidle"])
    ap.add_argument("--zip", default="85715",
                    help="Delivery zip for Amazon/Best Buy (default: 85715). Pass empty to skip.")
    args = ap.parse_args()

    if not args.url.lower().startswith(("http://", "https://")):
        print("[ERROR] url must start with http:// or https://", file=sys.stderr)
        return 3

    try:
        from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
    except ImportError:
        print("[ERROR] playwright not installed in this interpreter — "
              "use the bot venv: projects/discord-bot/.venv/Scripts/python.exe", file=sys.stderr)
        return 2

    with sync_playwright() as p:
        # Drive the installed Chrome (not bundled Chromium). Strip the automation
        # flags so navigator.webdriver isn't a dead giveaway to bot detectors.
        browser = p.chromium.launch(
            channel="chrome",
            headless=True,
            args=["--disable-blink-features=AutomationControlled", "--disable-gpu"],
            ignore_default_args=["--enable-automation"],
        )
        # Strip the "HeadlessChrome" token from the UA (a common bot-detection tell),
        # derived from the live UA so it tracks the installed Chrome version.
        tmp = browser.new_context()
        ua = tmp.new_page().evaluate("() => navigator.userAgent").replace("HeadlessChrome", "Chrome")
        tmp.close()
        ctx = browser.new_context(
            viewport={"width": 1366, "height": 900},
            locale="en-US",
            user_agent=ua,
        )
        page = ctx.new_page()
        try:
            # Pre-set delivery zip for supported retailers before loading the target URL.
            if args.zip:
                from urllib.parse import urlparse
                host = urlparse(args.url).netloc.lower()
                if "amazon.com" in host:
                    try:
                        # Load homepage so session cookies exist, then use the
                        # location picker UI (most reliable — no CSRF tokens needed).
                        page.goto("https://www.amazon.com", wait_until="load", timeout=15000)
                        page.wait_for_timeout(1200)
                        # Click the "Delivering to..." location widget in the nav bar.
                        page.click("#glow-ingress-block", timeout=6000)
                        page.wait_for_selector("#GLUXZipUpdateInput", timeout=6000)
                        page.fill("#GLUXZipUpdateInput", args.zip)
                        page.wait_for_timeout(300)
                        page.click('[data-action="GLUXPostalUpdateAction"]', timeout=5000)
                        page.wait_for_timeout(1000)
                    except Exception:
                        pass  # non-fatal — continue to main URL
                elif "bestbuy.com" in host:
                    try:
                        # Best Buy uses a GraphQL-backed zip picker; the query param approach
                        # is the most reliable headless method.
                        page.goto(
                            f"https://www.bestbuy.com/site/searchpage.jsp?st=test&postalCode={args.zip}",
                            wait_until="load",
                            timeout=10000,
                        )
                        page.wait_for_timeout(500)
                    except Exception:
                        pass  # non-fatal

            page.goto(args.url, wait_until=args.wait_until, timeout=args.timeout_ms)
            if args.settle_ms > 0:
                page.wait_for_timeout(args.settle_ms)
            if args.selector:
                page.wait_for_selector(args.selector, timeout=args.timeout_ms)
                target = page.query_selector(args.selector)
                out = (target.inner_html() if args.html else target.inner_text()) if target else ""
            else:
                out = page.content() if args.html else page.inner_text("body")
        except PWTimeout:
            print(f"[ERROR] timed out loading {args.url}", file=sys.stderr)
            return 2
        except Exception as e:  # navigation, DNS, TLS, blocked, etc.
            print(f"[ERROR] {type(e).__name__}: {e}", file=sys.stderr)
            return 2
        finally:
            browser.close()

    out = out or ""
    if args.max_chars > 0 and len(out) > args.max_chars:
        out = out[: args.max_chars] + f"\n...[truncated at {args.max_chars} chars]"
    sys.stdout.write(out)
    return 0


if __name__ == "__main__":
    raise SystemExit(main())