#!/usr/bin/env python """Fetch a page with real (headless) Chrome when plain HTTP/WebFetch is bot-blocked. Drives the installed Chrome 148 via Playwright's channel="chrome" (no bundled Chromium download). Runs headless in an isolated temp profile, so it never touches the interactive Chrome session a human may have open on BEAST. Usage (always invoke with the bot venv's python): projects/discord-bot/.venv/Scripts/python.exe \ projects/discord-bot/scripts/web-fetch-chrome.py "" [options] Options: --html Output raw rendered HTML instead of readable body text (default: text) --selector CSS Wait for and extract only this element's text/HTML --max-chars N Truncate output to N chars (default 8000; 0 = no limit) --settle-ms N Extra wait after load for JS to render (default 1500) --timeout-ms N Navigation timeout (default 25000) --wait-until STATE domcontentloaded | load | networkidle (default: load) --zip CODE Set delivery/location zip code for supported retailers (Amazon, Best Buy). Defaults to 85715 (Tucson, AZ). Pass empty string to skip: --zip "" Exit codes: 0 ok, 2 navigation/render error, 3 bad usage. Errors go to stderr; page content goes to stdout. """ from __future__ import annotations import argparse import sys def main() -> int: ap = argparse.ArgumentParser(add_help=True) ap.add_argument("url") ap.add_argument("--html", action="store_true") ap.add_argument("--selector", default=None) ap.add_argument("--max-chars", type=int, default=8000) ap.add_argument("--settle-ms", type=int, default=1500) ap.add_argument("--timeout-ms", type=int, default=25000) ap.add_argument("--wait-until", default="load", choices=["domcontentloaded", "load", "networkidle"]) ap.add_argument("--zip", default="85715", help="Delivery zip for Amazon/Best Buy (default: 85715). Pass empty to skip.") args = ap.parse_args() if not args.url.lower().startswith(("http://", "https://")): print("[ERROR] url must start with http:// or https://", file=sys.stderr) return 3 try: from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout except ImportError: print("[ERROR] playwright not installed in this interpreter — " "use the bot venv: projects/discord-bot/.venv/Scripts/python.exe", file=sys.stderr) return 2 with sync_playwright() as p: # Drive the installed Chrome (not bundled Chromium). Strip the automation # flags so navigator.webdriver isn't a dead giveaway to bot detectors. browser = p.chromium.launch( channel="chrome", headless=True, args=["--disable-blink-features=AutomationControlled", "--disable-gpu"], ignore_default_args=["--enable-automation"], ) # Strip the "HeadlessChrome" token from the UA (a common bot-detection tell), # derived from the live UA so it tracks the installed Chrome version. tmp = browser.new_context() ua = tmp.new_page().evaluate("() => navigator.userAgent").replace("HeadlessChrome", "Chrome") tmp.close() ctx = browser.new_context( viewport={"width": 1366, "height": 900}, locale="en-US", user_agent=ua, ) page = ctx.new_page() try: # Pre-set delivery zip for supported retailers before loading the target URL. if args.zip: from urllib.parse import urlparse host = urlparse(args.url).netloc.lower() if "amazon.com" in host: try: # Load homepage so session cookies exist, then use the # location picker UI (most reliable — no CSRF tokens needed). page.goto("https://www.amazon.com", wait_until="load", timeout=15000) page.wait_for_timeout(1200) # Click the "Delivering to..." location widget in the nav bar. page.click("#glow-ingress-block", timeout=6000) page.wait_for_selector("#GLUXZipUpdateInput", timeout=6000) page.fill("#GLUXZipUpdateInput", args.zip) page.wait_for_timeout(300) page.click('[data-action="GLUXPostalUpdateAction"]', timeout=5000) page.wait_for_timeout(1000) except Exception: pass # non-fatal — continue to main URL elif "bestbuy.com" in host: try: # Best Buy uses a GraphQL-backed zip picker; the query param approach # is the most reliable headless method. page.goto( f"https://www.bestbuy.com/site/searchpage.jsp?st=test&postalCode={args.zip}", wait_until="load", timeout=10000, ) page.wait_for_timeout(500) except Exception: pass # non-fatal page.goto(args.url, wait_until=args.wait_until, timeout=args.timeout_ms) if args.settle_ms > 0: page.wait_for_timeout(args.settle_ms) if args.selector: page.wait_for_selector(args.selector, timeout=args.timeout_ms) target = page.query_selector(args.selector) out = (target.inner_html() if args.html else target.inner_text()) if target else "" else: out = page.content() if args.html else page.inner_text("body") except PWTimeout: print(f"[ERROR] timed out loading {args.url}", file=sys.stderr) return 2 except Exception as e: # navigation, DNS, TLS, blocked, etc. print(f"[ERROR] {type(e).__name__}: {e}", file=sys.stderr) return 2 finally: browser.close() out = out or "" if args.max_chars > 0 and len(out) > args.max_chars: out = out[: args.max_chars] + f"\n...[truncated at {args.max_chars} chars]" sys.stdout.write(out) return 0 if __name__ == "__main__": raise SystemExit(main())