Files
claudetools/projects/discord-bot/scripts/web-fetch-chrome.py
Mike Swanson 51d55566bf sync: auto-sync from GURU-BEAST-ROG at 2026-05-22 13:13:08
Author: Mike Swanson
Machine: GURU-BEAST-ROG
Timestamp: 2026-05-22 13:13:08
2026-05-22 13:13:09 -07:00

100 lines
4.1 KiB
Python

#!/usr/bin/env python
"""Fetch a page with real (headless) Chrome when plain HTTP/WebFetch is bot-blocked.
Drives the installed Chrome 148 via Playwright's channel="chrome" (no bundled
Chromium download). Runs headless in an isolated temp profile, so it never touches
the interactive Chrome session a human may have open on BEAST.
Usage (always invoke with the bot venv's python):
projects/discord-bot/.venv/Scripts/python.exe \
projects/discord-bot/scripts/web-fetch-chrome.py "<url>" [options]
Options:
--html Output raw rendered HTML instead of readable body text (default: text)
--selector CSS Wait for and extract only this element's text/HTML
--max-chars N Truncate output to N chars (default 8000; 0 = no limit)
--settle-ms N Extra wait after load for JS to render (default 1500)
--timeout-ms N Navigation timeout (default 25000)
--wait-until STATE domcontentloaded | load | networkidle (default: load)
Exit codes: 0 ok, 2 navigation/render error, 3 bad usage.
Errors go to stderr; page content goes to stdout.
"""
from __future__ import annotations
import argparse
import sys
def main() -> int:
ap = argparse.ArgumentParser(add_help=True)
ap.add_argument("url")
ap.add_argument("--html", action="store_true")
ap.add_argument("--selector", default=None)
ap.add_argument("--max-chars", type=int, default=8000)
ap.add_argument("--settle-ms", type=int, default=1500)
ap.add_argument("--timeout-ms", type=int, default=25000)
ap.add_argument("--wait-until", default="load",
choices=["domcontentloaded", "load", "networkidle"])
args = ap.parse_args()
if not args.url.lower().startswith(("http://", "https://")):
print("[ERROR] url must start with http:// or https://", file=sys.stderr)
return 3
try:
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
except ImportError:
print("[ERROR] playwright not installed in this interpreter — "
"use the bot venv: projects/discord-bot/.venv/Scripts/python.exe", file=sys.stderr)
return 2
with sync_playwright() as p:
# Drive the installed Chrome (not bundled Chromium). Strip the automation
# flags so navigator.webdriver isn't a dead giveaway to bot detectors.
browser = p.chromium.launch(
channel="chrome",
headless=True,
args=["--disable-blink-features=AutomationControlled", "--disable-gpu"],
ignore_default_args=["--enable-automation"],
)
# Strip the "HeadlessChrome" token from the UA (a common bot-detection tell),
# derived from the live UA so it tracks the installed Chrome version.
tmp = browser.new_context()
ua = tmp.new_page().evaluate("() => navigator.userAgent").replace("HeadlessChrome", "Chrome")
tmp.close()
ctx = browser.new_context(
viewport={"width": 1366, "height": 900},
locale="en-US",
user_agent=ua,
)
page = ctx.new_page()
try:
page.goto(args.url, wait_until=args.wait_until, timeout=args.timeout_ms)
if args.settle_ms > 0:
page.wait_for_timeout(args.settle_ms)
if args.selector:
page.wait_for_selector(args.selector, timeout=args.timeout_ms)
target = page.query_selector(args.selector)
out = (target.inner_html() if args.html else target.inner_text()) if target else ""
else:
out = page.content() if args.html else page.inner_text("body")
except PWTimeout:
print(f"[ERROR] timed out loading {args.url}", file=sys.stderr)
return 2
except Exception as e: # navigation, DNS, TLS, blocked, etc.
print(f"[ERROR] {type(e).__name__}: {e}", file=sys.stderr)
return 2
finally:
browser.close()
out = out or ""
if args.max_chars > 0 and len(out) > args.max_chars:
out = out[: args.max_chars] + f"\n...[truncated at {args.max_chars} chars]"
sys.stdout.write(out)
return 0
if __name__ == "__main__":
raise SystemExit(main())