137 lines
6.3 KiB
Python
137 lines
6.3 KiB
Python
#!/usr/bin/env python
|
|
"""Fetch a page with real (headless) Chrome when plain HTTP/WebFetch is bot-blocked.
|
|
|
|
Drives the installed Chrome 148 via Playwright's channel="chrome" (no bundled
|
|
Chromium download). Runs headless in an isolated temp profile, so it never touches
|
|
the interactive Chrome session a human may have open on BEAST.
|
|
|
|
Usage (always invoke with the bot venv's python):
|
|
projects/discord-bot/.venv/Scripts/python.exe \
|
|
projects/discord-bot/scripts/web-fetch-chrome.py "<url>" [options]
|
|
|
|
Options:
|
|
--html Output raw rendered HTML instead of readable body text (default: text)
|
|
--selector CSS Wait for and extract only this element's text/HTML
|
|
--max-chars N Truncate output to N chars (default 8000; 0 = no limit)
|
|
--settle-ms N Extra wait after load for JS to render (default 1500)
|
|
--timeout-ms N Navigation timeout (default 25000)
|
|
--wait-until STATE domcontentloaded | load | networkidle (default: load)
|
|
--zip CODE Set delivery/location zip code for supported retailers
|
|
(Amazon, Best Buy). Defaults to 85715 (Tucson, AZ).
|
|
Pass empty string to skip: --zip ""
|
|
|
|
Exit codes: 0 ok, 2 navigation/render error, 3 bad usage.
|
|
Errors go to stderr; page content goes to stdout.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import sys
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser(add_help=True)
|
|
ap.add_argument("url")
|
|
ap.add_argument("--html", action="store_true")
|
|
ap.add_argument("--selector", default=None)
|
|
ap.add_argument("--max-chars", type=int, default=8000)
|
|
ap.add_argument("--settle-ms", type=int, default=1500)
|
|
ap.add_argument("--timeout-ms", type=int, default=25000)
|
|
ap.add_argument("--wait-until", default="load",
|
|
choices=["domcontentloaded", "load", "networkidle"])
|
|
ap.add_argument("--zip", default="85715",
|
|
help="Delivery zip for Amazon/Best Buy (default: 85715). Pass empty to skip.")
|
|
args = ap.parse_args()
|
|
|
|
if not args.url.lower().startswith(("http://", "https://")):
|
|
print("[ERROR] url must start with http:// or https://", file=sys.stderr)
|
|
return 3
|
|
|
|
try:
|
|
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout
|
|
except ImportError:
|
|
print("[ERROR] playwright not installed in this interpreter — "
|
|
"use the bot venv: projects/discord-bot/.venv/Scripts/python.exe", file=sys.stderr)
|
|
return 2
|
|
|
|
with sync_playwright() as p:
|
|
# Drive the installed Chrome (not bundled Chromium). Strip the automation
|
|
# flags so navigator.webdriver isn't a dead giveaway to bot detectors.
|
|
browser = p.chromium.launch(
|
|
channel="chrome",
|
|
headless=True,
|
|
args=["--disable-blink-features=AutomationControlled", "--disable-gpu"],
|
|
ignore_default_args=["--enable-automation"],
|
|
)
|
|
# Strip the "HeadlessChrome" token from the UA (a common bot-detection tell),
|
|
# derived from the live UA so it tracks the installed Chrome version.
|
|
tmp = browser.new_context()
|
|
ua = tmp.new_page().evaluate("() => navigator.userAgent").replace("HeadlessChrome", "Chrome")
|
|
tmp.close()
|
|
ctx = browser.new_context(
|
|
viewport={"width": 1366, "height": 900},
|
|
locale="en-US",
|
|
user_agent=ua,
|
|
)
|
|
page = ctx.new_page()
|
|
try:
|
|
# Pre-set delivery zip for supported retailers before loading the target URL.
|
|
if args.zip:
|
|
from urllib.parse import urlparse
|
|
host = urlparse(args.url).netloc.lower()
|
|
if "amazon.com" in host:
|
|
try:
|
|
# Load homepage so session cookies exist, then use the
|
|
# location picker UI (most reliable — no CSRF tokens needed).
|
|
page.goto("https://www.amazon.com", wait_until="load", timeout=15000)
|
|
page.wait_for_timeout(1200)
|
|
# Click the "Delivering to..." location widget in the nav bar.
|
|
page.click("#glow-ingress-block", timeout=6000)
|
|
page.wait_for_selector("#GLUXZipUpdateInput", timeout=6000)
|
|
page.fill("#GLUXZipUpdateInput", args.zip)
|
|
page.wait_for_timeout(300)
|
|
page.click('[data-action="GLUXPostalUpdateAction"]', timeout=5000)
|
|
page.wait_for_timeout(1000)
|
|
except Exception:
|
|
pass # non-fatal — continue to main URL
|
|
elif "bestbuy.com" in host:
|
|
try:
|
|
# Best Buy uses a GraphQL-backed zip picker; the query param approach
|
|
# is the most reliable headless method.
|
|
page.goto(
|
|
f"https://www.bestbuy.com/site/searchpage.jsp?st=test&postalCode={args.zip}",
|
|
wait_until="load",
|
|
timeout=10000,
|
|
)
|
|
page.wait_for_timeout(500)
|
|
except Exception:
|
|
pass # non-fatal
|
|
|
|
page.goto(args.url, wait_until=args.wait_until, timeout=args.timeout_ms)
|
|
if args.settle_ms > 0:
|
|
page.wait_for_timeout(args.settle_ms)
|
|
if args.selector:
|
|
page.wait_for_selector(args.selector, timeout=args.timeout_ms)
|
|
target = page.query_selector(args.selector)
|
|
out = (target.inner_html() if args.html else target.inner_text()) if target else ""
|
|
else:
|
|
out = page.content() if args.html else page.inner_text("body")
|
|
except PWTimeout:
|
|
print(f"[ERROR] timed out loading {args.url}", file=sys.stderr)
|
|
return 2
|
|
except Exception as e: # navigation, DNS, TLS, blocked, etc.
|
|
print(f"[ERROR] {type(e).__name__}: {e}", file=sys.stderr)
|
|
return 2
|
|
finally:
|
|
browser.close()
|
|
|
|
out = out or ""
|
|
if args.max_chars > 0 and len(out) > args.max_chars:
|
|
out = out[: args.max_chars] + f"\n...[truncated at {args.max_chars} chars]"
|
|
sys.stdout.write(out)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|