fix(bitdefender): retry 429/5xx/timeout with backoff + reuse one httpx client

Audit fix H2 (+ M2): the live GravityZone tenant is rate-limited and sweeps fan
out one getManagedEndpointDetails per endpoint across every company, which hit a
real HTTP 429 (errorlog 2026-06-21). _post had zero retry and opened a fresh
httpx.Client (new TLS handshake) per request.
- _post now retries 429/500/502/503/504/timeout up to RETRY_MAX_ATTEMPTS with
  bounded exponential backoff + jitter, honoring Retry-After (numeric or HTTP-date).
  Retry notices go to stderr (don't pollute --json). Terminal errors still raise.
- M2: a single httpx.Client is created lazily and reused (connection pooling),
  closed via client.close() in main()'s finally. Makes the docstring's pooling
  claim true and cuts handshake overhead + 429 pressure during sweeps.
- Verified: compile clean; offline unit tests (persistent 429 -> 4 attempts then
  raise, flaky 503 -> recovers, Retry-After honored); live status read OK.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-25 12:51:08 -07:00
parent d8f0974e0f
commit 51751e6473
2 changed files with 109 additions and 8 deletions

View File

@@ -1310,6 +1310,7 @@ HANDLERS = {
def main(argv=None) -> int:
args = build_parser().parse_args(argv)
handler = HANDLERS[args.command]
client = None
try:
client = GravityZoneClient()
rc = handler(client, args)
@@ -1322,6 +1323,9 @@ def main(argv=None) -> int:
return 1
except KeyboardInterrupt:
return 130
finally:
if client is not None:
client.close()
if __name__ == "__main__":

View File

@@ -20,12 +20,15 @@ from __future__ import annotations
import base64
import json
import os
import random
import subprocess
import sys
import time
import urllib.error
import urllib.request
from dataclasses import dataclass, field
from datetime import datetime, timezone
from email.utils import parsedate_to_datetime
from pathlib import Path
from typing import Any, Optional
@@ -44,6 +47,52 @@ except ImportError: # pragma: no cover - depends on environment
# other data - bound the blast radius rather than echo full bodies into logs.
ERROR_BODY_MAX_CHARS = 500
# --- transient-failure retry policy -------------------------------------------
# The live tenant is rate-limited (real HTTP 429s observed during sweeps, which
# fan out one getManagedEndpointDetails per endpoint across every company). Retry
# 429/5xx/timeout with bounded exponential backoff, honoring Retry-After.
RETRY_STATUSES = frozenset({429, 500, 502, 503, 504})
RETRY_MAX_ATTEMPTS = 4 # total tries = 1 initial + up to (MAX-1) retries
RETRY_BASE_DELAY_SECONDS = 1.0
RETRY_MAX_DELAY_SECONDS = 30.0
class _RetryableHTTP(Exception):
"""Internal signal that a request failed transiently and may be retried.
`code` is the HTTP status (int) or the string 'timeout'."""
def __init__(self, code, headers=None, detail=""):
self.code = code
self.headers = headers or {}
self.detail = detail
super().__init__(f"transient {code}")
def _retry_delay(headers, attempt: int) -> float:
"""Seconds to wait before the next retry: honor a Retry-After header when
present (numeric seconds or an HTTP-date), else exponential backoff + jitter."""
ra = None
try:
ra = headers.get("Retry-After") or headers.get("retry-after")
except AttributeError:
ra = None
if ra:
try:
return min(float(ra), RETRY_MAX_DELAY_SECONDS)
except (TypeError, ValueError):
try:
dt = parsedate_to_datetime(ra)
if dt is not None:
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
delta = (dt - datetime.now(timezone.utc)).total_seconds()
if delta > 0:
return min(delta, RETRY_MAX_DELAY_SECONDS)
except (TypeError, ValueError):
pass
backoff = min(RETRY_BASE_DELAY_SECONDS * (2 ** attempt), RETRY_MAX_DELAY_SECONDS)
return backoff + random.uniform(0.0, backoff * 0.25)
# --- constants ----------------------------------------------------------------
GRAVITYZONE_API_BASE_URL = os.environ.get(
"GRAVITYZONE_API_BASE_URL",
@@ -169,6 +218,24 @@ class GravityZoneClient:
self._api_key = api_key # lazily loaded if None
self.timeout = timeout
self.connect_timeout = connect_timeout
self._httpx_client = None # reused across calls (pooling) when httpx present
def close(self) -> None:
"""Close the pooled httpx client, if one was opened."""
if self._httpx_client is not None:
try:
self._httpx_client.close()
finally:
self._httpx_client = None
@property
def _client(self):
"""Lazily create and reuse a single httpx.Client so a multi-call sweep
shares one connection pool instead of a TLS handshake per request."""
if self._httpx_client is None:
timeout = httpx.Timeout(self.timeout, connect=self.connect_timeout)
self._httpx_client = httpx.Client(timeout=timeout)
return self._httpx_client
@property
def api_key(self) -> str:
@@ -196,24 +263,49 @@ class GravityZoneClient:
return body
def _post(self, url: str, payload: dict) -> Any:
"""POST with bounded retry on transient failures (429/5xx/timeout)."""
data = json.dumps(payload).encode("utf-8")
for attempt in range(RETRY_MAX_ATTEMPTS):
try:
return self._post_once(url, data)
except _RetryableHTTP as exc:
if attempt >= RETRY_MAX_ATTEMPTS - 1:
raise GravityZoneError(
f"GravityZone HTTP {exc.code} after {RETRY_MAX_ATTEMPTS} "
f"attempts: {exc.detail}".rstrip(": ")
) from exc
delay = _retry_delay(exc.headers, attempt)
print(
f"[WARNING] GravityZone {exc.code} - retry "
f"{attempt + 1}/{RETRY_MAX_ATTEMPTS - 1} in {delay:.1f}s",
file=sys.stderr,
)
time.sleep(delay)
# unreachable: the loop either returns or raises on the final attempt
raise GravityZoneError("GravityZone request failed: retries exhausted")
def _post_once(self, url: str, data: bytes) -> Any:
"""One POST. Returns parsed JSON, raises _RetryableHTTP on a transient
failure, or GravityZoneError on a terminal one."""
if _HAS_HTTPX:
try:
timeout = httpx.Timeout(self.timeout, connect=self.connect_timeout)
with httpx.Client(timeout=timeout) as client:
resp = client.post(url, content=data, auth=(self.api_key, ""),
headers={"Content-Type": "application/json"})
resp = self._client.post(
url, content=data, auth=(self.api_key, ""),
headers={"Content-Type": "application/json"})
resp.raise_for_status()
return resp.json()
except httpx.TimeoutException as exc:
raise GravityZoneError(f"GravityZone request timed out: {exc}") from exc
raise _RetryableHTTP("timeout", detail=str(exc)) from exc
except httpx.HTTPStatusError as exc:
code = exc.response.status_code
detail = (exc.response.text or "")[:ERROR_BODY_MAX_CHARS]
if code in RETRY_STATUSES:
raise _RetryableHTTP(code, exc.response.headers, detail) from exc
raise GravityZoneError(
f"GravityZone HTTP {exc.response.status_code}: {detail}"
) from exc
f"GravityZone HTTP {code}: {detail}") from exc
except httpx.HTTPError as exc:
raise GravityZoneError(f"GravityZone request failed: {exc}") from exc
raise GravityZoneError(
f"GravityZone request failed: {exc}") from exc
# stdlib fallback
token = base64.b64encode(f"{self.api_key}:".encode("utf-8")).decode("ascii")
@@ -232,7 +324,12 @@ class GravityZoneClient:
return json.loads(raw.decode("utf-8"))
except urllib.error.HTTPError as exc:
detail = exc.read().decode("utf-8", errors="replace")[:ERROR_BODY_MAX_CHARS]
if exc.code in RETRY_STATUSES:
raise _RetryableHTTP(exc.code, getattr(exc, "headers", None),
detail) from exc
raise GravityZoneError(f"GravityZone HTTP {exc.code}: {detail}") from exc
except TimeoutError as exc:
raise _RetryableHTTP("timeout", detail=str(exc)) from exc
except urllib.error.URLError as exc:
raise GravityZoneError(f"GravityZone request failed: {exc}") from exc