fix(bitdefender): retry 429/5xx/timeout with backoff + reuse one httpx client
Audit fix H2 (+ M2): the live GravityZone tenant is rate-limited and sweeps fan out one getManagedEndpointDetails per endpoint across every company, which hit a real HTTP 429 (errorlog 2026-06-21). _post had zero retry and opened a fresh httpx.Client (new TLS handshake) per request. - _post now retries 429/500/502/503/504/timeout up to RETRY_MAX_ATTEMPTS with bounded exponential backoff + jitter, honoring Retry-After (numeric or HTTP-date). Retry notices go to stderr (don't pollute --json). Terminal errors still raise. - M2: a single httpx.Client is created lazily and reused (connection pooling), closed via client.close() in main()'s finally. Makes the docstring's pooling claim true and cuts handshake overhead + 429 pressure during sweeps. - Verified: compile clean; offline unit tests (persistent 429 -> 4 attempts then raise, flaky 503 -> recovers, Retry-After honored); live status read OK. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1310,6 +1310,7 @@ HANDLERS = {
|
||||
def main(argv=None) -> int:
|
||||
args = build_parser().parse_args(argv)
|
||||
handler = HANDLERS[args.command]
|
||||
client = None
|
||||
try:
|
||||
client = GravityZoneClient()
|
||||
rc = handler(client, args)
|
||||
@@ -1322,6 +1323,9 @@ def main(argv=None) -> int:
|
||||
return 1
|
||||
except KeyboardInterrupt:
|
||||
return 130
|
||||
finally:
|
||||
if client is not None:
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -20,12 +20,15 @@ from __future__ import annotations
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from email.utils import parsedate_to_datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
@@ -44,6 +47,52 @@ except ImportError: # pragma: no cover - depends on environment
|
||||
# other data - bound the blast radius rather than echo full bodies into logs.
|
||||
ERROR_BODY_MAX_CHARS = 500
|
||||
|
||||
# --- transient-failure retry policy -------------------------------------------
|
||||
# The live tenant is rate-limited (real HTTP 429s observed during sweeps, which
|
||||
# fan out one getManagedEndpointDetails per endpoint across every company). Retry
|
||||
# 429/5xx/timeout with bounded exponential backoff, honoring Retry-After.
|
||||
RETRY_STATUSES = frozenset({429, 500, 502, 503, 504})
|
||||
RETRY_MAX_ATTEMPTS = 4 # total tries = 1 initial + up to (MAX-1) retries
|
||||
RETRY_BASE_DELAY_SECONDS = 1.0
|
||||
RETRY_MAX_DELAY_SECONDS = 30.0
|
||||
|
||||
|
||||
class _RetryableHTTP(Exception):
|
||||
"""Internal signal that a request failed transiently and may be retried.
|
||||
`code` is the HTTP status (int) or the string 'timeout'."""
|
||||
|
||||
def __init__(self, code, headers=None, detail=""):
|
||||
self.code = code
|
||||
self.headers = headers or {}
|
||||
self.detail = detail
|
||||
super().__init__(f"transient {code}")
|
||||
|
||||
|
||||
def _retry_delay(headers, attempt: int) -> float:
|
||||
"""Seconds to wait before the next retry: honor a Retry-After header when
|
||||
present (numeric seconds or an HTTP-date), else exponential backoff + jitter."""
|
||||
ra = None
|
||||
try:
|
||||
ra = headers.get("Retry-After") or headers.get("retry-after")
|
||||
except AttributeError:
|
||||
ra = None
|
||||
if ra:
|
||||
try:
|
||||
return min(float(ra), RETRY_MAX_DELAY_SECONDS)
|
||||
except (TypeError, ValueError):
|
||||
try:
|
||||
dt = parsedate_to_datetime(ra)
|
||||
if dt is not None:
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
delta = (dt - datetime.now(timezone.utc)).total_seconds()
|
||||
if delta > 0:
|
||||
return min(delta, RETRY_MAX_DELAY_SECONDS)
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
backoff = min(RETRY_BASE_DELAY_SECONDS * (2 ** attempt), RETRY_MAX_DELAY_SECONDS)
|
||||
return backoff + random.uniform(0.0, backoff * 0.25)
|
||||
|
||||
# --- constants ----------------------------------------------------------------
|
||||
GRAVITYZONE_API_BASE_URL = os.environ.get(
|
||||
"GRAVITYZONE_API_BASE_URL",
|
||||
@@ -169,6 +218,24 @@ class GravityZoneClient:
|
||||
self._api_key = api_key # lazily loaded if None
|
||||
self.timeout = timeout
|
||||
self.connect_timeout = connect_timeout
|
||||
self._httpx_client = None # reused across calls (pooling) when httpx present
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the pooled httpx client, if one was opened."""
|
||||
if self._httpx_client is not None:
|
||||
try:
|
||||
self._httpx_client.close()
|
||||
finally:
|
||||
self._httpx_client = None
|
||||
|
||||
@property
|
||||
def _client(self):
|
||||
"""Lazily create and reuse a single httpx.Client so a multi-call sweep
|
||||
shares one connection pool instead of a TLS handshake per request."""
|
||||
if self._httpx_client is None:
|
||||
timeout = httpx.Timeout(self.timeout, connect=self.connect_timeout)
|
||||
self._httpx_client = httpx.Client(timeout=timeout)
|
||||
return self._httpx_client
|
||||
|
||||
@property
|
||||
def api_key(self) -> str:
|
||||
@@ -196,24 +263,49 @@ class GravityZoneClient:
|
||||
return body
|
||||
|
||||
def _post(self, url: str, payload: dict) -> Any:
|
||||
"""POST with bounded retry on transient failures (429/5xx/timeout)."""
|
||||
data = json.dumps(payload).encode("utf-8")
|
||||
for attempt in range(RETRY_MAX_ATTEMPTS):
|
||||
try:
|
||||
return self._post_once(url, data)
|
||||
except _RetryableHTTP as exc:
|
||||
if attempt >= RETRY_MAX_ATTEMPTS - 1:
|
||||
raise GravityZoneError(
|
||||
f"GravityZone HTTP {exc.code} after {RETRY_MAX_ATTEMPTS} "
|
||||
f"attempts: {exc.detail}".rstrip(": ")
|
||||
) from exc
|
||||
delay = _retry_delay(exc.headers, attempt)
|
||||
print(
|
||||
f"[WARNING] GravityZone {exc.code} - retry "
|
||||
f"{attempt + 1}/{RETRY_MAX_ATTEMPTS - 1} in {delay:.1f}s",
|
||||
file=sys.stderr,
|
||||
)
|
||||
time.sleep(delay)
|
||||
# unreachable: the loop either returns or raises on the final attempt
|
||||
raise GravityZoneError("GravityZone request failed: retries exhausted")
|
||||
|
||||
def _post_once(self, url: str, data: bytes) -> Any:
|
||||
"""One POST. Returns parsed JSON, raises _RetryableHTTP on a transient
|
||||
failure, or GravityZoneError on a terminal one."""
|
||||
if _HAS_HTTPX:
|
||||
try:
|
||||
timeout = httpx.Timeout(self.timeout, connect=self.connect_timeout)
|
||||
with httpx.Client(timeout=timeout) as client:
|
||||
resp = client.post(url, content=data, auth=(self.api_key, ""),
|
||||
headers={"Content-Type": "application/json"})
|
||||
resp = self._client.post(
|
||||
url, content=data, auth=(self.api_key, ""),
|
||||
headers={"Content-Type": "application/json"})
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
except httpx.TimeoutException as exc:
|
||||
raise GravityZoneError(f"GravityZone request timed out: {exc}") from exc
|
||||
raise _RetryableHTTP("timeout", detail=str(exc)) from exc
|
||||
except httpx.HTTPStatusError as exc:
|
||||
code = exc.response.status_code
|
||||
detail = (exc.response.text or "")[:ERROR_BODY_MAX_CHARS]
|
||||
if code in RETRY_STATUSES:
|
||||
raise _RetryableHTTP(code, exc.response.headers, detail) from exc
|
||||
raise GravityZoneError(
|
||||
f"GravityZone HTTP {exc.response.status_code}: {detail}"
|
||||
) from exc
|
||||
f"GravityZone HTTP {code}: {detail}") from exc
|
||||
except httpx.HTTPError as exc:
|
||||
raise GravityZoneError(f"GravityZone request failed: {exc}") from exc
|
||||
raise GravityZoneError(
|
||||
f"GravityZone request failed: {exc}") from exc
|
||||
|
||||
# stdlib fallback
|
||||
token = base64.b64encode(f"{self.api_key}:".encode("utf-8")).decode("ascii")
|
||||
@@ -232,7 +324,12 @@ class GravityZoneClient:
|
||||
return json.loads(raw.decode("utf-8"))
|
||||
except urllib.error.HTTPError as exc:
|
||||
detail = exc.read().decode("utf-8", errors="replace")[:ERROR_BODY_MAX_CHARS]
|
||||
if exc.code in RETRY_STATUSES:
|
||||
raise _RetryableHTTP(exc.code, getattr(exc, "headers", None),
|
||||
detail) from exc
|
||||
raise GravityZoneError(f"GravityZone HTTP {exc.code}: {detail}") from exc
|
||||
except TimeoutError as exc:
|
||||
raise _RetryableHTTP("timeout", detail=str(exc)) from exc
|
||||
except urllib.error.URLError as exc:
|
||||
raise GravityZoneError(f"GravityZone request failed: {exc}") from exc
|
||||
|
||||
|
||||
Reference in New Issue
Block a user