claudetools/projects/dataforth-dos/datasheet-pipeline/test-upload-two.py

"""Upload two real datasheets, fetch them back, diff byte-for-byte."""
import json
import sys
import urllib.request
import urllib.parse
import hashlib

import os, sys
TOKEN_URL = os.environ.get("CF_TOKEN_URL", "https://login.dataforth.com/connect/token")
API_BASE = os.environ.get("CF_API_BASE", "https://www.dataforth.com") + "/api/v1"
CLIENT_ID = os.environ.get("CF_CLIENT_ID", "")
CLIENT_SECRET = os.environ.get("CF_CLIENT_SECRET", "")
SCOPE = os.environ.get("CF_SCOPE", "dataforth.web")
if not CLIENT_ID or not CLIENT_SECRET:
    sys.exit("set CF_CLIENT_ID + CF_CLIENT_SECRET (vault: clients/dataforth/api-oauth.sops.yaml)")

SAMPLES = [
    ("179377-5", r"D:\claudetools\projects\dataforth-dos\datasheet-pipeline\scmvas-hvas-research\samples\backfill-verify\179377-5-source.txt"),
    ("179377-6", r"D:\claudetools\projects\dataforth-dos\datasheet-pipeline\scmvas-hvas-research\samples\backfill-verify\179377-6-source.txt"),
]


def get_token():
    data = urllib.parse.urlencode({
        "grant_type": "client_credentials",
        "client_id": CLIENT_ID,
        "client_secret": CLIENT_SECRET,
        "scope": SCOPE,
    }).encode()
    req = urllib.request.Request(TOKEN_URL, data=data)
    with urllib.request.urlopen(req) as r:
        return json.loads(r.read())["access_token"]


def api(method, path, token, body=None):
    url = API_BASE + path
    headers = {"Authorization": f"Bearer {token}"}
    if body is not None:
        body = json.dumps(body).encode()
        headers["Content-Type"] = "application/json"
    req = urllib.request.Request(url, data=body, headers=headers, method=method)
    try:
        with urllib.request.urlopen(req) as r:
            return r.status, r.read().decode()
    except urllib.error.HTTPError as e:
        return e.code, e.read().decode()


def main():
    token = get_token()
    print(f"[OK] Got access token (len={len(token)})\n")

    for sn, path in SAMPLES:
        with open(path, "rb") as f:
            content_bytes = f.read()
        content = content_bytes.decode("utf-8", errors="replace")
        local_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
        print(f"=== {sn} ===")
        print(f"  Local file: {path}")
        print(f"  Local bytes: {len(content_bytes)}  sha256[16]: {local_hash}")

        status, body = api("POST", "/TestReportDataFiles", token,
                           {"SerialNumber": sn, "Content": content})
        print(f"  POST -> HTTP {status}")
        print(f"  Server response: {body}")

        status, body = api("GET", f"/TestReportDataFiles/{sn}", token)
        print(f"  GET -> HTTP {status}")
        if status != 200:
            print(f"  !! Fetch failed: {body}")
            continue
        obj = json.loads(body)
        fetched = obj.get("Content", "")
        fetched_hash = hashlib.sha256(fetched.encode()).hexdigest()[:16]
        print(f"  Server bytes: {len(fetched.encode('utf-8'))}  sha256[16]: {fetched_hash}")
        match = "MATCH" if content == fetched else "DIFF"
        print(f"  Content match: {match}")
        print(f"  CreatedAtUtc: {obj.get('CreatedAtUtc')}")
        print(f"  UpdatedAtUtc: {obj.get('UpdatedAtUtc')}")

        if content != fetched:
            # Show first diff
            for i, (a, b) in enumerate(zip(content, fetched)):
                if a != b:
                    print(f"  First diff at char {i}: local={a!r} server={b!r}")
                    print(f"    context: ...{content[max(0,i-20):i+20]!r}")
                    break
            else:
                print(f"  Length diff: local={len(content)} server={len(fetched)}")
        print()


if __name__ == "__main__":
    main()