sync: auto-sync from DESKTOP-0O8A1RL at 2026-05-16 16:26:04

Author: Mike Swanson Machine: DESKTOP-0O8A1RL Timestamp: 2026-05-16 16:26:04
2026-05-16 16:26:07 -07:00
parent 4aadf16a9f
commit 887f0ae266
3 changed files with 259 additions and 0 deletions
--- a/tmp_bench_8b.py
+++ b/tmp_bench_8b.py
@@ -0,0 +1,108 @@
+import urllib.request, json, time
+
+MODELS = ["qwen3:8b", "qwen3:14b", "qwen3.6:latest"]
+
+TESTS = [
+    ("prose", "List 10 common Windows troubleshooting steps an IT technician uses daily. One sentence each, numbered.", 300),
+    ("classification", "Classify this ticket into ONE of: Hardware, Software, Network, Security, User-Error. Reply with category and one sentence only.\n\nTicket: Client says Outlook keeps asking for password every morning. Started after Windows Update last Tuesday.", 80),
+    ("json", 'Extract from this ticket and return ONLY valid JSON with keys: client_name, issue_summary, urgency (low/medium/high).\n\nTicket: Hi, this is Janet from Cascades Dental. Our front desk PC shows a blue screen every time we open Dentrix. Patients arriving at 9am, need this fixed ASAP.', 150),
+    ("summary", "Summarize in exactly 2 bullet points, under 30 words total:\n\nIncident: GuruRMM watchdog failed to restart service after auto-update due to SCM access denied error and a stuck suppression timer. Service was offline 25 minutes. Fixed by adding sc.exe fallback and clearing suppression on failure.", 100),
+]
+
+def ask(model, prompt, max_tokens):
+    payload = {"model": model, "prompt": prompt, "stream": False,
+               "options": {"num_predict": max_tokens}}
+    start = time.time()
+    req = urllib.request.Request("http://localhost:11434/api/generate",
+        data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"})
+    r = json.loads(urllib.request.urlopen(req, timeout=300).read())
+    elapsed = time.time() - start
+    gen_tok = r.get("eval_count", 0)
+    gen_ns = r.get("eval_duration", 1)
+    tps = gen_tok / (gen_ns / 1e9) if gen_ns else 0
+    raw = r["response"].strip()
+    if "<think>" in raw and "</think>" in raw:
+        visible = raw[raw.index("</think>")+8:].strip()
+    else:
+        visible = raw
+    return visible, elapsed, gen_tok, tps
+
+def get_ps():
+    try:
+        r = urllib.request.urlopen("http://localhost:11434/api/ps", timeout=5)
+        d = json.loads(r.read())
+        models = d.get("models", [])
+        if models:
+            m = models[0]
+            vram = m.get("size_vram", 0) // 1024 // 1024
+            total = m.get("size", 0) // 1024 // 1024
+            pct = int(vram / total * 100) if total else 0
+            return vram, total, pct
+    except:
+        pass
+    return 0, 0, 0
+
+def unload(model):
+    try:
+        payload = {"model": model, "keep_alive": 0}
+        urllib.request.urlopen(urllib.request.Request(
+            "http://localhost:11434/api/generate",
+            data=json.dumps(payload).encode(),
+            headers={"Content-Type": "application/json"}), timeout=10)
+    except:
+        pass
+    time.sleep(2)
+
+results = {}  # model -> {test_name -> (tps, elapsed, visible)}
+
+for model in MODELS:
+    print(f"\n{'='*60}")
+    print(f"MODEL: {model}")
+    print('='*60)
+    model_results = {}
+
+    # Warmup load
+    print("  Loading...", end="", flush=True)
+    ask(model, "hello", 5)
+    vram, total, pct = get_ps()
+    print(f" {vram} MB / {total} MB in VRAM ({pct}%)")
+
+    for test_name, prompt, max_tok in TESTS:
+        visible, elapsed, gen_tok, tps = ask(model, prompt, max_tok)
+        model_results[test_name] = (tps, elapsed, gen_tok, visible)
+        hit_limit = " [HIT LIMIT]" if gen_tok >= max_tok else ""
+        print(f"  [{test_name:14}] {tps:5.1f} tok/s  {elapsed:5.1f}s  {gen_tok} tok{hit_limit}")
+        if test_name == "json":
+            clean = visible.strip().strip("```json").strip("```").strip()
+            try:
+                json.loads(clean)
+                print(f"                   JSON: [OK]")
+            except:
+                print(f"                   JSON: [FAIL] {repr(clean[:60])}")
+
+    results[model] = model_results
+    unload(model)
+
+# Summary table
+print(f"\n{'='*60}")
+print("THROUGHPUT SUMMARY (tok/s)")
+print('='*60)
+print(f"{'Task':<16} {'qwen3:8b':>10} {'qwen3:14b':>10} {'qwen3.6':>10}")
+print("-"*50)
+for test_name, _, _ in TESTS:
+    row = f"{test_name:<16}"
+    for model in MODELS:
+        tps = results[model][test_name][0]
+        row += f" {tps:>10.1f}"
+    print(row)
+
+print()
+print("Reference (full-GPU machine from OLLAMA.md):")
+print("  qwen3:14b ~66 tok/s  |  qwen3.6 ~32 tok/s")
+print()
+
+# VRAM fit analysis
+print("VRAM FIT (12 GB available):")
+for model, size_gb in [("qwen3:8b", 5.2), ("qwen3:14b", 9.3), ("qwen3.6", 23)]:
+    fit = "FITS" if size_gb < 10 else "SPLIT"
+    print(f"  {model:<16} {size_gb:>5.1f} GB  [{fit}]")