import urllib.request, json, time MODELS = ["qwen3:8b", "qwen3:14b", "qwen3.6:latest"] TESTS = [ ("prose", "List 10 common Windows troubleshooting steps an IT technician uses daily. One sentence each, numbered.", 300), ("classification", "Classify this ticket into ONE of: Hardware, Software, Network, Security, User-Error. Reply with category and one sentence only.\n\nTicket: Client says Outlook keeps asking for password every morning. Started after Windows Update last Tuesday.", 80), ("json", 'Extract from this ticket and return ONLY valid JSON with keys: client_name, issue_summary, urgency (low/medium/high).\n\nTicket: Hi, this is Janet from Cascades Dental. Our front desk PC shows a blue screen every time we open Dentrix. Patients arriving at 9am, need this fixed ASAP.', 150), ("summary", "Summarize in exactly 2 bullet points, under 30 words total:\n\nIncident: GuruRMM watchdog failed to restart service after auto-update due to SCM access denied error and a stuck suppression timer. Service was offline 25 minutes. Fixed by adding sc.exe fallback and clearing suppression on failure.", 100), ] def ask(model, prompt, max_tokens): payload = {"model": model, "prompt": prompt, "stream": False, "options": {"num_predict": max_tokens}} start = time.time() req = urllib.request.Request("http://localhost:11434/api/generate", data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"}) r = json.loads(urllib.request.urlopen(req, timeout=300).read()) elapsed = time.time() - start gen_tok = r.get("eval_count", 0) gen_ns = r.get("eval_duration", 1) tps = gen_tok / (gen_ns / 1e9) if gen_ns else 0 raw = r["response"].strip() if "" in raw and "" in raw: visible = raw[raw.index("")+8:].strip() else: visible = raw return visible, elapsed, gen_tok, tps def get_ps(): try: r = urllib.request.urlopen("http://localhost:11434/api/ps", timeout=5) d = json.loads(r.read()) models = d.get("models", []) if models: m = models[0] vram = m.get("size_vram", 0) // 1024 // 1024 total = m.get("size", 0) // 1024 // 1024 pct = int(vram / total * 100) if total else 0 return vram, total, pct except: pass return 0, 0, 0 def unload(model): try: payload = {"model": model, "keep_alive": 0} urllib.request.urlopen(urllib.request.Request( "http://localhost:11434/api/generate", data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"}), timeout=10) except: pass time.sleep(2) results = {} # model -> {test_name -> (tps, elapsed, visible)} for model in MODELS: print(f"\n{'='*60}") print(f"MODEL: {model}") print('='*60) model_results = {} # Warmup load print(" Loading...", end="", flush=True) ask(model, "hello", 5) vram, total, pct = get_ps() print(f" {vram} MB / {total} MB in VRAM ({pct}%)") for test_name, prompt, max_tok in TESTS: visible, elapsed, gen_tok, tps = ask(model, prompt, max_tok) model_results[test_name] = (tps, elapsed, gen_tok, visible) hit_limit = " [HIT LIMIT]" if gen_tok >= max_tok else "" print(f" [{test_name:14}] {tps:5.1f} tok/s {elapsed:5.1f}s {gen_tok} tok{hit_limit}") if test_name == "json": clean = visible.strip().strip("```json").strip("```").strip() try: json.loads(clean) print(f" JSON: [OK]") except: print(f" JSON: [FAIL] {repr(clean[:60])}") results[model] = model_results unload(model) # Summary table print(f"\n{'='*60}") print("THROUGHPUT SUMMARY (tok/s)") print('='*60) print(f"{'Task':<16} {'qwen3:8b':>10} {'qwen3:14b':>10} {'qwen3.6':>10}") print("-"*50) for test_name, _, _ in TESTS: row = f"{test_name:<16}" for model in MODELS: tps = results[model][test_name][0] row += f" {tps:>10.1f}" print(row) print() print("Reference (full-GPU machine from OLLAMA.md):") print(" qwen3:14b ~66 tok/s | qwen3.6 ~32 tok/s") print() # VRAM fit analysis print("VRAM FIT (12 GB available):") for model, size_gb in [("qwen3:8b", 5.2), ("qwen3:14b", 9.3), ("qwen3.6", 23)]: fit = "FITS" if size_gb < 10 else "SPLIT" print(f" {model:<16} {size_gb:>5.1f} GB [{fit}]")