""" Throughput benchmark for DESKTOP-0O8A1RL. Tests: current settings, then pulls GPU layer count while running. """ import urllib.request, json, time, subprocess, threading, sys PROMPT = ( "List 10 common Windows troubleshooting steps an IT technician uses daily. " "One sentence each, numbered." ) def get_ps(): """Check which models are loaded and how many GPU layers.""" try: r = urllib.request.urlopen("http://localhost:11434/api/ps", timeout=5) return json.loads(r.read()) except: return {} def run_gen(model, num_predict=300): payload = { "model": model, "prompt": PROMPT, "stream": False, "options": {"num_predict": num_predict} } start = time.time() req = urllib.request.Request( "http://localhost:11434/api/generate", data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"}, ) r = json.loads(urllib.request.urlopen(req, timeout=300).read()) elapsed = time.time() - start eval_tok = r.get("eval_count", 0) eval_ns = r.get("eval_duration", 1) prompt_tok = r.get("prompt_eval_count", 0) prompt_ns = r.get("prompt_eval_duration", 1) gen_tps = eval_tok / (eval_ns / 1e9) if eval_ns else 0 prompt_tps = prompt_tok / (prompt_ns / 1e9) if prompt_ns else 0 return { "elapsed": elapsed, "eval_tok": eval_tok, "gen_tps": gen_tps, "prompt_tok": prompt_tok, "prompt_tps": prompt_tps, "response_snippet": r["response"].strip()[:200], } def print_result(label, res, ps_info=None): print(f"\n {label}") print(f" Generation: {res['gen_tps']:.1f} tok/s ({res['eval_tok']} tokens in {res['elapsed']:.1f}s)") print(f" Prompt eval: {res['prompt_tps']:.1f} tok/s ({res['prompt_tok']} tokens)") if ps_info: for m in ps_info.get("models", []): layers = m.get("size_vram", 0) / 1024 / 1024 total = m.get("size", 0) / 1024 / 1024 pct = (m.get("size_vram", 0) / m.get("size", 1)) * 100 if m.get("size") else 0 print(f" GPU VRAM used: {layers:.0f} MB / {total:.0f} MB total ({pct:.0f}% in VRAM)") # ── Check initial Ollama env ──────────────────────────────────────────────── print("=" * 60) print("DESKTOP-0O8A1RL — Ollama Throughput Benchmark") print("GPU: RTX 5070 Ti Laptop (4 GB VRAM)") print("=" * 60) # ── Test 1: qwen3:14b current settings ───────────────────────────────────── print("\n[1] Warming up qwen3:14b...") # warmup run_gen("qwen3:14b", num_predict=20) time.sleep(2) ps = get_ps() print(f" ps after warmup: {ps}") print("\n[2] qwen3:14b — current settings (300 tokens)") res14 = run_gen("qwen3:14b", num_predict=300) ps14 = get_ps() print_result("qwen3:14b", res14, ps14) # Unload print("\n[3] Unloading model...") try: payload = {"model": "qwen3:14b", "keep_alive": 0} urllib.request.urlopen(urllib.request.Request( "http://localhost:11434/api/generate", data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"} ), timeout=10) except: pass time.sleep(3) # ── Test 2: qwen3.6 current settings ─────────────────────────────────────── print("\n[4] qwen3.6:latest — current settings (300 tokens)") print(" (This may take a while to load from disk...)") run_gen("qwen3.6:latest", num_predict=20) # warmup load time.sleep(2) ps36w = get_ps() print(f" ps after warmup: {ps36w}") res36 = run_gen("qwen3.6:latest", num_predict=300) ps36 = get_ps() print_result("qwen3.6:latest", res36, ps36) # ── Summary ───────────────────────────────────────────────────────────────── print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print(f" qwen3:14b gen: {res14['gen_tps']:.1f} tok/s prompt: {res14['prompt_tps']:.1f} tok/s") print(f" qwen3.6 gen: {res36['gen_tps']:.1f} tok/s prompt: {res36['prompt_tps']:.1f} tok/s") print() print(" Reference (other machine benchmark from OLLAMA.md):") print(" qwen3:14b ~66 tok/s") print(" qwen3.6 ~32 tok/s") print() # VRAM analysis for label, model_size_gb in [("qwen3:14b", 8.8), ("qwen3.6", 22)]: vram_gb = 4.0 pct_gpu = min(vram_gb / model_size_gb * 100, 100) print(f" {label} ({model_size_gb} GB): {pct_gpu:.0f}% fits in 4 GB VRAM → rest on CPU/RAM") print() print(" Diagnosis: Both models exceed 4 GB VRAM → split CPU/GPU or pure CPU") print(" Expected impact: 2-4x slower than a machine with sufficient VRAM")