sync: auto-sync from DESKTOP-0O8A1RL at 2026-05-16 16:26:04
Author: Mike Swanson Machine: DESKTOP-0O8A1RL Timestamp: 2026-05-16 16:26:04
This commit is contained in:
124
tmp_ollama_bench.py
Normal file
124
tmp_ollama_bench.py
Normal file
@@ -0,0 +1,124 @@
|
||||
"""
|
||||
Throughput benchmark for DESKTOP-0O8A1RL.
|
||||
Tests: current settings, then pulls GPU layer count while running.
|
||||
"""
|
||||
import urllib.request, json, time, subprocess, threading, sys
|
||||
|
||||
PROMPT = (
|
||||
"List 10 common Windows troubleshooting steps an IT technician uses daily. "
|
||||
"One sentence each, numbered."
|
||||
)
|
||||
|
||||
def get_ps():
|
||||
"""Check which models are loaded and how many GPU layers."""
|
||||
try:
|
||||
r = urllib.request.urlopen("http://localhost:11434/api/ps", timeout=5)
|
||||
return json.loads(r.read())
|
||||
except:
|
||||
return {}
|
||||
|
||||
def run_gen(model, num_predict=300):
|
||||
payload = {
|
||||
"model": model,
|
||||
"prompt": PROMPT,
|
||||
"stream": False,
|
||||
"options": {"num_predict": num_predict}
|
||||
}
|
||||
start = time.time()
|
||||
req = urllib.request.Request(
|
||||
"http://localhost:11434/api/generate",
|
||||
data=json.dumps(payload).encode(),
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
r = json.loads(urllib.request.urlopen(req, timeout=300).read())
|
||||
elapsed = time.time() - start
|
||||
eval_tok = r.get("eval_count", 0)
|
||||
eval_ns = r.get("eval_duration", 1)
|
||||
prompt_tok = r.get("prompt_eval_count", 0)
|
||||
prompt_ns = r.get("prompt_eval_duration", 1)
|
||||
gen_tps = eval_tok / (eval_ns / 1e9) if eval_ns else 0
|
||||
prompt_tps = prompt_tok / (prompt_ns / 1e9) if prompt_ns else 0
|
||||
return {
|
||||
"elapsed": elapsed,
|
||||
"eval_tok": eval_tok,
|
||||
"gen_tps": gen_tps,
|
||||
"prompt_tok": prompt_tok,
|
||||
"prompt_tps": prompt_tps,
|
||||
"response_snippet": r["response"].strip()[:200],
|
||||
}
|
||||
|
||||
def print_result(label, res, ps_info=None):
|
||||
print(f"\n {label}")
|
||||
print(f" Generation: {res['gen_tps']:.1f} tok/s ({res['eval_tok']} tokens in {res['elapsed']:.1f}s)")
|
||||
print(f" Prompt eval: {res['prompt_tps']:.1f} tok/s ({res['prompt_tok']} tokens)")
|
||||
if ps_info:
|
||||
for m in ps_info.get("models", []):
|
||||
layers = m.get("size_vram", 0) / 1024 / 1024
|
||||
total = m.get("size", 0) / 1024 / 1024
|
||||
pct = (m.get("size_vram", 0) / m.get("size", 1)) * 100 if m.get("size") else 0
|
||||
print(f" GPU VRAM used: {layers:.0f} MB / {total:.0f} MB total ({pct:.0f}% in VRAM)")
|
||||
|
||||
# ── Check initial Ollama env ────────────────────────────────────────────────
|
||||
print("=" * 60)
|
||||
print("DESKTOP-0O8A1RL — Ollama Throughput Benchmark")
|
||||
print("GPU: RTX 5070 Ti Laptop (4 GB VRAM)")
|
||||
print("=" * 60)
|
||||
|
||||
# ── Test 1: qwen3:14b current settings ─────────────────────────────────────
|
||||
print("\n[1] Warming up qwen3:14b...")
|
||||
# warmup
|
||||
run_gen("qwen3:14b", num_predict=20)
|
||||
time.sleep(2)
|
||||
ps = get_ps()
|
||||
print(f" ps after warmup: {ps}")
|
||||
|
||||
print("\n[2] qwen3:14b — current settings (300 tokens)")
|
||||
res14 = run_gen("qwen3:14b", num_predict=300)
|
||||
ps14 = get_ps()
|
||||
print_result("qwen3:14b", res14, ps14)
|
||||
|
||||
# Unload
|
||||
print("\n[3] Unloading model...")
|
||||
try:
|
||||
payload = {"model": "qwen3:14b", "keep_alive": 0}
|
||||
urllib.request.urlopen(urllib.request.Request(
|
||||
"http://localhost:11434/api/generate",
|
||||
data=json.dumps(payload).encode(),
|
||||
headers={"Content-Type": "application/json"}
|
||||
), timeout=10)
|
||||
except:
|
||||
pass
|
||||
time.sleep(3)
|
||||
|
||||
# ── Test 2: qwen3.6 current settings ───────────────────────────────────────
|
||||
print("\n[4] qwen3.6:latest — current settings (300 tokens)")
|
||||
print(" (This may take a while to load from disk...)")
|
||||
run_gen("qwen3.6:latest", num_predict=20) # warmup load
|
||||
time.sleep(2)
|
||||
ps36w = get_ps()
|
||||
print(f" ps after warmup: {ps36w}")
|
||||
res36 = run_gen("qwen3.6:latest", num_predict=300)
|
||||
ps36 = get_ps()
|
||||
print_result("qwen3.6:latest", res36, ps36)
|
||||
|
||||
# ── Summary ─────────────────────────────────────────────────────────────────
|
||||
print("\n" + "=" * 60)
|
||||
print("SUMMARY")
|
||||
print("=" * 60)
|
||||
print(f" qwen3:14b gen: {res14['gen_tps']:.1f} tok/s prompt: {res14['prompt_tps']:.1f} tok/s")
|
||||
print(f" qwen3.6 gen: {res36['gen_tps']:.1f} tok/s prompt: {res36['prompt_tps']:.1f} tok/s")
|
||||
print()
|
||||
print(" Reference (other machine benchmark from OLLAMA.md):")
|
||||
print(" qwen3:14b ~66 tok/s")
|
||||
print(" qwen3.6 ~32 tok/s")
|
||||
print()
|
||||
|
||||
# VRAM analysis
|
||||
for label, model_size_gb in [("qwen3:14b", 8.8), ("qwen3.6", 22)]:
|
||||
vram_gb = 4.0
|
||||
pct_gpu = min(vram_gb / model_size_gb * 100, 100)
|
||||
print(f" {label} ({model_size_gb} GB): {pct_gpu:.0f}% fits in 4 GB VRAM → rest on CPU/RAM")
|
||||
|
||||
print()
|
||||
print(" Diagnosis: Both models exceed 4 GB VRAM → split CPU/GPU or pure CPU")
|
||||
print(" Expected impact: 2-4x slower than a machine with sufficient VRAM")
|
||||
Reference in New Issue
Block a user