From 887f0ae2662ebfc0a9228291eba1b1fbba17f6d1 Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Sat, 16 May 2026 16:26:07 -0700 Subject: [PATCH] sync: auto-sync from DESKTOP-0O8A1RL at 2026-05-16 16:26:04 Author: Mike Swanson Machine: DESKTOP-0O8A1RL Timestamp: 2026-05-16 16:26:04 --- tmp_bench_8b.py | 108 ++++++++++++++++++++++++++++++++++++++ tmp_hw_check.ps1 | 27 ++++++++++ tmp_ollama_bench.py | 124 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 259 insertions(+) create mode 100644 tmp_bench_8b.py create mode 100644 tmp_hw_check.ps1 create mode 100644 tmp_ollama_bench.py diff --git a/tmp_bench_8b.py b/tmp_bench_8b.py new file mode 100644 index 0000000..d6684c7 --- /dev/null +++ b/tmp_bench_8b.py @@ -0,0 +1,108 @@ +import urllib.request, json, time + +MODELS = ["qwen3:8b", "qwen3:14b", "qwen3.6:latest"] + +TESTS = [ + ("prose", "List 10 common Windows troubleshooting steps an IT technician uses daily. One sentence each, numbered.", 300), + ("classification", "Classify this ticket into ONE of: Hardware, Software, Network, Security, User-Error. Reply with category and one sentence only.\n\nTicket: Client says Outlook keeps asking for password every morning. Started after Windows Update last Tuesday.", 80), + ("json", 'Extract from this ticket and return ONLY valid JSON with keys: client_name, issue_summary, urgency (low/medium/high).\n\nTicket: Hi, this is Janet from Cascades Dental. Our front desk PC shows a blue screen every time we open Dentrix. Patients arriving at 9am, need this fixed ASAP.', 150), + ("summary", "Summarize in exactly 2 bullet points, under 30 words total:\n\nIncident: GuruRMM watchdog failed to restart service after auto-update due to SCM access denied error and a stuck suppression timer. Service was offline 25 minutes. Fixed by adding sc.exe fallback and clearing suppression on failure.", 100), +] + +def ask(model, prompt, max_tokens): + payload = {"model": model, "prompt": prompt, "stream": False, + "options": {"num_predict": max_tokens}} + start = time.time() + req = urllib.request.Request("http://localhost:11434/api/generate", + data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"}) + r = json.loads(urllib.request.urlopen(req, timeout=300).read()) + elapsed = time.time() - start + gen_tok = r.get("eval_count", 0) + gen_ns = r.get("eval_duration", 1) + tps = gen_tok / (gen_ns / 1e9) if gen_ns else 0 + raw = r["response"].strip() + if "" in raw and "" in raw: + visible = raw[raw.index("")+8:].strip() + else: + visible = raw + return visible, elapsed, gen_tok, tps + +def get_ps(): + try: + r = urllib.request.urlopen("http://localhost:11434/api/ps", timeout=5) + d = json.loads(r.read()) + models = d.get("models", []) + if models: + m = models[0] + vram = m.get("size_vram", 0) // 1024 // 1024 + total = m.get("size", 0) // 1024 // 1024 + pct = int(vram / total * 100) if total else 0 + return vram, total, pct + except: + pass + return 0, 0, 0 + +def unload(model): + try: + payload = {"model": model, "keep_alive": 0} + urllib.request.urlopen(urllib.request.Request( + "http://localhost:11434/api/generate", + data=json.dumps(payload).encode(), + headers={"Content-Type": "application/json"}), timeout=10) + except: + pass + time.sleep(2) + +results = {} # model -> {test_name -> (tps, elapsed, visible)} + +for model in MODELS: + print(f"\n{'='*60}") + print(f"MODEL: {model}") + print('='*60) + model_results = {} + + # Warmup load + print(" Loading...", end="", flush=True) + ask(model, "hello", 5) + vram, total, pct = get_ps() + print(f" {vram} MB / {total} MB in VRAM ({pct}%)") + + for test_name, prompt, max_tok in TESTS: + visible, elapsed, gen_tok, tps = ask(model, prompt, max_tok) + model_results[test_name] = (tps, elapsed, gen_tok, visible) + hit_limit = " [HIT LIMIT]" if gen_tok >= max_tok else "" + print(f" [{test_name:14}] {tps:5.1f} tok/s {elapsed:5.1f}s {gen_tok} tok{hit_limit}") + if test_name == "json": + clean = visible.strip().strip("```json").strip("```").strip() + try: + json.loads(clean) + print(f" JSON: [OK]") + except: + print(f" JSON: [FAIL] {repr(clean[:60])}") + + results[model] = model_results + unload(model) + +# Summary table +print(f"\n{'='*60}") +print("THROUGHPUT SUMMARY (tok/s)") +print('='*60) +print(f"{'Task':<16} {'qwen3:8b':>10} {'qwen3:14b':>10} {'qwen3.6':>10}") +print("-"*50) +for test_name, _, _ in TESTS: + row = f"{test_name:<16}" + for model in MODELS: + tps = results[model][test_name][0] + row += f" {tps:>10.1f}" + print(row) + +print() +print("Reference (full-GPU machine from OLLAMA.md):") +print(" qwen3:14b ~66 tok/s | qwen3.6 ~32 tok/s") +print() + +# VRAM fit analysis +print("VRAM FIT (12 GB available):") +for model, size_gb in [("qwen3:8b", 5.2), ("qwen3:14b", 9.3), ("qwen3.6", 23)]: + fit = "FITS" if size_gb < 10 else "SPLIT" + print(f" {model:<16} {size_gb:>5.1f} GB [{fit}]") diff --git a/tmp_hw_check.ps1 b/tmp_hw_check.ps1 new file mode 100644 index 0000000..31b5d57 --- /dev/null +++ b/tmp_hw_check.ps1 @@ -0,0 +1,27 @@ +$cpu = Get-WmiObject Win32_Processor | Select-Object -First 1 +$ram = Get-WmiObject Win32_ComputerSystem +$gpu = Get-WmiObject Win32_VideoController | Where-Object { $_.AdapterRAM -gt 0 } +$os = Get-WmiObject Win32_OperatingSystem + +Write-Output "=== CPU ===" +Write-Output " $($cpu.Name)" +Write-Output " Cores: $($cpu.NumberOfCores) physical / $($cpu.NumberOfLogicalProcessors) logical" +Write-Output " Max MHz: $($cpu.MaxClockSpeed)" + +Write-Output "`n=== RAM ===" +$ramGB = [math]::Round($ram.TotalPhysicalMemory / 1GB, 1) +Write-Output " Total: $ramGB GB" + +Write-Output "`n=== GPU(s) ===" +foreach ($g in $gpu) { + $vramMB = [math]::Round($g.AdapterRAM / 1MB, 0) + Write-Output " $($g.Name)" + Write-Output " VRAM: $vramMB MB" + Write-Output " Driver: $($g.DriverVersion)" +} + +Write-Output "`n=== Storage (system drive) ===" +$disk = Get-WmiObject Win32_LogicalDisk -Filter "DeviceID='C:'" +$freeGB = [math]::Round($disk.FreeSpace / 1GB, 1) +$totalGB = [math]::Round($disk.Size / 1GB, 1) +Write-Output " C: $freeGB GB free / $totalGB GB total" diff --git a/tmp_ollama_bench.py b/tmp_ollama_bench.py new file mode 100644 index 0000000..0babaab --- /dev/null +++ b/tmp_ollama_bench.py @@ -0,0 +1,124 @@ +""" +Throughput benchmark for DESKTOP-0O8A1RL. +Tests: current settings, then pulls GPU layer count while running. +""" +import urllib.request, json, time, subprocess, threading, sys + +PROMPT = ( + "List 10 common Windows troubleshooting steps an IT technician uses daily. " + "One sentence each, numbered." +) + +def get_ps(): + """Check which models are loaded and how many GPU layers.""" + try: + r = urllib.request.urlopen("http://localhost:11434/api/ps", timeout=5) + return json.loads(r.read()) + except: + return {} + +def run_gen(model, num_predict=300): + payload = { + "model": model, + "prompt": PROMPT, + "stream": False, + "options": {"num_predict": num_predict} + } + start = time.time() + req = urllib.request.Request( + "http://localhost:11434/api/generate", + data=json.dumps(payload).encode(), + headers={"Content-Type": "application/json"}, + ) + r = json.loads(urllib.request.urlopen(req, timeout=300).read()) + elapsed = time.time() - start + eval_tok = r.get("eval_count", 0) + eval_ns = r.get("eval_duration", 1) + prompt_tok = r.get("prompt_eval_count", 0) + prompt_ns = r.get("prompt_eval_duration", 1) + gen_tps = eval_tok / (eval_ns / 1e9) if eval_ns else 0 + prompt_tps = prompt_tok / (prompt_ns / 1e9) if prompt_ns else 0 + return { + "elapsed": elapsed, + "eval_tok": eval_tok, + "gen_tps": gen_tps, + "prompt_tok": prompt_tok, + "prompt_tps": prompt_tps, + "response_snippet": r["response"].strip()[:200], + } + +def print_result(label, res, ps_info=None): + print(f"\n {label}") + print(f" Generation: {res['gen_tps']:.1f} tok/s ({res['eval_tok']} tokens in {res['elapsed']:.1f}s)") + print(f" Prompt eval: {res['prompt_tps']:.1f} tok/s ({res['prompt_tok']} tokens)") + if ps_info: + for m in ps_info.get("models", []): + layers = m.get("size_vram", 0) / 1024 / 1024 + total = m.get("size", 0) / 1024 / 1024 + pct = (m.get("size_vram", 0) / m.get("size", 1)) * 100 if m.get("size") else 0 + print(f" GPU VRAM used: {layers:.0f} MB / {total:.0f} MB total ({pct:.0f}% in VRAM)") + +# ── Check initial Ollama env ──────────────────────────────────────────────── +print("=" * 60) +print("DESKTOP-0O8A1RL — Ollama Throughput Benchmark") +print("GPU: RTX 5070 Ti Laptop (4 GB VRAM)") +print("=" * 60) + +# ── Test 1: qwen3:14b current settings ───────────────────────────────────── +print("\n[1] Warming up qwen3:14b...") +# warmup +run_gen("qwen3:14b", num_predict=20) +time.sleep(2) +ps = get_ps() +print(f" ps after warmup: {ps}") + +print("\n[2] qwen3:14b — current settings (300 tokens)") +res14 = run_gen("qwen3:14b", num_predict=300) +ps14 = get_ps() +print_result("qwen3:14b", res14, ps14) + +# Unload +print("\n[3] Unloading model...") +try: + payload = {"model": "qwen3:14b", "keep_alive": 0} + urllib.request.urlopen(urllib.request.Request( + "http://localhost:11434/api/generate", + data=json.dumps(payload).encode(), + headers={"Content-Type": "application/json"} + ), timeout=10) +except: + pass +time.sleep(3) + +# ── Test 2: qwen3.6 current settings ─────────────────────────────────────── +print("\n[4] qwen3.6:latest — current settings (300 tokens)") +print(" (This may take a while to load from disk...)") +run_gen("qwen3.6:latest", num_predict=20) # warmup load +time.sleep(2) +ps36w = get_ps() +print(f" ps after warmup: {ps36w}") +res36 = run_gen("qwen3.6:latest", num_predict=300) +ps36 = get_ps() +print_result("qwen3.6:latest", res36, ps36) + +# ── Summary ───────────────────────────────────────────────────────────────── +print("\n" + "=" * 60) +print("SUMMARY") +print("=" * 60) +print(f" qwen3:14b gen: {res14['gen_tps']:.1f} tok/s prompt: {res14['prompt_tps']:.1f} tok/s") +print(f" qwen3.6 gen: {res36['gen_tps']:.1f} tok/s prompt: {res36['prompt_tps']:.1f} tok/s") +print() +print(" Reference (other machine benchmark from OLLAMA.md):") +print(" qwen3:14b ~66 tok/s") +print(" qwen3.6 ~32 tok/s") +print() + +# VRAM analysis +for label, model_size_gb in [("qwen3:14b", 8.8), ("qwen3.6", 22)]: + vram_gb = 4.0 + pct_gpu = min(vram_gb / model_size_gb * 100, 100) + print(f" {label} ({model_size_gb} GB): {pct_gpu:.0f}% fits in 4 GB VRAM → rest on CPU/RAM") + +print() +print(" Diagnosis: Both models exceed 4 GB VRAM → split CPU/GPU or pure CPU") +print(" Expected impact: 2-4x slower than a machine with sufficient VRAM")