diff --git a/tmp_bench_8b.py b/tmp_bench_8b.py deleted file mode 100644 index d6684c7..0000000 --- a/tmp_bench_8b.py +++ /dev/null @@ -1,108 +0,0 @@ -import urllib.request, json, time - -MODELS = ["qwen3:8b", "qwen3:14b", "qwen3.6:latest"] - -TESTS = [ - ("prose", "List 10 common Windows troubleshooting steps an IT technician uses daily. One sentence each, numbered.", 300), - ("classification", "Classify this ticket into ONE of: Hardware, Software, Network, Security, User-Error. Reply with category and one sentence only.\n\nTicket: Client says Outlook keeps asking for password every morning. Started after Windows Update last Tuesday.", 80), - ("json", 'Extract from this ticket and return ONLY valid JSON with keys: client_name, issue_summary, urgency (low/medium/high).\n\nTicket: Hi, this is Janet from Cascades Dental. Our front desk PC shows a blue screen every time we open Dentrix. Patients arriving at 9am, need this fixed ASAP.', 150), - ("summary", "Summarize in exactly 2 bullet points, under 30 words total:\n\nIncident: GuruRMM watchdog failed to restart service after auto-update due to SCM access denied error and a stuck suppression timer. Service was offline 25 minutes. Fixed by adding sc.exe fallback and clearing suppression on failure.", 100), -] - -def ask(model, prompt, max_tokens): - payload = {"model": model, "prompt": prompt, "stream": False, - "options": {"num_predict": max_tokens}} - start = time.time() - req = urllib.request.Request("http://localhost:11434/api/generate", - data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"}) - r = json.loads(urllib.request.urlopen(req, timeout=300).read()) - elapsed = time.time() - start - gen_tok = r.get("eval_count", 0) - gen_ns = r.get("eval_duration", 1) - tps = gen_tok / (gen_ns / 1e9) if gen_ns else 0 - raw = r["response"].strip() - if "" in raw and "" in raw: - visible = raw[raw.index("")+8:].strip() - else: - visible = raw - return visible, elapsed, gen_tok, tps - -def get_ps(): - try: - r = urllib.request.urlopen("http://localhost:11434/api/ps", timeout=5) - d = json.loads(r.read()) - models = d.get("models", []) - if models: - m = models[0] - vram = m.get("size_vram", 0) // 1024 // 1024 - total = m.get("size", 0) // 1024 // 1024 - pct = int(vram / total * 100) if total else 0 - return vram, total, pct - except: - pass - return 0, 0, 0 - -def unload(model): - try: - payload = {"model": model, "keep_alive": 0} - urllib.request.urlopen(urllib.request.Request( - "http://localhost:11434/api/generate", - data=json.dumps(payload).encode(), - headers={"Content-Type": "application/json"}), timeout=10) - except: - pass - time.sleep(2) - -results = {} # model -> {test_name -> (tps, elapsed, visible)} - -for model in MODELS: - print(f"\n{'='*60}") - print(f"MODEL: {model}") - print('='*60) - model_results = {} - - # Warmup load - print(" Loading...", end="", flush=True) - ask(model, "hello", 5) - vram, total, pct = get_ps() - print(f" {vram} MB / {total} MB in VRAM ({pct}%)") - - for test_name, prompt, max_tok in TESTS: - visible, elapsed, gen_tok, tps = ask(model, prompt, max_tok) - model_results[test_name] = (tps, elapsed, gen_tok, visible) - hit_limit = " [HIT LIMIT]" if gen_tok >= max_tok else "" - print(f" [{test_name:14}] {tps:5.1f} tok/s {elapsed:5.1f}s {gen_tok} tok{hit_limit}") - if test_name == "json": - clean = visible.strip().strip("```json").strip("```").strip() - try: - json.loads(clean) - print(f" JSON: [OK]") - except: - print(f" JSON: [FAIL] {repr(clean[:60])}") - - results[model] = model_results - unload(model) - -# Summary table -print(f"\n{'='*60}") -print("THROUGHPUT SUMMARY (tok/s)") -print('='*60) -print(f"{'Task':<16} {'qwen3:8b':>10} {'qwen3:14b':>10} {'qwen3.6':>10}") -print("-"*50) -for test_name, _, _ in TESTS: - row = f"{test_name:<16}" - for model in MODELS: - tps = results[model][test_name][0] - row += f" {tps:>10.1f}" - print(row) - -print() -print("Reference (full-GPU machine from OLLAMA.md):") -print(" qwen3:14b ~66 tok/s | qwen3.6 ~32 tok/s") -print() - -# VRAM fit analysis -print("VRAM FIT (12 GB available):") -for model, size_gb in [("qwen3:8b", 5.2), ("qwen3:14b", 9.3), ("qwen3.6", 23)]: - fit = "FITS" if size_gb < 10 else "SPLIT" - print(f" {model:<16} {size_gb:>5.1f} GB [{fit}]") diff --git a/tmp_hw_check.ps1 b/tmp_hw_check.ps1 deleted file mode 100644 index 31b5d57..0000000 --- a/tmp_hw_check.ps1 +++ /dev/null @@ -1,27 +0,0 @@ -$cpu = Get-WmiObject Win32_Processor | Select-Object -First 1 -$ram = Get-WmiObject Win32_ComputerSystem -$gpu = Get-WmiObject Win32_VideoController | Where-Object { $_.AdapterRAM -gt 0 } -$os = Get-WmiObject Win32_OperatingSystem - -Write-Output "=== CPU ===" -Write-Output " $($cpu.Name)" -Write-Output " Cores: $($cpu.NumberOfCores) physical / $($cpu.NumberOfLogicalProcessors) logical" -Write-Output " Max MHz: $($cpu.MaxClockSpeed)" - -Write-Output "`n=== RAM ===" -$ramGB = [math]::Round($ram.TotalPhysicalMemory / 1GB, 1) -Write-Output " Total: $ramGB GB" - -Write-Output "`n=== GPU(s) ===" -foreach ($g in $gpu) { - $vramMB = [math]::Round($g.AdapterRAM / 1MB, 0) - Write-Output " $($g.Name)" - Write-Output " VRAM: $vramMB MB" - Write-Output " Driver: $($g.DriverVersion)" -} - -Write-Output "`n=== Storage (system drive) ===" -$disk = Get-WmiObject Win32_LogicalDisk -Filter "DeviceID='C:'" -$freeGB = [math]::Round($disk.FreeSpace / 1GB, 1) -$totalGB = [math]::Round($disk.Size / 1GB, 1) -Write-Output " C: $freeGB GB free / $totalGB GB total" diff --git a/tmp_ollama_bench.py b/tmp_ollama_bench.py deleted file mode 100644 index 0babaab..0000000 --- a/tmp_ollama_bench.py +++ /dev/null @@ -1,124 +0,0 @@ -""" -Throughput benchmark for DESKTOP-0O8A1RL. -Tests: current settings, then pulls GPU layer count while running. -""" -import urllib.request, json, time, subprocess, threading, sys - -PROMPT = ( - "List 10 common Windows troubleshooting steps an IT technician uses daily. " - "One sentence each, numbered." -) - -def get_ps(): - """Check which models are loaded and how many GPU layers.""" - try: - r = urllib.request.urlopen("http://localhost:11434/api/ps", timeout=5) - return json.loads(r.read()) - except: - return {} - -def run_gen(model, num_predict=300): - payload = { - "model": model, - "prompt": PROMPT, - "stream": False, - "options": {"num_predict": num_predict} - } - start = time.time() - req = urllib.request.Request( - "http://localhost:11434/api/generate", - data=json.dumps(payload).encode(), - headers={"Content-Type": "application/json"}, - ) - r = json.loads(urllib.request.urlopen(req, timeout=300).read()) - elapsed = time.time() - start - eval_tok = r.get("eval_count", 0) - eval_ns = r.get("eval_duration", 1) - prompt_tok = r.get("prompt_eval_count", 0) - prompt_ns = r.get("prompt_eval_duration", 1) - gen_tps = eval_tok / (eval_ns / 1e9) if eval_ns else 0 - prompt_tps = prompt_tok / (prompt_ns / 1e9) if prompt_ns else 0 - return { - "elapsed": elapsed, - "eval_tok": eval_tok, - "gen_tps": gen_tps, - "prompt_tok": prompt_tok, - "prompt_tps": prompt_tps, - "response_snippet": r["response"].strip()[:200], - } - -def print_result(label, res, ps_info=None): - print(f"\n {label}") - print(f" Generation: {res['gen_tps']:.1f} tok/s ({res['eval_tok']} tokens in {res['elapsed']:.1f}s)") - print(f" Prompt eval: {res['prompt_tps']:.1f} tok/s ({res['prompt_tok']} tokens)") - if ps_info: - for m in ps_info.get("models", []): - layers = m.get("size_vram", 0) / 1024 / 1024 - total = m.get("size", 0) / 1024 / 1024 - pct = (m.get("size_vram", 0) / m.get("size", 1)) * 100 if m.get("size") else 0 - print(f" GPU VRAM used: {layers:.0f} MB / {total:.0f} MB total ({pct:.0f}% in VRAM)") - -# ── Check initial Ollama env ──────────────────────────────────────────────── -print("=" * 60) -print("DESKTOP-0O8A1RL — Ollama Throughput Benchmark") -print("GPU: RTX 5070 Ti Laptop (4 GB VRAM)") -print("=" * 60) - -# ── Test 1: qwen3:14b current settings ───────────────────────────────────── -print("\n[1] Warming up qwen3:14b...") -# warmup -run_gen("qwen3:14b", num_predict=20) -time.sleep(2) -ps = get_ps() -print(f" ps after warmup: {ps}") - -print("\n[2] qwen3:14b — current settings (300 tokens)") -res14 = run_gen("qwen3:14b", num_predict=300) -ps14 = get_ps() -print_result("qwen3:14b", res14, ps14) - -# Unload -print("\n[3] Unloading model...") -try: - payload = {"model": "qwen3:14b", "keep_alive": 0} - urllib.request.urlopen(urllib.request.Request( - "http://localhost:11434/api/generate", - data=json.dumps(payload).encode(), - headers={"Content-Type": "application/json"} - ), timeout=10) -except: - pass -time.sleep(3) - -# ── Test 2: qwen3.6 current settings ─────────────────────────────────────── -print("\n[4] qwen3.6:latest — current settings (300 tokens)") -print(" (This may take a while to load from disk...)") -run_gen("qwen3.6:latest", num_predict=20) # warmup load -time.sleep(2) -ps36w = get_ps() -print(f" ps after warmup: {ps36w}") -res36 = run_gen("qwen3.6:latest", num_predict=300) -ps36 = get_ps() -print_result("qwen3.6:latest", res36, ps36) - -# ── Summary ───────────────────────────────────────────────────────────────── -print("\n" + "=" * 60) -print("SUMMARY") -print("=" * 60) -print(f" qwen3:14b gen: {res14['gen_tps']:.1f} tok/s prompt: {res14['prompt_tps']:.1f} tok/s") -print(f" qwen3.6 gen: {res36['gen_tps']:.1f} tok/s prompt: {res36['prompt_tps']:.1f} tok/s") -print() -print(" Reference (other machine benchmark from OLLAMA.md):") -print(" qwen3:14b ~66 tok/s") -print(" qwen3.6 ~32 tok/s") -print() - -# VRAM analysis -for label, model_size_gb in [("qwen3:14b", 8.8), ("qwen3.6", 22)]: - vram_gb = 4.0 - pct_gpu = min(vram_gb / model_size_gb * 100, 100) - print(f" {label} ({model_size_gb} GB): {pct_gpu:.0f}% fits in 4 GB VRAM → rest on CPU/RAM") - -print() -print(" Diagnosis: Both models exceed 4 GB VRAM → split CPU/GPU or pure CPU") -print(" Expected impact: 2-4x slower than a machine with sufficient VRAM")