chore: remove scratch benchmark files

Removes tmp_bench_8b.py, tmp_hw_check.ps1, and tmp_ollama_bench.py from DESKTOP-0O8A1RL's qwen3:8b benchmark. The routing decisions and numbers are captured in OLLAMA.md; the scripts were one-off scratch work and don't need to live in the repo. Untracked counterparts on GURU-BEAST-ROG (benchmark_qwen_3_6.py, rescore_qwen.py, qwen-benchmark-2026-05-16.{md,json}) were also removed locally. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 16:54:59 -07:00
parent 97f93dd6d7
commit f157795fb7
3 changed files with 0 additions and 259 deletions
--- a/tmp_bench_8b.py
+++ b/tmp_bench_8b.py
@@ -1,108 +0,0 @@
-import urllib.request, json, time
-
-MODELS = ["qwen3:8b", "qwen3:14b", "qwen3.6:latest"]
-
-TESTS = [
-    ("prose", "List 10 common Windows troubleshooting steps an IT technician uses daily. One sentence each, numbered.", 300),
-    ("classification", "Classify this ticket into ONE of: Hardware, Software, Network, Security, User-Error. Reply with category and one sentence only.\n\nTicket: Client says Outlook keeps asking for password every morning. Started after Windows Update last Tuesday.", 80),
-    ("json", 'Extract from this ticket and return ONLY valid JSON with keys: client_name, issue_summary, urgency (low/medium/high).\n\nTicket: Hi, this is Janet from Cascades Dental. Our front desk PC shows a blue screen every time we open Dentrix. Patients arriving at 9am, need this fixed ASAP.', 150),
-    ("summary", "Summarize in exactly 2 bullet points, under 30 words total:\n\nIncident: GuruRMM watchdog failed to restart service after auto-update due to SCM access denied error and a stuck suppression timer. Service was offline 25 minutes. Fixed by adding sc.exe fallback and clearing suppression on failure.", 100),
-]
-
-def ask(model, prompt, max_tokens):
-    payload = {"model": model, "prompt": prompt, "stream": False,
-               "options": {"num_predict": max_tokens}}
-    start = time.time()
-    req = urllib.request.Request("http://localhost:11434/api/generate",
-        data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"})
-    r = json.loads(urllib.request.urlopen(req, timeout=300).read())
-    elapsed = time.time() - start
-    gen_tok = r.get("eval_count", 0)
-    gen_ns = r.get("eval_duration", 1)
-    tps = gen_tok / (gen_ns / 1e9) if gen_ns else 0
-    raw = r["response"].strip()
-    if "<think>" in raw and "</think>" in raw:
-        visible = raw[raw.index("</think>")+8:].strip()
-    else:
-        visible = raw
-    return visible, elapsed, gen_tok, tps
-
-def get_ps():
-    try:
-        r = urllib.request.urlopen("http://localhost:11434/api/ps", timeout=5)
-        d = json.loads(r.read())
-        models = d.get("models", [])
-        if models:
-            m = models[0]
-            vram = m.get("size_vram", 0) // 1024 // 1024
-            total = m.get("size", 0) // 1024 // 1024
-            pct = int(vram / total * 100) if total else 0
-            return vram, total, pct
-    except:
-        pass
-    return 0, 0, 0
-
-def unload(model):
-    try:
-        payload = {"model": model, "keep_alive": 0}
-        urllib.request.urlopen(urllib.request.Request(
-            "http://localhost:11434/api/generate",
-            data=json.dumps(payload).encode(),
-            headers={"Content-Type": "application/json"}), timeout=10)
-    except:
-        pass
-    time.sleep(2)
-
-results = {}  # model -> {test_name -> (tps, elapsed, visible)}
-
-for model in MODELS:
-    print(f"\n{'='*60}")
-    print(f"MODEL: {model}")
-    print('='*60)
-    model_results = {}
-
-    # Warmup load
-    print("  Loading...", end="", flush=True)
-    ask(model, "hello", 5)
-    vram, total, pct = get_ps()
-    print(f" {vram} MB / {total} MB in VRAM ({pct}%)")
-
-    for test_name, prompt, max_tok in TESTS:
-        visible, elapsed, gen_tok, tps = ask(model, prompt, max_tok)
-        model_results[test_name] = (tps, elapsed, gen_tok, visible)
-        hit_limit = " [HIT LIMIT]" if gen_tok >= max_tok else ""
-        print(f"  [{test_name:14}] {tps:5.1f} tok/s  {elapsed:5.1f}s  {gen_tok} tok{hit_limit}")
-        if test_name == "json":
-            clean = visible.strip().strip("```json").strip("```").strip()
-            try:
-                json.loads(clean)
-                print(f"                   JSON: [OK]")
-            except:
-                print(f"                   JSON: [FAIL] {repr(clean[:60])}")
-
-    results[model] = model_results
-    unload(model)
-
-# Summary table
-print(f"\n{'='*60}")
-print("THROUGHPUT SUMMARY (tok/s)")
-print('='*60)
-print(f"{'Task':<16} {'qwen3:8b':>10} {'qwen3:14b':>10} {'qwen3.6':>10}")
-print("-"*50)
-for test_name, _, _ in TESTS:
-    row = f"{test_name:<16}"
-    for model in MODELS:
-        tps = results[model][test_name][0]
-        row += f" {tps:>10.1f}"
-    print(row)
-
-print()
-print("Reference (full-GPU machine from OLLAMA.md):")
-print("  qwen3:14b ~66 tok/s  |  qwen3.6 ~32 tok/s")
-print()
-
-# VRAM fit analysis
-print("VRAM FIT (12 GB available):")
-for model, size_gb in [("qwen3:8b", 5.2), ("qwen3:14b", 9.3), ("qwen3.6", 23)]:
-    fit = "FITS" if size_gb < 10 else "SPLIT"
-    print(f"  {model:<16} {size_gb:>5.1f} GB  [{fit}]")
--- a/tmp_hw_check.ps1
+++ b/tmp_hw_check.ps1
@@ -1,27 +0,0 @@
-$cpu = Get-WmiObject Win32_Processor | Select-Object -First 1
-$ram = Get-WmiObject Win32_ComputerSystem
-$gpu = Get-WmiObject Win32_VideoController | Where-Object { $_.AdapterRAM -gt 0 }
-$os = Get-WmiObject Win32_OperatingSystem
-
-Write-Output "=== CPU ==="
-Write-Output "  $($cpu.Name)"
-Write-Output "  Cores: $($cpu.NumberOfCores) physical / $($cpu.NumberOfLogicalProcessors) logical"
-Write-Output "  Max MHz: $($cpu.MaxClockSpeed)"
-
-Write-Output "`n=== RAM ==="
-$ramGB = [math]::Round($ram.TotalPhysicalMemory / 1GB, 1)
-Write-Output "  Total: $ramGB GB"
-
-Write-Output "`n=== GPU(s) ==="
-foreach ($g in $gpu) {
-    $vramMB = [math]::Round($g.AdapterRAM / 1MB, 0)
-    Write-Output "  $($g.Name)"
-    Write-Output "  VRAM: $vramMB MB"
-    Write-Output "  Driver: $($g.DriverVersion)"
-}
-
-Write-Output "`n=== Storage (system drive) ==="
-$disk = Get-WmiObject Win32_LogicalDisk -Filter "DeviceID='C:'"
-$freeGB = [math]::Round($disk.FreeSpace / 1GB, 1)
-$totalGB = [math]::Round($disk.Size / 1GB, 1)
-Write-Output "  C: $freeGB GB free / $totalGB GB total"
--- a/tmp_ollama_bench.py
+++ b/tmp_ollama_bench.py
@@ -1,124 +0,0 @@
-"""
-Throughput benchmark for DESKTOP-0O8A1RL.
-Tests: current settings, then pulls GPU layer count while running.
-"""
-import urllib.request, json, time, subprocess, threading, sys
-
-PROMPT = (
-    "List 10 common Windows troubleshooting steps an IT technician uses daily. "
-    "One sentence each, numbered."
-)
-
-def get_ps():
-    """Check which models are loaded and how many GPU layers."""
-    try:
-        r = urllib.request.urlopen("http://localhost:11434/api/ps", timeout=5)
-        return json.loads(r.read())
-    except:
-        return {}
-
-def run_gen(model, num_predict=300):
-    payload = {
-        "model": model,
-        "prompt": PROMPT,
-        "stream": False,
-        "options": {"num_predict": num_predict}
-    }
-    start = time.time()
-    req = urllib.request.Request(
-        "http://localhost:11434/api/generate",
-        data=json.dumps(payload).encode(),
-        headers={"Content-Type": "application/json"},
-    )
-    r = json.loads(urllib.request.urlopen(req, timeout=300).read())
-    elapsed = time.time() - start
-    eval_tok = r.get("eval_count", 0)
-    eval_ns = r.get("eval_duration", 1)
-    prompt_tok = r.get("prompt_eval_count", 0)
-    prompt_ns = r.get("prompt_eval_duration", 1)
-    gen_tps = eval_tok / (eval_ns / 1e9) if eval_ns else 0
-    prompt_tps = prompt_tok / (prompt_ns / 1e9) if prompt_ns else 0
-    return {
-        "elapsed": elapsed,
-        "eval_tok": eval_tok,
-        "gen_tps": gen_tps,
-        "prompt_tok": prompt_tok,
-        "prompt_tps": prompt_tps,
-        "response_snippet": r["response"].strip()[:200],
-    }
-
-def print_result(label, res, ps_info=None):
-    print(f"\n  {label}")
-    print(f"    Generation: {res['gen_tps']:.1f} tok/s ({res['eval_tok']} tokens in {res['elapsed']:.1f}s)")
-    print(f"    Prompt eval: {res['prompt_tps']:.1f} tok/s ({res['prompt_tok']} tokens)")
-    if ps_info:
-        for m in ps_info.get("models", []):
-            layers = m.get("size_vram", 0) / 1024 / 1024
-            total = m.get("size", 0) / 1024 / 1024
-            pct = (m.get("size_vram", 0) / m.get("size", 1)) * 100 if m.get("size") else 0
-            print(f"    GPU VRAM used: {layers:.0f} MB / {total:.0f} MB total ({pct:.0f}% in VRAM)")
-
-# ── Check initial Ollama env ────────────────────────────────────────────────
-print("=" * 60)
-print("DESKTOP-0O8A1RL — Ollama Throughput Benchmark")
-print("GPU: RTX 5070 Ti Laptop (4 GB VRAM)")
-print("=" * 60)
-
-# ── Test 1: qwen3:14b current settings ─────────────────────────────────────
-print("\n[1] Warming up qwen3:14b...")
-# warmup
-run_gen("qwen3:14b", num_predict=20)
-time.sleep(2)
-ps = get_ps()
-print(f"    ps after warmup: {ps}")
-
-print("\n[2] qwen3:14b — current settings (300 tokens)")
-res14 = run_gen("qwen3:14b", num_predict=300)
-ps14 = get_ps()
-print_result("qwen3:14b", res14, ps14)
-
-# Unload
-print("\n[3] Unloading model...")
-try:
-    payload = {"model": "qwen3:14b", "keep_alive": 0}
-    urllib.request.urlopen(urllib.request.Request(
-        "http://localhost:11434/api/generate",
-        data=json.dumps(payload).encode(),
-        headers={"Content-Type": "application/json"}
-    ), timeout=10)
-except:
-    pass
-time.sleep(3)
-
-# ── Test 2: qwen3.6 current settings ───────────────────────────────────────
-print("\n[4] qwen3.6:latest — current settings (300 tokens)")
-print("    (This may take a while to load from disk...)")
-run_gen("qwen3.6:latest", num_predict=20)  # warmup load
-time.sleep(2)
-ps36w = get_ps()
-print(f"    ps after warmup: {ps36w}")
-res36 = run_gen("qwen3.6:latest", num_predict=300)
-ps36 = get_ps()
-print_result("qwen3.6:latest", res36, ps36)
-
-# ── Summary ─────────────────────────────────────────────────────────────────
-print("\n" + "=" * 60)
-print("SUMMARY")
-print("=" * 60)
-print(f"  qwen3:14b  gen: {res14['gen_tps']:.1f} tok/s   prompt: {res14['prompt_tps']:.1f} tok/s")
-print(f"  qwen3.6    gen: {res36['gen_tps']:.1f} tok/s   prompt: {res36['prompt_tps']:.1f} tok/s")
-print()
-print("  Reference (other machine benchmark from OLLAMA.md):")
-print("    qwen3:14b  ~66 tok/s")
-print("    qwen3.6    ~32 tok/s")
-print()
-
-# VRAM analysis
-for label, model_size_gb in [("qwen3:14b", 8.8), ("qwen3.6", 22)]:
-    vram_gb = 4.0
-    pct_gpu = min(vram_gb / model_size_gb * 100, 100)
-    print(f"  {label} ({model_size_gb} GB): {pct_gpu:.0f}% fits in 4 GB VRAM → rest on CPU/RAM")
-
-print()
-print("  Diagnosis: Both models exceed 4 GB VRAM → split CPU/GPU or pure CPU")
-print("  Expected impact: 2-4x slower than a machine with sufficient VRAM")