chore: remove remaining qwen scratch files

tmp_qwen_reason.py, tmp_qwen_test.py, tmp_qwen_test2.py — additional local qwen test scratch from today's benchmarking work. The routing decisions live in OLLAMA.md; the throwaway scripts don't need to ship. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 16:56:37 -07:00
parent f157795fb7
commit a6fb8d2ab6
3 changed files with 0 additions and 371 deletions
--- a/tmp_qwen_reason.py
+++ b/tmp_qwen_reason.py
@@ -1,54 +0,0 @@
-import urllib.request, json, time
-
-def ask(model, prompt, max_tokens=6000):
-    payload = {
-        "model": model,
-        "prompt": prompt,
-        "stream": False,
-        "options": {"num_predict": max_tokens}
-    }
-    start = time.time()
-    req = urllib.request.Request(
-        "http://localhost:11434/api/generate",
-        data=json.dumps(payload).encode(),
-        headers={"Content-Type": "application/json"},
-    )
-    r = json.loads(urllib.request.urlopen(req, timeout=600).read())
-    elapsed = time.time() - start
-    tokens = r.get("eval_count", 0)
-    dur_ns = r.get("eval_duration", 1)
-    tps = tokens / (dur_ns / 1e9) if dur_ns else 0
-    raw = r["response"].strip()
-    if "<think>" in raw and "</think>" in raw:
-        think = raw[raw.index("<think>")+7 : raw.index("</think>")]
-        visible = raw[raw.index("</think>")+8:].strip()
-        think_words = len(think.split())
-    else:
-        think = ""
-        visible = raw
-        think_words = 0
-    return visible, elapsed, tokens, tps, think_words, think
-
-prompt = (
-    "An MSP has 3 technicians. Tech A can complete 4 tickets per hour. "
-    "Tech B can complete 3 tickets per hour. Tech C can complete 2 tickets per hour. "
-    "They have 45 tickets in the queue. Tech A works 8 hours, Tech B works 6 hours, "
-    "Tech C works 4 hours. Will they clear the queue? How many tickets will be left or "
-    "how many ahead of schedule will they finish? Show your work."
-)
-
-print("Running qwen3.6 reasoning test at 6000 token budget...")
-print(f"Prompt: {prompt}\n")
-
-for model in ["qwen3.6:latest", "qwen3:14b"]:
-    print(f"\n{'='*60}")
-    print(f"MODEL: {model}")
-    print('='*60)
-    visible, t, tokens, tps, think_words, think = ask(model, prompt)
-    print(f"Time: {t:.1f}s | Total tokens: {tokens} | Speed: {tps:.0f} tok/s")
-    if think_words:
-        print(f"Thinking: ~{think_words} words")
-        print(f"Thinking excerpt (first 300 chars):\n  {think[:300]}...")
-    else:
-        print("Thinking: not exposed in output")
-    print(f"\nResponse:\n{visible}")
--- a/tmp_qwen_test.py
+++ b/tmp_qwen_test.py
@@ -1,159 +0,0 @@
-import urllib.request, json, time, sys
-
-MODEL = "qwen3.6:latest"
-COMPARE = "qwen3:14b"
-
-def ask(model, prompt, system=None, max_tokens=400):
-    payload = {"model": model, "prompt": prompt, "stream": False, "options": {"num_predict": max_tokens}}
-    if system:
-        payload["system"] = system
-    start = time.time()
-    req = urllib.request.Request(
-        "http://localhost:11434/api/generate",
-        data=json.dumps(payload).encode(),
-        headers={"Content-Type": "application/json"},
-    )
-    r = json.loads(urllib.request.urlopen(req, timeout=180).read())
-    elapsed = time.time() - start
-    tokens = r.get("eval_count", 0)
-    dur_ns = r.get("eval_duration", 1)
-    tps = tokens / (dur_ns / 1e9) if dur_ns else 0
-    return r["response"].strip(), elapsed, tokens, tps
-
-def report(label, resp, t, tokens, tps):
-    print(f"\n--- {label} ---")
-    print(f"Response:\n{resp}")
-    print(f"\nTime: {t:.1f}s | Output tokens: {tokens} | Speed: {tps:.0f} tok/s")
-    print()
-
-# ── TEST 1: Ticket Classification ──────────────────────────────────────────
-print("=" * 60)
-print("TEST 1: TICKET CLASSIFICATION")
-print("=" * 60)
-p = (
-    "Classify this IT support ticket into ONE category: "
-    "Hardware, Software, Network, Security, or User-Error.\n\n"
-    "Ticket: Client says Outlook keeps asking for password every morning. "
-    "Rebooting does not help. Started after a Windows Update last Tuesday.\n\n"
-    "Respond with the category name and one sentence of reasoning. No thinking tags."
-)
-resp, t, tok, tps = ask(MODEL, p)
-report(MODEL, resp, t, tok, tps)
-
-# ── TEST 2: JSON Structured Extraction ─────────────────────────────────────
-print("=" * 60)
-print("TEST 2: STRUCTURED JSON EXTRACTION")
-print("=" * 60)
-p2 = (
-    "Extract the following from the ticket and return ONLY valid JSON, no explanation:\n"
-    "Fields: client_name, issue_summary, affected_system, urgency (low/medium/high), suggested_action\n\n"
-    "Ticket: Hi, this is Janet from Cascades Dental. Our front desk computer running Windows 10 "
-    "is showing a blue screen every time we open Dentrix. This started this morning and we have "
-    "patients coming in at 9am. We need this fixed ASAP.\n\n"
-    "Return only the JSON object."
-)
-resp2, t2, tok2, tps2 = ask(MODEL, p2)
-report(MODEL, resp2, t2, tok2, tps2)
-try:
-    # strip thinking tags if present
-    clean = resp2
-    if "</think>" in clean:
-        clean = clean[clean.index("</think>") + 8:].strip()
-    parsed = json.loads(clean)
-    print(f"[OK] Valid JSON: {list(parsed.keys())}")
-except Exception as e:
-    print(f"[FAIL] JSON parse error: {e}")
-
-# ── TEST 3: Summarization ───────────────────────────────────────────────────
-print("\n" + "=" * 60)
-print("TEST 3: TECHNICAL SUMMARIZATION")
-print("=" * 60)
-p3 = (
-    "Summarize the following incident in 3 bullet points for a client-facing email. "
-    "Be professional, non-technical, under 80 words total.\n\n"
-    "Incident: The GuruRMM agent watchdog on Pluto (172.16.3.36) failed to restart the main "
-    "agent service after an auto-update because: (1) SCM service.stop() returned access denied, "
-    "(2) suppress_until was set to a future timestamp instead of being cleared on failure, "
-    "(3) the watchdog then treated the suppression as intentional and skipped all restart attempts "
-    "for 25 minutes. Fix: sc.exe fallback added for stop, suppress_until cleared on error."
-)
-resp3, t3, tok3, tps3 = ask(MODEL, p3)
-report(MODEL, resp3, t3, tok3, tps3)
-
-# ── TEST 4: Code Explanation ────────────────────────────────────────────────
-print("=" * 60)
-print("TEST 4: RUST CODE EXPLANATION")
-print("=" * 60)
-p4 = (
-    "Explain what this Rust code does in 2-3 sentences. Be specific about the error handling strategy.\n\n"
-    "```rust\n"
-    "let stop_result = service.stop();\n"
-    "if let Err(e) = stop_result {\n"
-    "    warn!(\"Watchdog: SCM stop failed ({}), falling back to sc.exe\", e);\n"
-    "    let _ = std::process::Command::new(\"sc.exe\")\n"
-    "        .args([\"stop\", MAIN_SERVICE_NAME])\n"
-    "        .status();\n"
-    "}\n"
-    "```"
-)
-resp4, t4, tok4, tps4 = ask(MODEL, p4)
-report(MODEL, resp4, t4, tok4, tps4)
-
-# ── TEST 5: Roadmap Classification ─────────────────────────────────────────
-print("=" * 60)
-print("TEST 5: FEATURE ROADMAP PLACEMENT (MSP context)")
-print("=" * 60)
-p5 = (
-    "You are helping classify a feature request for GuruRMM, an RMM tool for MSPs. "
-    "The roadmap sections are: Core Agent Features, Server/API Features, Dashboard & UI, "
-    "Platform & Infrastructure, Integrations, Future Considerations.\n\n"
-    "Feature request: Add the ability to remotely enable or disable Windows Defender Real-Time "
-    "Protection on managed endpoints from the dashboard.\n\n"
-    'Respond with JSON only: {"section": "...", "subsection": "...", "priority": "P1|P2|P3", "summary": "..."}'
-)
-resp5, t5, tok5, tps5 = ask(MODEL, p5)
-report(MODEL, resp5, t5, tok5, tps5)
-try:
-    clean5 = resp5
-    if "</think>" in clean5:
-        clean5 = clean5[clean5.index("</think>") + 8:].strip()
-    if clean5.startswith("```"):
-        clean5 = clean5.split("\n", 1)[1].rsplit("```", 1)[0].strip()
-    parsed5 = json.loads(clean5)
-    print(f"[OK] Valid JSON: {parsed5}")
-except Exception as e:
-    print(f"[FAIL] JSON parse error: {e}")
-
-# ── TEST 6: Instruction Following ───────────────────────────────────────────
-print("\n" + "=" * 60)
-print("TEST 6: MULTI-STEP INSTRUCTION FOLLOWING")
-print("=" * 60)
-p6 = (
-    "Do exactly these steps in order:\n"
-    "1. Write the word ALPHA on its own line\n"
-    "2. Count the letters in the word 'authenticate'\n"
-    "3. Write that number doubled on its own line\n"
-    "4. Write the word OMEGA on its own line\n"
-    "No explanation, no thinking, just the three lines of output."
-)
-resp6, t6, tok6, tps6 = ask(MODEL, p6)
-report(MODEL, resp6, t6, tok6, tps6)
-lines = [l.strip() for l in resp6.split("\n") if l.strip()]
-if "</think>" in resp6:
-    lines = [l for l in lines if not l.startswith("<")]
-print(f"Lines output: {lines}")
-expected_num = len("authenticate") * 2  # 12*2=24
-ok = "ALPHA" in lines and "OMEGA" in lines and str(expected_num) in lines
-print(f"[{'OK' if ok else 'FAIL'}] Expected ALPHA, {expected_num}, OMEGA")
-
-# ── TEST 7: Speed comparison ─────────────────────────────────────────────
-print("\n" + "=" * 60)
-print("TEST 7: SPEED COMPARISON (same prompt, both models)")
-print("=" * 60)
-speed_prompt = "List 5 common Windows 10 issues an IT support technician encounters and one fix for each. Be concise."
-print(f"Prompt: {speed_prompt}\n")
-for model in [MODEL, COMPARE]:
-    r, t, tok, tps = ask(model, speed_prompt)
-    print(f"{model}: {t:.1f}s | {tok} tokens | {tps:.0f} tok/s")
-    print(f"  {r[:200]}...")
-    print()
--- a/tmp_qwen_test2.py
+++ b/tmp_qwen_test2.py
@@ -1,158 +0,0 @@
-import urllib.request, json, time
-
-MODEL = "qwen3.6:latest"
-COMPARE = "qwen3:14b"
-
-def ask(model, prompt, system=None, max_tokens=2000, no_think=False):
-    if no_think:
-        prompt = "/no_think\n" + prompt
-    payload = {
-        "model": model,
-        "prompt": prompt,
-        "stream": False,
-        "options": {"num_predict": max_tokens}
-    }
-    if system:
-        payload["system"] = system
-    start = time.time()
-    req = urllib.request.Request(
-        "http://localhost:11434/api/generate",
-        data=json.dumps(payload).encode(),
-        headers={"Content-Type": "application/json"},
-    )
-    r = json.loads(urllib.request.urlopen(req, timeout=300).read())
-    elapsed = time.time() - start
-    tokens = r.get("eval_count", 0)
-    dur_ns = r.get("eval_duration", 1)
-    tps = tokens / (dur_ns / 1e9) if dur_ns else 0
-    raw = r["response"].strip()
-    # Strip thinking block if present
-    if "<think>" in raw and "</think>" in raw:
-        think_content = raw[raw.index("<think>")+7:raw.index("</think>")]
-        visible = raw[raw.index("</think>")+8:].strip()
-        think_tokens = len(think_content.split())
-    else:
-        visible = raw
-        think_tokens = 0
-    return visible, elapsed, tokens, tps, think_tokens
-
-def hdr(title):
-    print("\n" + "=" * 60)
-    print(title)
-    print("=" * 60)
-
-def report(label, resp, t, tokens, tps, think_tok):
-    tag = f"  [thinking: ~{think_tok} words]" if think_tok else "  [no thinking]"
-    print(f"\n{label}{tag}")
-    print(f"Time: {t:.1f}s | Total tokens: {tokens} | Speed: {tps:.0f} tok/s")
-    print(f"Response:\n{resp[:600]}")
-
-# ── TEST 1: Ticket Classification ──────────────────────────────────────────
-hdr("TEST 1: TICKET CLASSIFICATION")
-p = (
-    "Classify this IT support ticket into ONE category: "
-    "Hardware, Software, Network, Security, or User-Error.\n\n"
-    "Ticket: Client says Outlook keeps asking for password every morning. "
-    "Rebooting does not help. Started after a Windows Update last Tuesday.\n\n"
-    "Respond with the category name and one sentence of reasoning."
-)
-for no_think in [False, True]:
-    mode = "no_think" if no_think else "thinking"
-    resp, t, tok, tps, think_tok = ask(MODEL, p, no_think=no_think)
-    report(f"{MODEL} [{mode}]", resp, t, tok, tps, think_tok)
-
-# ── TEST 2: JSON Structured Extraction ─────────────────────────────────────
-hdr("TEST 2: JSON EXTRACTION")
-p2 = (
-    "Extract from this ticket and return ONLY a valid JSON object, no explanation:\n"
-    "Fields: client_name, issue_summary, affected_system, urgency (low/medium/high), suggested_action\n\n"
-    "Ticket: Hi, this is Janet from Cascades Dental. Our front desk computer running Windows 10 "
-    "is showing a blue screen every time we open Dentrix. This started this morning and we have "
-    "patients coming in at 9am. We need this fixed ASAP.\n\n"
-    "Return only the JSON object."
-)
-for no_think in [False, True]:
-    mode = "no_think" if no_think else "thinking"
-    resp, t, tok, tps, think_tok = ask(MODEL, p2, no_think=no_think)
-    report(f"{MODEL} [{mode}]", resp, t, tok, tps, think_tok)
-    clean = resp.strip().strip("```json").strip("```").strip()
-    try:
-        parsed = json.loads(clean)
-        print(f"  [OK] Valid JSON with keys: {list(parsed.keys())}")
-    except Exception as e:
-        print(f"  [FAIL] {e} | raw: {repr(clean[:100])}")
-
-# ── TEST 3: Summarization ───────────────────────────────────────────────────
-hdr("TEST 3: SUMMARIZATION (no_think only — faster)")
-p3 = (
-    "Summarize this incident in 3 bullet points for a client-facing email. "
-    "Professional, non-technical, under 80 words total.\n\n"
-    "Incident: The GuruRMM agent watchdog on a Windows build server failed to restart the main "
-    "agent service after an auto-update because: (1) SCM service.stop() returned access denied, "
-    "(2) suppress_until was set to a future timestamp instead of being cleared on failure, "
-    "causing the watchdog to skip all restart attempts for 25 minutes. "
-    "Fix: sc.exe fallback added for stop, suppress_until cleared on error."
-)
-resp3, t3, tok3, tps3, think3 = ask(MODEL, p3, no_think=True)
-report(f"{MODEL} [no_think]", resp3, t3, tok3, tps3, think3)
-
-# ── TEST 4: Roadmap Classification (JSON) ──────────────────────────────────
-hdr("TEST 4: FEATURE ROADMAP PLACEMENT (no_think)")
-p4 = (
-    "You are classifying a feature request for GuruRMM, an RMM tool for MSPs. "
-    "Roadmap sections: Core Agent Features, Server/API Features, Dashboard & UI, "
-    "Platform & Infrastructure, Integrations, Future Considerations.\n\n"
-    "Feature request: Add ability to remotely enable or disable Windows Defender "
-    "Real-Time Protection on managed endpoints from the dashboard.\n\n"
-    'Return ONLY this JSON: {"section": "...", "subsection": "...", "priority": "P1|P2|P3", "summary": "..."}'
-)
-resp4, t4, tok4, tps4, think4 = ask(MODEL, p4, no_think=True)
-report(f"{MODEL} [no_think]", resp4, t4, tok4, tps4, think4)
-clean4 = resp4.strip().strip("```json").strip("```").strip()
-try:
-    parsed4 = json.loads(clean4)
-    print(f"  [OK] {parsed4}")
-except Exception as e:
-    print(f"  [FAIL] {e}")
-
-# ── TEST 5: Instruction Following ───────────────────────────────────────────
-hdr("TEST 5: INSTRUCTION FOLLOWING (no_think)")
-p5 = (
-    "Do exactly these steps:\n"
-    "1. Write ALPHA on its own line\n"
-    "2. Count letters in 'authenticate' and double that number, write only the result on its own line\n"
-    "3. Write OMEGA on its own line\n"
-    "Output only the three lines."
-)
-resp5, t5, tok5, tps5, think5 = ask(MODEL, p5, no_think=True)
-report(f"{MODEL} [no_think]", resp5, t5, tok5, tps5, think5)
-lines = [l.strip() for l in resp5.split("\n") if l.strip()]
-expected = str(len("authenticate") * 2)
-ok = "ALPHA" in lines and "OMEGA" in lines and expected in lines
-print(f"  Lines: {lines} -> [{'OK' if ok else 'FAIL'}] (expect ALPHA, {expected}, OMEGA)")
-
-# ── TEST 6: Speed head-to-head ──────────────────────────────────────────────
-hdr("TEST 6: SPEED — qwen3.6 vs qwen3:14b (no_think, same prompt)")
-speed_p = (
-    "List 5 common Windows 10 issues an MSP technician sees and one fix for each. Be concise, no intro."
-)
-print(f"Prompt: {speed_p}\n")
-for model in [MODEL, COMPARE]:
-    nt = model == MODEL  # no_think only for 3.6
-    resp, t, tok, tps, think_tok = ask(model, speed_p, no_think=nt, max_tokens=600)
-    label = f"{model} [{'no_think' if nt else 'default'}]"
-    print(f"{label}: {t:.1f}s | {tok} tokens | {tps:.0f} tok/s")
-    print(resp[:400])
-    print()
-
-# ── TEST 7: Thinking mode — where it shines ─────────────────────────────────
-hdr("TEST 7: REASONING — where thinking mode should help")
-p7 = (
-    "An MSP has 3 technicians. Tech A can complete 4 tickets per hour. "
-    "Tech B can complete 3 tickets per hour. Tech C can complete 2 tickets per hour. "
-    "They have 45 tickets in the queue. Tech A works 8 hours, Tech B works 6 hours, "
-    "Tech C works 4 hours. Will they clear the queue? How many tickets will be left or "
-    "how many ahead of schedule will they finish? Show your work."
-)
-resp7, t7, tok7, tps7, think7 = ask(MODEL, p7, no_think=False, max_tokens=1500)
-report(f"{MODEL} [thinking]", resp7, t7, tok7, tps7, think7)