chore: remove remaining qwen scratch files

tmp_qwen_reason.py, tmp_qwen_test.py, tmp_qwen_test2.py — additional
local qwen test scratch from today's benchmarking work. The routing
decisions live in OLLAMA.md; the throwaway scripts don't need to ship.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-16 16:56:37 -07:00
parent f157795fb7
commit a6fb8d2ab6
3 changed files with 0 additions and 371 deletions

View File

@@ -1,54 +0,0 @@
import urllib.request, json, time
def ask(model, prompt, max_tokens=6000):
payload = {
"model": model,
"prompt": prompt,
"stream": False,
"options": {"num_predict": max_tokens}
}
start = time.time()
req = urllib.request.Request(
"http://localhost:11434/api/generate",
data=json.dumps(payload).encode(),
headers={"Content-Type": "application/json"},
)
r = json.loads(urllib.request.urlopen(req, timeout=600).read())
elapsed = time.time() - start
tokens = r.get("eval_count", 0)
dur_ns = r.get("eval_duration", 1)
tps = tokens / (dur_ns / 1e9) if dur_ns else 0
raw = r["response"].strip()
if "<think>" in raw and "</think>" in raw:
think = raw[raw.index("<think>")+7 : raw.index("</think>")]
visible = raw[raw.index("</think>")+8:].strip()
think_words = len(think.split())
else:
think = ""
visible = raw
think_words = 0
return visible, elapsed, tokens, tps, think_words, think
prompt = (
"An MSP has 3 technicians. Tech A can complete 4 tickets per hour. "
"Tech B can complete 3 tickets per hour. Tech C can complete 2 tickets per hour. "
"They have 45 tickets in the queue. Tech A works 8 hours, Tech B works 6 hours, "
"Tech C works 4 hours. Will they clear the queue? How many tickets will be left or "
"how many ahead of schedule will they finish? Show your work."
)
print("Running qwen3.6 reasoning test at 6000 token budget...")
print(f"Prompt: {prompt}\n")
for model in ["qwen3.6:latest", "qwen3:14b"]:
print(f"\n{'='*60}")
print(f"MODEL: {model}")
print('='*60)
visible, t, tokens, tps, think_words, think = ask(model, prompt)
print(f"Time: {t:.1f}s | Total tokens: {tokens} | Speed: {tps:.0f} tok/s")
if think_words:
print(f"Thinking: ~{think_words} words")
print(f"Thinking excerpt (first 300 chars):\n {think[:300]}...")
else:
print("Thinking: not exposed in output")
print(f"\nResponse:\n{visible}")

View File

@@ -1,159 +0,0 @@
import urllib.request, json, time, sys
MODEL = "qwen3.6:latest"
COMPARE = "qwen3:14b"
def ask(model, prompt, system=None, max_tokens=400):
payload = {"model": model, "prompt": prompt, "stream": False, "options": {"num_predict": max_tokens}}
if system:
payload["system"] = system
start = time.time()
req = urllib.request.Request(
"http://localhost:11434/api/generate",
data=json.dumps(payload).encode(),
headers={"Content-Type": "application/json"},
)
r = json.loads(urllib.request.urlopen(req, timeout=180).read())
elapsed = time.time() - start
tokens = r.get("eval_count", 0)
dur_ns = r.get("eval_duration", 1)
tps = tokens / (dur_ns / 1e9) if dur_ns else 0
return r["response"].strip(), elapsed, tokens, tps
def report(label, resp, t, tokens, tps):
print(f"\n--- {label} ---")
print(f"Response:\n{resp}")
print(f"\nTime: {t:.1f}s | Output tokens: {tokens} | Speed: {tps:.0f} tok/s")
print()
# ── TEST 1: Ticket Classification ──────────────────────────────────────────
print("=" * 60)
print("TEST 1: TICKET CLASSIFICATION")
print("=" * 60)
p = (
"Classify this IT support ticket into ONE category: "
"Hardware, Software, Network, Security, or User-Error.\n\n"
"Ticket: Client says Outlook keeps asking for password every morning. "
"Rebooting does not help. Started after a Windows Update last Tuesday.\n\n"
"Respond with the category name and one sentence of reasoning. No thinking tags."
)
resp, t, tok, tps = ask(MODEL, p)
report(MODEL, resp, t, tok, tps)
# ── TEST 2: JSON Structured Extraction ─────────────────────────────────────
print("=" * 60)
print("TEST 2: STRUCTURED JSON EXTRACTION")
print("=" * 60)
p2 = (
"Extract the following from the ticket and return ONLY valid JSON, no explanation:\n"
"Fields: client_name, issue_summary, affected_system, urgency (low/medium/high), suggested_action\n\n"
"Ticket: Hi, this is Janet from Cascades Dental. Our front desk computer running Windows 10 "
"is showing a blue screen every time we open Dentrix. This started this morning and we have "
"patients coming in at 9am. We need this fixed ASAP.\n\n"
"Return only the JSON object."
)
resp2, t2, tok2, tps2 = ask(MODEL, p2)
report(MODEL, resp2, t2, tok2, tps2)
try:
# strip thinking tags if present
clean = resp2
if "</think>" in clean:
clean = clean[clean.index("</think>") + 8:].strip()
parsed = json.loads(clean)
print(f"[OK] Valid JSON: {list(parsed.keys())}")
except Exception as e:
print(f"[FAIL] JSON parse error: {e}")
# ── TEST 3: Summarization ───────────────────────────────────────────────────
print("\n" + "=" * 60)
print("TEST 3: TECHNICAL SUMMARIZATION")
print("=" * 60)
p3 = (
"Summarize the following incident in 3 bullet points for a client-facing email. "
"Be professional, non-technical, under 80 words total.\n\n"
"Incident: The GuruRMM agent watchdog on Pluto (172.16.3.36) failed to restart the main "
"agent service after an auto-update because: (1) SCM service.stop() returned access denied, "
"(2) suppress_until was set to a future timestamp instead of being cleared on failure, "
"(3) the watchdog then treated the suppression as intentional and skipped all restart attempts "
"for 25 minutes. Fix: sc.exe fallback added for stop, suppress_until cleared on error."
)
resp3, t3, tok3, tps3 = ask(MODEL, p3)
report(MODEL, resp3, t3, tok3, tps3)
# ── TEST 4: Code Explanation ────────────────────────────────────────────────
print("=" * 60)
print("TEST 4: RUST CODE EXPLANATION")
print("=" * 60)
p4 = (
"Explain what this Rust code does in 2-3 sentences. Be specific about the error handling strategy.\n\n"
"```rust\n"
"let stop_result = service.stop();\n"
"if let Err(e) = stop_result {\n"
" warn!(\"Watchdog: SCM stop failed ({}), falling back to sc.exe\", e);\n"
" let _ = std::process::Command::new(\"sc.exe\")\n"
" .args([\"stop\", MAIN_SERVICE_NAME])\n"
" .status();\n"
"}\n"
"```"
)
resp4, t4, tok4, tps4 = ask(MODEL, p4)
report(MODEL, resp4, t4, tok4, tps4)
# ── TEST 5: Roadmap Classification ─────────────────────────────────────────
print("=" * 60)
print("TEST 5: FEATURE ROADMAP PLACEMENT (MSP context)")
print("=" * 60)
p5 = (
"You are helping classify a feature request for GuruRMM, an RMM tool for MSPs. "
"The roadmap sections are: Core Agent Features, Server/API Features, Dashboard & UI, "
"Platform & Infrastructure, Integrations, Future Considerations.\n\n"
"Feature request: Add the ability to remotely enable or disable Windows Defender Real-Time "
"Protection on managed endpoints from the dashboard.\n\n"
'Respond with JSON only: {"section": "...", "subsection": "...", "priority": "P1|P2|P3", "summary": "..."}'
)
resp5, t5, tok5, tps5 = ask(MODEL, p5)
report(MODEL, resp5, t5, tok5, tps5)
try:
clean5 = resp5
if "</think>" in clean5:
clean5 = clean5[clean5.index("</think>") + 8:].strip()
if clean5.startswith("```"):
clean5 = clean5.split("\n", 1)[1].rsplit("```", 1)[0].strip()
parsed5 = json.loads(clean5)
print(f"[OK] Valid JSON: {parsed5}")
except Exception as e:
print(f"[FAIL] JSON parse error: {e}")
# ── TEST 6: Instruction Following ───────────────────────────────────────────
print("\n" + "=" * 60)
print("TEST 6: MULTI-STEP INSTRUCTION FOLLOWING")
print("=" * 60)
p6 = (
"Do exactly these steps in order:\n"
"1. Write the word ALPHA on its own line\n"
"2. Count the letters in the word 'authenticate'\n"
"3. Write that number doubled on its own line\n"
"4. Write the word OMEGA on its own line\n"
"No explanation, no thinking, just the three lines of output."
)
resp6, t6, tok6, tps6 = ask(MODEL, p6)
report(MODEL, resp6, t6, tok6, tps6)
lines = [l.strip() for l in resp6.split("\n") if l.strip()]
if "</think>" in resp6:
lines = [l for l in lines if not l.startswith("<")]
print(f"Lines output: {lines}")
expected_num = len("authenticate") * 2 # 12*2=24
ok = "ALPHA" in lines and "OMEGA" in lines and str(expected_num) in lines
print(f"[{'OK' if ok else 'FAIL'}] Expected ALPHA, {expected_num}, OMEGA")
# ── TEST 7: Speed comparison ─────────────────────────────────────────────
print("\n" + "=" * 60)
print("TEST 7: SPEED COMPARISON (same prompt, both models)")
print("=" * 60)
speed_prompt = "List 5 common Windows 10 issues an IT support technician encounters and one fix for each. Be concise."
print(f"Prompt: {speed_prompt}\n")
for model in [MODEL, COMPARE]:
r, t, tok, tps = ask(model, speed_prompt)
print(f"{model}: {t:.1f}s | {tok} tokens | {tps:.0f} tok/s")
print(f" {r[:200]}...")
print()

View File

@@ -1,158 +0,0 @@
import urllib.request, json, time
MODEL = "qwen3.6:latest"
COMPARE = "qwen3:14b"
def ask(model, prompt, system=None, max_tokens=2000, no_think=False):
if no_think:
prompt = "/no_think\n" + prompt
payload = {
"model": model,
"prompt": prompt,
"stream": False,
"options": {"num_predict": max_tokens}
}
if system:
payload["system"] = system
start = time.time()
req = urllib.request.Request(
"http://localhost:11434/api/generate",
data=json.dumps(payload).encode(),
headers={"Content-Type": "application/json"},
)
r = json.loads(urllib.request.urlopen(req, timeout=300).read())
elapsed = time.time() - start
tokens = r.get("eval_count", 0)
dur_ns = r.get("eval_duration", 1)
tps = tokens / (dur_ns / 1e9) if dur_ns else 0
raw = r["response"].strip()
# Strip thinking block if present
if "<think>" in raw and "</think>" in raw:
think_content = raw[raw.index("<think>")+7:raw.index("</think>")]
visible = raw[raw.index("</think>")+8:].strip()
think_tokens = len(think_content.split())
else:
visible = raw
think_tokens = 0
return visible, elapsed, tokens, tps, think_tokens
def hdr(title):
print("\n" + "=" * 60)
print(title)
print("=" * 60)
def report(label, resp, t, tokens, tps, think_tok):
tag = f" [thinking: ~{think_tok} words]" if think_tok else " [no thinking]"
print(f"\n{label}{tag}")
print(f"Time: {t:.1f}s | Total tokens: {tokens} | Speed: {tps:.0f} tok/s")
print(f"Response:\n{resp[:600]}")
# ── TEST 1: Ticket Classification ──────────────────────────────────────────
hdr("TEST 1: TICKET CLASSIFICATION")
p = (
"Classify this IT support ticket into ONE category: "
"Hardware, Software, Network, Security, or User-Error.\n\n"
"Ticket: Client says Outlook keeps asking for password every morning. "
"Rebooting does not help. Started after a Windows Update last Tuesday.\n\n"
"Respond with the category name and one sentence of reasoning."
)
for no_think in [False, True]:
mode = "no_think" if no_think else "thinking"
resp, t, tok, tps, think_tok = ask(MODEL, p, no_think=no_think)
report(f"{MODEL} [{mode}]", resp, t, tok, tps, think_tok)
# ── TEST 2: JSON Structured Extraction ─────────────────────────────────────
hdr("TEST 2: JSON EXTRACTION")
p2 = (
"Extract from this ticket and return ONLY a valid JSON object, no explanation:\n"
"Fields: client_name, issue_summary, affected_system, urgency (low/medium/high), suggested_action\n\n"
"Ticket: Hi, this is Janet from Cascades Dental. Our front desk computer running Windows 10 "
"is showing a blue screen every time we open Dentrix. This started this morning and we have "
"patients coming in at 9am. We need this fixed ASAP.\n\n"
"Return only the JSON object."
)
for no_think in [False, True]:
mode = "no_think" if no_think else "thinking"
resp, t, tok, tps, think_tok = ask(MODEL, p2, no_think=no_think)
report(f"{MODEL} [{mode}]", resp, t, tok, tps, think_tok)
clean = resp.strip().strip("```json").strip("```").strip()
try:
parsed = json.loads(clean)
print(f" [OK] Valid JSON with keys: {list(parsed.keys())}")
except Exception as e:
print(f" [FAIL] {e} | raw: {repr(clean[:100])}")
# ── TEST 3: Summarization ───────────────────────────────────────────────────
hdr("TEST 3: SUMMARIZATION (no_think only — faster)")
p3 = (
"Summarize this incident in 3 bullet points for a client-facing email. "
"Professional, non-technical, under 80 words total.\n\n"
"Incident: The GuruRMM agent watchdog on a Windows build server failed to restart the main "
"agent service after an auto-update because: (1) SCM service.stop() returned access denied, "
"(2) suppress_until was set to a future timestamp instead of being cleared on failure, "
"causing the watchdog to skip all restart attempts for 25 minutes. "
"Fix: sc.exe fallback added for stop, suppress_until cleared on error."
)
resp3, t3, tok3, tps3, think3 = ask(MODEL, p3, no_think=True)
report(f"{MODEL} [no_think]", resp3, t3, tok3, tps3, think3)
# ── TEST 4: Roadmap Classification (JSON) ──────────────────────────────────
hdr("TEST 4: FEATURE ROADMAP PLACEMENT (no_think)")
p4 = (
"You are classifying a feature request for GuruRMM, an RMM tool for MSPs. "
"Roadmap sections: Core Agent Features, Server/API Features, Dashboard & UI, "
"Platform & Infrastructure, Integrations, Future Considerations.\n\n"
"Feature request: Add ability to remotely enable or disable Windows Defender "
"Real-Time Protection on managed endpoints from the dashboard.\n\n"
'Return ONLY this JSON: {"section": "...", "subsection": "...", "priority": "P1|P2|P3", "summary": "..."}'
)
resp4, t4, tok4, tps4, think4 = ask(MODEL, p4, no_think=True)
report(f"{MODEL} [no_think]", resp4, t4, tok4, tps4, think4)
clean4 = resp4.strip().strip("```json").strip("```").strip()
try:
parsed4 = json.loads(clean4)
print(f" [OK] {parsed4}")
except Exception as e:
print(f" [FAIL] {e}")
# ── TEST 5: Instruction Following ───────────────────────────────────────────
hdr("TEST 5: INSTRUCTION FOLLOWING (no_think)")
p5 = (
"Do exactly these steps:\n"
"1. Write ALPHA on its own line\n"
"2. Count letters in 'authenticate' and double that number, write only the result on its own line\n"
"3. Write OMEGA on its own line\n"
"Output only the three lines."
)
resp5, t5, tok5, tps5, think5 = ask(MODEL, p5, no_think=True)
report(f"{MODEL} [no_think]", resp5, t5, tok5, tps5, think5)
lines = [l.strip() for l in resp5.split("\n") if l.strip()]
expected = str(len("authenticate") * 2)
ok = "ALPHA" in lines and "OMEGA" in lines and expected in lines
print(f" Lines: {lines} -> [{'OK' if ok else 'FAIL'}] (expect ALPHA, {expected}, OMEGA)")
# ── TEST 6: Speed head-to-head ──────────────────────────────────────────────
hdr("TEST 6: SPEED — qwen3.6 vs qwen3:14b (no_think, same prompt)")
speed_p = (
"List 5 common Windows 10 issues an MSP technician sees and one fix for each. Be concise, no intro."
)
print(f"Prompt: {speed_p}\n")
for model in [MODEL, COMPARE]:
nt = model == MODEL # no_think only for 3.6
resp, t, tok, tps, think_tok = ask(model, speed_p, no_think=nt, max_tokens=600)
label = f"{model} [{'no_think' if nt else 'default'}]"
print(f"{label}: {t:.1f}s | {tok} tokens | {tps:.0f} tok/s")
print(resp[:400])
print()
# ── TEST 7: Thinking mode — where it shines ─────────────────────────────────
hdr("TEST 7: REASONING — where thinking mode should help")
p7 = (
"An MSP has 3 technicians. Tech A can complete 4 tickets per hour. "
"Tech B can complete 3 tickets per hour. Tech C can complete 2 tickets per hour. "
"They have 45 tickets in the queue. Tech A works 8 hours, Tech B works 6 hours, "
"Tech C works 4 hours. Will they clear the queue? How many tickets will be left or "
"how many ahead of schedule will they finish? Show your work."
)
resp7, t7, tok7, tps7, think7 = ask(MODEL, p7, no_think=False, max_tokens=1500)
report(f"{MODEL} [thinking]", resp7, t7, tok7, tps7, think7)