import urllib.request, json, time MODEL = "qwen3.6:latest" COMPARE = "qwen3:14b" def ask(model, prompt, system=None, max_tokens=2000, no_think=False): if no_think: prompt = "/no_think\n" + prompt payload = { "model": model, "prompt": prompt, "stream": False, "options": {"num_predict": max_tokens} } if system: payload["system"] = system start = time.time() req = urllib.request.Request( "http://localhost:11434/api/generate", data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"}, ) r = json.loads(urllib.request.urlopen(req, timeout=300).read()) elapsed = time.time() - start tokens = r.get("eval_count", 0) dur_ns = r.get("eval_duration", 1) tps = tokens / (dur_ns / 1e9) if dur_ns else 0 raw = r["response"].strip() # Strip thinking block if present if "" in raw and "" in raw: think_content = raw[raw.index("")+7:raw.index("")] visible = raw[raw.index("")+8:].strip() think_tokens = len(think_content.split()) else: visible = raw think_tokens = 0 return visible, elapsed, tokens, tps, think_tokens def hdr(title): print("\n" + "=" * 60) print(title) print("=" * 60) def report(label, resp, t, tokens, tps, think_tok): tag = f" [thinking: ~{think_tok} words]" if think_tok else " [no thinking]" print(f"\n{label}{tag}") print(f"Time: {t:.1f}s | Total tokens: {tokens} | Speed: {tps:.0f} tok/s") print(f"Response:\n{resp[:600]}") # ── TEST 1: Ticket Classification ────────────────────────────────────────── hdr("TEST 1: TICKET CLASSIFICATION") p = ( "Classify this IT support ticket into ONE category: " "Hardware, Software, Network, Security, or User-Error.\n\n" "Ticket: Client says Outlook keeps asking for password every morning. " "Rebooting does not help. Started after a Windows Update last Tuesday.\n\n" "Respond with the category name and one sentence of reasoning." ) for no_think in [False, True]: mode = "no_think" if no_think else "thinking" resp, t, tok, tps, think_tok = ask(MODEL, p, no_think=no_think) report(f"{MODEL} [{mode}]", resp, t, tok, tps, think_tok) # ── TEST 2: JSON Structured Extraction ───────────────────────────────────── hdr("TEST 2: JSON EXTRACTION") p2 = ( "Extract from this ticket and return ONLY a valid JSON object, no explanation:\n" "Fields: client_name, issue_summary, affected_system, urgency (low/medium/high), suggested_action\n\n" "Ticket: Hi, this is Janet from Cascades Dental. Our front desk computer running Windows 10 " "is showing a blue screen every time we open Dentrix. This started this morning and we have " "patients coming in at 9am. We need this fixed ASAP.\n\n" "Return only the JSON object." ) for no_think in [False, True]: mode = "no_think" if no_think else "thinking" resp, t, tok, tps, think_tok = ask(MODEL, p2, no_think=no_think) report(f"{MODEL} [{mode}]", resp, t, tok, tps, think_tok) clean = resp.strip().strip("```json").strip("```").strip() try: parsed = json.loads(clean) print(f" [OK] Valid JSON with keys: {list(parsed.keys())}") except Exception as e: print(f" [FAIL] {e} | raw: {repr(clean[:100])}") # ── TEST 3: Summarization ─────────────────────────────────────────────────── hdr("TEST 3: SUMMARIZATION (no_think only — faster)") p3 = ( "Summarize this incident in 3 bullet points for a client-facing email. " "Professional, non-technical, under 80 words total.\n\n" "Incident: The GuruRMM agent watchdog on a Windows build server failed to restart the main " "agent service after an auto-update because: (1) SCM service.stop() returned access denied, " "(2) suppress_until was set to a future timestamp instead of being cleared on failure, " "causing the watchdog to skip all restart attempts for 25 minutes. " "Fix: sc.exe fallback added for stop, suppress_until cleared on error." ) resp3, t3, tok3, tps3, think3 = ask(MODEL, p3, no_think=True) report(f"{MODEL} [no_think]", resp3, t3, tok3, tps3, think3) # ── TEST 4: Roadmap Classification (JSON) ────────────────────────────────── hdr("TEST 4: FEATURE ROADMAP PLACEMENT (no_think)") p4 = ( "You are classifying a feature request for GuruRMM, an RMM tool for MSPs. " "Roadmap sections: Core Agent Features, Server/API Features, Dashboard & UI, " "Platform & Infrastructure, Integrations, Future Considerations.\n\n" "Feature request: Add ability to remotely enable or disable Windows Defender " "Real-Time Protection on managed endpoints from the dashboard.\n\n" 'Return ONLY this JSON: {"section": "...", "subsection": "...", "priority": "P1|P2|P3", "summary": "..."}' ) resp4, t4, tok4, tps4, think4 = ask(MODEL, p4, no_think=True) report(f"{MODEL} [no_think]", resp4, t4, tok4, tps4, think4) clean4 = resp4.strip().strip("```json").strip("```").strip() try: parsed4 = json.loads(clean4) print(f" [OK] {parsed4}") except Exception as e: print(f" [FAIL] {e}") # ── TEST 5: Instruction Following ─────────────────────────────────────────── hdr("TEST 5: INSTRUCTION FOLLOWING (no_think)") p5 = ( "Do exactly these steps:\n" "1. Write ALPHA on its own line\n" "2. Count letters in 'authenticate' and double that number, write only the result on its own line\n" "3. Write OMEGA on its own line\n" "Output only the three lines." ) resp5, t5, tok5, tps5, think5 = ask(MODEL, p5, no_think=True) report(f"{MODEL} [no_think]", resp5, t5, tok5, tps5, think5) lines = [l.strip() for l in resp5.split("\n") if l.strip()] expected = str(len("authenticate") * 2) ok = "ALPHA" in lines and "OMEGA" in lines and expected in lines print(f" Lines: {lines} -> [{'OK' if ok else 'FAIL'}] (expect ALPHA, {expected}, OMEGA)") # ── TEST 6: Speed head-to-head ────────────────────────────────────────────── hdr("TEST 6: SPEED — qwen3.6 vs qwen3:14b (no_think, same prompt)") speed_p = ( "List 5 common Windows 10 issues an MSP technician sees and one fix for each. Be concise, no intro." ) print(f"Prompt: {speed_p}\n") for model in [MODEL, COMPARE]: nt = model == MODEL # no_think only for 3.6 resp, t, tok, tps, think_tok = ask(model, speed_p, no_think=nt, max_tokens=600) label = f"{model} [{'no_think' if nt else 'default'}]" print(f"{label}: {t:.1f}s | {tok} tokens | {tps:.0f} tok/s") print(resp[:400]) print() # ── TEST 7: Thinking mode — where it shines ───────────────────────────────── hdr("TEST 7: REASONING — where thinking mode should help") p7 = ( "An MSP has 3 technicians. Tech A can complete 4 tickets per hour. " "Tech B can complete 3 tickets per hour. Tech C can complete 2 tickets per hour. " "They have 45 tickets in the queue. Tech A works 8 hours, Tech B works 6 hours, " "Tech C works 4 hours. Will they clear the queue? How many tickets will be left or " "how many ahead of schedule will they finish? Show your work." ) resp7, t7, tok7, tps7, think7 = ask(MODEL, p7, no_think=False, max_tokens=1500) report(f"{MODEL} [thinking]", resp7, t7, tok7, tps7, think7)