import urllib.request, json, time def ask(model, prompt, max_tokens=6000): payload = { "model": model, "prompt": prompt, "stream": False, "options": {"num_predict": max_tokens} } start = time.time() req = urllib.request.Request( "http://localhost:11434/api/generate", data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"}, ) r = json.loads(urllib.request.urlopen(req, timeout=600).read()) elapsed = time.time() - start tokens = r.get("eval_count", 0) dur_ns = r.get("eval_duration", 1) tps = tokens / (dur_ns / 1e9) if dur_ns else 0 raw = r["response"].strip() if "" in raw and "" in raw: think = raw[raw.index("")+7 : raw.index("")] visible = raw[raw.index("")+8:].strip() think_words = len(think.split()) else: think = "" visible = raw think_words = 0 return visible, elapsed, tokens, tps, think_words, think prompt = ( "An MSP has 3 technicians. Tech A can complete 4 tickets per hour. " "Tech B can complete 3 tickets per hour. Tech C can complete 2 tickets per hour. " "They have 45 tickets in the queue. Tech A works 8 hours, Tech B works 6 hours, " "Tech C works 4 hours. Will they clear the queue? How many tickets will be left or " "how many ahead of schedule will they finish? Show your work." ) print("Running qwen3.6 reasoning test at 6000 token budget...") print(f"Prompt: {prompt}\n") for model in ["qwen3.6:latest", "qwen3:14b"]: print(f"\n{'='*60}") print(f"MODEL: {model}") print('='*60) visible, t, tokens, tps, think_words, think = ask(model, prompt) print(f"Time: {t:.1f}s | Total tokens: {tokens} | Speed: {tps:.0f} tok/s") if think_words: print(f"Thinking: ~{think_words} words") print(f"Thinking excerpt (first 300 chars):\n {think[:300]}...") else: print("Thinking: not exposed in output") print(f"\nResponse:\n{visible}")