bench_ollama_coding.py 3.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. #!/usr/bin/env python3
  2. import json
  3. import time
  4. import requests
  5. from statistics import mean
  6. URL = "http://127.0.0.1:11434/api/chat"
  7. MODEL = "glm-4.7-flash-128k"
  8. SCENARIO = [
  9. "你是资深Python工程师。请实现一个LRUCache类(get/put,O(1)),并附上类型标注。",
  10. "基于上一步代码,补充线程安全支持(使用RLock),并说明性能影响。",
  11. "增加TTL过期机制,要求懒清理+写入时清理,给出关键测试用例。",
  12. "把核心逻辑重构为两个类:Storage与Policy,保证接口不变。",
  13. "增加一个命令行演示入口:支持put/get/dump命令。",
  14. "修复潜在bug:高并发下TTL扫描可能导致长时间持锁,请优化。",
  15. "把项目拆成3个文件并给出最小可运行目录结构。",
  16. "最后给出pytest测试代码(覆盖率目标>90%)。"
  17. ]
  18. PROFILES = [
  19. {"name": "ctx32k_temp0.4", "options": {"num_ctx": 32768, "temperature": 0.4, "num_predict": 1024}},
  20. {"name": "ctx64k_temp0.4", "options": {"num_ctx": 65536, "temperature": 0.4, "num_predict": 1024}},
  21. {"name": "ctx128k_temp0.4", "options": {"num_ctx": 131072, "temperature": 0.4, "num_predict": 1024}},
  22. ]
  23. def run_profile(profile):
  24. messages = [{"role": "system", "content": "你是严谨的代码助手,回答要给出可运行代码,避免空话。"}]
  25. rounds = []
  26. for i, prompt in enumerate(SCENARIO, start=1):
  27. messages.append({"role": "user", "content": prompt})
  28. payload = {
  29. "model": MODEL,
  30. "messages": messages,
  31. "stream": False,
  32. "options": profile["options"],
  33. }
  34. t0 = time.time()
  35. ok = True
  36. err = ""
  37. content = ""
  38. eval_count = None
  39. prompt_eval_count = None
  40. done_reason = ""
  41. try:
  42. r = requests.post(URL, json=payload, timeout=300)
  43. r.raise_for_status()
  44. data = r.json()
  45. content = data.get("message", {}).get("content", "")
  46. eval_count = data.get("eval_count")
  47. prompt_eval_count = data.get("prompt_eval_count")
  48. done_reason = data.get("done_reason", "")
  49. except Exception as e:
  50. ok = False
  51. err = str(e)
  52. dt = time.time() - t0
  53. rounds.append({
  54. "round": i,
  55. "ok": ok,
  56. "latency_s": round(dt, 2),
  57. "chars": len(content),
  58. "eval_count": eval_count,
  59. "prompt_eval_count": prompt_eval_count,
  60. "done_reason": done_reason,
  61. "error": err,
  62. })
  63. if not ok:
  64. break
  65. messages.append({"role": "assistant", "content": content})
  66. oks = [r for r in rounds if r["ok"]]
  67. return {
  68. "profile": profile["name"],
  69. "options": profile["options"],
  70. "rounds": rounds,
  71. "success_rounds": len(oks),
  72. "all_success": len(oks) == len(SCENARIO),
  73. "avg_latency_s": round(mean([r["latency_s"] for r in oks]), 2) if oks else None,
  74. "max_latency_s": max([r["latency_s"] for r in oks]) if oks else None,
  75. }
  76. def main():
  77. out = {"model": MODEL, "profiles": []}
  78. for p in PROFILES:
  79. print(f"Running {p['name']} ...", flush=True)
  80. out["profiles"].append(run_profile(p))
  81. ts = time.strftime("%Y%m%d-%H%M%S")
  82. path = f"/home/zhn/.openclaw/workspace/reports/ollama-benchmark-{ts}.json"
  83. import os
  84. os.makedirs("/home/zhn/.openclaw/workspace/reports", exist_ok=True)
  85. with open(path, "w", encoding="utf-8") as f:
  86. json.dump(out, f, ensure_ascii=False, indent=2)
  87. print(path)
  88. if __name__ == "__main__":
  89. main()