#!/usr/bin/env python3
import json
import time
import requests
from statistics import mean

URL = "http://127.0.0.1:11434/api/chat"
MODEL = "glm-4.7-flash-128k"

SCENARIO = [
    "你是资深Python工程师。请实现一个LRUCache类（get/put，O(1)），并附上类型标注。",
    "基于上一步代码，补充线程安全支持（使用RLock），并说明性能影响。",
    "增加TTL过期机制，要求懒清理+写入时清理，给出关键测试用例。",
    "把核心逻辑重构为两个类：Storage与Policy，保证接口不变。",
    "增加一个命令行演示入口：支持put/get/dump命令。",
    "修复潜在bug：高并发下TTL扫描可能导致长时间持锁，请优化。",
    "把项目拆成3个文件并给出最小可运行目录结构。",
    "最后给出pytest测试代码（覆盖率目标>90%）。"
]

PROFILES = [
    {"name": "ctx32k_temp0.4", "options": {"num_ctx": 32768, "temperature": 0.4, "num_predict": 1024}},
    {"name": "ctx64k_temp0.4", "options": {"num_ctx": 65536, "temperature": 0.4, "num_predict": 1024}},
    {"name": "ctx128k_temp0.4", "options": {"num_ctx": 131072, "temperature": 0.4, "num_predict": 1024}},
]


def run_profile(profile):
    messages = [{"role": "system", "content": "你是严谨的代码助手，回答要给出可运行代码，避免空话。"}]
    rounds = []
    for i, prompt in enumerate(SCENARIO, start=1):
        messages.append({"role": "user", "content": prompt})
        payload = {
            "model": MODEL,
            "messages": messages,
            "stream": False,
            "options": profile["options"],
        }
        t0 = time.time()
        ok = True
        err = ""
        content = ""
        eval_count = None
        prompt_eval_count = None
        done_reason = ""
        try:
            r = requests.post(URL, json=payload, timeout=300)
            r.raise_for_status()
            data = r.json()
            content = data.get("message", {}).get("content", "")
            eval_count = data.get("eval_count")
            prompt_eval_count = data.get("prompt_eval_count")
            done_reason = data.get("done_reason", "")
        except Exception as e:
            ok = False
            err = str(e)
        dt = time.time() - t0
        rounds.append({
            "round": i,
            "ok": ok,
            "latency_s": round(dt, 2),
            "chars": len(content),
            "eval_count": eval_count,
            "prompt_eval_count": prompt_eval_count,
            "done_reason": done_reason,
            "error": err,
        })
        if not ok:
            break
        messages.append({"role": "assistant", "content": content})
    oks = [r for r in rounds if r["ok"]]
    return {
        "profile": profile["name"],
        "options": profile["options"],
        "rounds": rounds,
        "success_rounds": len(oks),
        "all_success": len(oks) == len(SCENARIO),
        "avg_latency_s": round(mean([r["latency_s"] for r in oks]), 2) if oks else None,
        "max_latency_s": max([r["latency_s"] for r in oks]) if oks else None,
    }


def main():
    out = {"model": MODEL, "profiles": []}
    for p in PROFILES:
        print(f"Running {p['name']} ...", flush=True)
        out["profiles"].append(run_profile(p))
    ts = time.strftime("%Y%m%d-%H%M%S")
    path = f"/home/zhn/.openclaw/workspace/reports/ollama-benchmark-{ts}.json"
    import os
    os.makedirs("/home/zhn/.openclaw/workspace/reports", exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)
    print(path)


if __name__ == "__main__":
    main()