tools: add llama.cpp comparison baseline + standard benchmark suite

Vendor llama.cpp as a submodule pinned to b9371 and add a one-click benchmark driver that compares xserv against it on identical workloads: - setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh converts the same safetensors to BF16 GGUF for an apples-to-apples baseline. - tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput (single-stream + concurrent) and response quality on AIME 2025 + GSM8K. - fetch_datasets.py pulls datasets to local JSON (GPU host has no network); task loaders prefer the local JSON. - sync-and-build.sh: `bench` subcommand transfers source + datasets to the GPU host via tar-over-ssh (no rsync there), builds, and runs the suite. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 11:18:52 +08:00
parent 9bb5c5c328
commit 49c7653222
20 changed files with 1690 additions and 14 deletions
--- a/tools/bench/tasks/gsm8k.py
+++ b/tools/bench/tasks/gsm8k.py
@@ -0,0 +1,90 @@
+"""GSM8K — 1319 grade-school math problems with integer/decimal answers.
+
+Gold answers in the dataset are in the form `... #### 42`. We score by
+exact-match of the final number, with the same `\\boxed{}` / last-number
+extraction used for AIME, since for instruction-tuned models the response
+follows the prompt instructions, not the dataset's `####` convention.
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Any
+
+from . import load_local
+
+TASK_NAME = "gsm8k"
+
+
+def load() -> list[dict[str, Any]]:
+    local = load_local(TASK_NAME)
+    if local is not None:
+        return local
+    return load_remote()
+
+
+def load_remote() -> list[dict[str, Any]]:
+    """Fetch from HuggingFace. Requires network — used by fetch_datasets.py."""
+    from datasets import load_dataset  # noqa: PLC0415
+
+    ds = load_dataset("openai/gsm8k", "main", split="test")
+    out: list[dict[str, Any]] = []
+    for i, row in enumerate(ds):
+        ans_full: str = row["answer"]
+        # gold format: "<chain of thought>\n#### 42"
+        gold = ans_full.split("####")[-1].strip().replace(",", "")
+        out.append({
+            "id": str(i),
+            "problem": row["question"],
+            "answer": gold,
+            "source": "openai/gsm8k",
+        })
+    return out
+
+
+SYSTEM_PROMPT = (
+    "You are a careful math problem solver. Solve the problem step by step. "
+    "Put your final numeric answer inside \\boxed{}."
+)
+
+
+def make_messages(problem: str) -> list[dict[str, str]]:
+    return [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": problem},
+    ]
+
+
+_BOXED_RE = re.compile(r"\\boxed\s*\{([^{}]*)\}")
+# Allow comma-grouped thousands (e.g. "3,500"); _normalize_num strips them.
+_NUM_RE = re.compile(r"-?\d+(?:,\d{3})*(?:\.\d+)?")
+
+
+def _normalize_num(s: str) -> str | None:
+    s = s.replace(",", "").strip()
+    try:
+        f = float(s)
+    except ValueError:
+        return None
+    return str(int(f)) if f.is_integer() else f"{f:g}"
+
+
+def extract_answer(text: str) -> str | None:
+    if not text:
+        return None
+    boxed = _BOXED_RE.findall(text)
+    if boxed:
+        nums = _NUM_RE.findall(boxed[-1])
+        if nums:
+            return _normalize_num(nums[-1])
+    nums = _NUM_RE.findall(text)
+    if nums:
+        return _normalize_num(nums[-1])
+    return None
+
+
+def score(pred: str | None, gold: str) -> bool:
+    if pred is None:
+        return False
+    gold_norm = _normalize_num(gold)
+    return gold_norm is not None and pred == gold_norm