tools: add llama.cpp comparison baseline + standard benchmark suite

Vendor llama.cpp as a submodule pinned to b9371 and add a one-click
benchmark driver that compares xserv against it on identical workloads:

- setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh
  converts the same safetensors to BF16 GGUF for an apples-to-apples baseline.
- tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput
  (single-stream + concurrent) and response quality on AIME 2025 + GSM8K.
- fetch_datasets.py pulls datasets to local JSON (GPU host has no network);
  task loaders prefer the local JSON.
- sync-and-build.sh: `bench` subcommand transfers source + datasets to the
  GPU host via tar-over-ssh (no rsync there), builds, and runs the suite.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-28 11:18:52 +08:00
parent 9bb5c5c328
commit 49c7653222
20 changed files with 1690 additions and 14 deletions

View File

@@ -0,0 +1,90 @@
"""GSM8K — 1319 grade-school math problems with integer/decimal answers.
Gold answers in the dataset are in the form `... #### 42`. We score by
exact-match of the final number, with the same `\\boxed{}` / last-number
extraction used for AIME, since for instruction-tuned models the response
follows the prompt instructions, not the dataset's `####` convention.
"""
from __future__ import annotations
import re
from typing import Any
from . import load_local
TASK_NAME = "gsm8k"
def load() -> list[dict[str, Any]]:
local = load_local(TASK_NAME)
if local is not None:
return local
return load_remote()
def load_remote() -> list[dict[str, Any]]:
"""Fetch from HuggingFace. Requires network — used by fetch_datasets.py."""
from datasets import load_dataset # noqa: PLC0415
ds = load_dataset("openai/gsm8k", "main", split="test")
out: list[dict[str, Any]] = []
for i, row in enumerate(ds):
ans_full: str = row["answer"]
# gold format: "<chain of thought>\n#### 42"
gold = ans_full.split("####")[-1].strip().replace(",", "")
out.append({
"id": str(i),
"problem": row["question"],
"answer": gold,
"source": "openai/gsm8k",
})
return out
SYSTEM_PROMPT = (
"You are a careful math problem solver. Solve the problem step by step. "
"Put your final numeric answer inside \\boxed{}."
)
def make_messages(problem: str) -> list[dict[str, str]]:
return [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": problem},
]
_BOXED_RE = re.compile(r"\\boxed\s*\{([^{}]*)\}")
# Allow comma-grouped thousands (e.g. "3,500"); _normalize_num strips them.
_NUM_RE = re.compile(r"-?\d+(?:,\d{3})*(?:\.\d+)?")
def _normalize_num(s: str) -> str | None:
s = s.replace(",", "").strip()
try:
f = float(s)
except ValueError:
return None
return str(int(f)) if f.is_integer() else f"{f:g}"
def extract_answer(text: str) -> str | None:
if not text:
return None
boxed = _BOXED_RE.findall(text)
if boxed:
nums = _NUM_RE.findall(boxed[-1])
if nums:
return _normalize_num(nums[-1])
nums = _NUM_RE.findall(text)
if nums:
return _normalize_num(nums[-1])
return None
def score(pred: str | None, gold: str) -> bool:
if pred is None:
return False
gold_norm = _normalize_num(gold)
return gold_norm is not None and pred == gold_norm