tools: add llama.cpp comparison baseline + standard benchmark suite
Vendor llama.cpp as a submodule pinned to b9371 and add a one-click benchmark driver that compares xserv against it on identical workloads: - setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh converts the same safetensors to BF16 GGUF for an apples-to-apples baseline. - tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput (single-stream + concurrent) and response quality on AIME 2025 + GSM8K. - fetch_datasets.py pulls datasets to local JSON (GPU host has no network); task loaders prefer the local JSON. - sync-and-build.sh: `bench` subcommand transfers source + datasets to the GPU host via tar-over-ssh (no rsync there), builds, and runs the suite. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
90
tools/bench/tasks/gsm8k.py
Normal file
90
tools/bench/tasks/gsm8k.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""GSM8K — 1319 grade-school math problems with integer/decimal answers.
|
||||
|
||||
Gold answers in the dataset are in the form `... #### 42`. We score by
|
||||
exact-match of the final number, with the same `\\boxed{}` / last-number
|
||||
extraction used for AIME, since for instruction-tuned models the response
|
||||
follows the prompt instructions, not the dataset's `####` convention.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from . import load_local
|
||||
|
||||
TASK_NAME = "gsm8k"
|
||||
|
||||
|
||||
def load() -> list[dict[str, Any]]:
|
||||
local = load_local(TASK_NAME)
|
||||
if local is not None:
|
||||
return local
|
||||
return load_remote()
|
||||
|
||||
|
||||
def load_remote() -> list[dict[str, Any]]:
|
||||
"""Fetch from HuggingFace. Requires network — used by fetch_datasets.py."""
|
||||
from datasets import load_dataset # noqa: PLC0415
|
||||
|
||||
ds = load_dataset("openai/gsm8k", "main", split="test")
|
||||
out: list[dict[str, Any]] = []
|
||||
for i, row in enumerate(ds):
|
||||
ans_full: str = row["answer"]
|
||||
# gold format: "<chain of thought>\n#### 42"
|
||||
gold = ans_full.split("####")[-1].strip().replace(",", "")
|
||||
out.append({
|
||||
"id": str(i),
|
||||
"problem": row["question"],
|
||||
"answer": gold,
|
||||
"source": "openai/gsm8k",
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"You are a careful math problem solver. Solve the problem step by step. "
|
||||
"Put your final numeric answer inside \\boxed{}."
|
||||
)
|
||||
|
||||
|
||||
def make_messages(problem: str) -> list[dict[str, str]]:
|
||||
return [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": problem},
|
||||
]
|
||||
|
||||
|
||||
_BOXED_RE = re.compile(r"\\boxed\s*\{([^{}]*)\}")
|
||||
# Allow comma-grouped thousands (e.g. "3,500"); _normalize_num strips them.
|
||||
_NUM_RE = re.compile(r"-?\d+(?:,\d{3})*(?:\.\d+)?")
|
||||
|
||||
|
||||
def _normalize_num(s: str) -> str | None:
|
||||
s = s.replace(",", "").strip()
|
||||
try:
|
||||
f = float(s)
|
||||
except ValueError:
|
||||
return None
|
||||
return str(int(f)) if f.is_integer() else f"{f:g}"
|
||||
|
||||
|
||||
def extract_answer(text: str) -> str | None:
|
||||
if not text:
|
||||
return None
|
||||
boxed = _BOXED_RE.findall(text)
|
||||
if boxed:
|
||||
nums = _NUM_RE.findall(boxed[-1])
|
||||
if nums:
|
||||
return _normalize_num(nums[-1])
|
||||
nums = _NUM_RE.findall(text)
|
||||
if nums:
|
||||
return _normalize_num(nums[-1])
|
||||
return None
|
||||
|
||||
|
||||
def score(pred: str | None, gold: str) -> bool:
|
||||
if pred is None:
|
||||
return False
|
||||
gold_norm = _normalize_num(gold)
|
||||
return gold_norm is not None and pred == gold_norm
|
||||
Reference in New Issue
Block a user