Vendor llama.cpp as a submodule pinned to b9371 and add a one-click benchmark driver that compares xserv against it on identical workloads: - setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh converts the same safetensors to BF16 GGUF for an apples-to-apples baseline. - tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput (single-stream + concurrent) and response quality on AIME 2025 + GSM8K. - fetch_datasets.py pulls datasets to local JSON (GPU host has no network); task loaders prefer the local JSON. - sync-and-build.sh: `bench` subcommand transfers source + datasets to the GPU host via tar-over-ssh (no rsync there), builds, and runs the suite. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
91 lines
2.5 KiB
Python
91 lines
2.5 KiB
Python
"""GSM8K — 1319 grade-school math problems with integer/decimal answers.
|
|
|
|
Gold answers in the dataset are in the form `... #### 42`. We score by
|
|
exact-match of the final number, with the same `\\boxed{}` / last-number
|
|
extraction used for AIME, since for instruction-tuned models the response
|
|
follows the prompt instructions, not the dataset's `####` convention.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import Any
|
|
|
|
from . import load_local
|
|
|
|
TASK_NAME = "gsm8k"
|
|
|
|
|
|
def load() -> list[dict[str, Any]]:
|
|
local = load_local(TASK_NAME)
|
|
if local is not None:
|
|
return local
|
|
return load_remote()
|
|
|
|
|
|
def load_remote() -> list[dict[str, Any]]:
|
|
"""Fetch from HuggingFace. Requires network — used by fetch_datasets.py."""
|
|
from datasets import load_dataset # noqa: PLC0415
|
|
|
|
ds = load_dataset("openai/gsm8k", "main", split="test")
|
|
out: list[dict[str, Any]] = []
|
|
for i, row in enumerate(ds):
|
|
ans_full: str = row["answer"]
|
|
# gold format: "<chain of thought>\n#### 42"
|
|
gold = ans_full.split("####")[-1].strip().replace(",", "")
|
|
out.append({
|
|
"id": str(i),
|
|
"problem": row["question"],
|
|
"answer": gold,
|
|
"source": "openai/gsm8k",
|
|
})
|
|
return out
|
|
|
|
|
|
SYSTEM_PROMPT = (
|
|
"You are a careful math problem solver. Solve the problem step by step. "
|
|
"Put your final numeric answer inside \\boxed{}."
|
|
)
|
|
|
|
|
|
def make_messages(problem: str) -> list[dict[str, str]]:
|
|
return [
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{"role": "user", "content": problem},
|
|
]
|
|
|
|
|
|
_BOXED_RE = re.compile(r"\\boxed\s*\{([^{}]*)\}")
|
|
# Allow comma-grouped thousands (e.g. "3,500"); _normalize_num strips them.
|
|
_NUM_RE = re.compile(r"-?\d+(?:,\d{3})*(?:\.\d+)?")
|
|
|
|
|
|
def _normalize_num(s: str) -> str | None:
|
|
s = s.replace(",", "").strip()
|
|
try:
|
|
f = float(s)
|
|
except ValueError:
|
|
return None
|
|
return str(int(f)) if f.is_integer() else f"{f:g}"
|
|
|
|
|
|
def extract_answer(text: str) -> str | None:
|
|
if not text:
|
|
return None
|
|
boxed = _BOXED_RE.findall(text)
|
|
if boxed:
|
|
nums = _NUM_RE.findall(boxed[-1])
|
|
if nums:
|
|
return _normalize_num(nums[-1])
|
|
nums = _NUM_RE.findall(text)
|
|
if nums:
|
|
return _normalize_num(nums[-1])
|
|
return None
|
|
|
|
|
|
def score(pred: str | None, gold: str) -> bool:
|
|
if pred is None:
|
|
return False
|
|
gold_norm = _normalize_num(gold)
|
|
return gold_norm is not None and pred == gold_norm
|