Files
xserv/tools/bench/tasks/aime.py
Gahow Wang 49c7653222 tools: add llama.cpp comparison baseline + standard benchmark suite
Vendor llama.cpp as a submodule pinned to b9371 and add a one-click
benchmark driver that compares xserv against it on identical workloads:

- setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh
  converts the same safetensors to BF16 GGUF for an apples-to-apples baseline.
- tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput
  (single-stream + concurrent) and response quality on AIME 2025 + GSM8K.
- fetch_datasets.py pulls datasets to local JSON (GPU host has no network);
  task loaders prefer the local JSON.
- sync-and-build.sh: `bench` subcommand transfers source + datasets to the
  GPU host via tar-over-ssh (no rsync there), builds, and runs the suite.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-28 11:18:52 +08:00

115 lines
3.5 KiB
Python

"""AIME 2025 — 30 problems, integer answers 0..999.
Scoring: exact-match of the integer in the last `\\boxed{...}` in the response,
falling back to the last standalone integer in the response. Matches the
convention used by most reasoning-model leaderboards.
"""
from __future__ import annotations
import re
from typing import Any
from . import load_local
TASK_NAME = "aime2025"
# Tried in order; first one to load wins. These are the most-cited HF datasets
# for AIME 2025 at time of writing; we don't depend on any one being present.
DATASET_CANDIDATES = [
("MathArena/aime_2025", None, "test"),
("yentinglin/aime_2025", None, "train"),
("opencompass/AIME2025", "AIME2025-I", "test"),
]
def load() -> list[dict[str, Any]]:
# Prefer the pre-fetched local JSON (GPU host has no network).
local = load_local(TASK_NAME)
if local is not None:
return local
return load_remote()
def load_remote() -> list[dict[str, Any]]:
"""Fetch from HuggingFace. Requires network — used by fetch_datasets.py."""
from datasets import load_dataset # noqa: PLC0415 — optional dep, see requirements.txt
last_err: Exception | None = None
for repo, config, split in DATASET_CANDIDATES:
try:
ds = load_dataset(repo, config, split=split) if config else load_dataset(repo, split=split)
except Exception as e: # noqa: BLE001 — try the next candidate
last_err = e
continue
problems: list[dict[str, Any]] = []
for i, row in enumerate(ds):
problem = row.get("problem") or row.get("question") or row.get("Problem")
answer = row.get("answer") or row.get("Answer") or row.get("solution_int")
if problem is None or answer is None:
continue
problems.append({
"id": str(row.get("id") or row.get("ID") or i),
"problem": problem,
"answer": str(answer).strip(),
"source": repo,
})
if problems:
return problems
raise RuntimeError(
f"Could not load AIME 2025 from any of {[c[0] for c in DATASET_CANDIDATES]} "
f"(last error: {last_err!r}). Set HF_HOME / HF_TOKEN if needed."
)
SYSTEM_PROMPT = (
"You are a careful math problem solver. Solve the problem step by step. "
"Put your final integer answer (an integer from 0 to 999) inside \\boxed{}."
)
def make_messages(problem: str) -> list[dict[str, str]]:
return [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": problem},
]
_BOXED_RE = re.compile(r"\\boxed\s*\{([^{}]*)\}")
_INT_RE = re.compile(r"-?\d+")
def extract_answer(text: str) -> str | None:
"""Return canonical integer string, or None if nothing parseable."""
if not text:
return None
boxed = _BOXED_RE.findall(text)
candidates: list[str] = []
if boxed:
# Inside the \boxed{} there may be extra latex; grab the last integer.
ints = _INT_RE.findall(boxed[-1])
if ints:
candidates.append(ints[-1])
# Fallback: the last integer anywhere in the response.
if not candidates:
ints = _INT_RE.findall(text)
if ints:
candidates.append(ints[-1])
if not candidates:
return None
try:
return str(int(candidates[-1]))
except ValueError:
return None
def score(pred: str | None, gold: str) -> bool:
if pred is None:
return False
try:
return int(pred) == int(gold)
except ValueError:
return False