Vendor llama.cpp as a submodule pinned to b9371 and add a one-click benchmark driver that compares xserv against it on identical workloads: - setup-llama-cpp.sh: network-optional CUDA build (SM120); convert-to-gguf.sh converts the same safetensors to BF16 GGUF for an apples-to-apples baseline. - tools/bench/: black-box OpenAI-API driver measuring TTFT/TPOT/throughput (single-stream + concurrent) and response quality on AIME 2025 + GSM8K. - fetch_datasets.py pulls datasets to local JSON (GPU host has no network); task loaders prefer the local JSON. - sync-and-build.sh: `bench` subcommand transfers source + datasets to the GPU host via tar-over-ssh (no rsync there), builds, and runs the suite. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
115 lines
3.5 KiB
Python
115 lines
3.5 KiB
Python
"""AIME 2025 — 30 problems, integer answers 0..999.
|
|
|
|
Scoring: exact-match of the integer in the last `\\boxed{...}` in the response,
|
|
falling back to the last standalone integer in the response. Matches the
|
|
convention used by most reasoning-model leaderboards.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import Any
|
|
|
|
from . import load_local
|
|
|
|
TASK_NAME = "aime2025"
|
|
|
|
# Tried in order; first one to load wins. These are the most-cited HF datasets
|
|
# for AIME 2025 at time of writing; we don't depend on any one being present.
|
|
DATASET_CANDIDATES = [
|
|
("MathArena/aime_2025", None, "test"),
|
|
("yentinglin/aime_2025", None, "train"),
|
|
("opencompass/AIME2025", "AIME2025-I", "test"),
|
|
]
|
|
|
|
|
|
def load() -> list[dict[str, Any]]:
|
|
# Prefer the pre-fetched local JSON (GPU host has no network).
|
|
local = load_local(TASK_NAME)
|
|
if local is not None:
|
|
return local
|
|
return load_remote()
|
|
|
|
|
|
def load_remote() -> list[dict[str, Any]]:
|
|
"""Fetch from HuggingFace. Requires network — used by fetch_datasets.py."""
|
|
from datasets import load_dataset # noqa: PLC0415 — optional dep, see requirements.txt
|
|
|
|
last_err: Exception | None = None
|
|
for repo, config, split in DATASET_CANDIDATES:
|
|
try:
|
|
ds = load_dataset(repo, config, split=split) if config else load_dataset(repo, split=split)
|
|
except Exception as e: # noqa: BLE001 — try the next candidate
|
|
last_err = e
|
|
continue
|
|
|
|
problems: list[dict[str, Any]] = []
|
|
for i, row in enumerate(ds):
|
|
problem = row.get("problem") or row.get("question") or row.get("Problem")
|
|
answer = row.get("answer") or row.get("Answer") or row.get("solution_int")
|
|
if problem is None or answer is None:
|
|
continue
|
|
problems.append({
|
|
"id": str(row.get("id") or row.get("ID") or i),
|
|
"problem": problem,
|
|
"answer": str(answer).strip(),
|
|
"source": repo,
|
|
})
|
|
if problems:
|
|
return problems
|
|
|
|
raise RuntimeError(
|
|
f"Could not load AIME 2025 from any of {[c[0] for c in DATASET_CANDIDATES]} "
|
|
f"(last error: {last_err!r}). Set HF_HOME / HF_TOKEN if needed."
|
|
)
|
|
|
|
|
|
SYSTEM_PROMPT = (
|
|
"You are a careful math problem solver. Solve the problem step by step. "
|
|
"Put your final integer answer (an integer from 0 to 999) inside \\boxed{}."
|
|
)
|
|
|
|
|
|
def make_messages(problem: str) -> list[dict[str, str]]:
|
|
return [
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{"role": "user", "content": problem},
|
|
]
|
|
|
|
|
|
_BOXED_RE = re.compile(r"\\boxed\s*\{([^{}]*)\}")
|
|
_INT_RE = re.compile(r"-?\d+")
|
|
|
|
|
|
def extract_answer(text: str) -> str | None:
|
|
"""Return canonical integer string, or None if nothing parseable."""
|
|
if not text:
|
|
return None
|
|
boxed = _BOXED_RE.findall(text)
|
|
candidates: list[str] = []
|
|
if boxed:
|
|
# Inside the \boxed{} there may be extra latex; grab the last integer.
|
|
ints = _INT_RE.findall(boxed[-1])
|
|
if ints:
|
|
candidates.append(ints[-1])
|
|
# Fallback: the last integer anywhere in the response.
|
|
if not candidates:
|
|
ints = _INT_RE.findall(text)
|
|
if ints:
|
|
candidates.append(ints[-1])
|
|
if not candidates:
|
|
return None
|
|
try:
|
|
return str(int(candidates[-1]))
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
def score(pred: str | None, gold: str) -> bool:
|
|
if pred is None:
|
|
return False
|
|
try:
|
|
return int(pred) == int(gold)
|
|
except ValueError:
|
|
return False
|