"""AIME 2025 — 30 problems, integer answers 0..999. Scoring: exact-match of the integer in the last `\\boxed{...}` in the response, falling back to the last standalone integer in the response. Matches the convention used by most reasoning-model leaderboards. """ from __future__ import annotations import re from typing import Any from . import load_local TASK_NAME = "aime2025" # Tried in order; first one to load wins. These are the most-cited HF datasets # for AIME 2025 at time of writing; we don't depend on any one being present. DATASET_CANDIDATES = [ ("MathArena/aime_2025", None, "test"), ("yentinglin/aime_2025", None, "train"), ("opencompass/AIME2025", "AIME2025-I", "test"), ] def load() -> list[dict[str, Any]]: # Prefer the pre-fetched local JSON (GPU host has no network). local = load_local(TASK_NAME) if local is not None: return local return load_remote() def load_remote() -> list[dict[str, Any]]: """Fetch from HuggingFace. Requires network — used by fetch_datasets.py.""" from datasets import load_dataset # noqa: PLC0415 — optional dep, see requirements.txt last_err: Exception | None = None for repo, config, split in DATASET_CANDIDATES: try: ds = load_dataset(repo, config, split=split) if config else load_dataset(repo, split=split) except Exception as e: # noqa: BLE001 — try the next candidate last_err = e continue problems: list[dict[str, Any]] = [] for i, row in enumerate(ds): problem = row.get("problem") or row.get("question") or row.get("Problem") answer = row.get("answer") or row.get("Answer") or row.get("solution_int") if problem is None or answer is None: continue problems.append({ "id": str(row.get("id") or row.get("ID") or i), "problem": problem, "answer": str(answer).strip(), "source": repo, }) if problems: return problems raise RuntimeError( f"Could not load AIME 2025 from any of {[c[0] for c in DATASET_CANDIDATES]} " f"(last error: {last_err!r}). Set HF_HOME / HF_TOKEN if needed." ) SYSTEM_PROMPT = ( "You are a careful math problem solver. Solve the problem step by step. " "Put your final integer answer (an integer from 0 to 999) inside \\boxed{}." ) def make_messages(problem: str) -> list[dict[str, str]]: return [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": problem}, ] _BOXED_RE = re.compile(r"\\boxed\s*\{([^{}]*)\}") _INT_RE = re.compile(r"-?\d+") def extract_answer(text: str) -> str | None: """Return canonical integer string, or None if nothing parseable.""" if not text: return None boxed = _BOXED_RE.findall(text) candidates: list[str] = [] if boxed: # Inside the \boxed{} there may be extra latex; grab the last integer. ints = _INT_RE.findall(boxed[-1]) if ints: candidates.append(ints[-1]) # Fallback: the last integer anywhere in the response. if not candidates: ints = _INT_RE.findall(text) if ints: candidates.append(ints[-1]) if not candidates: return None try: return str(int(candidates[-1])) except ValueError: return None def score(pred: str | None, gold: str) -> bool: if pred is None: return False try: return int(pred) == int(gold) except ValueError: return False