xserv/tools/bench/tasks/aime.py

"""AIME 2025 — 30 problems, integer answers 0..999.

Scoring: exact-match of the integer in the last `\\boxed{...}` in the response,
falling back to the last standalone integer in the response. Matches the
convention used by most reasoning-model leaderboards.
"""

from __future__ import annotations

import re
from typing import Any

from . import load_local

TASK_NAME = "aime2025"

# Tried in order; first one to load wins. These are the most-cited HF datasets
# for AIME 2025 at time of writing; we don't depend on any one being present.
DATASET_CANDIDATES = [
    ("MathArena/aime_2025", None, "test"),
    ("yentinglin/aime_2025", None, "train"),
    ("opencompass/AIME2025", "AIME2025-I", "test"),
]


def load() -> list[dict[str, Any]]:
    # Prefer the pre-fetched local JSON (GPU host has no network).
    local = load_local(TASK_NAME)
    if local is not None:
        return local
    return load_remote()


def load_remote() -> list[dict[str, Any]]:
    """Fetch from HuggingFace. Requires network — used by fetch_datasets.py."""
    from datasets import load_dataset  # noqa: PLC0415 — optional dep, see requirements.txt

    last_err: Exception | None = None
    for repo, config, split in DATASET_CANDIDATES:
        try:
            ds = load_dataset(repo, config, split=split) if config else load_dataset(repo, split=split)
        except Exception as e:  # noqa: BLE001 — try the next candidate
            last_err = e
            continue

        problems: list[dict[str, Any]] = []
        for i, row in enumerate(ds):
            problem = row.get("problem") or row.get("question") or row.get("Problem")
            answer = row.get("answer") or row.get("Answer") or row.get("solution_int")
            if problem is None or answer is None:
                continue
            problems.append({
                "id": str(row.get("id") or row.get("ID") or i),
                "problem": problem,
                "answer": str(answer).strip(),
                "source": repo,
            })
        if problems:
            return problems

    raise RuntimeError(
        f"Could not load AIME 2025 from any of {[c[0] for c in DATASET_CANDIDATES]} "
        f"(last error: {last_err!r}). Set HF_HOME / HF_TOKEN if needed."
    )


SYSTEM_PROMPT = (
    "You are a careful math problem solver. Solve the problem step by step. "
    "Put your final integer answer (an integer from 0 to 999) inside \\boxed{}."
)


def make_messages(problem: str) -> list[dict[str, str]]:
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": problem},
    ]


_BOXED_RE = re.compile(r"\\boxed\s*\{([^{}]*)\}")
_INT_RE = re.compile(r"-?\d+")


def extract_answer(text: str) -> str | None:
    """Return canonical integer string, or None if nothing parseable."""
    if not text:
        return None
    boxed = _BOXED_RE.findall(text)
    candidates: list[str] = []
    if boxed:
        # Inside the \boxed{} there may be extra latex; grab the last integer.
        ints = _INT_RE.findall(boxed[-1])
        if ints:
            candidates.append(ints[-1])
    # Fallback: the last integer anywhere in the response.
    if not candidates:
        ints = _INT_RE.findall(text)
        if ints:
            candidates.append(ints[-1])
    if not candidates:
        return None
    try:
        return str(int(candidates[-1]))
    except ValueError:
        return None


def score(pred: str | None, gold: str) -> bool:
    if pred is None:
        return False
    try:
        return int(pred) == int(gold)
    except ValueError:
        return False