From 950ccf38223d3c35eff1ddaf456c1aeb4710c179 Mon Sep 17 00:00:00 2001
From: Gahow Wang <gahow.wang@gmail.com>
Date: Thu, 28 May 2026 15:06:12 +0800
Subject: [PATCH] bench: fix llama.cpp per-slot context (was 1/parallel of
 intended)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

llama.cpp divides total -c across --parallel slots, so -c 4096 --parallel 4
gave each request only 1024 tokens — truncating long AIME generations before
the boxed answer and making xserv look artificially better (20% vs 3.3%).
Set total -c = max_seq_len * n_parallel so per-slot context equals xserv's
per-sequence max_seq_len. Also drop --log-disable; its startup log reports the
per-slot n_ctx that catches exactly this misconfiguration.

After the fix, AIME is at parity (xserv 23.3% vs llama.cpp 20.0%), matching the
GSM8K parity and confirming the gap was a config artifact, not engine quality.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 tools/bench/runner.py  |  2 +-
 tools/bench/servers.py | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/tools/bench/runner.py b/tools/bench/runner.py
index 3118f28..7056f16 100644
--- a/tools/bench/runner.py
+++ b/tools/bench/runner.py
@@ -134,7 +134,7 @@ def build_endpoints(args) -> list[SystemEndpoint]:
                 model_id=args.llama_model_id,
                 launch_cmd=llama_cpp_launch_cmd(
                     args.llama_bin, gguf, args.llama_port,
-                    n_parallel=args.max_batch, ctx_size=args.max_seq_len,
+                    n_parallel=args.max_batch, ctx_per_slot=args.max_seq_len,
                 ),
                 # llama-server's health endpoint also returns 200 only when model is loaded.
                 health_path="/health",
diff --git a/tools/bench/servers.py b/tools/bench/servers.py
index 6961feb..993f2ba 100644
--- a/tools/bench/servers.py
+++ b/tools/bench/servers.py
@@ -129,17 +129,22 @@ def llama_cpp_launch_cmd(
     port: int,
     *,
     n_parallel: int,
-    ctx_size: int,
+    ctx_per_slot: int,
     n_gpu_layers: int = 99,
 ) -> list[str]:
+    # llama.cpp DIVIDES total -c across --parallel slots: per-slot context is
+    # n_ctx / n_parallel. xserv gives each sequence the full max_seq_len, so to
+    # match we must set total -c = ctx_per_slot * n_parallel. Getting this wrong
+    # silently truncates long generations (e.g. AIME) on llama.cpp's side.
+    total_ctx = ctx_per_slot * n_parallel
     return [
         bin_path,
         "-m", gguf_path,
         "--port", str(port),
         "--host", "0.0.0.0",
-        "-c", str(ctx_size),
+        "-c", str(total_ctx),
         "-ngl", str(n_gpu_layers),
         "--parallel", str(n_parallel),
-        # Be quiet by default; the log file already captures stderr.
-        "--log-disable",
+        # NOTE: do NOT pass --log-disable; its startup log reports per-slot
+        # n_ctx, which is exactly the diagnostic that catches ctx misconfig.
     ]