From 950ccf38223d3c35eff1ddaf456c1aeb4710c179 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Thu, 28 May 2026 15:06:12 +0800 Subject: [PATCH] bench: fix llama.cpp per-slot context (was 1/parallel of intended) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit llama.cpp divides total -c across --parallel slots, so -c 4096 --parallel 4 gave each request only 1024 tokens — truncating long AIME generations before the boxed answer and making xserv look artificially better (20% vs 3.3%). Set total -c = max_seq_len * n_parallel so per-slot context equals xserv's per-sequence max_seq_len. Also drop --log-disable; its startup log reports the per-slot n_ctx that catches exactly this misconfiguration. After the fix, AIME is at parity (xserv 23.3% vs llama.cpp 20.0%), matching the GSM8K parity and confirming the gap was a config artifact, not engine quality. Co-Authored-By: Claude Opus 4.7 --- tools/bench/runner.py | 2 +- tools/bench/servers.py | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/bench/runner.py b/tools/bench/runner.py index 3118f28..7056f16 100644 --- a/tools/bench/runner.py +++ b/tools/bench/runner.py @@ -134,7 +134,7 @@ def build_endpoints(args) -> list[SystemEndpoint]: model_id=args.llama_model_id, launch_cmd=llama_cpp_launch_cmd( args.llama_bin, gguf, args.llama_port, - n_parallel=args.max_batch, ctx_size=args.max_seq_len, + n_parallel=args.max_batch, ctx_per_slot=args.max_seq_len, ), # llama-server's health endpoint also returns 200 only when model is loaded. health_path="/health", diff --git a/tools/bench/servers.py b/tools/bench/servers.py index 6961feb..993f2ba 100644 --- a/tools/bench/servers.py +++ b/tools/bench/servers.py @@ -129,17 +129,22 @@ def llama_cpp_launch_cmd( port: int, *, n_parallel: int, - ctx_size: int, + ctx_per_slot: int, n_gpu_layers: int = 99, ) -> list[str]: + # llama.cpp DIVIDES total -c across --parallel slots: per-slot context is + # n_ctx / n_parallel. xserv gives each sequence the full max_seq_len, so to + # match we must set total -c = ctx_per_slot * n_parallel. Getting this wrong + # silently truncates long generations (e.g. AIME) on llama.cpp's side. + total_ctx = ctx_per_slot * n_parallel return [ bin_path, "-m", gguf_path, "--port", str(port), "--host", "0.0.0.0", - "-c", str(ctx_size), + "-c", str(total_ctx), "-ngl", str(n_gpu_layers), "--parallel", str(n_parallel), - # Be quiet by default; the log file already captures stderr. - "--log-disable", + # NOTE: do NOT pass --log-disable; its startup log reports per-slot + # n_ctx, which is exactly the diagnostic that catches ctx misconfig. ]