bench: fix llama.cpp per-slot context (was 1/parallel of intended)
llama.cpp divides total -c across --parallel slots, so -c 4096 --parallel 4 gave each request only 1024 tokens — truncating long AIME generations before the boxed answer and making xserv look artificially better (20% vs 3.3%). Set total -c = max_seq_len * n_parallel so per-slot context equals xserv's per-sequence max_seq_len. Also drop --log-disable; its startup log reports the per-slot n_ctx that catches exactly this misconfiguration. After the fix, AIME is at parity (xserv 23.3% vs llama.cpp 20.0%), matching the GSM8K parity and confirming the gap was a config artifact, not engine quality. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -134,7 +134,7 @@ def build_endpoints(args) -> list[SystemEndpoint]:
|
|||||||
model_id=args.llama_model_id,
|
model_id=args.llama_model_id,
|
||||||
launch_cmd=llama_cpp_launch_cmd(
|
launch_cmd=llama_cpp_launch_cmd(
|
||||||
args.llama_bin, gguf, args.llama_port,
|
args.llama_bin, gguf, args.llama_port,
|
||||||
n_parallel=args.max_batch, ctx_size=args.max_seq_len,
|
n_parallel=args.max_batch, ctx_per_slot=args.max_seq_len,
|
||||||
),
|
),
|
||||||
# llama-server's health endpoint also returns 200 only when model is loaded.
|
# llama-server's health endpoint also returns 200 only when model is loaded.
|
||||||
health_path="/health",
|
health_path="/health",
|
||||||
|
|||||||
@@ -129,17 +129,22 @@ def llama_cpp_launch_cmd(
|
|||||||
port: int,
|
port: int,
|
||||||
*,
|
*,
|
||||||
n_parallel: int,
|
n_parallel: int,
|
||||||
ctx_size: int,
|
ctx_per_slot: int,
|
||||||
n_gpu_layers: int = 99,
|
n_gpu_layers: int = 99,
|
||||||
) -> list[str]:
|
) -> list[str]:
|
||||||
|
# llama.cpp DIVIDES total -c across --parallel slots: per-slot context is
|
||||||
|
# n_ctx / n_parallel. xserv gives each sequence the full max_seq_len, so to
|
||||||
|
# match we must set total -c = ctx_per_slot * n_parallel. Getting this wrong
|
||||||
|
# silently truncates long generations (e.g. AIME) on llama.cpp's side.
|
||||||
|
total_ctx = ctx_per_slot * n_parallel
|
||||||
return [
|
return [
|
||||||
bin_path,
|
bin_path,
|
||||||
"-m", gguf_path,
|
"-m", gguf_path,
|
||||||
"--port", str(port),
|
"--port", str(port),
|
||||||
"--host", "0.0.0.0",
|
"--host", "0.0.0.0",
|
||||||
"-c", str(ctx_size),
|
"-c", str(total_ctx),
|
||||||
"-ngl", str(n_gpu_layers),
|
"-ngl", str(n_gpu_layers),
|
||||||
"--parallel", str(n_parallel),
|
"--parallel", str(n_parallel),
|
||||||
# Be quiet by default; the log file already captures stderr.
|
# NOTE: do NOT pass --log-disable; its startup log reports per-slot
|
||||||
"--log-disable",
|
# n_ctx, which is exactly the diagnostic that catches ctx misconfig.
|
||||||
]
|
]
|
||||||
|
|||||||
Reference in New Issue
Block a user