diff --git a/tools/bench/runner.py b/tools/bench/runner.py index 3118f28..7056f16 100644 --- a/tools/bench/runner.py +++ b/tools/bench/runner.py @@ -134,7 +134,7 @@ def build_endpoints(args) -> list[SystemEndpoint]: model_id=args.llama_model_id, launch_cmd=llama_cpp_launch_cmd( args.llama_bin, gguf, args.llama_port, - n_parallel=args.max_batch, ctx_size=args.max_seq_len, + n_parallel=args.max_batch, ctx_per_slot=args.max_seq_len, ), # llama-server's health endpoint also returns 200 only when model is loaded. health_path="/health", diff --git a/tools/bench/servers.py b/tools/bench/servers.py index 6961feb..993f2ba 100644 --- a/tools/bench/servers.py +++ b/tools/bench/servers.py @@ -129,17 +129,22 @@ def llama_cpp_launch_cmd( port: int, *, n_parallel: int, - ctx_size: int, + ctx_per_slot: int, n_gpu_layers: int = 99, ) -> list[str]: + # llama.cpp DIVIDES total -c across --parallel slots: per-slot context is + # n_ctx / n_parallel. xserv gives each sequence the full max_seq_len, so to + # match we must set total -c = ctx_per_slot * n_parallel. Getting this wrong + # silently truncates long generations (e.g. AIME) on llama.cpp's side. + total_ctx = ctx_per_slot * n_parallel return [ bin_path, "-m", gguf_path, "--port", str(port), "--host", "0.0.0.0", - "-c", str(ctx_size), + "-c", str(total_ctx), "-ngl", str(n_gpu_layers), "--parallel", str(n_parallel), - # Be quiet by default; the log file already captures stderr. - "--log-disable", + # NOTE: do NOT pass --log-disable; its startup log reports per-slot + # n_ctx, which is exactly the diagnostic that catches ctx misconfig. ]