{ "observation": "Increase batching once tp=4 is fixed.", "diagnosis": "Throughput should improve if the engine can admit more concurrent prefills without violating TTFT.", "config_patch": { "env_patch": {}, "flag_patch": { "tensor-parallel-size": 4, "max-num-seqs": 16, "max-num-batched-tokens": 65536, "gpu-memory-utilization": 0.9, "block-size": 64 } }, "expected_effects": [ "Higher feasible sampling_u than the conservative baseline", "Better token throughput if memory headroom is sufficient" ], "why_not_previous_failures": "Raises batching in a controlled step instead of jumping directly to the most aggressive setting." }