{ "observation": "Start from a safe tp=4 layout and conservative batching.", "diagnosis": "The first pass should verify multi-GPU launch and avoid queueing collapse from over-batching.", "config_patch": { "env_patch": {}, "flag_patch": { "tensor-parallel-size": 4, "max-num-seqs": 8, "max-num-batched-tokens": 32768, "gpu-memory-utilization": 0.85, "block-size": 64 } }, "expected_effects": [ "Stable startup on 4x H20", "Low risk of OOM during the first binary-search probes" ], "why_not_previous_failures": "This is the initial baseline proposal." }