dist: lengthen scaling bench so NCCL init amortizes

30-step bench charged the one-time NCCL init + 4 model builds (present at world=4, absent at world=1) against the wall clock, understating steady-state scaling (in-loop tok/s already showed ~53k at 4 GPUs). Bump to 150 steps. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-15 17:18:23 +08:00
parent 818f76a18f
commit ad82e8bf92
1 changed files with 5 additions and 2 deletions
--- a/crates/xtrain-distributed/tests/ddp_correctness.rs
+++ b/crates/xtrain-distributed/tests/ddp_correctness.rs
@@ -189,12 +189,15 @@ fn ddp_throughput_scaling() {
        return;
    }
    // Same PER-GPU workload at each world size (batch scales with world), so the
-    // per-rank cost is fixed and global tok/s should scale ~linearly.
+    // per-rank cost is fixed and global tok/s should scale ~linearly. Use enough
+    // steps that the one-time NCCL init + model-build overhead (which is larger at
+    // world=4 and absent at world=1) amortizes — otherwise the wall-clock ratio
+    // understates steady-state scaling.
    let per_gpu_batch = 8usize;
    let vocab = 256usize;
    let cfg = test_config(vocab);
    let corpus = synth_corpus(vocab, 8192);
-    let steps = 30usize;
+    let steps = 150usize;
    let seq_len = 64usize;

    let worlds: Vec<usize> = [1, 2, 4].into_iter().filter(|&w| w <= max_gpus).collect();