distributed: T21-for-proc — wire --dropout into the process-per-GPU launcher

T21 fixed --dropout under thread-per-GPU (train_ddp): added the flag, set cfg.dropout, and made train_rank re-assert model.train() each step so the training forward stays live across periodic eval flips. The process-per-GPU launcher (train_ddp_mp) was left out: it never parsed --dropout, so cfg.dropout stayed at Config::from_arch's 0.0 default, and the worker's model built with dropout permanently disabled — silently, regardless of what the user passed. The gap is the exact same launcher-wiring class the V9-PILOT caught: op-level + single-GPU tests pass, the DDP-thread T21 regression test passes, but the proc-per-GPU launcher path was never exercised end-to-end with dropout>0. Mirror bin/train_ddp exactly: parse --dropout (default 0, bit-identical default), set cfg.dropout before build_model, print an ON banner on rank 0. train_rank's per-step model.train() from T21 is reused unchanged (proc-per-GPU uses the same train_rank). Follow-up test that exercises this wiring end-to-end (GATE B loss-trace divergence between p=0 and p=0.2 under process-per-GPU) lands in the next commit.
2026-07-01 13:51:17 +08:00
parent 4379868f2d
commit 86de6bfb51
1 changed files with 14 additions and 2 deletions
--- a/crates/xtrain-distributed/src/bin/train_ddp_mp.rs
+++ b/crates/xtrain-distributed/src/bin/train_ddp_mp.rs
@@ -10,7 +10,9 @@
 //!
 //! Versus `train_ddp` (thread-per-GPU, kept as the regression baseline) the ONLY
 //! difference is the launch model + cross-process UniqueId bootstrap. CLI flags
-//! are identical, so it doubles as the before→after throughput driver.
+//! mirror `train_ddp` (incl. `--dropout` — same T21 wiring: `cfg.dropout` set here
+//! and `train_rank` re-asserts `model.train()` each step), so it doubles as the
+//! before→after throughput driver.
 //!
 //! Run on dash5 (pick idle GPUs — dash5 is shared):
 //!   export PATH=/usr/local/cuda/bin:/opt/wjh/.cargo/bin:$PATH
@@ -108,6 +110,11 @@ fn main() {
    let val_tokens: usize = flag(&args, "--val-tokens", 0);
    let eval_every: usize = flag(&args, "--eval-every", 0);
    let eval_batches: usize = flag(&args, "--eval-batches", 64);
+    // Dropout (Phase T18/T21): residual-path dropout prob, active at training time
+    // only (inverted scaling), identity at eval/sampling/export. Default 0 = off
+    // (bit-identical to the no-dropout path). Mirrors bin/train_ddp; propagates into
+    // cfg.dropout (below) and relies on T21's per-step model.train() in train_rank.
+    let dropout: f32 = flag(&args, "--dropout", 0.0f32);
    let opts = ModelOpts {
        bf16: args.iter().any(|a| a == "--bf16"),
        recompute: args.iter().any(|a| a == "--recompute"),
@@ -136,7 +143,9 @@ fn main() {
        (corpus, None)
    };

-    let cfg = Config::from_arch(vocab, n_heads, head_dim, n_layers, ffn).with_kv_heads(kv_heads);
+    let mut cfg =
+        Config::from_arch(vocab, n_heads, head_dim, n_layers, ffn).with_kv_heads(kv_heads);
+    cfg.dropout = dropout;

    if env.rank == 0 {
        println!(
@@ -162,6 +171,9 @@ fn main() {
        if opts.flash {
            println!("flash-attention: ON (fused SDPA kernel, no materialized scores)");
        }
+        if dropout > 0.0 {
+            println!("dropout: ON (p={dropout}, residual-path, train-only inverted scaling)");
+        }
    }

    let dcfg = DdpConfig {