diff --git a/crates/xtrain-distributed/src/bin/train_ddp.rs b/crates/xtrain-distributed/src/bin/train_ddp.rs index 944becc..eb31dd8 100644 --- a/crates/xtrain-distributed/src/bin/train_ddp.rs +++ b/crates/xtrain-distributed/src/bin/train_ddp.rs @@ -88,6 +88,12 @@ fn main() { let val_tokens: usize = flag(&args, "--val-tokens", 0); let eval_every: usize = flag(&args, "--eval-every", 0); let eval_batches: usize = flag(&args, "--eval-batches", 64); + // Dropout (Phase T18/T21): residual-path dropout prob, active at training time + // only (inverted scaling), identity at eval/sampling/export. Default 0 = off + // (forward graph bit-identical to the no-dropout path). Mirrors bin/train; the + // train_rank loop calls model.train() each step so dropout is actually live + // under DDP (T21 wired this — the launcher previously never set training mode). + let dropout: f32 = flag(&args, "--dropout", 0.0f32); // bf16 mixed precision (Phase T12): fp32 master weights, bf16 linears + // activations. Opt-in; default fp32 reproduces v0–v4 numerics. let bf16 = args.iter().any(|a| a == "--bf16"); @@ -139,7 +145,9 @@ fn main() { (corpus, None) }; - let cfg = Config::from_arch(vocab, n_heads, head_dim, n_layers, ffn).with_kv_heads(kv_heads); + let mut cfg = + Config::from_arch(vocab, n_heads, head_dim, n_layers, ffn).with_kv_heads(kv_heads); + cfg.dropout = dropout; println!( "model: dim {} layers {} heads {} kv_heads {} head_dim {} ffn {} → core {:.3}M params \ (+ embed/lm {:.2}M = {:.2}M total)", @@ -189,6 +197,9 @@ fn main() { if flash { println!("flash-attention: ON (fused SDPA kernel, no materialized scores)"); } + if dropout > 0.0 { + println!("dropout: ON (p={dropout}, residual-path, train-only inverted scaling)"); + } let results = launch( &devices, &train_corpus, diff --git a/crates/xtrain-distributed/src/ddp.rs b/crates/xtrain-distributed/src/ddp.rs index 6d3f847..4b442eb 100644 --- a/crates/xtrain-distributed/src/ddp.rs +++ b/crates/xtrain-distributed/src/ddp.rs @@ -124,6 +124,14 @@ pub fn train_rank( // all-reduce fires ONLY after the last micro-step (intermediate micro-steps // are local-only, no NCCL). let mut local_sum = 0.0f32; // Σ over micro of (local_mean · b_local) + // Training mode → dropout active (T18; no-op when cfg.dropout == 0). Set + // each step so it is restored after a periodic eval flips the model to eval + // mode (eval_loss calls model.eval() and does not restore). Mirrors the + // single-GPU loop's train/eval discipline — without this, DDP forwards run + // in the default eval (identity) mode and --dropout is silently ignored + // (the T21 launcher-wiring gap the V9-PILOT caught). Each micro-step's + // forward bumps the per-step seed → fresh masks. + model.train(); for _ in 0..accum { let mut inputs = Vec::with_capacity(batch_local); let mut targets_v = Vec::with_capacity(batch_local);