distributed: T21 — wire dropout into the DDP path (--dropout + model.train())
V9-PILOT caught a launcher-level integration gap: T18 wired dropout into the single-GPU bin/train, but the DDP path never did. train_ddp had no --dropout flag and never set cfg.dropout, and ddp.rs::train_rank never called model.train() — so under DDP every forward ran in the default eval mode and dropout was a silent identity, regardless of config. Fix, mirroring the single-GPU train/eval discipline: - train_ddp.rs: add a --dropout <p> flag (default 0 = off, matching the prior behavior) and set cfg.dropout from it; log it when on. - ddp.rs::train_rank: call model.train() at the start of each step (before the micro-batch loop). eval_loss() flips the model to eval mode and does not restore it, so re-asserting train() each step keeps dropout live across eval boundaries. --dropout 0 (default) is bit-identical to the prior DDP path: cfg.dropout stays 0 and ops::dropout(p=0) is a clone no-op regardless of training mode. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -88,6 +88,12 @@ fn main() {
|
|||||||
let val_tokens: usize = flag(&args, "--val-tokens", 0);
|
let val_tokens: usize = flag(&args, "--val-tokens", 0);
|
||||||
let eval_every: usize = flag(&args, "--eval-every", 0);
|
let eval_every: usize = flag(&args, "--eval-every", 0);
|
||||||
let eval_batches: usize = flag(&args, "--eval-batches", 64);
|
let eval_batches: usize = flag(&args, "--eval-batches", 64);
|
||||||
|
// Dropout (Phase T18/T21): residual-path dropout prob, active at training time
|
||||||
|
// only (inverted scaling), identity at eval/sampling/export. Default 0 = off
|
||||||
|
// (forward graph bit-identical to the no-dropout path). Mirrors bin/train; the
|
||||||
|
// train_rank loop calls model.train() each step so dropout is actually live
|
||||||
|
// under DDP (T21 wired this — the launcher previously never set training mode).
|
||||||
|
let dropout: f32 = flag(&args, "--dropout", 0.0f32);
|
||||||
// bf16 mixed precision (Phase T12): fp32 master weights, bf16 linears +
|
// bf16 mixed precision (Phase T12): fp32 master weights, bf16 linears +
|
||||||
// activations. Opt-in; default fp32 reproduces v0–v4 numerics.
|
// activations. Opt-in; default fp32 reproduces v0–v4 numerics.
|
||||||
let bf16 = args.iter().any(|a| a == "--bf16");
|
let bf16 = args.iter().any(|a| a == "--bf16");
|
||||||
@@ -139,7 +145,9 @@ fn main() {
|
|||||||
(corpus, None)
|
(corpus, None)
|
||||||
};
|
};
|
||||||
|
|
||||||
let cfg = Config::from_arch(vocab, n_heads, head_dim, n_layers, ffn).with_kv_heads(kv_heads);
|
let mut cfg =
|
||||||
|
Config::from_arch(vocab, n_heads, head_dim, n_layers, ffn).with_kv_heads(kv_heads);
|
||||||
|
cfg.dropout = dropout;
|
||||||
println!(
|
println!(
|
||||||
"model: dim {} layers {} heads {} kv_heads {} head_dim {} ffn {} → core {:.3}M params \
|
"model: dim {} layers {} heads {} kv_heads {} head_dim {} ffn {} → core {:.3}M params \
|
||||||
(+ embed/lm {:.2}M = {:.2}M total)",
|
(+ embed/lm {:.2}M = {:.2}M total)",
|
||||||
@@ -189,6 +197,9 @@ fn main() {
|
|||||||
if flash {
|
if flash {
|
||||||
println!("flash-attention: ON (fused SDPA kernel, no materialized scores)");
|
println!("flash-attention: ON (fused SDPA kernel, no materialized scores)");
|
||||||
}
|
}
|
||||||
|
if dropout > 0.0 {
|
||||||
|
println!("dropout: ON (p={dropout}, residual-path, train-only inverted scaling)");
|
||||||
|
}
|
||||||
let results = launch(
|
let results = launch(
|
||||||
&devices,
|
&devices,
|
||||||
&train_corpus,
|
&train_corpus,
|
||||||
|
|||||||
@@ -124,6 +124,14 @@ pub fn train_rank(
|
|||||||
// all-reduce fires ONLY after the last micro-step (intermediate micro-steps
|
// all-reduce fires ONLY after the last micro-step (intermediate micro-steps
|
||||||
// are local-only, no NCCL).
|
// are local-only, no NCCL).
|
||||||
let mut local_sum = 0.0f32; // Σ over micro of (local_mean · b_local)
|
let mut local_sum = 0.0f32; // Σ over micro of (local_mean · b_local)
|
||||||
|
// Training mode → dropout active (T18; no-op when cfg.dropout == 0). Set
|
||||||
|
// each step so it is restored after a periodic eval flips the model to eval
|
||||||
|
// mode (eval_loss calls model.eval() and does not restore). Mirrors the
|
||||||
|
// single-GPU loop's train/eval discipline — without this, DDP forwards run
|
||||||
|
// in the default eval (identity) mode and --dropout is silently ignored
|
||||||
|
// (the T21 launcher-wiring gap the V9-PILOT caught). Each micro-step's
|
||||||
|
// forward bumps the per-step seed → fresh masks.
|
||||||
|
model.train();
|
||||||
for _ in 0..accum {
|
for _ in 0..accum {
|
||||||
let mut inputs = Vec::with_capacity(batch_local);
|
let mut inputs = Vec::with_capacity(batch_local);
|
||||||
let mut targets_v = Vec::with_capacity(batch_local);
|
let mut targets_v = Vec::with_capacity(batch_local);
|
||||||
|
|||||||
Reference in New Issue
Block a user