distributed: T21-for-proc — wire --dropout into the process-per-GPU launcher
T21 fixed --dropout under thread-per-GPU (train_ddp): added the flag, set cfg.dropout, and made train_rank re-assert model.train() each step so the training forward stays live across periodic eval flips. The process-per-GPU launcher (train_ddp_mp) was left out: it never parsed --dropout, so cfg.dropout stayed at Config::from_arch's 0.0 default, and the worker's model built with dropout permanently disabled — silently, regardless of what the user passed. The gap is the exact same launcher-wiring class the V9-PILOT caught: op-level + single-GPU tests pass, the DDP-thread T21 regression test passes, but the proc-per-GPU launcher path was never exercised end-to-end with dropout>0. Mirror bin/train_ddp exactly: parse --dropout (default 0, bit-identical default), set cfg.dropout before build_model, print an ON banner on rank 0. train_rank's per-step model.train() from T21 is reused unchanged (proc-per-GPU uses the same train_rank). Follow-up test that exercises this wiring end-to-end (GATE B loss-trace divergence between p=0 and p=0.2 under process-per-GPU) lands in the next commit.
This commit is contained in:
@@ -10,7 +10,9 @@
|
|||||||
//!
|
//!
|
||||||
//! Versus `train_ddp` (thread-per-GPU, kept as the regression baseline) the ONLY
|
//! Versus `train_ddp` (thread-per-GPU, kept as the regression baseline) the ONLY
|
||||||
//! difference is the launch model + cross-process UniqueId bootstrap. CLI flags
|
//! difference is the launch model + cross-process UniqueId bootstrap. CLI flags
|
||||||
//! are identical, so it doubles as the before→after throughput driver.
|
//! mirror `train_ddp` (incl. `--dropout` — same T21 wiring: `cfg.dropout` set here
|
||||||
|
//! and `train_rank` re-asserts `model.train()` each step), so it doubles as the
|
||||||
|
//! before→after throughput driver.
|
||||||
//!
|
//!
|
||||||
//! Run on dash5 (pick idle GPUs — dash5 is shared):
|
//! Run on dash5 (pick idle GPUs — dash5 is shared):
|
||||||
//! export PATH=/usr/local/cuda/bin:/opt/wjh/.cargo/bin:$PATH
|
//! export PATH=/usr/local/cuda/bin:/opt/wjh/.cargo/bin:$PATH
|
||||||
@@ -108,6 +110,11 @@ fn main() {
|
|||||||
let val_tokens: usize = flag(&args, "--val-tokens", 0);
|
let val_tokens: usize = flag(&args, "--val-tokens", 0);
|
||||||
let eval_every: usize = flag(&args, "--eval-every", 0);
|
let eval_every: usize = flag(&args, "--eval-every", 0);
|
||||||
let eval_batches: usize = flag(&args, "--eval-batches", 64);
|
let eval_batches: usize = flag(&args, "--eval-batches", 64);
|
||||||
|
// Dropout (Phase T18/T21): residual-path dropout prob, active at training time
|
||||||
|
// only (inverted scaling), identity at eval/sampling/export. Default 0 = off
|
||||||
|
// (bit-identical to the no-dropout path). Mirrors bin/train_ddp; propagates into
|
||||||
|
// cfg.dropout (below) and relies on T21's per-step model.train() in train_rank.
|
||||||
|
let dropout: f32 = flag(&args, "--dropout", 0.0f32);
|
||||||
let opts = ModelOpts {
|
let opts = ModelOpts {
|
||||||
bf16: args.iter().any(|a| a == "--bf16"),
|
bf16: args.iter().any(|a| a == "--bf16"),
|
||||||
recompute: args.iter().any(|a| a == "--recompute"),
|
recompute: args.iter().any(|a| a == "--recompute"),
|
||||||
@@ -136,7 +143,9 @@ fn main() {
|
|||||||
(corpus, None)
|
(corpus, None)
|
||||||
};
|
};
|
||||||
|
|
||||||
let cfg = Config::from_arch(vocab, n_heads, head_dim, n_layers, ffn).with_kv_heads(kv_heads);
|
let mut cfg =
|
||||||
|
Config::from_arch(vocab, n_heads, head_dim, n_layers, ffn).with_kv_heads(kv_heads);
|
||||||
|
cfg.dropout = dropout;
|
||||||
|
|
||||||
if env.rank == 0 {
|
if env.rank == 0 {
|
||||||
println!(
|
println!(
|
||||||
@@ -162,6 +171,9 @@ fn main() {
|
|||||||
if opts.flash {
|
if opts.flash {
|
||||||
println!("flash-attention: ON (fused SDPA kernel, no materialized scores)");
|
println!("flash-attention: ON (fused SDPA kernel, no materialized scores)");
|
||||||
}
|
}
|
||||||
|
if dropout > 0.0 {
|
||||||
|
println!("dropout: ON (p={dropout}, residual-path, train-only inverted scaling)");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let dcfg = DdpConfig {
|
let dcfg = DdpConfig {
|
||||||
|
|||||||
Reference in New Issue
Block a user