From 33a1aee9ec2456afb560080cc8978fd5f98cb4da Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Wed, 1 Jul 2026 13:51:31 +0800 Subject: [PATCH] =?UTF-8?q?test:=20T21-for-proc=20=E2=80=94=20dropout-live?= =?UTF-8?q?=20regression=20under=20process-per-GPU?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Analogue of the ddp_dropout_is_live_and_p0_bit_identical test (T21, thread-per- GPU) for the process-per-GPU launcher. Runs launch_processes twice on the same corpus / init / config with the ONLY difference being cfg.dropout (passed launcher→worker via a new XTRAIN_TEST_DROPOUT env — worker re-execs cannot inherit argv changes), reads rank 0's loss trajectory from both runs, and asserts GATE B: max |loss diff| > 1e-3. The threshold sits ~4 orders of magnitude above this box's KI-5 cross-rank NCCL noise floor (~1e-7), so it is an unambiguous "dropout mask is applied" signal, not a noise measurement. Pre-fix (missing cfg.dropout = ... in the worker / launcher, exactly the gap the paired launcher commit closes) both traces are bit-identical and this test FAILs. Also wires ENV_DROPOUT into the shared worker entry so the existing correctness test's contract is unchanged (absent env → 0.0 → same synth run as before). p0/ and p02/ subdirs isolate the two invocations' dumps. --- crates/xtrain-distributed/tests/ddp_proc.rs | 112 +++++++++++++++++++- 1 file changed, 111 insertions(+), 1 deletion(-) diff --git a/crates/xtrain-distributed/tests/ddp_proc.rs b/crates/xtrain-distributed/tests/ddp_proc.rs index 5cfe88a..5f1e2d5 100644 --- a/crates/xtrain-distributed/tests/ddp_proc.rs +++ b/crates/xtrain-distributed/tests/ddp_proc.rs @@ -10,6 +10,14 @@ //! (a) multi-process loss matches single-GPU within `<1e-3`, //! (b) cross-rank params agree within `<1e-6` (KI-5 ULP tolerance), //! (c) multi-process loss matches the thread-per-GPU `launch` path within `<1e-3`. +//! +//! T21-for-proc regression `proc_per_gpu_dropout_is_live_and_p0_matches_no_dropout` +//! (below) additionally proves that `--dropout` propagates through the process-per- +//! GPU launcher — the analogue of the thread-per-GPU T21 fix. Pre-fix +//! `train_ddp_mp` had no `--dropout` flag, so `cfg.dropout` stayed 0 regardless of +//! what the user passed, silently disabling dropout under process-per-GPU. The +//! GATE B loss-trace signal (>1e-3 gap between p=0 and p=0.2) sits orders of +//! magnitude above the KI-5 cross-rank noise floor and catches that gap directly. #![cfg(not(no_cuda))] @@ -74,8 +82,20 @@ fn dcfg(batch_size: usize) -> DdpConfig { // The dump dir is passed launcher→worker via this env key (separate from the // XTRAIN_* keys the launcher sets); workers write `rank{N}.dump` there. const ENV_DUMP_DIR: &str = "XTRAIN_TEST_DUMP_DIR"; +// Optional launcher→worker channel for `cfg.dropout`. Absent = 0.0 = the existing +// correctness test's contract (no perturbation). The T21-for-proc regression test +// below sets it before each `launch_processes` call to prove the process-per-GPU +// path actually plumbs `--dropout` into every worker's model. +const ENV_DROPOUT: &str = "XTRAIN_TEST_DROPOUT"; const GLOBAL_BATCH: usize = 8; +fn worker_dropout() -> f32 { + std::env::var(ENV_DROPOUT) + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(0.0) +} + // ── Worker entry: runs when this test binary is re-execed by launch_processes ─ fn run_as_worker_if_needed() { @@ -87,7 +107,13 @@ fn run_as_worker_if_needed() { // production `run_worker` wrapper is exercised by `bin/train_ddp_mp` on dash5. let ctx = DdpContext::init(env.rank, env.world, env.id, env.local_rank); let device = Device::Cuda(env.local_rank); - let model = build_model(test_config(), device); + // Mirrors bin/train_ddp_mp's `cfg.dropout = dropout` wiring — the T21-for-proc + // regression: if this line were missing (the pre-fix launcher's exact gap), + // `cfg.dropout` would stay 0 and the GATE B test below would find a bit- + // identical p=0 / p=0.2 loss trace and FAIL. + let mut cfg = test_config(); + cfg.dropout = worker_dropout(); + let model = build_model(cfg, device); let res = train_rank( &ctx, &model, @@ -273,6 +299,90 @@ fn proc_per_gpu_matches_single_gpu_and_thread_path() { let _ = std::fs::remove_dir_all(&dump_dir); } +/// T21-for-proc regression: prove that `--dropout` actually reaches the model +/// under process-per-GPU. The pre-fix `bin/train_ddp_mp` had no `--dropout` flag +/// and never set `cfg.dropout`, so the launcher's worker built its model with +/// dropout stuck at 0 — silent identity, regardless of what the user passed. The +/// thread-per-GPU T21 fix caught the analogous gap; this test caps the same gap +/// on the proc-per-GPU path with the same GATE-B pattern (loss trajectory of a +/// p=0.2 run differs from p=0 by a large margin, well above the NCCL noise floor). +/// +/// Both runs share the corpus, the initial params (via `build_model`'s deterministic +/// LCG), and every other config knob; the ONLY difference is `cfg.dropout`. If the +/// worker didn't plumb the env-provided dropout into `cfg.dropout` (the exact pre- +/// fix regression), both traces would be bit-identical and this test would FAIL. +/// The `>1e-3` threshold sits orders of magnitude above the KI-5 cross-rank ULP +/// noise floor (~1e-7 on this PCIe box), so it's a hard signal for "dropout is +/// active" rather than a noise measurement. Mirrors +/// `ddp_dropout_is_live_and_p0_bit_identical` in ddp_correctness.rs for T21's +/// thread-per-GPU fix. +#[test] +fn proc_per_gpu_dropout_is_live_and_p0_matches_no_dropout() { + run_as_worker_if_needed(); + + let world = 2usize; + if device::device_count().unwrap_or(0) < world as i32 { + eprintln!("skip: need >= {world} GPUs"); + return; + } + + let base_dump_dir = std::env::temp_dir().join(format!("xtrain_t21mp_{}", std::process::id())); + std::fs::create_dir_all(&base_dump_dir).unwrap(); + let worker_args = [ + "--exact".to_string(), + "proc_per_gpu_dropout_is_live_and_p0_matches_no_dropout".to_string(), + "--test-threads=1".to_string(), + "--nocapture".to_string(), + ]; + + // Helper: launch `world` workers with a specific dropout prob (via env), read + // rank 0's loss trace, clean up. Uses a subdir per run so the two invocations + // do not clobber each other's dumps. + let mut launch_with_dropout = |p: f32, tag: &str| -> Vec { + let dump_dir = base_dump_dir.join(tag); + std::fs::create_dir_all(&dump_dir).unwrap(); + // SAFETY: single-threaded test (forced by --test-threads=1); no concurrent env access. + unsafe { + std::env::set_var(ENV_DUMP_DIR, &dump_dir); + std::env::set_var(ENV_DROPOUT, format!("{p}")); + } + launch_processes(world, &worker_args).expect("worker processes failed"); + let (losses, _) = read_dump(dump_dir.to_str().unwrap(), 0); + losses + }; + + let loss_p0 = launch_with_dropout(0.0, "p0"); + let loss_p1 = launch_with_dropout(0.2, "p02"); + + // GATE B — dropout is LIVE under process-per-GPU with p>0. If the worker + // didn't set `cfg.dropout` (the pre-fix gap), the two traces would match to + // the ~1e-7 NCCL noise floor. Anything above ~1e-3 is unambiguous evidence + // that dropout masks are actually applied in every worker's forward. + let max_live_diff = loss_p0 + .iter() + .zip(&loss_p1) + .map(|(a, b)| (a - b).abs()) + .fold(0.0f32, f32::max); + println!( + "T21-proc GATE B (dropout live under proc-per-GPU): p0[last]={:.6} p0.2[last]={:.6} max |loss diff| = {max_live_diff:.3e}", + loss_p0.last().unwrap(), + loss_p1.last().unwrap() + ); + assert!( + max_live_diff > 1e-3, + "p=0.2 proc-per-GPU loss matches p=0 — dropout NOT plumbed through the \ + process-per-GPU launcher (cfg.dropout stayed 0 in the worker): max |loss diff| {max_live_diff:.3e}" + ); + + // No NaN/Inf in the p>0 run. + assert!( + loss_p1.iter().all(|l| l.is_finite()), + "p=0.2 proc-per-GPU loss has non-finite values" + ); + + let _ = std::fs::remove_dir_all(&base_dump_dir); +} + fn max_rel(a: &[f32], b: &[f32]) -> f32 { a.iter() .zip(b)