test: T21-for-proc — dropout-live regression under process-per-GPU
Analogue of the ddp_dropout_is_live_and_p0_bit_identical test (T21, thread-per- GPU) for the process-per-GPU launcher. Runs launch_processes twice on the same corpus / init / config with the ONLY difference being cfg.dropout (passed launcher→worker via a new XTRAIN_TEST_DROPOUT env — worker re-execs cannot inherit argv changes), reads rank 0's loss trajectory from both runs, and asserts GATE B: max |loss diff| > 1e-3. The threshold sits ~4 orders of magnitude above this box's KI-5 cross-rank NCCL noise floor (~1e-7), so it is an unambiguous "dropout mask is applied" signal, not a noise measurement. Pre-fix (missing cfg.dropout = ... in the worker / launcher, exactly the gap the paired launcher commit closes) both traces are bit-identical and this test FAILs. Also wires ENV_DROPOUT into the shared worker entry so the existing correctness test's contract is unchanged (absent env → 0.0 → same synth run as before). p0/ and p02/ subdirs isolate the two invocations' dumps.
This commit is contained in:
@@ -10,6 +10,14 @@
|
||||
//! (a) multi-process loss matches single-GPU within `<1e-3`,
|
||||
//! (b) cross-rank params agree within `<1e-6` (KI-5 ULP tolerance),
|
||||
//! (c) multi-process loss matches the thread-per-GPU `launch` path within `<1e-3`.
|
||||
//!
|
||||
//! T21-for-proc regression `proc_per_gpu_dropout_is_live_and_p0_matches_no_dropout`
|
||||
//! (below) additionally proves that `--dropout` propagates through the process-per-
|
||||
//! GPU launcher — the analogue of the thread-per-GPU T21 fix. Pre-fix
|
||||
//! `train_ddp_mp` had no `--dropout` flag, so `cfg.dropout` stayed 0 regardless of
|
||||
//! what the user passed, silently disabling dropout under process-per-GPU. The
|
||||
//! GATE B loss-trace signal (>1e-3 gap between p=0 and p=0.2) sits orders of
|
||||
//! magnitude above the KI-5 cross-rank noise floor and catches that gap directly.
|
||||
|
||||
#![cfg(not(no_cuda))]
|
||||
|
||||
@@ -74,8 +82,20 @@ fn dcfg(batch_size: usize) -> DdpConfig {
|
||||
// The dump dir is passed launcher→worker via this env key (separate from the
|
||||
// XTRAIN_* keys the launcher sets); workers write `rank{N}.dump` there.
|
||||
const ENV_DUMP_DIR: &str = "XTRAIN_TEST_DUMP_DIR";
|
||||
// Optional launcher→worker channel for `cfg.dropout`. Absent = 0.0 = the existing
|
||||
// correctness test's contract (no perturbation). The T21-for-proc regression test
|
||||
// below sets it before each `launch_processes` call to prove the process-per-GPU
|
||||
// path actually plumbs `--dropout` into every worker's model.
|
||||
const ENV_DROPOUT: &str = "XTRAIN_TEST_DROPOUT";
|
||||
const GLOBAL_BATCH: usize = 8;
|
||||
|
||||
fn worker_dropout() -> f32 {
|
||||
std::env::var(ENV_DROPOUT)
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.unwrap_or(0.0)
|
||||
}
|
||||
|
||||
// ── Worker entry: runs when this test binary is re-execed by launch_processes ─
|
||||
|
||||
fn run_as_worker_if_needed() {
|
||||
@@ -87,7 +107,13 @@ fn run_as_worker_if_needed() {
|
||||
// production `run_worker` wrapper is exercised by `bin/train_ddp_mp` on dash5.
|
||||
let ctx = DdpContext::init(env.rank, env.world, env.id, env.local_rank);
|
||||
let device = Device::Cuda(env.local_rank);
|
||||
let model = build_model(test_config(), device);
|
||||
// Mirrors bin/train_ddp_mp's `cfg.dropout = dropout` wiring — the T21-for-proc
|
||||
// regression: if this line were missing (the pre-fix launcher's exact gap),
|
||||
// `cfg.dropout` would stay 0 and the GATE B test below would find a bit-
|
||||
// identical p=0 / p=0.2 loss trace and FAIL.
|
||||
let mut cfg = test_config();
|
||||
cfg.dropout = worker_dropout();
|
||||
let model = build_model(cfg, device);
|
||||
let res = train_rank(
|
||||
&ctx,
|
||||
&model,
|
||||
@@ -273,6 +299,90 @@ fn proc_per_gpu_matches_single_gpu_and_thread_path() {
|
||||
let _ = std::fs::remove_dir_all(&dump_dir);
|
||||
}
|
||||
|
||||
/// T21-for-proc regression: prove that `--dropout` actually reaches the model
|
||||
/// under process-per-GPU. The pre-fix `bin/train_ddp_mp` had no `--dropout` flag
|
||||
/// and never set `cfg.dropout`, so the launcher's worker built its model with
|
||||
/// dropout stuck at 0 — silent identity, regardless of what the user passed. The
|
||||
/// thread-per-GPU T21 fix caught the analogous gap; this test caps the same gap
|
||||
/// on the proc-per-GPU path with the same GATE-B pattern (loss trajectory of a
|
||||
/// p=0.2 run differs from p=0 by a large margin, well above the NCCL noise floor).
|
||||
///
|
||||
/// Both runs share the corpus, the initial params (via `build_model`'s deterministic
|
||||
/// LCG), and every other config knob; the ONLY difference is `cfg.dropout`. If the
|
||||
/// worker didn't plumb the env-provided dropout into `cfg.dropout` (the exact pre-
|
||||
/// fix regression), both traces would be bit-identical and this test would FAIL.
|
||||
/// The `>1e-3` threshold sits orders of magnitude above the KI-5 cross-rank ULP
|
||||
/// noise floor (~1e-7 on this PCIe box), so it's a hard signal for "dropout is
|
||||
/// active" rather than a noise measurement. Mirrors
|
||||
/// `ddp_dropout_is_live_and_p0_bit_identical` in ddp_correctness.rs for T21's
|
||||
/// thread-per-GPU fix.
|
||||
#[test]
|
||||
fn proc_per_gpu_dropout_is_live_and_p0_matches_no_dropout() {
|
||||
run_as_worker_if_needed();
|
||||
|
||||
let world = 2usize;
|
||||
if device::device_count().unwrap_or(0) < world as i32 {
|
||||
eprintln!("skip: need >= {world} GPUs");
|
||||
return;
|
||||
}
|
||||
|
||||
let base_dump_dir = std::env::temp_dir().join(format!("xtrain_t21mp_{}", std::process::id()));
|
||||
std::fs::create_dir_all(&base_dump_dir).unwrap();
|
||||
let worker_args = [
|
||||
"--exact".to_string(),
|
||||
"proc_per_gpu_dropout_is_live_and_p0_matches_no_dropout".to_string(),
|
||||
"--test-threads=1".to_string(),
|
||||
"--nocapture".to_string(),
|
||||
];
|
||||
|
||||
// Helper: launch `world` workers with a specific dropout prob (via env), read
|
||||
// rank 0's loss trace, clean up. Uses a subdir per run so the two invocations
|
||||
// do not clobber each other's dumps.
|
||||
let mut launch_with_dropout = |p: f32, tag: &str| -> Vec<f32> {
|
||||
let dump_dir = base_dump_dir.join(tag);
|
||||
std::fs::create_dir_all(&dump_dir).unwrap();
|
||||
// SAFETY: single-threaded test (forced by --test-threads=1); no concurrent env access.
|
||||
unsafe {
|
||||
std::env::set_var(ENV_DUMP_DIR, &dump_dir);
|
||||
std::env::set_var(ENV_DROPOUT, format!("{p}"));
|
||||
}
|
||||
launch_processes(world, &worker_args).expect("worker processes failed");
|
||||
let (losses, _) = read_dump(dump_dir.to_str().unwrap(), 0);
|
||||
losses
|
||||
};
|
||||
|
||||
let loss_p0 = launch_with_dropout(0.0, "p0");
|
||||
let loss_p1 = launch_with_dropout(0.2, "p02");
|
||||
|
||||
// GATE B — dropout is LIVE under process-per-GPU with p>0. If the worker
|
||||
// didn't set `cfg.dropout` (the pre-fix gap), the two traces would match to
|
||||
// the ~1e-7 NCCL noise floor. Anything above ~1e-3 is unambiguous evidence
|
||||
// that dropout masks are actually applied in every worker's forward.
|
||||
let max_live_diff = loss_p0
|
||||
.iter()
|
||||
.zip(&loss_p1)
|
||||
.map(|(a, b)| (a - b).abs())
|
||||
.fold(0.0f32, f32::max);
|
||||
println!(
|
||||
"T21-proc GATE B (dropout live under proc-per-GPU): p0[last]={:.6} p0.2[last]={:.6} max |loss diff| = {max_live_diff:.3e}",
|
||||
loss_p0.last().unwrap(),
|
||||
loss_p1.last().unwrap()
|
||||
);
|
||||
assert!(
|
||||
max_live_diff > 1e-3,
|
||||
"p=0.2 proc-per-GPU loss matches p=0 — dropout NOT plumbed through the \
|
||||
process-per-GPU launcher (cfg.dropout stayed 0 in the worker): max |loss diff| {max_live_diff:.3e}"
|
||||
);
|
||||
|
||||
// No NaN/Inf in the p>0 run.
|
||||
assert!(
|
||||
loss_p1.iter().all(|l| l.is_finite()),
|
||||
"p=0.2 proc-per-GPU loss has non-finite values"
|
||||
);
|
||||
|
||||
let _ = std::fs::remove_dir_all(&base_dump_dir);
|
||||
}
|
||||
|
||||
fn max_rel(a: &[f32], b: &[f32]) -> f32 {
|
||||
a.iter()
|
||||
.zip(b)
|
||||
|
||||
Reference in New Issue
Block a user