diff --git a/crates/xtrain-distributed/tests/ddp_proc.rs b/crates/xtrain-distributed/tests/ddp_proc.rs index 5f1e2d5..7c30f9f 100644 --- a/crates/xtrain-distributed/tests/ddp_proc.rs +++ b/crates/xtrain-distributed/tests/ddp_proc.rs @@ -229,8 +229,16 @@ fn proc_per_gpu_matches_single_gpu_and_thread_path() { let dump_dir = std::env::temp_dir().join(format!("xtrain_t17_{}", std::process::id())); std::fs::create_dir_all(&dump_dir).unwrap(); // SAFETY: single-threaded test (forced by --test-threads=1) sets this env - // before spawning workers; no concurrent env access. + // before spawning workers; no concurrent env access. ENV_DROPOUT is cleared + // defensively — libtest orders `--test-threads=1` runs alphabetically, so the + // sibling `proc_per_gpu_dropout_is_live_...` test (starts with 'd') runs BEFORE + // this one (starts with 'm'). If it happened to leak `ENV_DROPOUT=0.2` in this + // process's env, the workers here would inherit it (Command inherits parent + // env by default) and build with dropout=0.2 while the single-GPU baseline + // (run_single_gpu → test_config → dropout=0) stays at 0 — GATE (a) would blow up. + // Explicit remove here severs that ordering coupling. unsafe { + std::env::remove_var(ENV_DROPOUT); std::env::set_var(ENV_DUMP_DIR, &dump_dir); } // Re-exec the test binary but run ONLY this test, single-threaded, so the @@ -380,6 +388,16 @@ fn proc_per_gpu_dropout_is_live_and_p0_matches_no_dropout() { "p=0.2 proc-per-GPU loss has non-finite values" ); + // Clear the launcher→worker env keys so we don't leak state to anything that + // runs later in this process. `proc_per_gpu_matches_single_gpu_and_thread_path` + // clears ENV_DROPOUT defensively too, but keeping the invariant "each test + // leaves the env as it found it" costs nothing. + // SAFETY: single-threaded test (forced by --test-threads=1); no concurrent env access. + unsafe { + std::env::remove_var(ENV_DROPOUT); + std::env::remove_var(ENV_DUMP_DIR); + } + let _ = std::fs::remove_dir_all(&base_dump_dir); }