From 6465a2d5ceb2d72a6e580d05d63ca865a679fc8d Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Wed, 1 Jul 2026 14:09:42 +0800 Subject: [PATCH] =?UTF-8?q?test:=20T21-for-proc=20=E2=80=94=20clear=20ENV?= =?UTF-8?q?=5FDROPOUT=20across=20tests=20to=20sever=20ordering=20coupling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit libtest with --test-threads=1 (the documented invariant for this file's DDP tests) runs tests alphabetically. The new proc_per_gpu_dropout_is_live_and_p0_matches_no_dropout ('d') runs BEFORE proc_per_gpu_matches_single_gpu_and_thread_path ('m'). It sets ENV_DROPOUT=0.2 via std::env::set_var; if left in place, the correctness test's spawned workers would inherit it (Command inherits parent env by default) and build with cfg.dropout=0.2 while its single-GPU baseline (run_single_gpu → test_config → dropout=0) stays at 0 — GATE (a) `max_rel_single < 1e-3` would blow up by orders of magnitude. Two defenses: - correctness test remove_var(ENV_DROPOUT) before spawn (belt): even if the dropout test forgot to clean up, this test starts from a clean env. - dropout test remove_var(ENV_DROPOUT, ENV_DUMP_DIR) at exit (suspenders): keep the invariant "each test leaves the env as it found it" so any future test added after these two starts clean too. Same --test-threads=1 SAFETY comment applies (no concurrent env access). --- crates/xtrain-distributed/tests/ddp_proc.rs | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/crates/xtrain-distributed/tests/ddp_proc.rs b/crates/xtrain-distributed/tests/ddp_proc.rs index 5f1e2d5..7c30f9f 100644 --- a/crates/xtrain-distributed/tests/ddp_proc.rs +++ b/crates/xtrain-distributed/tests/ddp_proc.rs @@ -229,8 +229,16 @@ fn proc_per_gpu_matches_single_gpu_and_thread_path() { let dump_dir = std::env::temp_dir().join(format!("xtrain_t17_{}", std::process::id())); std::fs::create_dir_all(&dump_dir).unwrap(); // SAFETY: single-threaded test (forced by --test-threads=1) sets this env - // before spawning workers; no concurrent env access. + // before spawning workers; no concurrent env access. ENV_DROPOUT is cleared + // defensively — libtest orders `--test-threads=1` runs alphabetically, so the + // sibling `proc_per_gpu_dropout_is_live_...` test (starts with 'd') runs BEFORE + // this one (starts with 'm'). If it happened to leak `ENV_DROPOUT=0.2` in this + // process's env, the workers here would inherit it (Command inherits parent + // env by default) and build with dropout=0.2 while the single-GPU baseline + // (run_single_gpu → test_config → dropout=0) stays at 0 — GATE (a) would blow up. + // Explicit remove here severs that ordering coupling. unsafe { + std::env::remove_var(ENV_DROPOUT); std::env::set_var(ENV_DUMP_DIR, &dump_dir); } // Re-exec the test binary but run ONLY this test, single-threaded, so the @@ -380,6 +388,16 @@ fn proc_per_gpu_dropout_is_live_and_p0_matches_no_dropout() { "p=0.2 proc-per-GPU loss has non-finite values" ); + // Clear the launcher→worker env keys so we don't leak state to anything that + // runs later in this process. `proc_per_gpu_matches_single_gpu_and_thread_path` + // clears ENV_DROPOUT defensively too, but keeping the invariant "each test + // leaves the env as it found it" costs nothing. + // SAFETY: single-threaded test (forced by --test-threads=1); no concurrent env access. + unsafe { + std::env::remove_var(ENV_DROPOUT); + std::env::remove_var(ENV_DUMP_DIR); + } + let _ = std::fs::remove_dir_all(&base_dump_dir); }