test: T21-for-proc — clear ENV_DROPOUT across tests to sever ordering coupling
libtest with --test-threads=1 (the documented invariant for this file's DDP
tests) runs tests alphabetically. The new
proc_per_gpu_dropout_is_live_and_p0_matches_no_dropout ('d') runs BEFORE
proc_per_gpu_matches_single_gpu_and_thread_path ('m'). It sets ENV_DROPOUT=0.2
via std::env::set_var; if left in place, the correctness test's spawned workers
would inherit it (Command inherits parent env by default) and build with
cfg.dropout=0.2 while its single-GPU baseline (run_single_gpu → test_config →
dropout=0) stays at 0 — GATE (a) `max_rel_single < 1e-3` would blow up by
orders of magnitude.
Two defenses:
- correctness test remove_var(ENV_DROPOUT) before spawn (belt): even if the
dropout test forgot to clean up, this test starts from a clean env.
- dropout test remove_var(ENV_DROPOUT, ENV_DUMP_DIR) at exit (suspenders):
keep the invariant "each test leaves the env as it found it" so any future
test added after these two starts clean too.
Same --test-threads=1 SAFETY comment applies (no concurrent env access).
This commit is contained in:
@@ -229,8 +229,16 @@ fn proc_per_gpu_matches_single_gpu_and_thread_path() {
|
|||||||
let dump_dir = std::env::temp_dir().join(format!("xtrain_t17_{}", std::process::id()));
|
let dump_dir = std::env::temp_dir().join(format!("xtrain_t17_{}", std::process::id()));
|
||||||
std::fs::create_dir_all(&dump_dir).unwrap();
|
std::fs::create_dir_all(&dump_dir).unwrap();
|
||||||
// SAFETY: single-threaded test (forced by --test-threads=1) sets this env
|
// SAFETY: single-threaded test (forced by --test-threads=1) sets this env
|
||||||
// before spawning workers; no concurrent env access.
|
// before spawning workers; no concurrent env access. ENV_DROPOUT is cleared
|
||||||
|
// defensively — libtest orders `--test-threads=1` runs alphabetically, so the
|
||||||
|
// sibling `proc_per_gpu_dropout_is_live_...` test (starts with 'd') runs BEFORE
|
||||||
|
// this one (starts with 'm'). If it happened to leak `ENV_DROPOUT=0.2` in this
|
||||||
|
// process's env, the workers here would inherit it (Command inherits parent
|
||||||
|
// env by default) and build with dropout=0.2 while the single-GPU baseline
|
||||||
|
// (run_single_gpu → test_config → dropout=0) stays at 0 — GATE (a) would blow up.
|
||||||
|
// Explicit remove here severs that ordering coupling.
|
||||||
unsafe {
|
unsafe {
|
||||||
|
std::env::remove_var(ENV_DROPOUT);
|
||||||
std::env::set_var(ENV_DUMP_DIR, &dump_dir);
|
std::env::set_var(ENV_DUMP_DIR, &dump_dir);
|
||||||
}
|
}
|
||||||
// Re-exec the test binary but run ONLY this test, single-threaded, so the
|
// Re-exec the test binary but run ONLY this test, single-threaded, so the
|
||||||
@@ -380,6 +388,16 @@ fn proc_per_gpu_dropout_is_live_and_p0_matches_no_dropout() {
|
|||||||
"p=0.2 proc-per-GPU loss has non-finite values"
|
"p=0.2 proc-per-GPU loss has non-finite values"
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Clear the launcher→worker env keys so we don't leak state to anything that
|
||||||
|
// runs later in this process. `proc_per_gpu_matches_single_gpu_and_thread_path`
|
||||||
|
// clears ENV_DROPOUT defensively too, but keeping the invariant "each test
|
||||||
|
// leaves the env as it found it" costs nothing.
|
||||||
|
// SAFETY: single-threaded test (forced by --test-threads=1); no concurrent env access.
|
||||||
|
unsafe {
|
||||||
|
std::env::remove_var(ENV_DROPOUT);
|
||||||
|
std::env::remove_var(ENV_DUMP_DIR);
|
||||||
|
}
|
||||||
|
|
||||||
let _ = std::fs::remove_dir_all(&base_dump_dir);
|
let _ = std::fs::remove_dir_all(&base_dump_dir);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user