test: resolve real_training corpus default via CARGO_MANIFEST_DIR

cargo runs tests with cwd = crate dir, so the bare relative default
data/tinystories-valid-3mb.txt didn't resolve. Anchor it to the repo root via
CARGO_MANIFEST_DIR so the test runs out of the box (still overridable with
XTRAIN_CORPUS).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 16:41:12 +08:00
parent 2f8118fda9
commit 5df1d4d57b

View File

@@ -46,9 +46,15 @@ fn trains_on_tinystories() {
std::env::var("XTRAIN_TOKENIZER") std::env::var("XTRAIN_TOKENIZER")
.unwrap_or_else(|_| "/opt/wjh/models/gpt2/tokenizer.json".into()), .unwrap_or_else(|_| "/opt/wjh/models/gpt2/tokenizer.json".into()),
); );
let corpus_path = PathBuf::from( // Default resolves relative to the repo root (cargo runs tests with cwd =
std::env::var("XTRAIN_CORPUS").unwrap_or_else(|_| "data/tinystories-valid-3mb.txt".into()), // crate dir, so `../../data/...` from crates/xtrain-train); override with
); // XTRAIN_CORPUS for any other location.
let corpus_path = PathBuf::from(std::env::var("XTRAIN_CORPUS").unwrap_or_else(|_| {
format!(
"{}/../../data/tinystories-valid-3mb.txt",
env!("CARGO_MANIFEST_DIR")
)
}));
let corpus = Corpus::load(&tok_path, &corpus_path); let corpus = Corpus::load(&tok_path, &corpus_path);
println!( println!(