train: --eval-ckpt eval-only mode (v0-vs-v1 same-set val loss)

Expose eval_loss() and add a --eval-ckpt <path> branch to bin/train: load an
existing checkpoint into a model of the given arch and score it on the held-out
val split, then exit. Lets v0 and v1 be measured on the identical validation set
(the acceptance metric) without a separate eval binary.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 18:44:40 +08:00
parent e44e50ef78
commit ec8114ecbc
3 changed files with 17 additions and 3 deletions

View File

@@ -167,6 +167,20 @@ fn main() {
}
});
// Eval-only mode: load a checkpoint and score it on the held-out val set, then
// exit. Used to put an EXISTING model (e.g. v0) and a new one on the same
// metric — the v0-vs-v1 val-loss comparison. The arch flags must match the ckpt.
if let Some(p) = args.iter().position(|a| a == "--eval-ckpt") {
let ckpt_path = PathBuf::from(args.get(p + 1).expect("--eval-ckpt <path>"));
xtrain_train::checkpoint::load_into(&ckpt_path, &model.params())
.expect("load eval checkpoint");
let v = valid.expect("--eval-ckpt needs --val-tokens > 0");
let vl = xtrain_train::eval_loss(&model, device, &v, seq_len, eval_batches);
println!("eval-only: {} → val loss {vl:.4}", ckpt_path.display());
sample_some(&model, device, &tok_path);
return;
}
let tcfg = TrainConfig {
seq_len,
batch_size,

View File

@@ -19,4 +19,4 @@ pub mod sample;
mod train_loop;
#[cfg(not(no_cuda))]
pub use train_loop::{TrainConfig, TrainResult, train};
pub use train_loop::{TrainConfig, TrainResult, eval_loss, train};

View File

@@ -153,8 +153,8 @@ pub fn train(
/// Mean cross-entropy over `batches` deterministic, non-overlapping windows of
/// the validation corpus (no backward — eval only). Deterministic so val loss is
/// comparable across steps and runs.
fn eval_loss(
/// comparable across steps and runs (and across models — the v0-vs-v1 metric).
pub fn eval_loss(
model: &TinyTransformer,
device: Device,
valid: &Corpus,