post-train: M1 fix — enlarge arith key space + saturation guard

The default operand ranges (max_add=99, max_mul=12) gave only ~20k unique
problems, so 'gen_arith_task --n 20000 --eval 500' (a) made train dedup
pathologically slow near saturation and (b) made the disjoint-eval loop never
terminate. A background run stalled after ~10k train rows with no eval files.

Fix (root cause, not a workaround):
- enlarge default ranges to max_add=999, max_mul=99 (~2.01M key space) so 20k+
  requests are a tiny fraction and dedup stays trivial;
- add unique_space() + a generator guard that errors clearly when n+eval exceeds
  80% of the key space, instead of looping forever.

Verified: cargo test 10/10; full 20000/500 gen now 0.2s, all 3 files, 0
train/eval leakage; guard panics on an oversized (--max-add 99) request.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-29 23:28:25 +08:00
parent 9c70e99ae4
commit cb64604496
2 changed files with 45 additions and 5 deletions

View File

@@ -19,7 +19,7 @@ use std::fs::{self, File};
use std::io::{BufWriter, Write};
use std::path::PathBuf;
use xtrain_train::task::{GenConfig, Op, gen_problem};
use xtrain_train::task::{GenConfig, Op, gen_problem, unique_space};
fn flag<T: std::str::FromStr>(args: &[String], name: &str, default: T) -> T {
args.iter()
@@ -34,8 +34,8 @@ fn main() {
let n_train: usize = flag(&args, "--n", 20000);
let n_eval: usize = flag(&args, "--eval", 500);
let seed: u64 = flag(&args, "--seed", 1);
let max_add: i64 = flag(&args, "--max-add", 99);
let max_mul: i64 = flag(&args, "--max-mul", 12);
let max_add: i64 = flag(&args, "--max-add", 999);
let max_mul: i64 = flag(&args, "--max-mul", 99);
let out_dir: PathBuf = args
.iter()
.position(|a| a == "--out-dir")
@@ -49,6 +49,18 @@ fn main() {
max_mul,
ops: vec![Op::Add, Op::Sub, Op::Mul],
};
// Guard: train + eval are deduped (and eval is held out from train), so the
// request must fit comfortably inside the unique key space. Cap at 80% to keep
// dedup fast and the disjoint-eval loop terminating.
let space = unique_space(&cfg);
let need = (n_train + n_eval) as u64;
assert!(
need * 5 <= space * 4,
"requested {need} unique problems but the space is only {space} \
(max_add={max_add}, max_mul={max_mul}); raise --max-add/--max-mul or lower --n/--eval"
);
let mut rng = seed.max(1);
// Train: dedup so the same problem is not repeated and so eval can be held out.

View File

@@ -84,13 +84,27 @@ pub struct GenConfig {
impl Default for GenConfig {
fn default() -> Self {
Self {
max_add: 99,
max_mul: 12,
max_add: 999,
max_mul: 99,
ops: vec![Op::Add, Op::Sub, Op::Mul],
}
}
}
/// Number of distinct problems this config can produce (the key space). Used to
/// guard the dedup generator against requesting more unique problems than exist —
/// otherwise train/eval dedup loops near saturation get pathologically slow or, for
/// a disjoint eval, never terminate.
pub fn unique_space(cfg: &GenConfig) -> u64 {
cfg.ops
.iter()
.map(|op| {
let max = if *op == Op::Mul { cfg.max_mul } else { cfg.max_add };
((max as u64) + 1).pow(2) // ordered (a, b) pairs in [0, max]
})
.sum()
}
/// Sample one problem deterministically from the LCG state `rng`. Operands are drawn
/// in `[0, max]` per the op; subtraction may yield a negative answer (the checker /
/// parser handle a leading `-`).
@@ -198,6 +212,20 @@ mod tests {
}
}
#[test]
fn unique_space_counts_ordered_pairs_per_op() {
// add+sub+mul each contribute (max+1)^2 ordered pairs.
let cfg = GenConfig {
max_add: 9,
max_mul: 4,
ops: vec![Op::Add, Op::Sub, Op::Mul],
};
assert_eq!(unique_space(&cfg), 100 + 100 + 25);
// The shipped default is comfortably large (millions), so 20k requests are
// a tiny fraction and dedup stays fast.
assert!(unique_space(&GenConfig::default()) > 1_000_000);
}
#[test]
fn generation_is_deterministic_from_seed() {
let cfg = GenConfig::default();