post-train: M1 fix — enlarge arith key space + saturation guard

The default operand ranges (max_add=99, max_mul=12) gave only ~20k unique problems, so 'gen_arith_task --n 20000 --eval 500' (a) made train dedup pathologically slow near saturation and (b) made the disjoint-eval loop never terminate. A background run stalled after ~10k train rows with no eval files. Fix (root cause, not a workaround): - enlarge default ranges to max_add=999, max_mul=99 (~2.01M key space) so 20k+ requests are a tiny fraction and dedup stays trivial; - add unique_space() + a generator guard that errors clearly when n+eval exceeds 80% of the key space, instead of looping forever. Verified: cargo test 10/10; full 20000/500 gen now 0.2s, all 3 files, 0 train/eval leakage; guard panics on an oversized (--max-add 99) request. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-29 23:28:25 +08:00
parent 9c70e99ae4
commit cb64604496
2 changed files with 45 additions and 5 deletions
--- a/crates/xtrain-train/src/bin/gen_arith_task.rs
+++ b/crates/xtrain-train/src/bin/gen_arith_task.rs
@@ -19,7 +19,7 @@ use std::fs::{self, File};
 use std::io::{BufWriter, Write};
 use std::path::PathBuf;

-use xtrain_train::task::{GenConfig, Op, gen_problem};
+use xtrain_train::task::{GenConfig, Op, gen_problem, unique_space};

 fn flag<T: std::str::FromStr>(args: &[String], name: &str, default: T) -> T {
    args.iter()
@@ -34,8 +34,8 @@ fn main() {
    let n_train: usize = flag(&args, "--n", 20000);
    let n_eval: usize = flag(&args, "--eval", 500);
    let seed: u64 = flag(&args, "--seed", 1);
-    let max_add: i64 = flag(&args, "--max-add", 99);
-    let max_mul: i64 = flag(&args, "--max-mul", 12);
+    let max_add: i64 = flag(&args, "--max-add", 999);
+    let max_mul: i64 = flag(&args, "--max-mul", 99);
    let out_dir: PathBuf = args
        .iter()
        .position(|a| a == "--out-dir")
@@ -49,6 +49,18 @@ fn main() {
        max_mul,
        ops: vec![Op::Add, Op::Sub, Op::Mul],
    };
+
+    // Guard: train + eval are deduped (and eval is held out from train), so the
+    // request must fit comfortably inside the unique key space. Cap at 80% to keep
+    // dedup fast and the disjoint-eval loop terminating.
+    let space = unique_space(&cfg);
+    let need = (n_train + n_eval) as u64;
+    assert!(
+        need * 5 <= space * 4,
+        "requested {need} unique problems but the space is only {space} \
+         (max_add={max_add}, max_mul={max_mul}); raise --max-add/--max-mul or lower --n/--eval"
+    );
+
    let mut rng = seed.max(1);

    // Train: dedup so the same problem is not repeated and so eval can be held out.
--- a/crates/xtrain-train/src/task.rs
+++ b/crates/xtrain-train/src/task.rs
@@ -84,13 +84,27 @@ pub struct GenConfig {
 impl Default for GenConfig {
    fn default() -> Self {
        Self {
-            max_add: 99,
-            max_mul: 12,
+            max_add: 999,
+            max_mul: 99,
            ops: vec![Op::Add, Op::Sub, Op::Mul],
        }
    }
 }

+/// Number of distinct problems this config can produce (the key space). Used to
+/// guard the dedup generator against requesting more unique problems than exist —
+/// otherwise train/eval dedup loops near saturation get pathologically slow or, for
+/// a disjoint eval, never terminate.
+pub fn unique_space(cfg: &GenConfig) -> u64 {
+    cfg.ops
+        .iter()
+        .map(|op| {
+            let max = if *op == Op::Mul { cfg.max_mul } else { cfg.max_add };
+            ((max as u64) + 1).pow(2) // ordered (a, b) pairs in [0, max]
+        })
+        .sum()
+}
+
 /// Sample one problem deterministically from the LCG state `rng`. Operands are drawn
 /// in `[0, max]` per the op; subtraction may yield a negative answer (the checker /
 /// parser handle a leading `-`).
@@ -198,6 +212,20 @@ mod tests {
        }
    }

+    #[test]
+    fn unique_space_counts_ordered_pairs_per_op() {
+        // add+sub+mul each contribute (max+1)^2 ordered pairs.
+        let cfg = GenConfig {
+            max_add: 9,
+            max_mul: 4,
+            ops: vec![Op::Add, Op::Sub, Op::Mul],
+        };
+        assert_eq!(unique_space(&cfg), 100 + 100 + 25);
+        // The shipped default is comfortably large (millions), so 20k requests are
+        // a tiny fraction and dedup stays fast.
+        assert!(unique_space(&GenConfig::default()) > 1_000_000);
+    }
+
    #[test]
    fn generation_is_deterministic_from_seed() {
        let cfg = GenConfig::default();