dropout: autodiff op + fixed-seed grad-check (T18)

ops::dropout(x,p,seed): fwd runs Tensor::dropout, caches the mask in the backward closure, bwd pushes dx=d⊙mask. p==0 returns x.clone() (no node) so the default graph is unchanged. Tests in autograd.rs: fixed-seed finite-diff grad-check (mask held constant across the ± perturbation — dropout is a fixed elementwise linear map of x); E[out]≈input + keep-rate≈1-p over a seed sweep; p=0 kernel identity. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-18 00:05:32 +08:00
parent 1fdd0c5002
commit 5eb27783f8
2 changed files with 115 additions and 0 deletions
--- a/crates/xtrain-autodiff/src/ops.rs
+++ b/crates/xtrain-autodiff/src/ops.rs
@@ -140,6 +140,31 @@ pub fn swiglu(gate: &Var, up: &Var) -> Var {
    mul(&silu(gate), up)
 }
 /// Dropout (Phase T18). With probability `p` zero each element, scale the kept
 /// ones by `1/(1-p)` (inverted dropout — `E[out] == x`). The keep/drop mask is
 /// drawn by a counter-based RNG from `(seed, element index)`, so it is fully
 /// determined by `seed` (same `seed` ⇒ same mask: stable across the T13 recompute
 /// re-run, and held fixed across the ± perturbation of a finite-diff grad-check).
 /// Forward caches the per-element scale `mask`; **backward applies the same mask**
 /// (`dx = d ⊙ mask`), making dropout a fixed elementwise linear map of `x`.
 ///
 /// `p == 0` is a no-op: returns `x.clone()` (no node added) so the default graph
 /// is bit-identical to the no-dropout path. eval-time identity is handled by the
 /// caller simply not invoking dropout (the model's train/eval switch).
 pub fn dropout(x: &Var, p: f32, seed: u64) -> Var {
    if p == 0.0 {
        return x.clone();
    }
    let (out, mask) = x.value().dropout(p, seed);
    Var::from_op(
        out,
        vec![x.clone()],
        Box::new(move |d, parents| {
            Var::push_grad(&parents[0], Tensor::dropout_backward(d, &mask));
        }),
    )
 }
 /// RoPE (rotate_half) over `x:[tokens,heads,head_dim]` with per-sequence position
 /// `row % period` (`period` = sequence length; `period == tokens` for a single
 /// sequence). Orthogonal map, so the backward is the inverse rotation of `dy` — no
--- a/crates/xtrain-autodiff/tests/autograd.rs
+++ b/crates/xtrain-autodiff/tests/autograd.rs
@@ -625,6 +625,96 @@ fn attention_batched_bwd() {
    );
 }
 // ---- dropout (Phase T18) ----
 //
 // Fixed-seed finite-diff grad-check. Under a fixed `seed` the mask is constant
 // (it depends only on (seed, index), NOT on x), so dropout is a fixed elementwise
 // linear map `out_i = c_i·x_i` and the central difference of L is differentiable:
 // the ± perturbation of each x_i sees the SAME mask. The forward function in the
 // closure calls `ops::dropout(x, p, SEED)` with the same SEED, so it reproduces
 // the same mask both times.
 #[test]
 fn dropout_bwd() {
    require_gpu();
    const SEED: u64 = 0xD120_FE5E;
    let p = 0.3f32;
    let (m, n) = (16, 12);
    let x_h = fill(m * n, 71);
    let w = fill(m * n, 72);
    let x = Var::leaf(cuda(&x_h, &[m, n]));
    let out = ops::dropout(&x, p, SEED);
    scalar_loss(&out, &w).backward();
    let dx = x.grad().unwrap().to_device(Device::Cpu);
    let wf = w.clone();
    let lx = move |v: &[f32], s: &[usize]| {
        let o = ops::dropout(&Var::leaf(cuda(v, s)), p, SEED);
        weighted_sum(&o.value(), &wf)
    };
    report(
        "dropout dX",
        &grad_check(&x_h, &[m, n], &lx, dx.as_slice::<f32>(), cfg_linear()),
    );
 }
 // Inverted-dropout expectation + keep-rate check. Over a large tensor and a sweep
 // of seeds, the mean of dropout(x) tracks the mean of x (E[out] ≈ x, the inverted
 // 1/(1-p) scaling), and the kept fraction tracks 1-p (the RNG is ~Bernoulli).
 #[test]
 fn dropout_expectation_and_keep_rate() {
    require_gpu();
    let p = 0.25f32;
    let n = 200_000usize;
    let x_h = vec![1.0f32; n]; // mean(x) = 1 → mean(out) should ≈ 1
    let x = cuda(&x_h, &[n]);
    let trials = 8;
    let mut mean_out_acc = 0.0f64;
    let mut keep_acc = 0.0f64;
    for t in 0..trials {
        let (out, mask) = x.dropout(p, 0x5EED_0000 + t as u64);
        let out_h = out.to_device(Device::Cpu);
        let mask_h = mask.to_device(Device::Cpu);
        let mean_out: f64 =
            out_h.as_slice::<f32>().iter().map(|&v| v as f64).sum::<f64>() / n as f64;
        let kept = mask_h.as_slice::<f32>().iter().filter(|&&m| m != 0.0).count();
        mean_out_acc += mean_out;
        keep_acc += kept as f64 / n as f64;
    }
    let mean_out = mean_out_acc / trials as f64;
    let keep_rate = keep_acc / trials as f64;
    println!(
        "dropout p={p}: E[out]={mean_out:.5} (input mean 1.0), keep_rate={keep_rate:.5} (1-p={:.3})",
        1.0 - p
    );
    assert!(
        (mean_out - 1.0).abs() < 0.01,
        "E[out] {mean_out} not ≈ input mean 1.0 (inverted scaling broken)"
    );
    assert!(
        (keep_rate - (1.0 - p) as f64).abs() < 0.01,
        "keep_rate {keep_rate} not ≈ 1-p {}",
        1.0 - p
    );
 }
 // p=0 is a no-op (the op returns x.clone(), no node) → output is bit-identical to
 // x and its grad flows straight through (the default-graph regression guard at the
 // op level; the model-level bit-identity is in xtrain-model/tests/dropout.rs).
 #[test]
 fn dropout_p0_is_identity() {
    require_gpu();
    let (m, n) = (8, 5);
    let x_h = fill(m * n, 91);
    let x = cuda(&x_h, &[m, n]);
    let (out, _mask) = x.dropout(0.0, 12345);
    let out_h = out.to_device(Device::Cpu);
    for (a, b) in x_h.iter().zip(out_h.as_slice::<f32>()) {
        assert_eq!(*a, *b, "p=0 dropout must be identity");
    }
 }
 // --- test helpers ---
 // Scalar loss node L = sum(W ∘ out): wraps a fixed-weight Var and reduces. We