dropout: autodiff op + fixed-seed grad-check (T18)

ops::dropout(x,p,seed): fwd runs Tensor::dropout, caches the mask in the backward closure, bwd pushes dx=d⊙mask. p==0 returns x.clone() (no node) so the default graph is unchanged. Tests in autograd.rs: fixed-seed finite-diff grad-check (mask held constant across the ± perturbation — dropout is a fixed elementwise linear map of x); E[out]≈input + keep-rate≈1-p over a seed sweep; p=0 kernel identity. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-18 00:05:32 +08:00
parent 1fdd0c5002
commit 5eb27783f8
2 changed files with 115 additions and 0 deletions
--- a/crates/xtrain-autodiff/src/ops.rs
+++ b/crates/xtrain-autodiff/src/ops.rs
@@ -140,6 +140,31 @@ pub fn swiglu(gate: &Var, up: &Var) -> Var {
    mul(&silu(gate), up)
 }

+/// Dropout (Phase T18). With probability `p` zero each element, scale the kept
+/// ones by `1/(1-p)` (inverted dropout — `E[out] == x`). The keep/drop mask is
+/// drawn by a counter-based RNG from `(seed, element index)`, so it is fully
+/// determined by `seed` (same `seed` ⇒ same mask: stable across the T13 recompute
+/// re-run, and held fixed across the ± perturbation of a finite-diff grad-check).
+/// Forward caches the per-element scale `mask`; **backward applies the same mask**
+/// (`dx = d ⊙ mask`), making dropout a fixed elementwise linear map of `x`.
+///
+/// `p == 0` is a no-op: returns `x.clone()` (no node added) so the default graph
+/// is bit-identical to the no-dropout path. eval-time identity is handled by the
+/// caller simply not invoking dropout (the model's train/eval switch).
+pub fn dropout(x: &Var, p: f32, seed: u64) -> Var {
+    if p == 0.0 {
+        return x.clone();
+    }
+    let (out, mask) = x.value().dropout(p, seed);
+    Var::from_op(
+        out,
+        vec![x.clone()],
+        Box::new(move |d, parents| {
+            Var::push_grad(&parents[0], Tensor::dropout_backward(d, &mask));
+        }),
+    )
+}
+
 /// RoPE (rotate_half) over `x:[tokens,heads,head_dim]` with per-sequence position
 /// `row % period` (`period` = sequence length; `period == tokens` for a single
 /// sequence). Orthogonal map, so the backward is the inverse rotation of `dy` — no
--- a/crates/xtrain-autodiff/tests/autograd.rs
+++ b/crates/xtrain-autodiff/tests/autograd.rs
@@ -625,6 +625,96 @@ fn attention_batched_bwd() {
    );
 }

+// ---- dropout (Phase T18) ----
+//
+// Fixed-seed finite-diff grad-check. Under a fixed `seed` the mask is constant
+// (it depends only on (seed, index), NOT on x), so dropout is a fixed elementwise
+// linear map `out_i = c_i·x_i` and the central difference of L is differentiable:
+// the ± perturbation of each x_i sees the SAME mask. The forward function in the
+// closure calls `ops::dropout(x, p, SEED)` with the same SEED, so it reproduces
+// the same mask both times.
+#[test]
+fn dropout_bwd() {
+    require_gpu();
+    const SEED: u64 = 0xD120_FE5E;
+    let p = 0.3f32;
+    let (m, n) = (16, 12);
+    let x_h = fill(m * n, 71);
+    let w = fill(m * n, 72);
+
+    let x = Var::leaf(cuda(&x_h, &[m, n]));
+    let out = ops::dropout(&x, p, SEED);
+    scalar_loss(&out, &w).backward();
+    let dx = x.grad().unwrap().to_device(Device::Cpu);
+
+    let wf = w.clone();
+    let lx = move |v: &[f32], s: &[usize]| {
+        let o = ops::dropout(&Var::leaf(cuda(v, s)), p, SEED);
+        weighted_sum(&o.value(), &wf)
+    };
+    report(
+        "dropout dX",
+        &grad_check(&x_h, &[m, n], &lx, dx.as_slice::<f32>(), cfg_linear()),
+    );
+}
+
+// Inverted-dropout expectation + keep-rate check. Over a large tensor and a sweep
+// of seeds, the mean of dropout(x) tracks the mean of x (E[out] ≈ x, the inverted
+// 1/(1-p) scaling), and the kept fraction tracks 1-p (the RNG is ~Bernoulli).
+#[test]
+fn dropout_expectation_and_keep_rate() {
+    require_gpu();
+    let p = 0.25f32;
+    let n = 200_000usize;
+    let x_h = vec![1.0f32; n]; // mean(x) = 1 → mean(out) should ≈ 1
+    let x = cuda(&x_h, &[n]);
+
+    let trials = 8;
+    let mut mean_out_acc = 0.0f64;
+    let mut keep_acc = 0.0f64;
+    for t in 0..trials {
+        let (out, mask) = x.dropout(p, 0x5EED_0000 + t as u64);
+        let out_h = out.to_device(Device::Cpu);
+        let mask_h = mask.to_device(Device::Cpu);
+        let mean_out: f64 =
+            out_h.as_slice::<f32>().iter().map(|&v| v as f64).sum::<f64>() / n as f64;
+        let kept = mask_h.as_slice::<f32>().iter().filter(|&&m| m != 0.0).count();
+        mean_out_acc += mean_out;
+        keep_acc += kept as f64 / n as f64;
+    }
+    let mean_out = mean_out_acc / trials as f64;
+    let keep_rate = keep_acc / trials as f64;
+    println!(
+        "dropout p={p}: E[out]={mean_out:.5} (input mean 1.0), keep_rate={keep_rate:.5} (1-p={:.3})",
+        1.0 - p
+    );
+    assert!(
+        (mean_out - 1.0).abs() < 0.01,
+        "E[out] {mean_out} not ≈ input mean 1.0 (inverted scaling broken)"
+    );
+    assert!(
+        (keep_rate - (1.0 - p) as f64).abs() < 0.01,
+        "keep_rate {keep_rate} not ≈ 1-p {}",
+        1.0 - p
+    );
+}
+
+// p=0 is a no-op (the op returns x.clone(), no node) → output is bit-identical to
+// x and its grad flows straight through (the default-graph regression guard at the
+// op level; the model-level bit-identity is in xtrain-model/tests/dropout.rs).
+#[test]
+fn dropout_p0_is_identity() {
+    require_gpu();
+    let (m, n) = (8, 5);
+    let x_h = fill(m * n, 91);
+    let x = cuda(&x_h, &[m, n]);
+    let (out, _mask) = x.dropout(0.0, 12345);
+    let out_h = out.to_device(Device::Cpu);
+    for (a, b) in x_h.iter().zip(out_h.as_slice::<f32>()) {
+        assert_eq!(*a, *b, "p=0 dropout must be identity");
+    }
+}
+
 // --- test helpers ---

 // Scalar loss node L = sum(W ∘ out): wraps a fixed-weight Var and reduces. We