dropout: autodiff op + fixed-seed grad-check (T18)

ops::dropout(x,p,seed): fwd runs Tensor::dropout, caches the mask in the backward
closure, bwd pushes dx=d⊙mask. p==0 returns x.clone() (no node) so the default
graph is unchanged. Tests in autograd.rs: fixed-seed finite-diff grad-check (mask
held constant across the ± perturbation — dropout is a fixed elementwise linear
map of x); E[out]≈input + keep-rate≈1-p over a seed sweep; p=0 kernel identity.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-18 00:05:32 +08:00
parent 1fdd0c5002
commit 5eb27783f8
2 changed files with 115 additions and 0 deletions

View File

@@ -140,6 +140,31 @@ pub fn swiglu(gate: &Var, up: &Var) -> Var {
mul(&silu(gate), up) mul(&silu(gate), up)
} }
/// Dropout (Phase T18). With probability `p` zero each element, scale the kept
/// ones by `1/(1-p)` (inverted dropout — `E[out] == x`). The keep/drop mask is
/// drawn by a counter-based RNG from `(seed, element index)`, so it is fully
/// determined by `seed` (same `seed` ⇒ same mask: stable across the T13 recompute
/// re-run, and held fixed across the ± perturbation of a finite-diff grad-check).
/// Forward caches the per-element scale `mask`; **backward applies the same mask**
/// (`dx = d ⊙ mask`), making dropout a fixed elementwise linear map of `x`.
///
/// `p == 0` is a no-op: returns `x.clone()` (no node added) so the default graph
/// is bit-identical to the no-dropout path. eval-time identity is handled by the
/// caller simply not invoking dropout (the model's train/eval switch).
pub fn dropout(x: &Var, p: f32, seed: u64) -> Var {
if p == 0.0 {
return x.clone();
}
let (out, mask) = x.value().dropout(p, seed);
Var::from_op(
out,
vec![x.clone()],
Box::new(move |d, parents| {
Var::push_grad(&parents[0], Tensor::dropout_backward(d, &mask));
}),
)
}
/// RoPE (rotate_half) over `x:[tokens,heads,head_dim]` with per-sequence position /// RoPE (rotate_half) over `x:[tokens,heads,head_dim]` with per-sequence position
/// `row % period` (`period` = sequence length; `period == tokens` for a single /// `row % period` (`period` = sequence length; `period == tokens` for a single
/// sequence). Orthogonal map, so the backward is the inverse rotation of `dy` — no /// sequence). Orthogonal map, so the backward is the inverse rotation of `dy` — no

View File

@@ -625,6 +625,96 @@ fn attention_batched_bwd() {
); );
} }
// ---- dropout (Phase T18) ----
//
// Fixed-seed finite-diff grad-check. Under a fixed `seed` the mask is constant
// (it depends only on (seed, index), NOT on x), so dropout is a fixed elementwise
// linear map `out_i = c_i·x_i` and the central difference of L is differentiable:
// the ± perturbation of each x_i sees the SAME mask. The forward function in the
// closure calls `ops::dropout(x, p, SEED)` with the same SEED, so it reproduces
// the same mask both times.
#[test]
fn dropout_bwd() {
require_gpu();
const SEED: u64 = 0xD120_FE5E;
let p = 0.3f32;
let (m, n) = (16, 12);
let x_h = fill(m * n, 71);
let w = fill(m * n, 72);
let x = Var::leaf(cuda(&x_h, &[m, n]));
let out = ops::dropout(&x, p, SEED);
scalar_loss(&out, &w).backward();
let dx = x.grad().unwrap().to_device(Device::Cpu);
let wf = w.clone();
let lx = move |v: &[f32], s: &[usize]| {
let o = ops::dropout(&Var::leaf(cuda(v, s)), p, SEED);
weighted_sum(&o.value(), &wf)
};
report(
"dropout dX",
&grad_check(&x_h, &[m, n], &lx, dx.as_slice::<f32>(), cfg_linear()),
);
}
// Inverted-dropout expectation + keep-rate check. Over a large tensor and a sweep
// of seeds, the mean of dropout(x) tracks the mean of x (E[out] ≈ x, the inverted
// 1/(1-p) scaling), and the kept fraction tracks 1-p (the RNG is ~Bernoulli).
#[test]
fn dropout_expectation_and_keep_rate() {
require_gpu();
let p = 0.25f32;
let n = 200_000usize;
let x_h = vec![1.0f32; n]; // mean(x) = 1 → mean(out) should ≈ 1
let x = cuda(&x_h, &[n]);
let trials = 8;
let mut mean_out_acc = 0.0f64;
let mut keep_acc = 0.0f64;
for t in 0..trials {
let (out, mask) = x.dropout(p, 0x5EED_0000 + t as u64);
let out_h = out.to_device(Device::Cpu);
let mask_h = mask.to_device(Device::Cpu);
let mean_out: f64 =
out_h.as_slice::<f32>().iter().map(|&v| v as f64).sum::<f64>() / n as f64;
let kept = mask_h.as_slice::<f32>().iter().filter(|&&m| m != 0.0).count();
mean_out_acc += mean_out;
keep_acc += kept as f64 / n as f64;
}
let mean_out = mean_out_acc / trials as f64;
let keep_rate = keep_acc / trials as f64;
println!(
"dropout p={p}: E[out]={mean_out:.5} (input mean 1.0), keep_rate={keep_rate:.5} (1-p={:.3})",
1.0 - p
);
assert!(
(mean_out - 1.0).abs() < 0.01,
"E[out] {mean_out} not ≈ input mean 1.0 (inverted scaling broken)"
);
assert!(
(keep_rate - (1.0 - p) as f64).abs() < 0.01,
"keep_rate {keep_rate} not ≈ 1-p {}",
1.0 - p
);
}
// p=0 is a no-op (the op returns x.clone(), no node) → output is bit-identical to
// x and its grad flows straight through (the default-graph regression guard at the
// op level; the model-level bit-identity is in xtrain-model/tests/dropout.rs).
#[test]
fn dropout_p0_is_identity() {
require_gpu();
let (m, n) = (8, 5);
let x_h = fill(m * n, 91);
let x = cuda(&x_h, &[m, n]);
let (out, _mask) = x.dropout(0.0, 12345);
let out_h = out.to_device(Device::Cpu);
for (a, b) in x_h.iter().zip(out_h.as_slice::<f32>()) {
assert_eq!(*a, *b, "p=0 dropout must be identity");
}
}
// --- test helpers --- // --- test helpers ---
// Scalar loss node L = sum(W ∘ out): wraps a fixed-weight Var and reduces. We // Scalar loss node L = sum(W ∘ out): wraps a fixed-weight Var and reduces. We