dropout: autodiff op + fixed-seed grad-check (T18)
ops::dropout(x,p,seed): fwd runs Tensor::dropout, caches the mask in the backward closure, bwd pushes dx=d⊙mask. p==0 returns x.clone() (no node) so the default graph is unchanged. Tests in autograd.rs: fixed-seed finite-diff grad-check (mask held constant across the ± perturbation — dropout is a fixed elementwise linear map of x); E[out]≈input + keep-rate≈1-p over a seed sweep; p=0 kernel identity. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -140,6 +140,31 @@ pub fn swiglu(gate: &Var, up: &Var) -> Var {
|
||||
mul(&silu(gate), up)
|
||||
}
|
||||
|
||||
/// Dropout (Phase T18). With probability `p` zero each element, scale the kept
|
||||
/// ones by `1/(1-p)` (inverted dropout — `E[out] == x`). The keep/drop mask is
|
||||
/// drawn by a counter-based RNG from `(seed, element index)`, so it is fully
|
||||
/// determined by `seed` (same `seed` ⇒ same mask: stable across the T13 recompute
|
||||
/// re-run, and held fixed across the ± perturbation of a finite-diff grad-check).
|
||||
/// Forward caches the per-element scale `mask`; **backward applies the same mask**
|
||||
/// (`dx = d ⊙ mask`), making dropout a fixed elementwise linear map of `x`.
|
||||
///
|
||||
/// `p == 0` is a no-op: returns `x.clone()` (no node added) so the default graph
|
||||
/// is bit-identical to the no-dropout path. eval-time identity is handled by the
|
||||
/// caller simply not invoking dropout (the model's train/eval switch).
|
||||
pub fn dropout(x: &Var, p: f32, seed: u64) -> Var {
|
||||
if p == 0.0 {
|
||||
return x.clone();
|
||||
}
|
||||
let (out, mask) = x.value().dropout(p, seed);
|
||||
Var::from_op(
|
||||
out,
|
||||
vec![x.clone()],
|
||||
Box::new(move |d, parents| {
|
||||
Var::push_grad(&parents[0], Tensor::dropout_backward(d, &mask));
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
/// RoPE (rotate_half) over `x:[tokens,heads,head_dim]` with per-sequence position
|
||||
/// `row % period` (`period` = sequence length; `period == tokens` for a single
|
||||
/// sequence). Orthogonal map, so the backward is the inverse rotation of `dy` — no
|
||||
|
||||
@@ -625,6 +625,96 @@ fn attention_batched_bwd() {
|
||||
);
|
||||
}
|
||||
|
||||
// ---- dropout (Phase T18) ----
|
||||
//
|
||||
// Fixed-seed finite-diff grad-check. Under a fixed `seed` the mask is constant
|
||||
// (it depends only on (seed, index), NOT on x), so dropout is a fixed elementwise
|
||||
// linear map `out_i = c_i·x_i` and the central difference of L is differentiable:
|
||||
// the ± perturbation of each x_i sees the SAME mask. The forward function in the
|
||||
// closure calls `ops::dropout(x, p, SEED)` with the same SEED, so it reproduces
|
||||
// the same mask both times.
|
||||
#[test]
|
||||
fn dropout_bwd() {
|
||||
require_gpu();
|
||||
const SEED: u64 = 0xD120_FE5E;
|
||||
let p = 0.3f32;
|
||||
let (m, n) = (16, 12);
|
||||
let x_h = fill(m * n, 71);
|
||||
let w = fill(m * n, 72);
|
||||
|
||||
let x = Var::leaf(cuda(&x_h, &[m, n]));
|
||||
let out = ops::dropout(&x, p, SEED);
|
||||
scalar_loss(&out, &w).backward();
|
||||
let dx = x.grad().unwrap().to_device(Device::Cpu);
|
||||
|
||||
let wf = w.clone();
|
||||
let lx = move |v: &[f32], s: &[usize]| {
|
||||
let o = ops::dropout(&Var::leaf(cuda(v, s)), p, SEED);
|
||||
weighted_sum(&o.value(), &wf)
|
||||
};
|
||||
report(
|
||||
"dropout dX",
|
||||
&grad_check(&x_h, &[m, n], &lx, dx.as_slice::<f32>(), cfg_linear()),
|
||||
);
|
||||
}
|
||||
|
||||
// Inverted-dropout expectation + keep-rate check. Over a large tensor and a sweep
|
||||
// of seeds, the mean of dropout(x) tracks the mean of x (E[out] ≈ x, the inverted
|
||||
// 1/(1-p) scaling), and the kept fraction tracks 1-p (the RNG is ~Bernoulli).
|
||||
#[test]
|
||||
fn dropout_expectation_and_keep_rate() {
|
||||
require_gpu();
|
||||
let p = 0.25f32;
|
||||
let n = 200_000usize;
|
||||
let x_h = vec![1.0f32; n]; // mean(x) = 1 → mean(out) should ≈ 1
|
||||
let x = cuda(&x_h, &[n]);
|
||||
|
||||
let trials = 8;
|
||||
let mut mean_out_acc = 0.0f64;
|
||||
let mut keep_acc = 0.0f64;
|
||||
for t in 0..trials {
|
||||
let (out, mask) = x.dropout(p, 0x5EED_0000 + t as u64);
|
||||
let out_h = out.to_device(Device::Cpu);
|
||||
let mask_h = mask.to_device(Device::Cpu);
|
||||
let mean_out: f64 =
|
||||
out_h.as_slice::<f32>().iter().map(|&v| v as f64).sum::<f64>() / n as f64;
|
||||
let kept = mask_h.as_slice::<f32>().iter().filter(|&&m| m != 0.0).count();
|
||||
mean_out_acc += mean_out;
|
||||
keep_acc += kept as f64 / n as f64;
|
||||
}
|
||||
let mean_out = mean_out_acc / trials as f64;
|
||||
let keep_rate = keep_acc / trials as f64;
|
||||
println!(
|
||||
"dropout p={p}: E[out]={mean_out:.5} (input mean 1.0), keep_rate={keep_rate:.5} (1-p={:.3})",
|
||||
1.0 - p
|
||||
);
|
||||
assert!(
|
||||
(mean_out - 1.0).abs() < 0.01,
|
||||
"E[out] {mean_out} not ≈ input mean 1.0 (inverted scaling broken)"
|
||||
);
|
||||
assert!(
|
||||
(keep_rate - (1.0 - p) as f64).abs() < 0.01,
|
||||
"keep_rate {keep_rate} not ≈ 1-p {}",
|
||||
1.0 - p
|
||||
);
|
||||
}
|
||||
|
||||
// p=0 is a no-op (the op returns x.clone(), no node) → output is bit-identical to
|
||||
// x and its grad flows straight through (the default-graph regression guard at the
|
||||
// op level; the model-level bit-identity is in xtrain-model/tests/dropout.rs).
|
||||
#[test]
|
||||
fn dropout_p0_is_identity() {
|
||||
require_gpu();
|
||||
let (m, n) = (8, 5);
|
||||
let x_h = fill(m * n, 91);
|
||||
let x = cuda(&x_h, &[m, n]);
|
||||
let (out, _mask) = x.dropout(0.0, 12345);
|
||||
let out_h = out.to_device(Device::Cpu);
|
||||
for (a, b) in x_h.iter().zip(out_h.as_slice::<f32>()) {
|
||||
assert_eq!(*a, *b, "p=0 dropout must be identity");
|
||||
}
|
||||
}
|
||||
|
||||
// --- test helpers ---
|
||||
|
||||
// Scalar loss node L = sum(W ∘ out): wraps a fixed-weight Var and reduces. We
|
||||
|
||||
Reference in New Issue
Block a user