dropout: autodiff op + fixed-seed grad-check (T18)
ops::dropout(x,p,seed): fwd runs Tensor::dropout, caches the mask in the backward closure, bwd pushes dx=d⊙mask. p==0 returns x.clone() (no node) so the default graph is unchanged. Tests in autograd.rs: fixed-seed finite-diff grad-check (mask held constant across the ± perturbation — dropout is a fixed elementwise linear map of x); E[out]≈input + keep-rate≈1-p over a seed sweep; p=0 kernel identity. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -140,6 +140,31 @@ pub fn swiglu(gate: &Var, up: &Var) -> Var {
|
|||||||
mul(&silu(gate), up)
|
mul(&silu(gate), up)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Dropout (Phase T18). With probability `p` zero each element, scale the kept
|
||||||
|
/// ones by `1/(1-p)` (inverted dropout — `E[out] == x`). The keep/drop mask is
|
||||||
|
/// drawn by a counter-based RNG from `(seed, element index)`, so it is fully
|
||||||
|
/// determined by `seed` (same `seed` ⇒ same mask: stable across the T13 recompute
|
||||||
|
/// re-run, and held fixed across the ± perturbation of a finite-diff grad-check).
|
||||||
|
/// Forward caches the per-element scale `mask`; **backward applies the same mask**
|
||||||
|
/// (`dx = d ⊙ mask`), making dropout a fixed elementwise linear map of `x`.
|
||||||
|
///
|
||||||
|
/// `p == 0` is a no-op: returns `x.clone()` (no node added) so the default graph
|
||||||
|
/// is bit-identical to the no-dropout path. eval-time identity is handled by the
|
||||||
|
/// caller simply not invoking dropout (the model's train/eval switch).
|
||||||
|
pub fn dropout(x: &Var, p: f32, seed: u64) -> Var {
|
||||||
|
if p == 0.0 {
|
||||||
|
return x.clone();
|
||||||
|
}
|
||||||
|
let (out, mask) = x.value().dropout(p, seed);
|
||||||
|
Var::from_op(
|
||||||
|
out,
|
||||||
|
vec![x.clone()],
|
||||||
|
Box::new(move |d, parents| {
|
||||||
|
Var::push_grad(&parents[0], Tensor::dropout_backward(d, &mask));
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
/// RoPE (rotate_half) over `x:[tokens,heads,head_dim]` with per-sequence position
|
/// RoPE (rotate_half) over `x:[tokens,heads,head_dim]` with per-sequence position
|
||||||
/// `row % period` (`period` = sequence length; `period == tokens` for a single
|
/// `row % period` (`period` = sequence length; `period == tokens` for a single
|
||||||
/// sequence). Orthogonal map, so the backward is the inverse rotation of `dy` — no
|
/// sequence). Orthogonal map, so the backward is the inverse rotation of `dy` — no
|
||||||
|
|||||||
@@ -625,6 +625,96 @@ fn attention_batched_bwd() {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---- dropout (Phase T18) ----
|
||||||
|
//
|
||||||
|
// Fixed-seed finite-diff grad-check. Under a fixed `seed` the mask is constant
|
||||||
|
// (it depends only on (seed, index), NOT on x), so dropout is a fixed elementwise
|
||||||
|
// linear map `out_i = c_i·x_i` and the central difference of L is differentiable:
|
||||||
|
// the ± perturbation of each x_i sees the SAME mask. The forward function in the
|
||||||
|
// closure calls `ops::dropout(x, p, SEED)` with the same SEED, so it reproduces
|
||||||
|
// the same mask both times.
|
||||||
|
#[test]
|
||||||
|
fn dropout_bwd() {
|
||||||
|
require_gpu();
|
||||||
|
const SEED: u64 = 0xD120_FE5E;
|
||||||
|
let p = 0.3f32;
|
||||||
|
let (m, n) = (16, 12);
|
||||||
|
let x_h = fill(m * n, 71);
|
||||||
|
let w = fill(m * n, 72);
|
||||||
|
|
||||||
|
let x = Var::leaf(cuda(&x_h, &[m, n]));
|
||||||
|
let out = ops::dropout(&x, p, SEED);
|
||||||
|
scalar_loss(&out, &w).backward();
|
||||||
|
let dx = x.grad().unwrap().to_device(Device::Cpu);
|
||||||
|
|
||||||
|
let wf = w.clone();
|
||||||
|
let lx = move |v: &[f32], s: &[usize]| {
|
||||||
|
let o = ops::dropout(&Var::leaf(cuda(v, s)), p, SEED);
|
||||||
|
weighted_sum(&o.value(), &wf)
|
||||||
|
};
|
||||||
|
report(
|
||||||
|
"dropout dX",
|
||||||
|
&grad_check(&x_h, &[m, n], &lx, dx.as_slice::<f32>(), cfg_linear()),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Inverted-dropout expectation + keep-rate check. Over a large tensor and a sweep
|
||||||
|
// of seeds, the mean of dropout(x) tracks the mean of x (E[out] ≈ x, the inverted
|
||||||
|
// 1/(1-p) scaling), and the kept fraction tracks 1-p (the RNG is ~Bernoulli).
|
||||||
|
#[test]
|
||||||
|
fn dropout_expectation_and_keep_rate() {
|
||||||
|
require_gpu();
|
||||||
|
let p = 0.25f32;
|
||||||
|
let n = 200_000usize;
|
||||||
|
let x_h = vec![1.0f32; n]; // mean(x) = 1 → mean(out) should ≈ 1
|
||||||
|
let x = cuda(&x_h, &[n]);
|
||||||
|
|
||||||
|
let trials = 8;
|
||||||
|
let mut mean_out_acc = 0.0f64;
|
||||||
|
let mut keep_acc = 0.0f64;
|
||||||
|
for t in 0..trials {
|
||||||
|
let (out, mask) = x.dropout(p, 0x5EED_0000 + t as u64);
|
||||||
|
let out_h = out.to_device(Device::Cpu);
|
||||||
|
let mask_h = mask.to_device(Device::Cpu);
|
||||||
|
let mean_out: f64 =
|
||||||
|
out_h.as_slice::<f32>().iter().map(|&v| v as f64).sum::<f64>() / n as f64;
|
||||||
|
let kept = mask_h.as_slice::<f32>().iter().filter(|&&m| m != 0.0).count();
|
||||||
|
mean_out_acc += mean_out;
|
||||||
|
keep_acc += kept as f64 / n as f64;
|
||||||
|
}
|
||||||
|
let mean_out = mean_out_acc / trials as f64;
|
||||||
|
let keep_rate = keep_acc / trials as f64;
|
||||||
|
println!(
|
||||||
|
"dropout p={p}: E[out]={mean_out:.5} (input mean 1.0), keep_rate={keep_rate:.5} (1-p={:.3})",
|
||||||
|
1.0 - p
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
(mean_out - 1.0).abs() < 0.01,
|
||||||
|
"E[out] {mean_out} not ≈ input mean 1.0 (inverted scaling broken)"
|
||||||
|
);
|
||||||
|
assert!(
|
||||||
|
(keep_rate - (1.0 - p) as f64).abs() < 0.01,
|
||||||
|
"keep_rate {keep_rate} not ≈ 1-p {}",
|
||||||
|
1.0 - p
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// p=0 is a no-op (the op returns x.clone(), no node) → output is bit-identical to
|
||||||
|
// x and its grad flows straight through (the default-graph regression guard at the
|
||||||
|
// op level; the model-level bit-identity is in xtrain-model/tests/dropout.rs).
|
||||||
|
#[test]
|
||||||
|
fn dropout_p0_is_identity() {
|
||||||
|
require_gpu();
|
||||||
|
let (m, n) = (8, 5);
|
||||||
|
let x_h = fill(m * n, 91);
|
||||||
|
let x = cuda(&x_h, &[m, n]);
|
||||||
|
let (out, _mask) = x.dropout(0.0, 12345);
|
||||||
|
let out_h = out.to_device(Device::Cpu);
|
||||||
|
for (a, b) in x_h.iter().zip(out_h.as_slice::<f32>()) {
|
||||||
|
assert_eq!(*a, *b, "p=0 dropout must be identity");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// --- test helpers ---
|
// --- test helpers ---
|
||||||
|
|
||||||
// Scalar loss node L = sum(W ∘ out): wraps a fixed-weight Var and reduces. We
|
// Scalar loss node L = sum(W ∘ out): wraps a fixed-weight Var and reduces. We
|
||||||
|
|||||||
Reference in New Issue
Block a user