autograd: tape engine + grad accumulation
Var = Rc<RefCell<VarNode>> on a define-by-run tape: value + optional grad + parents + backward closure. backward() seeds a scalar loss, walks reverse topo order, and pushes grads to parents. push_grad always SUMs into the grad slot — the fan-out accumulation path T3 lacked. Per-crate build.rs emits the no_cuda cfg (does not propagate); engine gated, grad_check stays host-only. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
27
crates/xtrain-autodiff/build.rs
Normal file
27
crates/xtrain-autodiff/build.rs
Normal file
@@ -0,0 +1,27 @@
|
||||
use std::env;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
|
||||
// xtrain-autodiff's `Var` tape calls GPU ops (via xtrain-tensor), so it gates
|
||||
// those call sites behind `not(no_cuda)` — the same per-crate convention as
|
||||
// xtrain-cuda / xtrain-tensor (cfg does not propagate across crates). The
|
||||
// grad_check harness itself is host-only and always compiles. This script only
|
||||
// detects nvcc and emits the cfg; it compiles no CUDA.
|
||||
fn main() {
|
||||
println!("cargo:rustc-check-cfg=cfg(no_cuda)");
|
||||
|
||||
let cuda_path = env::var("CUDA_HOME")
|
||||
.or_else(|_| env::var("CUDA_PATH"))
|
||||
.unwrap_or_else(|_| "/usr/local/cuda".to_string());
|
||||
|
||||
if !nvcc_available(&cuda_path) {
|
||||
println!("cargo:rustc-cfg=no_cuda");
|
||||
}
|
||||
}
|
||||
|
||||
fn nvcc_available(cuda_path: &str) -> bool {
|
||||
if Command::new("nvcc").arg("--version").output().is_ok() {
|
||||
return true;
|
||||
}
|
||||
Path::new(&format!("{cuda_path}/bin/nvcc")).exists()
|
||||
}
|
||||
@@ -13,3 +13,14 @@
|
||||
pub mod finite_diff;
|
||||
|
||||
pub use finite_diff::{GradCheckConfig, GradCheckResult, ParamFn, grad_check};
|
||||
|
||||
// Tape-based autograd engine + differentiable ops (Phase T4). These call GPU
|
||||
// kernels via xtrain-tensor, so they are gated behind `not(no_cuda)` (the
|
||||
// per-crate convention); the grad_check harness above stays host-only.
|
||||
#[cfg(not(no_cuda))]
|
||||
pub mod ops;
|
||||
#[cfg(not(no_cuda))]
|
||||
pub mod tape;
|
||||
|
||||
#[cfg(not(no_cuda))]
|
||||
pub use tape::Var;
|
||||
|
||||
164
crates/xtrain-autodiff/src/tape.rs
Normal file
164
crates/xtrain-autodiff/src/tape.rs
Normal file
@@ -0,0 +1,164 @@
|
||||
//! Tape-based reverse-mode autograd (Phase T4).
|
||||
//!
|
||||
//! A [`Var`] is a reference-counted node wrapping a value [`Tensor`], an optional
|
||||
//! accumulated gradient, and — for non-leaf nodes — the parents it was computed
|
||||
//! from plus a backward closure. Forward ops (see [`crate::ops`]) build these
|
||||
//! nodes; [`Var::backward`] walks the graph in reverse topological order and
|
||||
//! pushes gradients to parents, **summing** on fan-out (a tensor consumed by
|
||||
//! several ops).
|
||||
//!
|
||||
//! The graph is dynamic (define-by-run) and the design favours clarity over
|
||||
//! speed: each op synchronizes its own kernels (T3 has no streams yet), and
|
||||
//! gradient accumulation is an explicit elementwise add.
|
||||
|
||||
#![cfg(not(no_cuda))]
|
||||
|
||||
use std::cell::RefCell;
|
||||
use std::rc::Rc;
|
||||
use xtrain_tensor::Tensor;
|
||||
|
||||
/// Backward closure: given this node's accumulated grad and its parents, compute
|
||||
/// and accumulate each parent's gradient contribution.
|
||||
pub type BackwardFn = Box<dyn Fn(&Tensor, &[Var])>;
|
||||
|
||||
pub struct VarNode {
|
||||
pub value: Tensor,
|
||||
pub grad: Option<Tensor>,
|
||||
parents: Vec<Var>,
|
||||
backward: Option<BackwardFn>,
|
||||
}
|
||||
|
||||
/// A node in the autograd tape. Cheap to clone (bumps the `Rc`); clones share
|
||||
/// the same underlying node, which is how fan-out is detected.
|
||||
#[derive(Clone)]
|
||||
pub struct Var(Rc<RefCell<VarNode>>);
|
||||
|
||||
impl Var {
|
||||
/// A leaf node (parameter / input). After `backward`, its `.grad()` holds the
|
||||
/// accumulated gradient of the loss w.r.t. this tensor.
|
||||
pub fn leaf(value: Tensor) -> Self {
|
||||
Var(Rc::new(RefCell::new(VarNode {
|
||||
value,
|
||||
grad: None,
|
||||
parents: Vec::new(),
|
||||
backward: None,
|
||||
})))
|
||||
}
|
||||
|
||||
/// Build a non-leaf node from a forward `value`, its `parents`, and a
|
||||
/// `backward` closure. Used by the op constructors in [`crate::ops`] and by
|
||||
/// callers that compose custom nodes (e.g. a loss reduction or a transpose
|
||||
/// the built-in op set doesn't cover).
|
||||
pub fn from_op(value: Tensor, parents: Vec<Var>, backward: BackwardFn) -> Self {
|
||||
Var(Rc::new(RefCell::new(VarNode {
|
||||
value,
|
||||
grad: None,
|
||||
parents,
|
||||
backward: Some(backward),
|
||||
})))
|
||||
}
|
||||
|
||||
/// Clone of the value tensor (cheap: storage is `Arc`-shared).
|
||||
pub fn value(&self) -> Tensor {
|
||||
self.0.borrow().value.clone()
|
||||
}
|
||||
|
||||
/// The accumulated gradient, if any (populated after `backward`).
|
||||
pub fn grad(&self) -> Option<Tensor> {
|
||||
self.0.borrow().grad.clone()
|
||||
}
|
||||
|
||||
/// Pointer identity, used to dedup nodes during the topological sort.
|
||||
fn id(&self) -> *const RefCell<VarNode> {
|
||||
Rc::as_ptr(&self.0)
|
||||
}
|
||||
|
||||
/// Accumulate `g` into this node's grad slot (SUM — the fan-out rule).
|
||||
fn accumulate(&self, g: Tensor) {
|
||||
let mut node = self.0.borrow_mut();
|
||||
match node.grad.take() {
|
||||
None => node.grad = Some(g),
|
||||
Some(existing) => node.grad = Some(existing.add(&g)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Reverse-mode backward from this node (treated as a scalar loss: its
|
||||
/// upstream grad is seeded to ones). Populates `.grad` on every node that
|
||||
/// transitively feeds it.
|
||||
///
|
||||
/// `loss` must be a scalar (single element) so the seed `dL/dL = 1` is
|
||||
/// unambiguous, matching the `L = sum(W∘out)` grad-check convention.
|
||||
pub fn backward(&self) {
|
||||
assert_eq!(
|
||||
self.value().numel(),
|
||||
1,
|
||||
"backward() expects a scalar loss; got shape {:?}",
|
||||
self.value().shape()
|
||||
);
|
||||
|
||||
// 1. Topological order (post-order DFS), parents before children.
|
||||
let mut topo: Vec<Var> = Vec::new();
|
||||
let mut visited: Vec<*const RefCell<VarNode>> = Vec::new();
|
||||
build_topo(self, &mut topo, &mut visited);
|
||||
|
||||
// 2. Seed the loss gradient with ones.
|
||||
let seed = ones_like(&self.value());
|
||||
self.accumulate(seed);
|
||||
|
||||
// 3. Walk in reverse: each node hands its grad to its parents' closures.
|
||||
for var in topo.iter().rev() {
|
||||
let (grad, parents, backward) = {
|
||||
let node = var.0.borrow();
|
||||
(
|
||||
node.grad.clone(),
|
||||
node.parents.clone(),
|
||||
node.backward.is_some(),
|
||||
)
|
||||
};
|
||||
let grad = match grad {
|
||||
Some(g) => g,
|
||||
None => continue, // node didn't contribute to the loss
|
||||
};
|
||||
if backward {
|
||||
// Borrow the closure out, run it, then drop the borrow.
|
||||
let node = var.0.borrow();
|
||||
let bw = node.backward.as_ref().unwrap();
|
||||
bw(&grad, &parents);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Drop the parents/closure so the graph can be freed, keeping value+grad.
|
||||
/// (Not needed for tests; provided for completeness of the engine.)
|
||||
pub fn detach_graph(&self) {
|
||||
let mut node = self.0.borrow_mut();
|
||||
node.parents.clear();
|
||||
node.backward = None;
|
||||
}
|
||||
|
||||
/// Distribute `g` to a parent (called from op backward closures). Every node
|
||||
/// accumulates its grad: intermediates need it for the chain rule, leaves
|
||||
/// expose it as the result. This SUM is what makes fan-out correct.
|
||||
pub fn push_grad(parent: &Var, g: Tensor) {
|
||||
parent.accumulate(g);
|
||||
}
|
||||
}
|
||||
|
||||
/// Post-order DFS: append a node only after all its parents, dedup by identity.
|
||||
fn build_topo(var: &Var, topo: &mut Vec<Var>, visited: &mut Vec<*const RefCell<VarNode>>) {
|
||||
if visited.contains(&var.id()) {
|
||||
return;
|
||||
}
|
||||
visited.push(var.id());
|
||||
let parents = var.0.borrow().parents.clone();
|
||||
for p in &parents {
|
||||
build_topo(p, topo, visited);
|
||||
}
|
||||
topo.push(var.clone());
|
||||
}
|
||||
|
||||
/// A ones tensor matching `t`'s shape/device (the backward seed for a scalar).
|
||||
fn ones_like(t: &Tensor) -> Tensor {
|
||||
let host = vec![1.0f32; t.numel()];
|
||||
Tensor::from_slice(&host, t.shape()).to_device(t.device())
|
||||
}
|
||||
Reference in New Issue
Block a user