From 224f750ee45687e5261d612495076f825f06be26 Mon Sep 17 00:00:00 2001 From: Gahow Wang Date: Mon, 15 Jun 2026 15:44:17 +0800 Subject: [PATCH] autograd: tape engine + grad accumulation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Var = Rc> on a define-by-run tape: value + optional grad + parents + backward closure. backward() seeds a scalar loss, walks reverse topo order, and pushes grads to parents. push_grad always SUMs into the grad slot — the fan-out accumulation path T3 lacked. Per-crate build.rs emits the no_cuda cfg (does not propagate); engine gated, grad_check stays host-only. Co-Authored-By: Claude Opus 4.8 --- crates/xtrain-autodiff/build.rs | 27 +++++ crates/xtrain-autodiff/src/lib.rs | 11 ++ crates/xtrain-autodiff/src/tape.rs | 164 +++++++++++++++++++++++++++++ 3 files changed, 202 insertions(+) create mode 100644 crates/xtrain-autodiff/build.rs create mode 100644 crates/xtrain-autodiff/src/tape.rs diff --git a/crates/xtrain-autodiff/build.rs b/crates/xtrain-autodiff/build.rs new file mode 100644 index 0000000..0ffc22e --- /dev/null +++ b/crates/xtrain-autodiff/build.rs @@ -0,0 +1,27 @@ +use std::env; +use std::path::Path; +use std::process::Command; + +// xtrain-autodiff's `Var` tape calls GPU ops (via xtrain-tensor), so it gates +// those call sites behind `not(no_cuda)` — the same per-crate convention as +// xtrain-cuda / xtrain-tensor (cfg does not propagate across crates). The +// grad_check harness itself is host-only and always compiles. This script only +// detects nvcc and emits the cfg; it compiles no CUDA. +fn main() { + println!("cargo:rustc-check-cfg=cfg(no_cuda)"); + + let cuda_path = env::var("CUDA_HOME") + .or_else(|_| env::var("CUDA_PATH")) + .unwrap_or_else(|_| "/usr/local/cuda".to_string()); + + if !nvcc_available(&cuda_path) { + println!("cargo:rustc-cfg=no_cuda"); + } +} + +fn nvcc_available(cuda_path: &str) -> bool { + if Command::new("nvcc").arg("--version").output().is_ok() { + return true; + } + Path::new(&format!("{cuda_path}/bin/nvcc")).exists() +} diff --git a/crates/xtrain-autodiff/src/lib.rs b/crates/xtrain-autodiff/src/lib.rs index d8584ad..7a5546d 100644 --- a/crates/xtrain-autodiff/src/lib.rs +++ b/crates/xtrain-autodiff/src/lib.rs @@ -13,3 +13,14 @@ pub mod finite_diff; pub use finite_diff::{GradCheckConfig, GradCheckResult, ParamFn, grad_check}; + +// Tape-based autograd engine + differentiable ops (Phase T4). These call GPU +// kernels via xtrain-tensor, so they are gated behind `not(no_cuda)` (the +// per-crate convention); the grad_check harness above stays host-only. +#[cfg(not(no_cuda))] +pub mod ops; +#[cfg(not(no_cuda))] +pub mod tape; + +#[cfg(not(no_cuda))] +pub use tape::Var; diff --git a/crates/xtrain-autodiff/src/tape.rs b/crates/xtrain-autodiff/src/tape.rs new file mode 100644 index 0000000..8e7c363 --- /dev/null +++ b/crates/xtrain-autodiff/src/tape.rs @@ -0,0 +1,164 @@ +//! Tape-based reverse-mode autograd (Phase T4). +//! +//! A [`Var`] is a reference-counted node wrapping a value [`Tensor`], an optional +//! accumulated gradient, and — for non-leaf nodes — the parents it was computed +//! from plus a backward closure. Forward ops (see [`crate::ops`]) build these +//! nodes; [`Var::backward`] walks the graph in reverse topological order and +//! pushes gradients to parents, **summing** on fan-out (a tensor consumed by +//! several ops). +//! +//! The graph is dynamic (define-by-run) and the design favours clarity over +//! speed: each op synchronizes its own kernels (T3 has no streams yet), and +//! gradient accumulation is an explicit elementwise add. + +#![cfg(not(no_cuda))] + +use std::cell::RefCell; +use std::rc::Rc; +use xtrain_tensor::Tensor; + +/// Backward closure: given this node's accumulated grad and its parents, compute +/// and accumulate each parent's gradient contribution. +pub type BackwardFn = Box; + +pub struct VarNode { + pub value: Tensor, + pub grad: Option, + parents: Vec, + backward: Option, +} + +/// A node in the autograd tape. Cheap to clone (bumps the `Rc`); clones share +/// the same underlying node, which is how fan-out is detected. +#[derive(Clone)] +pub struct Var(Rc>); + +impl Var { + /// A leaf node (parameter / input). After `backward`, its `.grad()` holds the + /// accumulated gradient of the loss w.r.t. this tensor. + pub fn leaf(value: Tensor) -> Self { + Var(Rc::new(RefCell::new(VarNode { + value, + grad: None, + parents: Vec::new(), + backward: None, + }))) + } + + /// Build a non-leaf node from a forward `value`, its `parents`, and a + /// `backward` closure. Used by the op constructors in [`crate::ops`] and by + /// callers that compose custom nodes (e.g. a loss reduction or a transpose + /// the built-in op set doesn't cover). + pub fn from_op(value: Tensor, parents: Vec, backward: BackwardFn) -> Self { + Var(Rc::new(RefCell::new(VarNode { + value, + grad: None, + parents, + backward: Some(backward), + }))) + } + + /// Clone of the value tensor (cheap: storage is `Arc`-shared). + pub fn value(&self) -> Tensor { + self.0.borrow().value.clone() + } + + /// The accumulated gradient, if any (populated after `backward`). + pub fn grad(&self) -> Option { + self.0.borrow().grad.clone() + } + + /// Pointer identity, used to dedup nodes during the topological sort. + fn id(&self) -> *const RefCell { + Rc::as_ptr(&self.0) + } + + /// Accumulate `g` into this node's grad slot (SUM — the fan-out rule). + fn accumulate(&self, g: Tensor) { + let mut node = self.0.borrow_mut(); + match node.grad.take() { + None => node.grad = Some(g), + Some(existing) => node.grad = Some(existing.add(&g)), + } + } + + /// Reverse-mode backward from this node (treated as a scalar loss: its + /// upstream grad is seeded to ones). Populates `.grad` on every node that + /// transitively feeds it. + /// + /// `loss` must be a scalar (single element) so the seed `dL/dL = 1` is + /// unambiguous, matching the `L = sum(W∘out)` grad-check convention. + pub fn backward(&self) { + assert_eq!( + self.value().numel(), + 1, + "backward() expects a scalar loss; got shape {:?}", + self.value().shape() + ); + + // 1. Topological order (post-order DFS), parents before children. + let mut topo: Vec = Vec::new(); + let mut visited: Vec<*const RefCell> = Vec::new(); + build_topo(self, &mut topo, &mut visited); + + // 2. Seed the loss gradient with ones. + let seed = ones_like(&self.value()); + self.accumulate(seed); + + // 3. Walk in reverse: each node hands its grad to its parents' closures. + for var in topo.iter().rev() { + let (grad, parents, backward) = { + let node = var.0.borrow(); + ( + node.grad.clone(), + node.parents.clone(), + node.backward.is_some(), + ) + }; + let grad = match grad { + Some(g) => g, + None => continue, // node didn't contribute to the loss + }; + if backward { + // Borrow the closure out, run it, then drop the borrow. + let node = var.0.borrow(); + let bw = node.backward.as_ref().unwrap(); + bw(&grad, &parents); + } + } + } + + /// Drop the parents/closure so the graph can be freed, keeping value+grad. + /// (Not needed for tests; provided for completeness of the engine.) + pub fn detach_graph(&self) { + let mut node = self.0.borrow_mut(); + node.parents.clear(); + node.backward = None; + } + + /// Distribute `g` to a parent (called from op backward closures). Every node + /// accumulates its grad: intermediates need it for the chain rule, leaves + /// expose it as the result. This SUM is what makes fan-out correct. + pub fn push_grad(parent: &Var, g: Tensor) { + parent.accumulate(g); + } +} + +/// Post-order DFS: append a node only after all its parents, dedup by identity. +fn build_topo(var: &Var, topo: &mut Vec, visited: &mut Vec<*const RefCell>) { + if visited.contains(&var.id()) { + return; + } + visited.push(var.id()); + let parents = var.0.borrow().parents.clone(); + for p in &parents { + build_topo(p, topo, visited); + } + topo.push(var.clone()); +} + +/// A ones tensor matching `t`'s shape/device (the backward seed for a scalar). +fn ones_like(t: &Tensor) -> Tensor { + let host = vec![1.0f32; t.numel()]; + Tensor::from_slice(&host, t.shape()).to_device(t.device()) +}