autograd: tape engine + grad accumulation

Var = Rc<RefCell<VarNode>> on a define-by-run tape: value + optional grad +
parents + backward closure. backward() seeds a scalar loss, walks reverse
topo order, and pushes grads to parents. push_grad always SUMs into the grad
slot — the fan-out accumulation path T3 lacked. Per-crate build.rs emits the
no_cuda cfg (does not propagate); engine gated, grad_check stays host-only.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-15 15:44:17 +08:00
parent 5aef3742d6
commit 224f750ee4
3 changed files with 202 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
use std::env;
use std::path::Path;
use std::process::Command;
// xtrain-autodiff's `Var` tape calls GPU ops (via xtrain-tensor), so it gates
// those call sites behind `not(no_cuda)` — the same per-crate convention as
// xtrain-cuda / xtrain-tensor (cfg does not propagate across crates). The
// grad_check harness itself is host-only and always compiles. This script only
// detects nvcc and emits the cfg; it compiles no CUDA.
fn main() {
println!("cargo:rustc-check-cfg=cfg(no_cuda)");
let cuda_path = env::var("CUDA_HOME")
.or_else(|_| env::var("CUDA_PATH"))
.unwrap_or_else(|_| "/usr/local/cuda".to_string());
if !nvcc_available(&cuda_path) {
println!("cargo:rustc-cfg=no_cuda");
}
}
fn nvcc_available(cuda_path: &str) -> bool {
if Command::new("nvcc").arg("--version").output().is_ok() {
return true;
}
Path::new(&format!("{cuda_path}/bin/nvcc")).exists()
}

View File

@@ -13,3 +13,14 @@
pub mod finite_diff;
pub use finite_diff::{GradCheckConfig, GradCheckResult, ParamFn, grad_check};
// Tape-based autograd engine + differentiable ops (Phase T4). These call GPU
// kernels via xtrain-tensor, so they are gated behind `not(no_cuda)` (the
// per-crate convention); the grad_check harness above stays host-only.
#[cfg(not(no_cuda))]
pub mod ops;
#[cfg(not(no_cuda))]
pub mod tape;
#[cfg(not(no_cuda))]
pub use tape::Var;

View File

@@ -0,0 +1,164 @@
//! Tape-based reverse-mode autograd (Phase T4).
//!
//! A [`Var`] is a reference-counted node wrapping a value [`Tensor`], an optional
//! accumulated gradient, and — for non-leaf nodes — the parents it was computed
//! from plus a backward closure. Forward ops (see [`crate::ops`]) build these
//! nodes; [`Var::backward`] walks the graph in reverse topological order and
//! pushes gradients to parents, **summing** on fan-out (a tensor consumed by
//! several ops).
//!
//! The graph is dynamic (define-by-run) and the design favours clarity over
//! speed: each op synchronizes its own kernels (T3 has no streams yet), and
//! gradient accumulation is an explicit elementwise add.
#![cfg(not(no_cuda))]
use std::cell::RefCell;
use std::rc::Rc;
use xtrain_tensor::Tensor;
/// Backward closure: given this node's accumulated grad and its parents, compute
/// and accumulate each parent's gradient contribution.
pub type BackwardFn = Box<dyn Fn(&Tensor, &[Var])>;
pub struct VarNode {
pub value: Tensor,
pub grad: Option<Tensor>,
parents: Vec<Var>,
backward: Option<BackwardFn>,
}
/// A node in the autograd tape. Cheap to clone (bumps the `Rc`); clones share
/// the same underlying node, which is how fan-out is detected.
#[derive(Clone)]
pub struct Var(Rc<RefCell<VarNode>>);
impl Var {
/// A leaf node (parameter / input). After `backward`, its `.grad()` holds the
/// accumulated gradient of the loss w.r.t. this tensor.
pub fn leaf(value: Tensor) -> Self {
Var(Rc::new(RefCell::new(VarNode {
value,
grad: None,
parents: Vec::new(),
backward: None,
})))
}
/// Build a non-leaf node from a forward `value`, its `parents`, and a
/// `backward` closure. Used by the op constructors in [`crate::ops`] and by
/// callers that compose custom nodes (e.g. a loss reduction or a transpose
/// the built-in op set doesn't cover).
pub fn from_op(value: Tensor, parents: Vec<Var>, backward: BackwardFn) -> Self {
Var(Rc::new(RefCell::new(VarNode {
value,
grad: None,
parents,
backward: Some(backward),
})))
}
/// Clone of the value tensor (cheap: storage is `Arc`-shared).
pub fn value(&self) -> Tensor {
self.0.borrow().value.clone()
}
/// The accumulated gradient, if any (populated after `backward`).
pub fn grad(&self) -> Option<Tensor> {
self.0.borrow().grad.clone()
}
/// Pointer identity, used to dedup nodes during the topological sort.
fn id(&self) -> *const RefCell<VarNode> {
Rc::as_ptr(&self.0)
}
/// Accumulate `g` into this node's grad slot (SUM — the fan-out rule).
fn accumulate(&self, g: Tensor) {
let mut node = self.0.borrow_mut();
match node.grad.take() {
None => node.grad = Some(g),
Some(existing) => node.grad = Some(existing.add(&g)),
}
}
/// Reverse-mode backward from this node (treated as a scalar loss: its
/// upstream grad is seeded to ones). Populates `.grad` on every node that
/// transitively feeds it.
///
/// `loss` must be a scalar (single element) so the seed `dL/dL = 1` is
/// unambiguous, matching the `L = sum(W∘out)` grad-check convention.
pub fn backward(&self) {
assert_eq!(
self.value().numel(),
1,
"backward() expects a scalar loss; got shape {:?}",
self.value().shape()
);
// 1. Topological order (post-order DFS), parents before children.
let mut topo: Vec<Var> = Vec::new();
let mut visited: Vec<*const RefCell<VarNode>> = Vec::new();
build_topo(self, &mut topo, &mut visited);
// 2. Seed the loss gradient with ones.
let seed = ones_like(&self.value());
self.accumulate(seed);
// 3. Walk in reverse: each node hands its grad to its parents' closures.
for var in topo.iter().rev() {
let (grad, parents, backward) = {
let node = var.0.borrow();
(
node.grad.clone(),
node.parents.clone(),
node.backward.is_some(),
)
};
let grad = match grad {
Some(g) => g,
None => continue, // node didn't contribute to the loss
};
if backward {
// Borrow the closure out, run it, then drop the borrow.
let node = var.0.borrow();
let bw = node.backward.as_ref().unwrap();
bw(&grad, &parents);
}
}
}
/// Drop the parents/closure so the graph can be freed, keeping value+grad.
/// (Not needed for tests; provided for completeness of the engine.)
pub fn detach_graph(&self) {
let mut node = self.0.borrow_mut();
node.parents.clear();
node.backward = None;
}
/// Distribute `g` to a parent (called from op backward closures). Every node
/// accumulates its grad: intermediates need it for the chain rule, leaves
/// expose it as the result. This SUM is what makes fan-out correct.
pub fn push_grad(parent: &Var, g: Tensor) {
parent.accumulate(g);
}
}
/// Post-order DFS: append a node only after all its parents, dedup by identity.
fn build_topo(var: &Var, topo: &mut Vec<Var>, visited: &mut Vec<*const RefCell<VarNode>>) {
if visited.contains(&var.id()) {
return;
}
visited.push(var.id());
let parents = var.0.borrow().parents.clone();
for p in &parents {
build_topo(p, topo, visited);
}
topo.push(var.clone());
}
/// A ones tensor matching `t`'s shape/device (the backward seed for a scalar).
fn ones_like(t: &Tensor) -> Tensor {
let host = vec![1.0f32; t.numel()];
Tensor::from_slice(&host, t.shape()).to_device(t.device())
}