From 224f750ee45687e5261d612495076f825f06be26 Mon Sep 17 00:00:00 2001
From: Gahow Wang <gahow.wang@gmail.com>
Date: Mon, 15 Jun 2026 15:44:17 +0800
Subject: [PATCH] autograd: tape engine + grad accumulation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Var = Rc<RefCell<VarNode>> on a define-by-run tape: value + optional grad +
parents + backward closure. backward() seeds a scalar loss, walks reverse
topo order, and pushes grads to parents. push_grad always SUMs into the grad
slot — the fan-out accumulation path T3 lacked. Per-crate build.rs emits the
no_cuda cfg (does not propagate); engine gated, grad_check stays host-only.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 crates/xtrain-autodiff/build.rs    |  27 +++++
 crates/xtrain-autodiff/src/lib.rs  |  11 ++
 crates/xtrain-autodiff/src/tape.rs | 164 +++++++++++++++++++++++++++++
 3 files changed, 202 insertions(+)
 create mode 100644 crates/xtrain-autodiff/build.rs
 create mode 100644 crates/xtrain-autodiff/src/tape.rs
diff --git a/crates/xtrain-autodiff/build.rs b/crates/xtrain-autodiff/build.rs
new file mode 100644
index 0000000..0ffc22e
--- /dev/null
+++ b/crates/xtrain-autodiff/build.rs
@@ -0,0 +1,27 @@
+use std::env;
+use std::path::Path;
+use std::process::Command;
+
+// xtrain-autodiff's `Var` tape calls GPU ops (via xtrain-tensor), so it gates
+// those call sites behind `not(no_cuda)` — the same per-crate convention as
+// xtrain-cuda / xtrain-tensor (cfg does not propagate across crates). The
+// grad_check harness itself is host-only and always compiles. This script only
+// detects nvcc and emits the cfg; it compiles no CUDA.
+fn main() {
+    println!("cargo:rustc-check-cfg=cfg(no_cuda)");
+
+    let cuda_path = env::var("CUDA_HOME")
+        .or_else(|_| env::var("CUDA_PATH"))
+        .unwrap_or_else(|_| "/usr/local/cuda".to_string());
+
+    if !nvcc_available(&cuda_path) {
+        println!("cargo:rustc-cfg=no_cuda");
+    }
+}
+
+fn nvcc_available(cuda_path: &str) -> bool {
+    if Command::new("nvcc").arg("--version").output().is_ok() {
+        return true;
+    }
+    Path::new(&format!("{cuda_path}/bin/nvcc")).exists()
+}
diff --git a/crates/xtrain-autodiff/src/lib.rs b/crates/xtrain-autodiff/src/lib.rs
index d8584ad..7a5546d 100644
--- a/crates/xtrain-autodiff/src/lib.rs
+++ b/crates/xtrain-autodiff/src/lib.rs
@@ -13,3 +13,14 @@
 pub mod finite_diff;
 
 pub use finite_diff::{GradCheckConfig, GradCheckResult, ParamFn, grad_check};
+
+// Tape-based autograd engine + differentiable ops (Phase T4). These call GPU
+// kernels via xtrain-tensor, so they are gated behind `not(no_cuda)` (the
+// per-crate convention); the grad_check harness above stays host-only.
+#[cfg(not(no_cuda))]
+pub mod ops;
+#[cfg(not(no_cuda))]
+pub mod tape;
+
+#[cfg(not(no_cuda))]
+pub use tape::Var;
diff --git a/crates/xtrain-autodiff/src/tape.rs b/crates/xtrain-autodiff/src/tape.rs
new file mode 100644
index 0000000..8e7c363
--- /dev/null
+++ b/crates/xtrain-autodiff/src/tape.rs
@@ -0,0 +1,164 @@
+//! Tape-based reverse-mode autograd (Phase T4).
+//!
+//! A [`Var`] is a reference-counted node wrapping a value [`Tensor`], an optional
+//! accumulated gradient, and — for non-leaf nodes — the parents it was computed
+//! from plus a backward closure. Forward ops (see [`crate::ops`]) build these
+//! nodes; [`Var::backward`] walks the graph in reverse topological order and
+//! pushes gradients to parents, **summing** on fan-out (a tensor consumed by
+//! several ops).
+//!
+//! The graph is dynamic (define-by-run) and the design favours clarity over
+//! speed: each op synchronizes its own kernels (T3 has no streams yet), and
+//! gradient accumulation is an explicit elementwise add.
+
+#![cfg(not(no_cuda))]
+
+use std::cell::RefCell;
+use std::rc::Rc;
+use xtrain_tensor::Tensor;
+
+/// Backward closure: given this node's accumulated grad and its parents, compute
+/// and accumulate each parent's gradient contribution.
+pub type BackwardFn = Box<dyn Fn(&Tensor, &[Var])>;
+
+pub struct VarNode {
+    pub value: Tensor,
+    pub grad: Option<Tensor>,
+    parents: Vec<Var>,
+    backward: Option<BackwardFn>,
+}
+
+/// A node in the autograd tape. Cheap to clone (bumps the `Rc`); clones share
+/// the same underlying node, which is how fan-out is detected.
+#[derive(Clone)]
+pub struct Var(Rc<RefCell<VarNode>>);
+
+impl Var {
+    /// A leaf node (parameter / input). After `backward`, its `.grad()` holds the
+    /// accumulated gradient of the loss w.r.t. this tensor.
+    pub fn leaf(value: Tensor) -> Self {
+        Var(Rc::new(RefCell::new(VarNode {
+            value,
+            grad: None,
+            parents: Vec::new(),
+            backward: None,
+        })))
+    }
+
+    /// Build a non-leaf node from a forward `value`, its `parents`, and a
+    /// `backward` closure. Used by the op constructors in [`crate::ops`] and by
+    /// callers that compose custom nodes (e.g. a loss reduction or a transpose
+    /// the built-in op set doesn't cover).
+    pub fn from_op(value: Tensor, parents: Vec<Var>, backward: BackwardFn) -> Self {
+        Var(Rc::new(RefCell::new(VarNode {
+            value,
+            grad: None,
+            parents,
+            backward: Some(backward),
+        })))
+    }
+
+    /// Clone of the value tensor (cheap: storage is `Arc`-shared).
+    pub fn value(&self) -> Tensor {
+        self.0.borrow().value.clone()
+    }
+
+    /// The accumulated gradient, if any (populated after `backward`).
+    pub fn grad(&self) -> Option<Tensor> {
+        self.0.borrow().grad.clone()
+    }
+
+    /// Pointer identity, used to dedup nodes during the topological sort.
+    fn id(&self) -> *const RefCell<VarNode> {
+        Rc::as_ptr(&self.0)
+    }
+
+    /// Accumulate `g` into this node's grad slot (SUM — the fan-out rule).
+    fn accumulate(&self, g: Tensor) {
+        let mut node = self.0.borrow_mut();
+        match node.grad.take() {
+            None => node.grad = Some(g),
+            Some(existing) => node.grad = Some(existing.add(&g)),
+        }
+    }
+
+    /// Reverse-mode backward from this node (treated as a scalar loss: its
+    /// upstream grad is seeded to ones). Populates `.grad` on every node that
+    /// transitively feeds it.
+    ///
+    /// `loss` must be a scalar (single element) so the seed `dL/dL = 1` is
+    /// unambiguous, matching the `L = sum(W∘out)` grad-check convention.
+    pub fn backward(&self) {
+        assert_eq!(
+            self.value().numel(),
+            1,
+            "backward() expects a scalar loss; got shape {:?}",
+            self.value().shape()
+        );
+
+        // 1. Topological order (post-order DFS), parents before children.
+        let mut topo: Vec<Var> = Vec::new();
+        let mut visited: Vec<*const RefCell<VarNode>> = Vec::new();
+        build_topo(self, &mut topo, &mut visited);
+
+        // 2. Seed the loss gradient with ones.
+        let seed = ones_like(&self.value());
+        self.accumulate(seed);
+
+        // 3. Walk in reverse: each node hands its grad to its parents' closures.
+        for var in topo.iter().rev() {
+            let (grad, parents, backward) = {
+                let node = var.0.borrow();
+                (
+                    node.grad.clone(),
+                    node.parents.clone(),
+                    node.backward.is_some(),
+                )
+            };
+            let grad = match grad {
+                Some(g) => g,
+                None => continue, // node didn't contribute to the loss
+            };
+            if backward {
+                // Borrow the closure out, run it, then drop the borrow.
+                let node = var.0.borrow();
+                let bw = node.backward.as_ref().unwrap();
+                bw(&grad, &parents);
+            }
+        }
+    }
+
+    /// Drop the parents/closure so the graph can be freed, keeping value+grad.
+    /// (Not needed for tests; provided for completeness of the engine.)
+    pub fn detach_graph(&self) {
+        let mut node = self.0.borrow_mut();
+        node.parents.clear();
+        node.backward = None;
+    }
+
+    /// Distribute `g` to a parent (called from op backward closures). Every node
+    /// accumulates its grad: intermediates need it for the chain rule, leaves
+    /// expose it as the result. This SUM is what makes fan-out correct.
+    pub fn push_grad(parent: &Var, g: Tensor) {
+        parent.accumulate(g);
+    }
+}
+
+/// Post-order DFS: append a node only after all its parents, dedup by identity.
+fn build_topo(var: &Var, topo: &mut Vec<Var>, visited: &mut Vec<*const RefCell<VarNode>>) {
+    if visited.contains(&var.id()) {
+        return;
+    }
+    visited.push(var.id());
+    let parents = var.0.borrow().parents.clone();
+    for p in &parents {
+        build_topo(p, topo, visited);
+    }
+    topo.push(var.clone());
+}
+
+/// A ones tensor matching `t`'s shape/device (the backward seed for a scalar).
+fn ones_like(t: &Tensor) -> Tensor {
+    let host = vec![1.0f32; t.numel()];
+    Tensor::from_slice(&host, t.shape()).to_device(t.device())
+}