Implementing Basic Autograd Operations

#26 · Deep Learning · Medium

Problem

Implement basic autograd operations: a simple computational graph that supports forward computation and backward differentiation for addition, multiplication, and power operations.

Solution

class Value:
    def __init__(self, data, children=(), op=''):
        self.data = data
        self.grad = 0.0
        self._backward = lambda: None
        self._children = set(children)
        self._op = op

    def __add__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data + other.data, (self, other), '+')
        def _backward():
            self.grad += out.grad
            other.grad += out.grad
        out._backward = _backward
        return out

    def __mul__(self, other):
        other = other if isinstance(other, Value) else Value(other)
        out = Value(self.data * other.data, (self, other), '*')
        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward
        return out

    def __pow__(self, other):
        out = Value(self.data ** other, (self,), f'**{other}')
        def _backward():
            self.grad += other * (self.data ** (other - 1)) * out.grad
        out._backward = _backward
        return out

    def relu(self):
        out = Value(max(0, self.data), (self,), 'relu')
        def _backward():
            self.grad += (1 if self.data > 0 else 0) * out.grad
        out._backward = _backward
        return out

    def backward(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._children:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        self.grad = 1.0
        for v in reversed(topo):
            v._backward()

Explanation

Each Value stores data, gradient, children, and a backward function.
Addition: d(a+b)/da = 1, d(a+b)/db = 1, so both parents receive out.grad.
Multiplication: d(ab)/da = b, d(ab)/db = a (product rule).
Power: d(a^n)/da = n * a^(n-1) (power rule).
Backward: Build a topological ordering of the graph, then propagate gradients from output to inputs in reverse order.

Complexity

Time: O(V + E) where V is the number of values and E is the number of operations
Space: O(V) for storing gradients and the topological sort

← #25 #27 →