Implement the Adadelta optimizer. Adadelta is an extension of Adagrad that seeks to reduce its aggressively monotonically decreasing learning rate. Instead of accumulating all past squared gradients, it restricts the window of accumulated past gradients to a fixed size using an exponential moving average.
import numpy as np
class Adadelta:
def __init__(self, rho: float = 0.95, epsilon: float = 1e-6):
self.rho = rho
self.epsilon = epsilon
self.E_g2 = None # running avg of squared gradients
self.E_dx2 = None # running avg of squared updates
def update(self, params: np.ndarray, grads: np.ndarray) -> np.ndarray:
if self.E_g2 is None:
self.E_g2 = np.zeros_like(params)
self.E_dx2 = np.zeros_like(params)
# Accumulate gradient
self.E_g2 = self.rho * self.E_g2 + (1 - self.rho) * grads ** 2
# Compute update (RMS of past updates / RMS of current gradient)
rms_dx = np.sqrt(self.E_dx2 + self.epsilon)
rms_g = np.sqrt(self.E_g2 + self.epsilon)
delta = -(rms_dx / rms_g) * grads
# Accumulate updates
self.E_dx2 = self.rho * self.E_dx2 + (1 - self.rho) * delta ** 2
params = params + delta
return params
def adadelta_update(params: np.ndarray, grads: np.ndarray, E_g2: np.ndarray, E_dx2: np.ndarray,
rho: float = 0.95, epsilon: float = 1e-6) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
E_g2 = rho * E_g2 + (1 - rho) * grads ** 2
rms_dx = np.sqrt(E_dx2 + epsilon)
rms_g = np.sqrt(E_g2 + epsilon)
delta = -(rms_dx / rms_g) * grads
E_dx2 = rho * E_dx2 + (1 - rho) * delta ** 2
params = params + delta
return params, E_g2, E_dx2RMS(dx_prev) / RMS(grad) instead of a fixed learning rate -- the units match naturally.