Implement the Momentum optimizer. Momentum accelerates SGD by accumulating an exponentially decaying moving average of past gradients, helping to navigate ravines and reduce oscillation.
import numpy as np
class MomentumOptimizer:
def __init__(self, learning_rate: float = 0.01, momentum: float = 0.9):
self.lr = learning_rate
self.momentum = momentum
self.velocity = None
def update(self, params: np.ndarray, grads: np.ndarray) -> np.ndarray:
if self.velocity is None:
self.velocity = np.zeros_like(params)
self.velocity = self.momentum * self.velocity - self.lr * grads
params = params + self.velocity
return params
def momentum_update(params: np.ndarray, grads: np.ndarray, velocity: np.ndarray,
lr: float = 0.01, momentum: float = 0.9) -> tuple[np.ndarray, np.ndarray]:
velocity = momentum * velocity - lr * grads
params = params + velocity
return params, velocityv = momentum * v - lr * gradient.params = params + v.