Implement the Adagrad (Adaptive Gradient) optimizer. Adagrad adapts the learning rate for each parameter based on the historical sum of squared gradients, giving larger updates to infrequent parameters and smaller updates to frequent ones.
import numpy as np
class Adagrad:
def __init__(self, learning_rate: float = 0.01, epsilon: float = 1e-8):
self.lr = learning_rate
self.epsilon = epsilon
self.accumulated = None
def update(self, params: np.ndarray, grads: np.ndarray) -> np.ndarray:
if self.accumulated is None:
self.accumulated = np.zeros_like(params)
self.accumulated += grads ** 2
params = params - self.lr * grads / (np.sqrt(self.accumulated) + self.epsilon)
return params
def adagrad_update(params: np.ndarray, grads: np.ndarray, accumulated_sq_grads: np.ndarray,
lr: float = 0.01, epsilon: float = 1e-8) -> tuple[np.ndarray, np.ndarray]:
accumulated_sq_grads = accumulated_sq_grads + grads ** 2
adjusted_grads = grads / (np.sqrt(accumulated_sq_grads) + epsilon)
params = params - lr * adjusted_grads
return params, accumulated_sq_grads