Implement the RMSProp optimizer from scratch. RMSProp maintains a running average of squared gradients and divides the gradient by the square root of this average to normalize the update.
import numpy as np
class RMSProp:
def __init__(self, params: list[np.ndarray], lr: float = 0.001,
decay_rate: float = 0.9, epsilon: float = 1e-8):
self.params = params
self.lr = lr
self.decay_rate = decay_rate
self.epsilon = epsilon
self.cache = [np.zeros_like(p) for p in params]
def step(self, gradients: list[np.ndarray]) -> list[np.ndarray]:
updated_params = []
for i, (param, grad) in enumerate(zip(self.params, gradients)):
self.cache[i] = (self.decay_rate * self.cache[i] +
(1 - self.decay_rate) * grad ** 2)
param = param - self.lr * grad / (
np.sqrt(self.cache[i]) + self.epsilon)
updated_params.append(param)
self.params = updated_params
return updated_params
def rmsprop_update(params: list[np.ndarray],
gradients: list[np.ndarray],
cache: list[np.ndarray],
lr: float = 0.001,
decay_rate: float = 0.9,
epsilon: float = 1e-8):
new_params = []
new_cache = []
for p, g, c in zip(params, gradients, cache):
c = decay_rate * c + (1 - decay_rate) * g ** 2
p = p - lr * g / (np.sqrt(c) + epsilon)
new_params.append(p)
new_cache.append(c)
return new_params, new_cachecache = decay_rate * cache + (1 - decay_rate) * grad^2.param = param - lr * grad / (sqrt(cache) + epsilon).epsilon prevents division by zero.