Implement the Adamax optimizer, a variant of Adam based on the infinity norm. Instead of using the second moment (mean of squared gradients), Adamax uses the max of exponentially weighted absolute gradients.
import numpy as np
class Adamax:
def __init__(self, learning_rate: float = 0.002, beta1: float = 0.9, beta2: float = 0.999, epsilon: float = 1e-8):
self.lr = learning_rate
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.m = None # first moment
self.u = None # infinity norm
self.t = 0
def update(self, params: np.ndarray, grads: np.ndarray) -> np.ndarray:
if self.m is None:
self.m = np.zeros_like(params)
self.u = np.zeros_like(params)
self.t += 1
# Update biased first moment
self.m = self.beta1 * self.m + (1 - self.beta1) * grads
# Update infinity norm (exponentially weighted)
self.u = np.maximum(self.beta2 * self.u, np.abs(grads))
# Bias correction for first moment
m_hat = self.m / (1 - self.beta1 ** self.t)
# Update params
params = params - self.lr * m_hat / (self.u + self.epsilon)
return params
def adamax_update(params: np.ndarray, grads: np.ndarray, m: np.ndarray, u: np.ndarray,
t: int, lr: float = 0.002, beta1: float = 0.9, beta2: float = 0.999,
epsilon: float = 1e-8) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
m = beta1 * m + (1 - beta1) * grads
u = np.maximum(beta2 * u, np.abs(grads))
m_hat = m / (1 - beta1 ** t)
params = params - lr * m_hat / (u + epsilon)
return params, m, umax(beta2 * u, |grad|). This is the L-infinity norm version.params -= lr * m_hat / (u + epsilon).