Implement the Adam (Adaptive Moment Estimation) optimizer from scratch. Given parameters, gradients, and hyperparameters (learning rate, beta1, beta2, epsilon), perform one step of the Adam update rule.
import numpy as np
def adam_optimizer(params, grads, m, v, t, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
updated_params = []
new_m = []
new_v = []
for i in range(len(params)):
p = np.array(params[i], dtype=float)
g = np.array(grads[i], dtype=float)
mi = np.array(m[i], dtype=float)
vi = np.array(v[i], dtype=float)
# Update biased first moment estimate
mi = beta1 * mi + (1 - beta1) * g
# Update biased second moment estimate
vi = beta2 * vi + (1 - beta2) * (g ** 2)
# Bias-corrected estimates
m_hat = mi / (1 - beta1 ** t)
v_hat = vi / (1 - beta2 ** t)
# Update parameters
p = p - lr * m_hat / (np.sqrt(v_hat) + epsilon)
updated_params.append(p.tolist() if hasattr(p, 'tolist') else p)
new_m.append(mi.tolist() if hasattr(mi, 'tolist') else mi)
new_v.append(vi.tolist() if hasattr(vi, 'tolist') else vi)
return updated_params, new_m, new_v