#170 · Optimization · Medium
⊣ Solve on deep-ml.comImplement a single step of the Muon Optimizer with matrix preconditioning. Muon applies Newton-Schulz iterations to approximate the matrix square root inverse of the gradient's covariance for preconditioning.
import numpy as np
def muon_step(params: np.ndarray, grads: np.ndarray,
momentum_buffer: np.ndarray, lr: float = 0.02,
momentum: float = 0.95, ns_steps: int = 5):
if grads.ndim < 2:
grads_2d = grads.reshape(-1, 1)
else:
grads_2d = grads
buf = momentum * momentum_buffer + grads
G = buf.copy()
if G.shape[0] >= G.shape[1]:
GtG = G.T @ G
norm = np.sqrt(np.trace(GtG)) + 1e-7
X = G / norm
for _ in range(ns_steps):
A = X.T @ X
X = X @ (3 * np.eye(A.shape[0]) - A) / 2
else:
GGt = G @ G.T
norm = np.sqrt(np.trace(GGt)) + 1e-7
X = G / norm
for _ in range(ns_steps):
A = X @ X.T
X = (3 * np.eye(A.shape[0]) - A) @ X / 2
preconditioned = X * np.sqrt(max(G.shape[0], G.shape[1]))
if grads.ndim < 2:
preconditioned = preconditioned.flatten()[:params.shape[0]]
params_new = params - lr * preconditioned
return params_new, bufbuf = momentum * old_buf + grads.X <- X * (3I - X^T X) / 2.