#177 · Deep Learning · Medium
⊣ Solve on deep-ml.comImplement MuonClip (qk-clip) for stabilizing attention in transformers. This technique clips the query-key dot products to prevent extreme attention scores that can destabilize training.
import numpy as np
def muon_clip_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray,
clip_value: float = 1.0,
temperature: float = None) -> np.ndarray:
d_k = Q.shape[-1]
if temperature is None:
temperature = np.sqrt(d_k)
scores = Q @ K.T / temperature
scores = np.clip(scores, -clip_value, clip_value)
exp_scores = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
attention_weights = exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)
return attention_weights @ VQK^T / sqrt(d_k).[-clip_value, clip_value] to prevent extreme values.