Implement Rotary Positional Embeddings (RoPE), which encode position information by rotating query and key vectors in pairs of dimensions. This allows the dot product between queries and keys to naturally encode relative position.
import numpy as np
def precompute_freqs(d_model: int, max_len: int, base: float = 10000.0) -> tuple[np.ndarray, np.ndarray]:
freqs = 1.0 / (base ** (np.arange(0, d_model, 2) / d_model))
positions = np.arange(max_len)
angles = np.outer(positions, freqs)
cos = np.cos(angles)
sin = np.sin(angles)
return cos, sin
def apply_rope(x: np.ndarray, cos: np.ndarray, sin: np.ndarray) -> np.ndarray:
# x shape: (batch, seq_len, d_model) or (seq_len, d_model)
d = x.shape[-1]
x1 = x[..., :d // 2]
x2 = x[..., d // 2:]
seq_len = x.shape[-2]
c = cos[:seq_len]
s = sin[:seq_len]
# Rotate pairs: (x1, x2) -> (x1*cos - x2*sin, x1*sin + x2*cos)
out1 = x1 * c - x2 * s
out2 = x1 * s + x2 * c
return np.concatenate([out1, out2], axis=-1)theta_i = 1 / base^(2i/d).position * theta_i.(x1*cos - x2*sin, x1*sin + x2*cos).