#406 · Deep Learning · Hard
⊣ Solve on deep-ml.comImplement NoPE (No Positional Embedding) with iRoPE attention. This approach uses no explicit positional embedding in most layers but applies RoPE (Rotary Position Embedding) only in select attention layers, using the inherent causal mask for position awareness.
import numpy as np
def apply_rope(x: np.ndarray, positions: np.ndarray) -> np.ndarray:
# x: (batch, num_heads, seq_len, d_k)
d_k = x.shape[-1]
half_d = d_k // 2
freqs = 1.0 / (10000.0 ** (np.arange(half_d) / half_d))
# positions: (seq_len,)
angles = positions[:, np.newaxis] * freqs[np.newaxis, :] # (seq_len, half_d)
cos_vals = np.cos(angles)
sin_vals = np.sin(angles)
# Split x into pairs and rotate
x1 = x[..., :half_d]
x2 = x[..., half_d:]
out1 = x1 * cos_vals - x2 * sin_vals
out2 = x1 * sin_vals + x2 * cos_vals
return np.concatenate([out1, out2], axis=-1)
def irope_attention(
Q: np.ndarray,
K: np.ndarray,
V: np.ndarray,
positions: np.ndarray,
use_rope: bool = True,
causal: bool = True
) -> np.ndarray:
# Q, K, V: (batch, num_heads, seq_len, d_k)
d_k = Q.shape[-1]
seq_len = Q.shape[2]
if use_rope:
Q = apply_rope(Q, positions)
K = apply_rope(K, positions)
scores = np.matmul(Q, K.transpose(0, 1, 3, 2)) / np.sqrt(d_k)
if causal:
mask = np.triu(np.ones((seq_len, seq_len)), k=1) * (-1e9)
scores = scores + mask
scores_max = np.max(scores, axis=-1, keepdims=True)
weights = np.exp(scores - scores_max)
weights = weights / np.sum(weights, axis=-1, keepdims=True)
return np.matmul(weights, V)
class iRoPETransformer:
def __init__(self, num_layers: int, rope_layers: set[int]):
self.num_layers = num_layers
self.rope_layers = rope_layers
def forward_layer(self, Q, K, V, positions, layer_idx):
use_rope = layer_idx in self.rope_layers
return irope_attention(Q, K, V, positions, use_rope=use_rope, causal=True)