Implement Masked Self-Attention (causal attention). Given query, key, and value matrices, compute scaled dot-product attention with a causal mask that prevents attending to future positions. This is the core component of decoder-only Transformers like GPT.
import numpy as np
def masked_self_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray) -> np.ndarray:
d_k = Q.shape[-1]
# Compute attention scores
scores = Q @ K.T / np.sqrt(d_k)
# Create causal mask (upper triangle = -inf)
seq_len = scores.shape[0]
mask = np.triu(np.ones((seq_len, seq_len), dtype=bool), k=1)
scores = np.where(mask, -1e9, scores)
# Softmax
scores_shifted = scores - np.max(scores, axis=-1, keepdims=True)
exp_scores = np.exp(scores_shifted)
attention_weights = exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)
# Weighted sum of values
output = attention_weights @ V
return outputQ K^T / sqrt(d_k) to get raw attention scores. Scaling by sqrt(d_k) prevents softmax saturation.