Implement a self-attention mechanism from scratch. Given query, key, and value matrices, compute scaled dot-product attention: Attention(Q, K, V) = softmax(Q @ K^T / sqrt(d_k)) @ V.
import numpy as np
def self_attention(Q, K, V):
Q = np.array(Q, dtype=np.float64)
K = np.array(K, dtype=np.float64)
V = np.array(V, dtype=np.float64)
d_k = Q.shape[-1]
# Scaled dot-product scores
scores = Q @ K.T / np.sqrt(d_k)
# Softmax along last axis
exp_scores = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
attention_weights = exp_scores / np.sum(exp_scores, axis=-1, keepdims=True)
# Weighted sum of values
output = attention_weights @ V
return output.tolist()1 / sqrt(d_k) to prevent large dot products that push softmax into saturated regions.