Implement sliding window attention (as used in models like Longformer). Instead of full quadratic attention over all tokens, each token only attends to a fixed-size local window of neighboring tokens, reducing memory from O(n^2) to O(n * w).
import numpy as np
def sliding_window_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray, window_size: int) -> np.ndarray:
# Q, K, V: shape (seq_len, d_k)
seq_len, d_k = Q.shape
output = np.zeros_like(V)
for i in range(seq_len):
# Define window bounds
start = max(0, i - window_size // 2)
end = min(seq_len, i + window_size // 2 + 1)
# Attend only within window
q_i = Q[i] # (d_k,)
K_window = K[start:end] # (w, d_k)
V_window = V[start:end] # (w, d_k)
scores = K_window @ q_i / np.sqrt(d_k) # (w,)
weights = np.exp(scores - np.max(scores))
weights = weights / weights.sum()
output[i] = weights @ V_window
return outputi, define a local window of size w centered on that position.