Implement Gated Attention — an attention mechanism that uses a learned gate to modulate the attention output. The gate controls how much of the attended information flows through.
Compute standard scaled dot-product attention, then apply a sigmoid gate derived from the query and context to modulate the output.
import math
def softmax(values: list[float]) -> list[float]:
max_v = max(values)
exps = [math.exp(v - max_v) for v in values]
total = sum(exps)
return [e / total for e in exps]
def gated_attention(
query: list[float],
keys: list[list[float]],
values: list[list[float]],
gate_weights: list[float],
gate_bias: float = 0.0,
) -> dict:
dim = len(query)
n = len(keys)
scale = math.sqrt(dim)
# Compute attention scores
scores = []
for i in range(n):
dot = sum(query[d] * keys[i][d] for d in range(dim))
scores.append(dot / scale)
# Softmax to get attention weights
attn_weights = softmax(scores)
# Compute attended value (context vector)
v_dim = len(values[0])
context = [0.0] * v_dim
for i in range(n):
for d in range(v_dim):
context[d] += attn_weights[i] * values[i][d]
# Compute gate: sigmoid(W_g . query + b_g)
gate_input = sum(gate_weights[d] * query[d] for d in range(dim)) + gate_bias
gate = 1.0 / (1.0 + math.exp(-gate_input))
# Apply gate to context
gated_output = [gate * context[d] for d in range(v_dim)]
return {
"attention_weights": [round(w, 6) for w in attn_weights],
"context": [round(c, 6) for c in context],
"gate": round(gate, 6),
"output": [round(o, 6) for o in gated_output],
}Q * K^T / sqrt(d), then softmax.