Implement a simplified GPT-2 text generation function. Given a vocabulary, initial token context, and model weights (embedding, attention, and feedforward layers), generate text token by token using autoregressive decoding with softmax sampling.
import numpy as np
def softmax(x):
e = np.exp(x - np.max(x))
return e / e.sum()
def layer_norm(x, eps=1e-5):
mean = np.mean(x, axis=-1, keepdims=True)
std = np.std(x, axis=-1, keepdims=True)
return (x - mean) / (std + eps)
def self_attention(Q, K, V, mask=None):
d_k = Q.shape[-1]
scores = Q @ K.T / np.sqrt(d_k)
if mask is not None:
scores = scores + mask
weights = np.array([softmax(row) for row in scores])
return weights @ V
def gpt2_generate(token_ids, wte, wpe, layers, ln_f_g, ln_f_b,
n_tokens=10, temperature=1.0):
generated = list(token_ids)
for _ in range(n_tokens):
seq_len = len(generated)
# Token + positional embeddings
x = wte[generated] + wpe[:seq_len]
# Causal mask
mask = np.triu(np.full((seq_len, seq_len), -1e10), k=1)
# Transformer layers
for layer in layers:
Wq, Wk, Wv, Wo = layer['attn_weights']
bq, bk, bv, bo = layer['attn_biases']
W1, b1, W2, b2 = layer['ffn_weights']
ln1_g, ln1_b = layer['ln1']
ln2_g, ln2_b = layer['ln2']
# Layer norm + self attention + residual
h = layer_norm(x) * ln1_g + ln1_b
Q = h @ Wq + bq
K = h @ Wk + bk
V = h @ Wv + bv
attn_out = self_attention(Q, K, V, mask)
x = x + (attn_out @ Wo + bo)
# Layer norm + FFN + residual
h = layer_norm(x) * ln2_g + ln2_b
ffn_out = np.maximum(0, h @ W1 + b1) # GELU approx with ReLU
ffn_out = ffn_out @ W2 + b2
x = x + ffn_out
# Final layer norm
x = layer_norm(x) * ln_f_g + ln_f_b
# Get logits for last token
logits = x[-1] @ wte.T
logits = logits / temperature
# Sample next token
probs = softmax(logits)
next_token = np.random.choice(len(probs), p=probs)
generated.append(int(next_token))
return generated