Implement an EAGLE-style draft model that predicts the next token using the hidden states from the target model rather than the token embeddings. Given the target model's hidden state at the current position, use a lightweight projection network (one linear layer + activation) to predict draft token logits.
import math
def eagle_draft_from_hidden(
hidden_state: list[float],
projection_weights: list[list[float]],
projection_bias: list[float],
lm_head_weights: list[list[float]],
temperature: float = 1.0
) -> dict:
"""
hidden_state: (d_model,) from target model
projection_weights: (d_proj, d_model) single-layer projection
projection_bias: (d_proj,)
lm_head_weights: (vocab_size, d_proj) maps to logits
"""
d_model = len(hidden_state)
d_proj = len(projection_weights)
vocab_size = len(lm_head_weights)
# Linear projection + SiLU activation
projected = []
for i in range(d_proj):
val = projection_bias[i]
for j in range(d_model):
val += projection_weights[i][j] * hidden_state[j]
# SiLU activation: x * sigmoid(x)
sigmoid_val = 1.0 / (1.0 + math.exp(-val)) if abs(val) < 500 else (1.0 if val > 0 else 0.0)
projected.append(val * sigmoid_val)
# LM head: project to vocabulary
logits = []
for v in range(vocab_size):
val = 0.0
for j in range(d_proj):
val += lm_head_weights[v][j] * projected[j]
logits.append(val)
# Apply temperature and softmax
if temperature > 0:
logits = [l / temperature for l in logits]
max_logit = max(logits)
exps = [math.exp(l - max_logit) for l in logits]
total = sum(exps)
probs = [e / total for e in exps]
top_token = max(range(vocab_size), key=lambda i: probs[i])
return {
"logits": [round(l, 4) for l in logits],
"probs": [round(p, 6) for p in probs],
"predicted_token": top_token,
"confidence": round(probs[top_token], 6)
}