#433 · Machine Learning · Medium
⊣ Solve on deep-ml.comAnalyze how the acceptance rate of speculative decoding varies with sampling temperature. At low temperatures, the draft and target models tend to agree more (both favor high-probability tokens). At high temperatures, the distributions diverge more. Compute the expected acceptance rate given draft and target distributions at various temperatures.
import math
def acceptance_rate_vs_temperature(
base_logits_draft: list[float],
base_logits_target: list[float],
temperatures: list[float]
) -> list[dict]:
vocab_size = len(base_logits_draft)
results = []
for temp in temperatures:
if temp <= 0:
# Greedy: acceptance = 1 if argmax matches, else depends
d_max = max(range(vocab_size), key=lambda i: base_logits_draft[i])
t_max = max(range(vocab_size), key=lambda i: base_logits_target[i])
results.append({
"temperature": temp,
"acceptance_rate": 1.0 if d_max == t_max else 0.0,
"effective_tokens_per_step": 1.0 if d_max == t_max else 1.0
})
continue
# Apply temperature
draft_scaled = [l / temp for l in base_logits_draft]
target_scaled = [l / temp for l in base_logits_target]
# Softmax
def softmax(logits):
mx = max(logits)
exps = [math.exp(l - mx) for l in logits]
s = sum(exps)
return [e / s for e in exps]
p = softmax(target_scaled) # target probs
q = softmax(draft_scaled) # draft probs
# Expected acceptance rate = sum_x min(p(x), q(x))
# This equals 1 - 0.5 * sum_x |p(x) - q(x)| = 1 - TV_distance
alpha = sum(min(p[i], q[i]) for i in range(vocab_size))
# Expected tokens per step with K draft tokens
# E[accepted] = sum_{k=0}^{K-1} alpha^k * (1 - alpha) * k + alpha^K * K
# Simplified: E[tokens] = (1 - alpha^(K+1)) / (1 - alpha) for geometric
K = 5 # typical draft length
if alpha < 1.0:
expected_tokens = (1 - alpha ** (K + 1)) / (1 - alpha)
else:
expected_tokens = K + 1
results.append({
"temperature": temp,
"acceptance_rate": round(alpha, 4),
"expected_tokens_per_step": round(expected_tokens, 4)
})
return resultssum(min(p_target(x), p_draft(x))) over all tokens x. This is also 1 - TV_distance(p, q).sum(min(p, q)) approaches 1.0 since both are nearly uniform. However, intermediate temperatures can have the lowest acceptance.(1 - alpha^(K+1)) / (1 - alpha).