Implement the Sparse Mixture of Experts (MoE) top-k routing mechanism. Given input tokens and a set of expert networks, route each token to the top-k experts based on a gating network, then combine their outputs.
def sparse_moe_top_k(
inputs: list[list[float]],
gate_weights: list[list[float]],
expert_weights: list[list[list[float]]],
k: int,
) -> list[list[float]]:
import math
num_tokens = len(inputs)
num_experts = len(gate_weights[0]) if gate_weights else 0
dim = len(inputs[0]) if inputs else 0
def softmax(vals):
m = max(vals)
exps = [math.exp(v - m) for v in vals]
s = sum(exps)
return [e / s for e in exps]
def matvec(mat, vec):
return [sum(mat[i][j] * vec[j] for j in range(len(vec))) for i in range(len(mat))]
outputs = []
for t in range(num_tokens):
# Compute gate scores for this token
gate_scores = gate_weights[t]
# Select top-k experts
indexed = sorted(enumerate(gate_scores), key=lambda x: -x[1])
top_k = indexed[:k]
# Softmax over selected expert scores for routing weights
top_vals = [v for _, v in top_k]
weights = softmax(top_vals)
# Compute weighted combination of expert outputs
out = [0.0] * dim
for idx, (expert_id, _) in enumerate(top_k):
expert_out = matvec(expert_weights[expert_id], inputs[t])
for d in range(dim):
out[d] += weights[idx] * expert_out[d]
outputs.append([round(v, 4) for v in out])
return outputs