#458 · Deep Learning · Hard
⊣ Solve on deep-ml.comImplement a Sigmoid Mixture-of-Experts (MoE) router with bias correction. Unlike a softmax router that produces a probability distribution, a sigmoid router scores each expert independently with a sigmoid, then normalizes. Include a learned bias term per expert to correct for load imbalance.
import math
def sigmoid(x: float) -> float:
x = max(-500.0, min(500.0, x))
return 1.0 / (1.0 + math.exp(-x))
def sigmoid_moe_router(
token_embeddings: list[list[float]],
expert_weights: list[list[float]],
expert_bias: list[float],
top_k: int
) -> tuple[list[list[int]], list[list[float]]]:
B, E = len(token_embeddings), len(expert_weights)
all_indices, all_scores = [], []
for b in range(B):
scored = []
for e in range(E):
dot = expert_bias[e] + sum(token_embeddings[b][j] * expert_weights[e][j] for j in range(len(token_embeddings[0])))
scored.append((sigmoid(dot), e))
scored.sort(key=lambda x: -x[0])
top = scored[:top_k]
s = sum(v for v, _ in top) or 1.0
all_indices.append([idx for _, idx in top])
all_scores.append([v / s for v, _ in top])
return all_indices, all_scorestop_k experts with the highest sigmoid scores for each token.