#439 · Machine Learning · Medium
⊣ Solve on deep-ml.comSimulate expert parallelism token routing in a Mixture-of-Experts (MoE) model. Given the number of tokens, number of experts, top-k routing assignments per token, the expert-to-GPU placement, and inter-GPU bandwidth, compute how many tokens must be communicated between GPUs and the estimated communication time.
def expert_parallel_routing_cost(
token_assignments: list[list[int]],
expert_to_gpu: dict[int, int],
token_to_gpu: list[int],
token_hidden_bytes: int,
bandwidth_gbps: float
) -> dict:
local_tokens = 0
remote_tokens = 0
for t_idx, experts in enumerate(token_assignments):
src_gpu = token_to_gpu[t_idx]
for expert_id in experts:
dst_gpu = expert_to_gpu[expert_id]
if dst_gpu == src_gpu:
local_tokens += 1
else:
remote_tokens += 1
total_remote_bytes = remote_tokens * token_hidden_bytes
bandwidth_bytes_per_sec = bandwidth_gbps * 1e9 / 8
comm_time_ms = (total_remote_bytes / bandwidth_bytes_per_sec) * 1000
return {
"local_tokens": local_tokens,
"remote_tokens": remote_tokens,
"total_remote_bytes": total_remote_bytes,
"comm_time_ms": round(comm_time_ms, 4)
}