#440 · Machine Learning · Hard
⊣ Solve on deep-ml.comBuild a disaggregated prefill-decode serving simulator. In this architecture, prefill (prompt processing) and decode (token generation) run on separate GPU pools. Given a stream of requests with prompt lengths and output lengths, pool sizes, prefill throughput (tokens/sec per GPU), and decode throughput (tokens/sec per GPU), simulate the system and return per-request latencies and overall throughput.
def disaggregated_serving_sim(
requests: list[dict],
prefill_gpus: int,
decode_gpus: int,
prefill_tok_per_sec: float,
decode_tok_per_sec: float,
kv_transfer_ms: float
) -> dict:
import heapq
prefill_pool = [0.0] * prefill_gpus
decode_pool = [0.0] * decode_gpus
results = []
total_output_tokens = 0
for req in requests:
arrival = req["arrival_ms"]
prompt_len = req["prompt_length"]
output_len = req["output_length"]
prefill_time_ms = (prompt_len / prefill_tok_per_sec) * 1000
gpu_idx = prefill_pool.index(min(prefill_pool))
prefill_start = max(arrival, prefill_pool[gpu_idx])
prefill_end = prefill_start + prefill_time_ms
prefill_pool[gpu_idx] = prefill_end
kv_ready = prefill_end + kv_transfer_ms
decode_time_ms = (output_len / decode_tok_per_sec) * 1000
dgpu_idx = decode_pool.index(min(decode_pool))
decode_start = max(kv_ready, decode_pool[dgpu_idx])
decode_end = decode_start + decode_time_ms
decode_pool[dgpu_idx] = decode_end
latency = decode_end - arrival
results.append({
"latency_ms": round(latency, 2),
"prefill_ms": round(prefill_time_ms, 2),
"decode_ms": round(decode_time_ms, 2),
"kv_transfer_ms": kv_transfer_ms
})
total_output_tokens += output_len
last_finish = max(max(decode_pool), max(prefill_pool))
first_arrival = requests[0]["arrival_ms"] if requests else 0
total_time_sec = (last_finish - first_arrival) / 1000 if last_finish > first_arrival else 1
throughput = total_output_tokens / total_time_sec
return {
"per_request": results,
"throughput_tok_per_sec": round(throughput, 2)
}prompt_length / prefill_throughput.output_length / decode_throughput.