Break down the cold start latency budget for an ML inference service. Given component latencies (container pull, model download, model load into GPU memory, warmup inference, health check registration), compute the total cold start time and identify which components dominate the budget. Support analyzing the impact of optimizations like pre-pulled images or cached model weights.
def cold_start_budget(
components: list[dict],
optimizations: list[dict] | None = None
) -> dict:
opt_map = {}
if optimizations:
for opt in optimizations:
opt_map[opt["component"]] = opt["reduction_ms"]
breakdown = []
total_ms = 0.0
for comp in components:
name = comp["name"]
base_ms = comp["latency_ms"]
reduction = opt_map.get(name, 0)
effective_ms = max(0, base_ms - reduction)
total_ms += effective_ms
breakdown.append({
"name": name,
"base_ms": base_ms,
"reduction_ms": reduction,
"effective_ms": effective_ms
})
for item in breakdown:
item["fraction"] = round(item["effective_ms"] / total_ms, 4) if total_ms > 0 else 0.0
breakdown.sort(key=lambda x: x["effective_ms"], reverse=True)
bottleneck = breakdown[0]["name"] if breakdown else "none"
return {
"breakdown": breakdown,
"total_cold_start_ms": round(total_ms, 2),
"total_cold_start_sec": round(total_ms / 1000, 2),
"bottleneck": bottleneck
}max(0, base - reduction), then sum for the total cold start time.