Implement a cache-aware request routing system across multiple model replicas. Each replica maintains its own KV cache of recently-seen prompt prefixes. Given a list of requests (each with a prompt prefix hash) and replica states, route each request to the replica that already has the best prefix cache match, or to the least-loaded replica if no cache match exists. Return the routing assignments and cache hit ratio.
def cache_aware_routing(
requests: list[dict],
num_replicas: int,
cache_capacity: int
) -> dict:
from collections import OrderedDict
caches: list[OrderedDict] = [OrderedDict() for _ in range(num_replicas)]
loads: list[int] = [0] * num_replicas
assignments: list[int] = []
cache_hits = 0
for req in requests:
prefix_hash = req["prefix_hash"]
best_replica = -1
for r in range(num_replicas):
if prefix_hash in caches[r]:
if best_replica == -1 or loads[r] < loads[best_replica]:
best_replica = r
if best_replica != -1:
cache_hits += 1
caches[best_replica].move_to_end(prefix_hash)
else:
best_replica = min(range(num_replicas), key=lambda r: loads[r])
loads[best_replica] += 1
assignments.append(best_replica)
if prefix_hash not in caches[best_replica]:
if len(caches[best_replica]) >= cache_capacity:
caches[best_replica].popitem(last=False)
caches[best_replica][prefix_hash] = True
hit_ratio = cache_hits / len(requests) if requests else 0.0
return {
"assignments": assignments,
"cache_hits": cache_hits,
"hit_ratio": round(hit_ratio, 4)
}