Implement a KV Cache Tiered Offloading Simulator. Given a list of token access events (each with a token ID and timestamp), a fast-tier (GPU) cache capacity, and an eviction policy, simulate offloading cold KV cache entries from the fast tier to a slow tier (CPU/host memory). Return the number of fast-tier hits, slow-tier hits, and misses.
def kv_cache_tiered_offload(
accesses: list[tuple[int, int]],
fast_capacity: int,
slow_capacity: int
) -> dict[str, int]:
from collections import OrderedDict
fast_cache: OrderedDict[int, int] = OrderedDict()
slow_cache: OrderedDict[int, int] = OrderedDict()
fast_hits = 0
slow_hits = 0
misses = 0
for token_id, timestamp in accesses:
if token_id in fast_cache:
fast_hits += 1
fast_cache.move_to_end(token_id)
fast_cache[token_id] = timestamp
elif token_id in slow_cache:
slow_hits += 1
slow_cache.pop(token_id)
if len(fast_cache) >= fast_capacity:
evicted_key, evicted_ts = fast_cache.popitem(last=False)
if len(slow_cache) >= slow_capacity:
slow_cache.popitem(last=False)
slow_cache[evicted_key] = evicted_ts
fast_cache[token_id] = timestamp
else:
misses += 1
if len(fast_cache) >= fast_capacity:
evicted_key, evicted_ts = fast_cache.popitem(last=False)
if len(slow_cache) >= slow_capacity:
slow_cache.popitem(last=False)
slow_cache[evicted_key] = evicted_ts
fast_cache[token_id] = timestamp
return {
"fast_hits": fast_hits,
"slow_hits": slow_hits,
"misses": misses
}