Compute the prefix cache hit rate for an LLM serving system. Prefix caching stores the KV cache for common prompt prefixes (e.g., system prompts) so they do not need to be recomputed. Given a stream of requests with their prompt token sequences, determine how many prefix tokens can be served from cache versus computed from scratch.
def prefix_cache_hit_rate(
requests: list[list[int]],
cache_capacity: int = 10
) -> dict:
# Trie-based prefix cache
cache = {} # maps prefix tuple -> True
total_tokens = 0
cached_tokens = 0
computed_tokens = 0
# LRU tracking: list of recently used prefixes
lru_order = []
for req in requests:
total_tokens += len(req)
# Find longest cached prefix
longest_match = 0
for length in range(len(req), 0, -1):
prefix = tuple(req[:length])
if prefix in cache:
longest_match = length
# Update LRU
if prefix in lru_order:
lru_order.remove(prefix)
lru_order.append(prefix)
break
cached_tokens += longest_match
computed_tokens += len(req) - longest_match
# Add all prefixes of this request to cache
prefix = tuple(req)
if prefix not in cache:
# Evict if at capacity
while len(cache) >= cache_capacity and lru_order:
evict = lru_order.pop(0)
cache.pop(evict, None)
cache[prefix] = True
lru_order.append(prefix)
hit_rate = cached_tokens / total_tokens if total_tokens > 0 else 0.0
return {
"total_tokens": total_tokens,
"cached_tokens": cached_tokens,
"computed_tokens": computed_tokens,
"hit_rate": round(hit_rate, 4),
"cache_entries": len(cache)
}