Implement a roofline model analysis for GPU operations. Given a list of operations (each with its FLOPs and memory traffic in bytes), along with GPU peak compute (FLOP/s) and peak memory bandwidth (bytes/s), compute the attainable performance for each operation and determine whether each is compute-bound or memory-bound.
def roofline_analysis(
operations: list[dict],
peak_compute: float,
peak_bandwidth: float
) -> list[dict]:
ridge_point = peak_compute / peak_bandwidth
results = []
for op in operations:
flops = op["flops"]
mem_bytes = op["memory_bytes"]
ai = flops / mem_bytes if mem_bytes > 0 else float('inf')
attainable = min(peak_compute, ai * peak_bandwidth)
bound = "compute-bound" if ai >= ridge_point else "memory-bound"
efficiency = attainable / peak_compute * 100
exec_time = flops / attainable if attainable > 0 else float('inf')
results.append({
"name": op.get("name", ""),
"arithmetic_intensity": round(ai, 4),
"attainable_flops": round(attainable, 4),
"bound": bound,
"efficiency": round(efficiency, 2),
"estimated_time_s": round(exec_time, 8)
})
return resultsmin(peak_compute, AI * peak_bandwidth), forming the classic roofline shape.