Given a GPU spec sheet with peak FLOP/s and peak memory bandwidth (bytes/s), compute the ops:byte ratio (also known as the machine balance or ridge point of the roofline model). This ratio tells you the minimum arithmetic intensity an operation needs to be compute-bound on this hardware.
def gpu_ops_byte_ratio(
peak_flops: float,
peak_bandwidth_bytes_per_sec: float,
dtype: str = "fp16"
) -> dict:
ops_byte_ratio = peak_flops / peak_bandwidth_bytes_per_sec
dtype_bytes = {"fp32": 4, "fp16": 2, "bf16": 2, "fp8": 1, "int8": 1, "int4": 0.5}
bpe = dtype_bytes.get(dtype, 2)
# Ops:element ratio = ops:byte * bytes_per_element
ops_per_element = ops_byte_ratio * bpe
return {
"ops_byte_ratio": round(ops_byte_ratio, 2),
"dtype": dtype,
"bytes_per_element": bpe,
"ops_per_element": round(ops_per_element, 2),
"interpretation": (
f"An operation must perform at least {round(ops_byte_ratio, 1)} FLOPs "
f"per byte transferred to be compute-bound. "
f"For {dtype} ({bpe}B per element), that is {round(ops_per_element, 1)} "
f"FLOPs per element."
)
}