Estimate the minimum number of GPUs required to deploy a large language model given the model's parameter count, bytes per parameter, and the available GPU memory (in GB). Account for an overhead factor (e.g., 1.2x) for activations, KV cache, and framework buffers.
import math
def estimate_min_gpus(
num_params_billion: float,
bytes_per_param: int = 2,
gpu_memory_gb: float = 80.0,
overhead_factor: float = 1.2
) -> int:
model_memory_gb = num_params_billion * 1e9 * bytes_per_param / (1024 ** 3)
total_memory_gb = model_memory_gb * overhead_factor
num_gpus = math.ceil(total_memory_gb / gpu_memory_gb)
return max(num_gpus, 1)math.ceil.