#422 · Inference · Medium
⊣ Solve on deep-ml.comCalculate the communication overhead when transferring data between GPUs using different interconnects (NVLink vs InfiniBand). Given the data size to transfer, the bandwidth of each link, and the communication pattern (all-reduce, all-gather, etc.), compute the time required.
def multi_gpu_comm_overhead(
data_size_bytes: float,
num_gpus: int,
nvlink_bw_gbps: float = 900.0,
ib_bw_gbps: float = 400.0,
collective: str = "all_reduce"
) -> dict:
nvlink_bw_bytes = nvlink_bw_gbps * 1e9 / 8 # bits to bytes
ib_bw_bytes = ib_bw_gbps * 1e9 / 8
# Ring all-reduce: each GPU sends (n-1)/n of data in 2 steps
if collective == "all_reduce":
effective_data = data_size_bytes * 2 * (num_gpus - 1) / num_gpus
elif collective == "all_gather":
effective_data = data_size_bytes * (num_gpus - 1) / num_gpus
elif collective == "reduce_scatter":
effective_data = data_size_bytes * (num_gpus - 1) / num_gpus
elif collective == "broadcast":
effective_data = data_size_bytes
else:
effective_data = data_size_bytes
nvlink_time_ms = (effective_data / nvlink_bw_bytes) * 1000
ib_time_ms = (effective_data / ib_bw_bytes) * 1000
speedup = ib_time_ms / nvlink_time_ms if nvlink_time_ms > 0 else 0
return {
"effective_data_bytes": round(effective_data, 2),
"nvlink_time_ms": round(nvlink_time_ms, 4),
"ib_time_ms": round(ib_time_ms, 4),
"nvlink_speedup": round(speedup, 2)
}(n-1)/n of the data twice (once for reduce-scatter, once for all-gather), so the effective data volume is 2 * (n-1)/n * data_size.