Implement a 3D CNN forward pass from scratch. Given a 3D input volume (e.g., a video clip or volumetric data) and a set of 3D convolutional filters, compute the output feature map.
def conv3d_forward(
input_vol: list[list[list[list[float]]]],
filters: list[list[list[list[list[float]]]]],
biases: list[float],
stride: int = 1,
padding: int = 0,
) -> list[list[list[list[float]]]]:
# input_vol: [C_in, D, H, W]
# filters: [C_out, C_in, Fd, Fh, Fw]
C_in = len(input_vol)
D = len(input_vol[0])
H = len(input_vol[0][0])
W = len(input_vol[0][0][0])
C_out = len(filters)
Fd = len(filters[0][0])
Fh = len(filters[0][0][0])
Fw = len(filters[0][0][0][0])
# Apply zero padding
if padding > 0:
D_p = D + 2 * padding
H_p = H + 2 * padding
W_p = W + 2 * padding
padded = [[[[0.0] * W_p for _ in range(H_p)] for _ in range(D_p)] for _ in range(C_in)]
for c in range(C_in):
for d in range(D):
for h in range(H):
for w in range(W):
padded[c][d + padding][h + padding][w + padding] = input_vol[c][d][h][w]
input_vol = padded
D, H, W = D_p, H_p, W_p
out_D = (D - Fd) // stride + 1
out_H = (H - Fh) // stride + 1
out_W = (W - Fw) // stride + 1
output = [[[[0.0] * out_W for _ in range(out_H)] for _ in range(out_D)] for _ in range(C_out)]
for co in range(C_out):
for d in range(out_D):
for h in range(out_H):
for w in range(out_W):
val = biases[co]
for ci in range(C_in):
for fd in range(Fd):
for fh in range(Fh):
for fw in range(Fw):
val += (
input_vol[ci][d * stride + fd][h * stride + fh][w * stride + fw]
* filters[co][ci][fd][fh][fw]
)
output[co][d][h][w] = round(val, 4)
return output[C_out, out_D, out_H, out_W] where each out dimension is (in_dim - filter_dim) // stride + 1.