#130 · Deep Learning · Hard
⊣ Solve on deep-ml.comImplement a simple CNN training function with forward pass and backpropagation. The CNN should have a convolutional layer followed by a fully connected layer, trained on a small dataset using MSE loss.
import numpy as np
def conv2d_forward(x, kernel, bias, stride=1, pad=0):
if pad > 0:
x = np.pad(x, ((0,0),(0,0),(pad,pad),(pad,pad)), mode='constant')
N, C_in, H, W = x.shape
C_out, C_in_k, kH, kW = kernel.shape
out_h = (H - kH) // stride + 1
out_w = (W - kW) // stride + 1
out = np.zeros((N, C_out, out_h, out_w))
for n in range(N):
for co in range(C_out):
for i in range(out_h):
for j in range(out_w):
h_start = i * stride
w_start = j * stride
receptive = x[n, :, h_start:h_start+kH, w_start:w_start+kW]
out[n, co, i, j] = np.sum(receptive * kernel[co]) + bias[co]
return out
def relu(x):
return np.maximum(0, x)
def relu_backward(dout, x):
return dout * (x > 0)
def train_cnn(X, y, epochs=100, lr=0.01):
N, C, H, W = X.shape
# Conv layer: 1 filter of size 3x3
n_filters = 1
kH, kW = 3, 3
kernel = np.random.randn(n_filters, C, kH, kW) * 0.1
bias_conv = np.zeros(n_filters)
out_h = H - kH + 1
out_w = W - kW + 1
fc_input_size = n_filters * out_h * out_w
output_size = y.shape[1] if y.ndim > 1 else 1
W_fc = np.random.randn(fc_input_size, output_size) * 0.1
b_fc = np.zeros(output_size)
for epoch in range(epochs):
# Forward
conv_out_raw = conv2d_forward(X, kernel, bias_conv)
conv_out = relu(conv_out_raw)
flat = conv_out.reshape(N, -1)
logits = flat @ W_fc + b_fc
# MSE loss
target = y.reshape(N, output_size)
loss = np.mean((logits - target) ** 2)
# Backward
d_logits = 2.0 * (logits - target) / (N * output_size)
# FC backward
dW_fc = flat.T @ d_logits
db_fc = np.sum(d_logits, axis=0)
d_flat = d_logits @ W_fc.T
# Reshape and ReLU backward
d_conv_out = d_flat.reshape(conv_out.shape)
d_conv_raw = relu_backward(d_conv_out, conv_out_raw)
# Conv backward
d_kernel = np.zeros_like(kernel)
d_bias_conv = np.sum(d_conv_raw, axis=(0, 2, 3))
for n in range(N):
for co in range(n_filters):
for i in range(out_h):
for j in range(out_w):
h_s = i
w_s = j
d_kernel[co] += d_conv_raw[n, co, i, j] * X[n, :, h_s:h_s+kH, w_s:w_s+kW]
# Update
kernel -= lr * d_kernel
bias_conv -= lr * d_bias_conv
W_fc -= lr * dW_fc
b_fc -= lr * db_fc
return kernel, bias_conv, W_fc, b_fc, loss