Implement the sinusoidal time embedding used in diffusion model U-Nets. The timestep is encoded into a high-dimensional vector using sinusoidal position encodings (similar to Transformer position embeddings), then projected through an MLP.
import numpy as np
def sinusoidal_time_embedding(timesteps: np.ndarray, embedding_dim: int) -> np.ndarray:
# timesteps: (batch_size,)
half_dim = embedding_dim // 2
freqs = np.exp(-np.log(10000.0) * np.arange(half_dim) / half_dim)
# Outer product: (batch_size, half_dim)
args = timesteps[:, np.newaxis] * freqs[np.newaxis, :]
embedding = np.concatenate([np.sin(args), np.cos(args)], axis=-1)
# If embedding_dim is odd, pad with zero
if embedding_dim % 2 == 1:
embedding = np.concatenate([embedding, np.zeros_like(embedding[:, :1])], axis=-1)
return embedding
def time_mlp(timesteps: np.ndarray, embedding_dim: int, hidden_dim: int, W1: np.ndarray, b1: np.ndarray, W2: np.ndarray, b2: np.ndarray) -> np.ndarray:
emb = sinusoidal_time_embedding(timesteps, embedding_dim)
# MLP: Linear -> SiLU -> Linear
h = emb @ W1 + b1
h = h * (1.0 / (1.0 + np.exp(-h))) # SiLU activation
return h @ W2 + b2half_dim steps.