Implement the GeLU (Gaussian Error Linear Unit) activation function. GeLU smoothly gates the input by its own value, using the cumulative distribution function of the standard normal distribution. It is widely used in transformers like BERT and GPT.
import numpy as np
def gelu(x: np.ndarray) -> np.ndarray:
# Exact: x * Phi(x) where Phi is the CDF of standard normal
return 0.5 * x * (1.0 + np.vectorize(lambda v: erf_approx(v / np.sqrt(2)))(x))
def erf_approx(x):
# Abramowitz and Stegun approximation
sign = np.sign(x)
x = abs(x)
t = 1.0 / (1.0 + 0.3275911 * x)
poly = t * (0.254829592 + t * (-0.284496736 + t * (1.421413741 + t * (-1.453152027 + t * 1.061405429))))
return sign * (1.0 - poly * np.exp(-x * x))
def gelu_approx(x: np.ndarray) -> np.ndarray:
# Tanh approximation (used in practice)
return 0.5 * x * (1.0 + np.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * x ** 3)))
def gelu_sigmoid(x: np.ndarray) -> np.ndarray:
# Sigmoid approximation
return x * (1.0 / (1.0 + np.exp(-1.702 * x)))x * Phi(x) where Phi is the standard normal CDF, computed via the error function.0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))) -- the most common approximation used in practice.x * sigmoid(1.702 * x) -- a simpler but less accurate variant.