Implement the gradient calculation step in t-SNE (t-distributed Stochastic Neighbor Embedding). Given high-dimensional pairwise affinities P and low-dimensional embeddings Y, compute the gradient of the KL divergence cost with respect to Y.
import numpy as np
def tsne_gradient(P: np.ndarray, Y: np.ndarray) -> np.ndarray:
n = Y.shape[0]
# Compute pairwise squared distances in low-dim space
sum_Y = np.sum(Y ** 2, axis=1)
D = sum_Y[:, None] + sum_Y[None, :] - 2 * Y @ Y.T
# Compute Student-t kernel (1 degree of freedom)
num = 1.0 / (1.0 + D)
np.fill_diagonal(num, 0.0)
Q = num / num.sum()
Q = np.maximum(Q, 1e-12)
# Compute gradient
PQ_diff = P - Q
grad = np.zeros_like(Y)
for i in range(n):
diff = Y[i] - Y
grad[i] = 4.0 * np.sum((PQ_diff[i, :] * num[i, :])[:, None] * diff, axis=0)
return grad||yi - yj||^2 = ||yi||^2 + ||yj||^2 - 2 yi^T yj.(1 + ||yi - yj||^2)^{-1}, then normalize.4 * sum_j (p_ij - q_ij)(1 + ||yi - yj||^2)^{-1} (yi - yj).