Implement K-Means++ initialization, which selects initial cluster centers in a way that spreads them out, leading to faster convergence and better clustering results than random initialization.
import numpy as np
def kmeans_plus_plus(X: np.ndarray, k: int) -> np.ndarray:
n, d = X.shape
centers = np.zeros((k, d))
# Choose first center uniformly at random
idx = np.random.randint(0, n)
centers[0] = X[idx]
for c in range(1, k):
# Compute squared distances from each point to nearest existing center
dists = np.min(
np.sum((X[:, None] - centers[:c][None, :]) ** 2, axis=2),
axis=1
)
# Choose next center with probability proportional to distance squared
probs = dists / dists.sum()
idx = np.random.choice(n, p=probs)
centers[c] = X[idx]
return centers