#258 · Machine Learning · Medium
⊣ Solve on deep-ml.comCompute the Calinski-Harabasz Index (Variance Ratio Criterion) for evaluating clustering quality. Higher values indicate better-defined clusters.
Compute the ratio of between-cluster dispersion to within-cluster dispersion, adjusted by degrees of freedom.
import math
def calinski_harabasz_index(X: list[list[float]], labels: list[int]) -> float:
n = len(X)
dim = len(X[0])
unique_labels = sorted(set(labels))
k = len(unique_labels)
if k <= 1 or k >= n:
return 0.0
# Global centroid
global_centroid = [sum(X[i][d] for i in range(n)) / n for d in range(dim)]
# Cluster centroids and members
cluster_members: dict[int, list[int]] = {}
for i, lab in enumerate(labels):
cluster_members.setdefault(lab, []).append(i)
centroids: dict[int, list[float]] = {}
for lab in unique_labels:
members = cluster_members[lab]
n_m = len(members)
centroids[lab] = [sum(X[j][d] for j in members) / n_m for d in range(dim)]
# Between-cluster dispersion (BGS)
bgs = 0.0
for lab in unique_labels:
n_m = len(cluster_members[lab])
dist_sq = sum((centroids[lab][d] - global_centroid[d]) ** 2 for d in range(dim))
bgs += n_m * dist_sq
# Within-cluster dispersion (WGS)
wgs = 0.0
for lab in unique_labels:
c = centroids[lab]
for j in cluster_members[lab]:
wgs += sum((X[j][d] - c[d]) ** 2 for d in range(dim))
if wgs == 0:
return 0.0
ch_index = (bgs / (k - 1)) / (wgs / (n - k))
return round(ch_index, 4)(BGS / (k-1)) / (WGS / (n-k)). Higher values indicate denser, better-separated clusters.