Implement the Elo rating system for comparing ML models (or players). Given a sequence of pairwise comparison outcomes, update the ratings of each model according to the Elo formula.
def expected_score(rating_a: float, rating_b: float) -> float:
return 1.0 / (1.0 + 10 ** ((rating_b - rating_a) / 400))
def update_elo(rating_a: float, rating_b: float, score_a: float,
k: float = 32.0) -> tuple:
"""
score_a: 1.0 if A wins, 0.0 if B wins, 0.5 for draw
Returns updated (rating_a, rating_b)
"""
e_a = expected_score(rating_a, rating_b)
e_b = 1.0 - e_a
new_rating_a = rating_a + k * (score_a - e_a)
new_rating_b = rating_b + k * ((1.0 - score_a) - e_b)
return new_rating_a, new_rating_b
def elo_rating_system(models: list[str], matchups: list[dict],
initial_rating: float = 1500.0,
k: float = 32.0) -> dict:
"""
matchups: list of {"model_a": str, "model_b": str, "score_a": float}
Returns: dict of model name -> final rating
"""
ratings = {model: initial_rating for model in models}
for match in matchups:
a = match["model_a"]
b = match["model_b"]
score_a = match["score_a"]
new_a, new_b = update_elo(ratings[a], ratings[b], score_a, k)
ratings[a] = round(new_a, 2)
ratings[b] = round(new_b, 2)
return ratings