Implement the BM25 (Best Matching 25) ranking function. Given a corpus of documents and a query, compute the BM25 score for each document with respect to the query. BM25 is a bag-of-words retrieval function that ranks documents based on the query terms appearing in each document.
import math
from collections import Counter
def bm25(corpus: list[list[str]], query: list[str], k1: float = 1.5, b: float = 0.75) -> list[float]:
N = len(corpus)
doc_lens = [len(doc) for doc in corpus]
avgdl = sum(doc_lens) / N
# Document frequency for each term
df = {}
for doc in corpus:
seen = set(doc)
for term in seen:
df[term] = df.get(term, 0) + 1
# Term frequencies per document
doc_tfs = [Counter(doc) for doc in corpus]
scores = []
for i in range(N):
score = 0.0
for term in query:
if term not in df:
continue
tf = doc_tfs[i].get(term, 0)
idf = math.log((N - df[term] + 0.5) / (df[term] + 0.5) + 1)
numerator = tf * (k1 + 1)
denominator = tf + k1 * (1 - b + b * doc_lens[i] / avgdl)
score += idf * numerator / denominator
scores.append(round(score, 4))
return scoreslog((N - df + 0.5) / (df + 0.5) + 1) which gives higher weight to rarer terms.tf * (k1 + 1) / (tf + k1 * (1 - b + b * dl / avgdl)). Parameter k1 controls term frequency saturation and b controls length normalization.