Evaluate translation quality with the METEOR (Metric for Evaluation of Translation with Explicit ORdering) score. Given a candidate translation and one or more reference translations, compute the METEOR score which considers precision, recall, and alignment with a penalty for fragmentation.
from collections import Counter
def meteor_score(candidate: str, reference: str, alpha: float = 0.9, beta: float = 3.0, gamma: float = 0.5) -> float:
cand_tokens = candidate.lower().split()
ref_tokens = reference.lower().split()
cand_counts = Counter(cand_tokens)
ref_counts = Counter(ref_tokens)
# Unigram matches
matches = sum((cand_counts & ref_counts).values())
if matches == 0:
return 0.0
precision = matches / len(cand_tokens)
recall = matches / len(ref_tokens)
# F-mean with alpha weighting recall more heavily
f_score = (precision * recall) / (alpha * precision + (1 - alpha) * recall)
# Count chunks (consecutive matched sequences)
chunks = 0
in_chunk = False
ref_set = set(ref_tokens)
for token in cand_tokens:
if token in ref_set:
if not in_chunk:
chunks += 1
in_chunk = True
else:
in_chunk = False
# Fragmentation penalty
if matches > 0:
frag = chunks / matches
else:
frag = 0
penalty = gamma * (frag ** beta)
score = f_score * (1 - penalty)
return round(max(0.0, score), 4)gamma * (chunks/matches)^beta.F * (1 - penalty). Perfectly ordered translations get low penalty; scrambled translations get high penalty.