Implement the ROUGE score (Recall-Oriented Understudy for Gisting Evaluation) to evaluate the quality of a generated summary against a reference summary. Compute ROUGE-1 (unigram) and ROUGE-2 (bigram) precision, recall, and F1 scores.
from typing import Dict
def rouge_score(reference: str, candidate: str) -> Dict[str, Dict[str, float]]:
def get_ngrams(text: str, n: int):
tokens = text.lower().split()
return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
def compute_rouge_n(ref: str, cand: str, n: int):
ref_ngrams = get_ngrams(ref, n)
cand_ngrams = get_ngrams(cand, n)
if not ref_ngrams and not cand_ngrams:
return {"precision": 1.0, "recall": 1.0, "f1": 1.0}
if not ref_ngrams or not cand_ngrams:
return {"precision": 0.0, "recall": 0.0, "f1": 0.0}
ref_counts = {}
for ng in ref_ngrams:
ref_counts[ng] = ref_counts.get(ng, 0) + 1
cand_counts = {}
for ng in cand_ngrams:
cand_counts[ng] = cand_counts.get(ng, 0) + 1
overlap = 0
for ng, count in cand_counts.items():
overlap += min(count, ref_counts.get(ng, 0))
precision = overlap / len(cand_ngrams) if cand_ngrams else 0.0
recall = overlap / len(ref_ngrams) if ref_ngrams else 0.0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
return {"precision": precision, "recall": recall, "f1": f1}
return {
"rouge-1": compute_rouge_n(reference, candidate, 1),
"rouge-2": compute_rouge_n(reference, candidate, 2),
}