Implement a pairwise preference judge that compares two LLM responses and determines which one is better based on specified criteria (relevance, coherence, completeness). Return the preferred response index and a score breakdown.
from typing import Dict, List
def pairwise_preference_judge(
prompt: str,
response_a: str,
response_b: str,
criteria_keywords: Dict[str, List[str]]
) -> Dict:
scores_a = {}
scores_b = {}
a_lower = response_a.lower()
b_lower = response_b.lower()
prompt_lower = prompt.lower()
for criterion, keywords in criteria_keywords.items():
# Score based on keyword coverage
a_matches = sum(1 for kw in keywords if kw.lower() in a_lower)
b_matches = sum(1 for kw in keywords if kw.lower() in b_lower)
scores_a[criterion] = a_matches / len(keywords) if keywords else 0
scores_b[criterion] = b_matches / len(keywords) if keywords else 0
# Length-based coherence bonus (penalize extremely short or long)
prompt_len = len(prompt.split())
for label, resp, scores in [("A", response_a, scores_a), ("B", response_b, scores_b)]:
resp_len = len(resp.split())
ratio = resp_len / max(prompt_len, 1)
length_score = min(ratio / 10.0, 1.0)
scores["length_appropriateness"] = round(length_score, 4)
total_a = sum(scores_a.values())
total_b = sum(scores_b.values())
if total_a > total_b:
preferred = "A"
elif total_b > total_a:
preferred = "B"
else:
preferred = "TIE"
return {
"preferred": preferred,
"scores_a": scores_a,
"scores_b": scores_b,
"total_a": round(total_a, 4),
"total_b": round(total_b, 4)
}