Implement a rubric-based LLM judge evaluation system. Given a response text, a rubric with scored criteria, evaluate the response against each criterion and produce an overall score.
from typing import Dict, List, Tuple
def rubric_based_evaluation(
response: str,
rubric: Dict[str, Dict[str, any]],
) -> Dict:
results = {}
total_score = 0.0
total_weight = 0.0
for criterion, details in rubric.items():
keywords = details.get("keywords", [])
weight = details.get("weight", 1.0)
max_score = details.get("max_score", 5)
response_lower = response.lower()
matches = sum(1 for kw in keywords if kw.lower() in response_lower)
ratio = matches / len(keywords) if keywords else 0
score = round(ratio * max_score, 2)
results[criterion] = {
"score": score,
"max_score": max_score,
"weight": weight,
"matched_keywords": matches,
"total_keywords": len(keywords)
}
total_score += score * weight
total_weight += max_score * weight
overall = round(total_score / total_weight, 4) if total_weight > 0 else 0.0
return {
"criteria_scores": results,
"overall_score": overall,
"weighted_total": round(total_score, 2),
"weighted_max": round(total_weight, 2)
}