#210 · Reinforcement Learning · Medium
⊣ Solve on deep-ml.comImplement the Dr. GRPO (Group Relative Policy Optimization) complete objective function. This RL objective uses group-relative advantages and includes both a clipped policy ratio term and a KL divergence penalty against a reference model.
import numpy as np
def dr_grpo_objective(log_probs: np.ndarray,
log_probs_old: np.ndarray,
ref_log_probs: np.ndarray,
rewards: np.ndarray,
epsilon: float = 0.2,
beta: float = 0.04) -> float:
"""
log_probs: log pi_theta(y_i|x) for each response in the group
log_probs_old: log pi_old(y_i|x) from the sampling policy
ref_log_probs: log pi_ref(y_i|x) from the reference policy
rewards: scalar rewards for each response
"""
G = len(rewards)
# Group-relative advantages
mean_r = np.mean(rewards)
std_r = np.std(rewards)
if std_r < 1e-8:
advantages = np.zeros(G)
else:
advantages = (rewards - mean_r) / std_r
# Policy ratio
ratio = np.exp(log_probs - log_probs_old)
# Clipped surrogate
surr1 = ratio * advantages
surr2 = np.clip(ratio, 1 - epsilon, 1 + epsilon) * advantages
clipped_obj = np.minimum(surr1, surr2)
# KL divergence penalty (approximate): D_KL(pi_theta || pi_ref)
kl = log_probs - ref_log_probs
# Objective: maximize clipped surrogate - beta * KL
objective = np.mean(clipped_obj) - beta * np.mean(kl)
# Return as loss (negate for minimization)
return float(-objective)exp(log_pi_new - log_pi_old) measures how much the action probability has changed.beta * E[log pi_theta - log pi_ref] penalizes deviation from the reference policy, preventing reward hacking and mode collapse.