Implement the Direct Preference Optimization (DPO) loss function. Given pairs of preferred and dispreferred responses, compute the loss that directly optimizes the policy to match human preferences without training a separate reward model.
import numpy as np
def dpo_loss(
pi_logprobs_chosen: np.ndarray,
pi_logprobs_rejected: np.ndarray,
ref_logprobs_chosen: np.ndarray,
ref_logprobs_rejected: np.ndarray,
beta: float = 0.1
) -> float:
# Compute log-ratios
log_ratio_chosen = pi_logprobs_chosen - ref_logprobs_chosen
log_ratio_rejected = pi_logprobs_rejected - ref_logprobs_rejected
# DPO loss: -E[log sigmoid(beta * (log_ratio_chosen - log_ratio_rejected))]
logits = beta * (log_ratio_chosen - log_ratio_rejected)
# Numerically stable log-sigmoid
loss = -np.mean(np.where(
logits >= 0,
-np.log(1 + np.exp(-logits)),
logits - np.log(1 + np.exp(logits))
))
return float(loss)
def dpo_gradients(
pi_logprobs_chosen: np.ndarray,
pi_logprobs_rejected: np.ndarray,
ref_logprobs_chosen: np.ndarray,
ref_logprobs_rejected: np.ndarray,
beta: float = 0.1
) -> dict:
log_ratio_chosen = pi_logprobs_chosen - ref_logprobs_chosen
log_ratio_rejected = pi_logprobs_rejected - ref_logprobs_rejected
logits = beta * (log_ratio_chosen - log_ratio_rejected)
sigmoid = 1.0 / (1.0 + np.exp(-logits))
# Gradient: push up chosen, push down rejected
grad_chosen = -beta * (1 - sigmoid) / len(logits)
grad_rejected = beta * (1 - sigmoid) / len(logits)
return {
"loss": float(dpo_loss(pi_logprobs_chosen, pi_logprobs_rejected,
ref_logprobs_chosen, ref_logprobs_rejected, beta)),
"grad_chosen": grad_chosen,
"grad_rejected": grad_rejected,
"mean_reward_margin": float(np.mean(log_ratio_chosen - log_ratio_rejected)),
}-log(sigmoid(beta * (log_ratio_chosen - log_ratio_rejected))), derived from the Bradley-Terry preference model.