Implement the REINFORCE policy gradient algorithm. Given a policy network (parameterized by weights), a set of episode trajectories (states, actions, rewards), and a discount factor gamma, compute the policy gradient update and return updated weights.
import numpy as np
def reinforce(policy_weights: np.ndarray, episodes: list, learning_rate: float = 0.01, gamma: float = 0.99) -> np.ndarray:
weight_update = np.zeros_like(policy_weights)
for episode in episodes:
states, actions, rewards = episode
# Compute discounted returns for each timestep
T = len(rewards)
returns = np.zeros(T)
G = 0.0
for t in reversed(range(T)):
G = rewards[t] + gamma * G
returns[t] = G
# Normalize returns for stability
if len(returns) > 1 and np.std(returns) > 0:
returns = (returns - np.mean(returns)) / (np.std(returns) + 1e-8)
for t in range(T):
state = np.array(states[t])
action = actions[t]
# Softmax policy: pi(a|s) = softmax(s @ W)
logits = state @ policy_weights
logits = logits - np.max(logits) # numerical stability
exp_logits = np.exp(logits)
probs = exp_logits / np.sum(exp_logits)
# Gradient of log-policy: d/dW log pi(a|s)
# = state^T (one_hot(a) - probs)
one_hot = np.zeros_like(probs)
one_hot[action] = 1.0
grad = np.outer(state, one_hot - probs)
weight_update += returns[t] * grad
policy_weights += learning_rate * weight_update / len(episodes)
return policy_weightsstate^T * (one_hot(action) - probs).