Implement Gradient Bandit action selection using a softmax preference-based method. Each action has a numerical preference H(a), and action probabilities are computed via softmax. Update preferences using a gradient ascent rule.
import numpy as np
def gradient_bandit_select(preferences: np.ndarray) -> tuple:
preferences = preferences - np.max(preferences)
exp_prefs = np.exp(preferences)
probs = exp_prefs / np.sum(exp_prefs)
action = np.random.choice(len(preferences), p=probs)
return int(action), probs
def gradient_bandit_update(preferences: np.ndarray, action: int,
reward: float, avg_reward: float,
probs: np.ndarray, alpha: float) -> np.ndarray:
baseline = reward - avg_reward
for a in range(len(preferences)):
if a == action:
preferences[a] += alpha * baseline * (1 - probs[a])
else:
preferences[a] -= alpha * baseline * probs[a]
return preferencesbaseline = reward - avg_reward.alpha * baseline * (1 - pi(a)).alpha * baseline * pi(a).