#157 · Reinforcement Learning · Medium
⊣ Solve on deep-ml.comImplement the Bellman Equation for Value Iteration in a Markov Decision Process (MDP). Given states, actions, transition probabilities, rewards, and a discount factor, iteratively compute the optimal value function.
import numpy as np
def value_iteration(states: int, actions: int, transitions: np.ndarray,
rewards: np.ndarray, gamma: float,
theta: float = 1e-6, max_iter: int = 1000) -> np.ndarray:
V = np.zeros(states)
for _ in range(max_iter):
V_new = np.zeros(states)
for s in range(states):
q_values = []
for a in range(actions):
q = 0.0
for s_next in range(states):
q += transitions[s, a, s_next] * (rewards[s, a, s_next] + gamma * V[s_next])
q_values.append(q)
V_new[s] = max(q_values)
if np.max(np.abs(V_new - V)) < theta:
break
V = V_new
return VV(s) = 0 for all states.Q(s, a) for every action using the Bellman equation: Q(s,a) = sum over s' of T(s,a,s') * [R(s,a,s') + gamma * V(s')].V(s) to the maximum Q-value across all actions (greedy selection).theta.