Perform policy evaluation on a gridworld MDP. Given a grid, a policy (action for each state), transition dynamics, rewards, and a discount factor, compute the value function for each state using iterative policy evaluation.
import numpy as np
def policy_evaluation(grid_size: tuple, policy: dict, rewards: dict, gamma: float = 0.9,
terminal_states: list = None, theta: float = 1e-6) -> dict:
if terminal_states is None:
terminal_states = []
rows, cols = grid_size
V = {(r, c): 0.0 for r in range(rows) for c in range(cols)}
action_deltas = {
'up': (-1, 0),
'down': (1, 0),
'left': (0, -1),
'right': (0, 1)
}
while True:
delta = 0.0
for r in range(rows):
for c in range(cols):
state = (r, c)
if state in terminal_states:
continue
old_v = V[state]
action = policy.get(state, 'up')
dr, dc = action_deltas.get(action, (0, 0))
next_r, next_c = r + dr, c + dc
# Stay in place if out of bounds
if not (0 <= next_r < rows and 0 <= next_c < cols):
next_r, next_c = r, c
next_state = (next_r, next_c)
reward = rewards.get(state, rewards.get('default', 0))
V[state] = reward + gamma * V[next_state]
delta = max(delta, abs(V[state] - old_v))
if delta < theta:
break
return VV(s) = R(s) + gamma * V(s') where s' is determined by the policy action.