#166 · Reinforcement Learning · Medium
⊣ Solve on deep-ml.comEvaluate the Expected Value of a state in a Markov Decision Process (MDP) given a fixed policy. Compute V^pi(s) = sum_a pi(a|s) * sum_s' T(s,a,s') * [R(s,a,s') + gamma * V^pi(s')] using iterative policy evaluation.
import numpy as np
def policy_evaluation(states: int, actions: int, policy: np.ndarray,
transitions: np.ndarray, rewards: np.ndarray,
gamma: float, theta: float = 1e-6,
max_iter: int = 1000) -> np.ndarray:
V = np.zeros(states)
for _ in range(max_iter):
V_new = np.zeros(states)
for s in range(states):
v = 0.0
for a in range(actions):
if policy[s, a] == 0:
continue
q = 0.0
for s_next in range(states):
q += transitions[s, a, s_next] * (
rewards[s, a, s_next] + gamma * V[s_next]
)
v += policy[s, a] * q
V_new[s] = v
if np.max(np.abs(V_new - V)) < theta:
break
V = V_new
return VV(s) = 0 for all states.pi(a|s).