Implement TD(lambda) with Eligibility Traces for value function estimation. TD(lambda) unifies TD(0) and Monte Carlo methods using a lambda parameter that controls how far into the future the value estimate looks.
Use an eligibility trace for each state. At every step, increment the trace for the current state, compute the TD error, and update all states proportionally to their eligibility trace. Then decay all traces by gamma * lambda.
def td_lambda(
episodes: list[list[tuple]],
lam: float = 0.8,
alpha: float = 0.1,
gamma: float = 0.99,
initial_value: float = 0.0,
) -> dict[str, float]:
V: dict[str, float] = {}
def get_v(state: str) -> float:
return V.get(state, initial_value)
for episode in episodes:
T = len(episode)
if T == 0:
continue
states = [str(episode[t][0]) for t in range(T)]
rewards = [episode[t][1] for t in range(T)]
# Initialize values
for s in states:
if s not in V:
V[s] = initial_value
# Eligibility traces
e: dict[str, float] = {}
for t in range(T):
state = states[t]
reward = rewards[t]
# Next state value (0 if terminal)
if t + 1 < T:
next_v = get_v(states[t + 1])
else:
next_v = 0.0
# TD error
delta = reward + gamma * next_v - get_v(state)
# Update eligibility trace for current state (accumulating traces)
e[state] = e.get(state, 0.0) + 1.0
# Update all states with non-zero traces
for s in list(e.keys()):
V[s] = V[s] + alpha * delta * e[s]
e[s] *= gamma * lam
if e[s] < 1e-10:
del e[s]
return {s: round(v, 6) for s, v in V.items()}delta = r + gamma * V(s') - V(s).V(s) += alpha * delta * e(s).gamma * lambda.