#133 · Reinforcement Learning · Medium
⊣ Solve on deep-ml.comImplement the Q-Learning algorithm for solving Markov Decision Processes. Given a set of states, actions, transition dynamics, and rewards, learn the optimal Q-values and derive the optimal policy.
import numpy as np
def q_learning(
n_states: int,
n_actions: int,
episodes: list,
alpha: float = 0.1,
gamma: float = 0.99,
epsilon: float = 0.1,
n_episodes: int = 1000
) -> tuple[np.ndarray, np.ndarray]:
Q = np.zeros((n_states, n_actions))
for episode in range(n_episodes):
state = np.random.randint(n_states)
for step in range(100): # max steps per episode
# Epsilon-greedy action selection
if np.random.random() < epsilon:
action = np.random.randint(n_actions)
else:
action = np.argmax(Q[state])
# Simulate transition (using provided episodes or environment)
if episode < len(episodes) and step < len(episodes[episode]):
_, action_taken, reward, next_state, done = episodes[episode][step]
action = action_taken
else:
next_state = np.random.randint(n_states)
reward = np.random.randn()
done = step >= 99
# Q-learning update: Q(s,a) += alpha * (r + gamma * max_a' Q(s',a') - Q(s,a))
best_next = np.max(Q[next_state]) if not done else 0.0
td_target = reward + gamma * best_next
Q[state, action] += alpha * (td_target - Q[state, action])
state = next_state
if done:
break
# Derive policy from Q-values
policy = np.argmax(Q, axis=1)
return Q, policy
def q_learning_from_transitions(
transitions: list[tuple],
n_states: int,
n_actions: int,
alpha: float = 0.1,
gamma: float = 0.99
) -> np.ndarray:
Q = np.zeros((n_states, n_actions))
for state, action, reward, next_state, done in transitions:
best_next = np.max(Q[next_state]) if not done else 0.0
td_target = reward + gamma * best_next
Q[state, action] += alpha * (td_target - Q[state, action])
return QQ(s,a) += alpha * (r + gamma * max Q(s',a') - Q(s,a)).argmax_a Q(s,a) for each state.