Source code for rlberry_scool.agents.tabular_rl.qlearning
from typing import Optional, Literal
import numpy as np
from gymnasium import spaces
from scipy.special import softmax
from rlberry import types
from rlberry.agents import AgentWithSimplePolicy
[docs]class QLAgent(AgentWithSimplePolicy):
"""Q-Learning Agent.
Parameters
----------
env: :class:`~rlberry.types.Env`
Environment with discrete states and actions.
gamma: float, default = 0.99
Discount factor.
alpha: float, default = 0.1
Learning rate.
exploration_type: {"epsilon", "boltzmann"}, default: None
If "epsilon": Epsilon-Greedy exploration.
If "boltzmann": Boltzmann exploration.
If None: No exploration.
exploration_rate: float, default: None
epsilon parameter for Epsilon-Greedy exploration or tau parameter for Boltzmann exploration.
**kwargs : Keyword Arguments
Arguments to be passed to `AgentWithSimplePolicy.__init__(self, env, **kwargs)` (:class:`~rlberry.agents.AgentWithSimplePolicy`).
Attributes
----------
Q : ndarray
2D array that stores the estimation ofexpected rewards for state-action pairs.
Examples
--------
>>> from rlberry.envs import GridWorld
>>>
>>> env = GridWorld(walls=(), nrows=5, ncols=5)
>>> agent = QLAgent()
>>> agent.fit(budget=1000)
>>> agent.policy(env.observation_space.sample())
>>> agent.reset()
"""
name = "QL"
def __init__(
self,
env: types.Env,
gamma: float = 0.99,
alpha: float = 0.1,
exploration_type: Optional[Literal["epsilon", "boltzmann"]] = None,
exploration_rate: Optional[float] = None,
**kwargs
):
# init base class
AgentWithSimplePolicy.__init__(self, env, **kwargs)
self.gamma = gamma
self.alpha = alpha
self.exploration_type = exploration_type
self.exploration_rate = exploration_rate
# check environment
assert isinstance(self.env.observation_space, spaces.Discrete)
assert isinstance(self.env.action_space, spaces.Discrete)
# check exploration type
if self.exploration_type is not None:
assert (
exploration_type == "epsilon" or "boltzmann"
) and exploration_rate is not None
self.Q = np.zeros((self.env.observation_space.n, self.env.action_space.n))
def reset(self, **kwargs):
self.Q.fill(0)
[docs] def policy(self, observation):
return self.Q[observation].argmax()
def get_action(self, observation):
if (
self.exploration_type == "epsilon"
and self.seeder.rng.random() <= self.exploration_rate
):
return self.seeder.rng.choice(self.env.action_space.n)
elif self.exploration_type == "boltzmann":
return self.seeder.rng.choice(
self.env.action_space.n,
p=softmax(self.exploration_rate * self.Q[observation]),
)
else:
return self.Q[observation].argmax()
[docs] def fit(self, budget: int, **kwargs):
"""
Train the agent using the provided environment.
Parameters
----------
budget: int
number of Q updates.
**kwargs : Keyword Arguments
Extra arguments. Not used for this agent.
"""
del kwargs
observation, info = self.env.reset()
episode_rewards = 0
for i in range(budget):
action = self.get_action(observation)
next_observation, reward, terminated, truncated, info = self.env.step(
action
)
done = terminated or truncated
episode_rewards += reward
if self.writer is not None:
self.writer.add_scalar("episode_rewards", episode_rewards, i)
if done:
self.Q[observation, action] = reward
else:
self.Q[observation, action] = self.Q[
observation, action
] + self.alpha * (
reward
+ self.gamma * np.amax(self.Q[next_observation])
- self.Q[observation, action]
)
observation = next_observation
if done:
observation, info = self.env.reset()
episode_rewards = 0