Comparison subplots of various index based bandits algorithms

This script Compare several bandits agents and as a sub-product also shows how to use subplots in with plot_writer_data

  • Cumulative Pseudo-Regret
  • Cumulative Regret
  • Na for arm 0, mean=0.6, Na for arm 1, mean=0.6, Na for arm 2, mean=0.6, Na for arm 3, mean=0.9
/home/runner/work/rlberry/rlberry/main/rlberry/manager/plotting.py:618: UserWarning: No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
  plt.legend()
/home/runner/work/rlberry/rlberry/main/rlberry/manager/plotting.py:618: UserWarning: No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
  plt.legend()
/home/runner/work/rlberry/rlberry/main/rlberry/manager/plotting.py:618: UserWarning: No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
  plt.legend()

import numpy as np
import matplotlib.pyplot as plt
from rlberry_research.envs.bandits import BernoulliBandit
from rlberry.manager import ExperimentManager, plot_writer_data
from rlberry_research.agents.bandits import (
    IndexAgent,
    RandomizedAgent,
    makeBoundedIMEDIndex,
    makeBoundedMOSSIndex,
    makeBoundedNPTSIndex,
    makeBoundedUCBIndex,
    makeBoundedUCBVIndex,
    makeETCIndex,
    makeEXP3Index,
)

# Agents definition


# Parameters of the problem
means = np.array([0.6, 0.6, 0.6, 0.9])  # means of the arms
A = len(means)
T = 2000  # Horizon
M = 10  # number of MC simu

# Construction of the experiment

env_ctor = BernoulliBandit
env_kwargs = {"p": means}


class UCBAgent(IndexAgent):
    name = "UCB"

    def __init__(self, env, **kwargs):
        index, _ = makeBoundedUCBIndex()
        IndexAgent.__init__(
            self, env, index, writer_extra="action_and_reward", **kwargs
        )


class UCBVAgent(IndexAgent):
    name = "UCBV"

    def __init__(self, env, **kwargs):
        index, params = makeBoundedUCBVIndex()
        IndexAgent.__init__(
            self,
            env,
            index,
            writer_extra="action_and_reward",
            tracker_params=params,
            **kwargs
        )


class ETCAgent(IndexAgent):
    name = "ETC"

    def __init__(self, env, m=20, **kwargs):
        index, _ = makeETCIndex(A, m)
        IndexAgent.__init__(
            self, env, index, writer_extra="action_and_reward", **kwargs
        )


class MOSSAgent(IndexAgent):
    name = "MOSS"

    def __init__(self, env, **kwargs):
        index, _ = makeBoundedMOSSIndex(T, A)
        IndexAgent.__init__(
            self, env, index, writer_extra="action_and_reward", **kwargs
        )


class IMEDAgent(IndexAgent):
    name = "IMED"

    def __init__(self, env, **kwargs):
        index, tracker_params = makeBoundedIMEDIndex()
        IndexAgent.__init__(
            self,
            env,
            index,
            writer_extra="action_and_reward",
            tracker_params=tracker_params,
            **kwargs
        )


class NPTSAgent(IndexAgent):
    name = "NPTS"

    def __init__(self, env, **kwargs):
        index, tracker_params = makeBoundedNPTSIndex()
        IndexAgent.__init__(
            self,
            env,
            index,
            writer_extra="action_and_reward",
            tracker_params=tracker_params,
            **kwargs
        )


class EXP3Agent(RandomizedAgent):
    name = "EXP3"

    def __init__(self, env, **kwargs):
        prob, tracker_params = makeEXP3Index()
        RandomizedAgent.__init__(
            self,
            env,
            prob,
            writer_extra="action_and_reward",
            tracker_params=tracker_params,
            **kwargs
        )


Agents_class = [
    ETCAgent,
    EXP3Agent,
    IMEDAgent,
    MOSSAgent,
    NPTSAgent,
    UCBAgent,
    UCBVAgent,
]

agents = [
    ExperimentManager(
        Agent,
        (env_ctor, env_kwargs),
        fit_budget=T,
        n_fit=M,
        parallelization="process",
        mp_context="fork",
        seed=42,
    )
    for Agent in Agents_class
]

# these parameters should give parallel computing even in notebooks


# Agent training
for agent in agents:
    agent.fit()


# Compute and plot regret
def compute_regret(rewards):
    return np.cumsum(np.max(means) - rewards)


# Compute and plot (pseudo-)regret
def compute_pseudo_regret(actions):
    return np.cumsum(np.max(means) - means[actions.astype(int)])


output = plot_writer_data(
    agents,
    tag="action",
    preprocess_func=compute_pseudo_regret,
    title="Cumulative Pseudo-Regret",
    linestyles=True,
)

output = plot_writer_data(
    agents,
    tag="reward",
    preprocess_func=compute_regret,
    title="Cumulative Regret",
    linestyles=True,
)


# Compute and plot number of times each arm was selected
def compute_na(actions, a):
    return np.cumsum(actions == a)


fig, axes = plt.subplots(2, 2, sharey=True, figsize=(6, 6))
axes = axes.ravel()
for arm in range(A):
    output = plot_writer_data(
        agents,
        tag="action",
        preprocess_func=lambda actions: compute_na(actions, arm),
        title="Na for arm " + str(arm) + ", mean=" + str(means[arm]),
        ax=axes[arm],
        show=False,
        linestyles=True,
    )
fig.tight_layout()
plt.show()

Total running time of the script: (0 minutes 39.026 seconds)

Gallery generated by Sphinx-Gallery