EXP3 Bandit cumulative regret

This script shows how to define a bandit environment and an EXP3 randomized algorithm.

Cumulative Pseudo-Regret
import numpy as np
from rlberry_research.envs.bandits import AdversarialBandit
from rlberry_research.agents.bandits import (
    RandomizedAgent,
    TSAgent,
    makeEXP3Index,
    makeBetaPrior,
)
from rlberry.manager import ExperimentManager, plot_writer_data


# Agents definition


class EXP3Agent(RandomizedAgent):
    name = "EXP3"

    def __init__(self, env, **kwargs):
        prob, tracker_params = makeEXP3Index()
        RandomizedAgent.__init__(
            self,
            env,
            prob,
            writer_extra="action",
            tracker_params=tracker_params,
            **kwargs
        )


class BernoulliTSAgent(TSAgent):
    """Thompson sampling for Bernoulli bandit"""

    name = "TS"

    def __init__(self, env, **kwargs):
        prior, _ = makeBetaPrior()
        TSAgent.__init__(self, env, prior, writer_extra="action", **kwargs)


# Parameters of the problem
T = 3000  # Horizon
M = 20  # number of MC simu


def switching_rewards(T, gap=0.1, rate=1.6):
    """Adversarially switching rewards over exponentially long phases.
    Inspired by Zimmert, Julian, and Yevgeny Seldin.
    "Tsallis-INF: An Optimal Algorithm for Stochastic and Adversarial Bandits."
    J. Mach. Learn. Res. 22 (2021): 28-1.
    """
    rewards = np.zeros((T, 2))
    t = 0
    exp = 1
    high_rewards = True
    for t in range(T):
        if t > np.floor(rate**exp):
            high_rewards = not high_rewards
            exp += 1
        if high_rewards:
            rewards[t] = [1.0 - gap, 1.0]
        else:
            rewards[t] = [0.0, gap]
    return rewards


rewards = switching_rewards(T, rate=5.0)


# Construction of the experiment

env_ctor = AdversarialBandit
env_kwargs = {"rewards": rewards}

Agents_class = [EXP3Agent, BernoulliTSAgent]

agents = [
    ExperimentManager(
        Agent,
        (env_ctor, env_kwargs),
        init_kwargs={},
        fit_budget=T,
        n_fit=M,
        parallelization="process",
        mp_context="fork",
    )
    for Agent in Agents_class
]

# these parameters should give parallel computing even in notebooks


# Agent training
for agent in agents:
    agent.fit()


# Compute and plot (pseudo-)regret
def compute_pseudo_regret(actions):
    selected_rewards = np.array(
        [rewards[t, int(action)] for t, action in enumerate(actions)]
    )
    return np.cumsum(np.max(rewards, axis=1) - selected_rewards)


output = plot_writer_data(
    agents,
    tag="action",
    preprocess_func=compute_pseudo_regret,
    title="Cumulative Pseudo-Regret",
)

Total running time of the script: (0 minutes 7.579 seconds)

Gallery generated by Sphinx-Gallery