Skip to content

Commit

Permalink
reworked multi armed bandit example
Browse files Browse the repository at this point in the history
  • Loading branch information
julianStreibel committed May 7, 2024
1 parent 0193e51 commit f64296f
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 83 deletions.
2 changes: 1 addition & 1 deletion baybe/kernels/priors/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,4 +112,4 @@ def to_gpytorch(self, *args, **kwargs): # noqa: D102

def numpy(self) -> np.ndarray:
"""Return alpha and beta as a numpy ndarray."""
return np.array([self.alpha, self.beta])
return np.array([self.alpha, self.beta]).reshape(-1, 1)
4 changes: 2 additions & 2 deletions baybe/surrogates/multi_armed_bandit.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,12 @@ def _posterior_beta_parameters(self) -> np.ndarray[float]:
@property
def means(self) -> np.ndarray[float]:
"""Posterior means of the bandit arms."""
return beta(*self._posterior_beta_parameters.T).mean()
return beta(*self._posterior_beta_parameters).mean()

@property
def variances(self) -> np.ndarray[float]:
"""Posterior variances of the bandit arms."""
return beta(*self._posterior_beta_parameters.T).var()
return beta(*self._posterior_beta_parameters).var()

def _posterior(self, candidates: Tensor) -> tuple[Tensor, Tensor]:
# See base class.
Expand Down
4 changes: 4 additions & 0 deletions examples/Multi_Armed_Bandit/Multi_Armed_Bandit_Header.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Multi-Armed Bandit

These examples demonstrate BayBE's
{doc}`Multi-Armed Bandit Capabilities </userguide/multi_armed_bandit>`.
143 changes: 84 additions & 59 deletions examples/Multi_Armed_Bandit/find_maximizing_arm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
import click
## Example for a Multi Armed Bandit

# This example shows how to use the bernoulli multi armed bandit surrogate.

from collections.abc import Iterable
from typing import Union

import numpy as np
from multi_armed_bandit import MultiArmedBandit
from scipy.stats import bernoulli
from attrs import define
from scipy.stats import bernoulli, rv_continuous, rv_discrete

from baybe import Campaign
from baybe.acquisition import ProbabilityOfImprovement
Expand All @@ -16,64 +22,83 @@
from baybe.surrogates import BernoulliMultiArmedBanditSurrogate
from baybe.targets import BinaryTarget

### Setup
# We are using a 5-armed bandit in this example. The bandit has a random win rate for now.
N_ARMS = 5
N_ITERATIONS = 300
np.random.seed(0)


@click.command()
@click.option("--n_arms", default=2, help="Number of arms")
@click.option("--n_iter", default=1, help="Number of iterations")
@click.option(
"--log_each_n_steps", default=100, help="Number of iteratins between prints."
@define
class MultiArmedBanditModel:
"""Representation of a multi armed bandit."""

real_distributions: list[Union[rv_discrete, rv_continuous]]
"""List of the reward distribution per arm."""

def sample(self, arm_idxs: Iterable[int]):
"""Draw reward samples from the arms indexed in arm_idxs."""
return [self.real_distributions[arm_idx].rvs() for arm_idx in arm_idxs]

@property
def means(self):
"""Return the real means of the reward distributions."""
return [dist.stats(moments="m") for dist in self.real_distributions]


mab = MultiArmedBanditModel(
real_distributions=[bernoulli(np.random.rand()) for _ in range(N_ARMS)]
)
def run_experiment(n_arms, n_iter, log_each_n_steps):
"""Search for the arm with maximum win rate."""
mab = MultiArmedBandit(
real_distributions=[bernoulli(np.random.rand()) for _ in range(n_arms)]
)
print("real means", mab.means)
target = BinaryTarget(name="win_rate")
objective = SingleTargetObjective(target=target)
parameters = [
CategoricalParameter(
name="arm",
values=[str(i) for i in range(n_arms)],
)
]
searchspace = SearchSpace.from_product(parameters)
mabs = BernoulliMultiArmedBanditSurrogate()
recommender = TwoPhaseMetaRecommender(
initial_recommender=FPSRecommender(
allow_repeated_recommendations=True,
allow_recommending_already_measured=True,
),
recommender=SequentialGreedyRecommender(
surrogate_model=mabs,
allow_repeated_recommendations=True,
allow_recommending_already_measured=True,
acquisition_function=ProbabilityOfImprovement(),
),
print("real means", mab.means)


### Campaign
# We are using the BinaryTarget as we are modeling a brnoulli reward.
# The searchspace has one categorical parameter to model the arms of the bandit.
# The probability of improvement acquisition function is not perfect in this setting
# as it assumes a normal distribution of the win rate.

target = BinaryTarget(name="win_rate")
objective = SingleTargetObjective(target=target)
parameters = [
CategoricalParameter(
name="arm",
values=[str(i) for i in range(N_ARMS)],
)
campaign = Campaign(searchspace, objective, recommender)

total_reward = 0
for i in range(n_iter):
df = campaign.recommend(batch_size=1)
reward = mab.sample(df.index.tolist())
total_reward += sum(reward)
df["win_rate"] = reward
campaign.add_measurements(df)

if (i + 1) % log_each_n_steps == 0:
print("iter", i + 1)
print("estimated means", mabs.means)
print("-" * 5)

real_means = mab.means
print("real means", real_means)
print("optimal expected reward", max(real_means) * n_iter)
print("total_reward", total_reward)
print("mean reward", total_reward / n_iter)


if __name__ == "__main__":
run_experiment()
]
searchspace = SearchSpace.from_product(parameters)
mabs = BernoulliMultiArmedBanditSurrogate()
recommender = TwoPhaseMetaRecommender(
initial_recommender=FPSRecommender(
allow_repeated_recommendations=True,
allow_recommending_already_measured=True,
),
recommender=SequentialGreedyRecommender(
surrogate_model=mabs,
allow_repeated_recommendations=True,
allow_recommending_already_measured=True,
acquisition_function=ProbabilityOfImprovement(),
),
)
campaign = Campaign(searchspace, objective, recommender)


### Optimization Loop
total_reward = 0
for i in range(N_ITERATIONS):
df = campaign.recommend(batch_size=1)
reward = mab.sample(df.index.tolist())
total_reward += sum(reward)
df["win_rate"] = reward
campaign.add_measurements(df)

if (i + 1) % 50 == 0:
print("iter", i + 1)
print("estimated means", mabs.means)
print("-" * 5)

real_means = mab.means
print("real means", real_means)
print("optimal expected reward", max(real_means) * N_ITERATIONS)
print("total_reward", total_reward)
print("mean reward", total_reward / N_ITERATIONS)
21 changes: 0 additions & 21 deletions examples/Multi_Armed_Bandit/multi_armed_bandit.py

This file was deleted.

0 comments on commit f64296f

Please sign in to comment.