Skip to content

mab

zeus.policy.mab

Multi-Armed Bandit implementations.

GaussianTS

Thompson Sampling policy for Gaussian bandits.

For each arm, the reward is modeled as a Gaussian distribution with known precision. The conjugate priors are also Gaussian distributions.

Source code in zeus/policy/mab.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
class GaussianTS:
    """Thompson Sampling policy for Gaussian bandits.

    For each arm, the reward is modeled as a Gaussian distribution with
    known precision. The conjugate priors are also Gaussian distributions.
    """

    def __init__(
        self,
        arms: list[int],
        reward_precision: list[float] | float,
        prior_mean: float = 0.0,
        prior_precision: float = 0.0,
        num_exploration: int = 1,
        seed: int = 123456,
        verbose: bool = True,
    ) -> None:
        """Initialze the object.

        Args:
            arms: Bandit arm values to use.
            reward_precision: Precision (inverse variance) of the reward distribution.
                Pass in a list of `float`s to set the reward precision differently for
                each arm.
            prior_mean: Mean of the belief prior distribution.
            prior_precision: Precision of the belief prior distribution.
            num_exploration: How many static explorations to run when no observations
                are available.
            seed: The random seed to use.
            verbose: Whether to print out what's going on.
        """
        self.name = "GaussianTS"

        self.arms = arms
        self.prior_mean = prior_mean
        self.prior_prec = prior_precision
        self.num_exploration = num_exploration
        self.seed = seed
        self.rng = np.random.default_rng(seed)
        self.verbose = verbose

        # Set the precision of the reward distribution of each arm.
        if isinstance(reward_precision, list):
            self.arm_reward_prec = dict(zip(arms, reward_precision))
        else:
            self.arm_reward_prec = {arm: reward_precision for arm in arms}

        # Initialze the parameter distribution with the prior parameters.
        self.arm_param_mean = dict.fromkeys(arms, prior_mean)
        self.arm_param_prec = dict.fromkeys(arms, prior_precision)

        # Track how many times an arm reward has been observed.
        self.arm_num_observations = dict.fromkeys(arms, 0)

    def fit(
        self,
        decisions: list[int] | np.ndarray,
        rewards: list[float] | np.ndarray,
        reset: bool,
    ) -> None:
        """Fit the bandit on the given list of observations.

        Args:
            decisions: A list of arms chosen.
            rewards: A list of rewards that resulted from choosing the arms in `decisions`.
            reset: Whether to reset all arms.
        """
        decisions_arr = np.array(decisions)
        rewards_arr = np.array(rewards)

        # Fit all arms.
        for arm in self.arms:
            self.fit_arm(arm, rewards_arr[decisions_arr == arm], reset)

    def fit_arm(self, arm: int, rewards: np.ndarray, reset: bool) -> None:
        """Update the parameter distribution for one arm.

        Reference: <https://en.wikipedia.org/wiki/Conjugate_prior>

        Args:
            arm: Arm to fit.
            rewards: Array of rewards observed by pulling that arm.
            reset: Whether to reset the parameters of the arm before fitting.
        """
        if reset:
            self.arm_param_mean[arm] = self.prior_mean
            self.arm_param_prec[arm] = self.prior_prec
            self.arm_num_observations[arm] = 0

        if len(rewards) == 0:
            return

        # Read previous state.
        reward_prec = self.arm_reward_prec[arm]
        mean = self.arm_param_mean[arm]
        prec = self.arm_param_prec[arm]

        # Compute the parameters of the posterior distribution.
        # The reward distribution's precision is given as infinite only when we
        # have exactly one observation for the arm, s.t. sampling yields that
        # exact observation.
        if reward_prec == np.inf:
            new_prec = np.inf
            new_mean = rewards.mean()
        else:
            new_prec = prec + len(rewards) * reward_prec
            new_mean = (prec * mean + reward_prec * rewards.sum()) / new_prec

        # Update state.
        self.arm_param_mean[arm] = new_mean
        self.arm_param_prec[arm] = new_prec
        self.arm_num_observations[arm] += len(rewards)

    def predict(self) -> int:
        """Return the arm with the largest sampled expected reward."""
        # Exploration-only phase.
        # Order is random considering concurrent bandit scenarios.
        arrms = np.array(self.arms)
        for arm in self.rng.choice(arrms, len(arrms), replace=False):
            if self.arm_num_observations[arm] < self.num_exploration:
                if self.verbose:
                    print(f"[{self.name}] Explore arm {arm}.")
                return arm

        # Thomopson Sampling phase.
        expectations = self.predict_expectations()
        if self.verbose:
            print(f"[{self.name}] Sampled mean rewards:")
            for arm, sample in expectations.items():
                print(
                    f"[{self.name}] Arm {arm:4d}: mu ~ N({self.arm_param_mean[arm]:.2f}, "
                    f"{1/self.arm_param_prec[arm]:.2f}) -> {sample:.2f}"
                )
        return max(expectations, key=expectations.get)  # type: ignore

    def predict_expectations(self) -> dict[int, float]:
        """Sample the expected reward for each arm.

        Assumes that each arm has been explored at least once. Otherwise,
        a value will be sampled from the prior.

        Returns:
            A mapping from every arm to their sampled expected reward.
        """
        expectations = {}
        for arm in self.arms:
            mean = self.arm_param_mean[arm]
            prec = self.arm_param_prec[arm]
            if prec == self.prior_prec:
                warnings.warn(
                    f"predict_expectations called when arm '{arm}' is cold.",
                    stacklevel=1,
                )
            expectations[arm] = self.rng.normal(mean, np.sqrt(np.reciprocal(prec)))
        return expectations

__init__

1
2
3
4
5
6
7
8
9
__init__(
    arms,
    reward_precision,
    prior_mean=0.0,
    prior_precision=0.0,
    num_exploration=1,
    seed=123456,
    verbose=True,
)

Parameters:

Name Type Description Default
arms list[int]

Bandit arm values to use.

required
reward_precision list[float] | float

Precision (inverse variance) of the reward distribution. Pass in a list of floats to set the reward precision differently for each arm.

required
prior_mean float

Mean of the belief prior distribution.

0.0
prior_precision float

Precision of the belief prior distribution.

0.0
num_exploration int

How many static explorations to run when no observations are available.

1
seed int

The random seed to use.

123456
verbose bool

Whether to print out what's going on.

True
Source code in zeus/policy/mab.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def __init__(
    self,
    arms: list[int],
    reward_precision: list[float] | float,
    prior_mean: float = 0.0,
    prior_precision: float = 0.0,
    num_exploration: int = 1,
    seed: int = 123456,
    verbose: bool = True,
) -> None:
    """Initialze the object.

    Args:
        arms: Bandit arm values to use.
        reward_precision: Precision (inverse variance) of the reward distribution.
            Pass in a list of `float`s to set the reward precision differently for
            each arm.
        prior_mean: Mean of the belief prior distribution.
        prior_precision: Precision of the belief prior distribution.
        num_exploration: How many static explorations to run when no observations
            are available.
        seed: The random seed to use.
        verbose: Whether to print out what's going on.
    """
    self.name = "GaussianTS"

    self.arms = arms
    self.prior_mean = prior_mean
    self.prior_prec = prior_precision
    self.num_exploration = num_exploration
    self.seed = seed
    self.rng = np.random.default_rng(seed)
    self.verbose = verbose

    # Set the precision of the reward distribution of each arm.
    if isinstance(reward_precision, list):
        self.arm_reward_prec = dict(zip(arms, reward_precision))
    else:
        self.arm_reward_prec = {arm: reward_precision for arm in arms}

    # Initialze the parameter distribution with the prior parameters.
    self.arm_param_mean = dict.fromkeys(arms, prior_mean)
    self.arm_param_prec = dict.fromkeys(arms, prior_precision)

    # Track how many times an arm reward has been observed.
    self.arm_num_observations = dict.fromkeys(arms, 0)

fit

1
fit(decisions, rewards, reset)

Fit the bandit on the given list of observations.

Parameters:

Name Type Description Default
decisions list[int] | ndarray

A list of arms chosen.

required
rewards list[float] | ndarray

A list of rewards that resulted from choosing the arms in decisions.

required
reset bool

Whether to reset all arms.

required
Source code in zeus/policy/mab.py
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def fit(
    self,
    decisions: list[int] | np.ndarray,
    rewards: list[float] | np.ndarray,
    reset: bool,
) -> None:
    """Fit the bandit on the given list of observations.

    Args:
        decisions: A list of arms chosen.
        rewards: A list of rewards that resulted from choosing the arms in `decisions`.
        reset: Whether to reset all arms.
    """
    decisions_arr = np.array(decisions)
    rewards_arr = np.array(rewards)

    # Fit all arms.
    for arm in self.arms:
        self.fit_arm(arm, rewards_arr[decisions_arr == arm], reset)

fit_arm

1
fit_arm(arm, rewards, reset)

Update the parameter distribution for one arm.

Reference: https://en.wikipedia.org/wiki/Conjugate_prior

Parameters:

Name Type Description Default
arm int

Arm to fit.

required
rewards ndarray

Array of rewards observed by pulling that arm.

required
reset bool

Whether to reset the parameters of the arm before fitting.

required
Source code in zeus/policy/mab.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def fit_arm(self, arm: int, rewards: np.ndarray, reset: bool) -> None:
    """Update the parameter distribution for one arm.

    Reference: <https://en.wikipedia.org/wiki/Conjugate_prior>

    Args:
        arm: Arm to fit.
        rewards: Array of rewards observed by pulling that arm.
        reset: Whether to reset the parameters of the arm before fitting.
    """
    if reset:
        self.arm_param_mean[arm] = self.prior_mean
        self.arm_param_prec[arm] = self.prior_prec
        self.arm_num_observations[arm] = 0

    if len(rewards) == 0:
        return

    # Read previous state.
    reward_prec = self.arm_reward_prec[arm]
    mean = self.arm_param_mean[arm]
    prec = self.arm_param_prec[arm]

    # Compute the parameters of the posterior distribution.
    # The reward distribution's precision is given as infinite only when we
    # have exactly one observation for the arm, s.t. sampling yields that
    # exact observation.
    if reward_prec == np.inf:
        new_prec = np.inf
        new_mean = rewards.mean()
    else:
        new_prec = prec + len(rewards) * reward_prec
        new_mean = (prec * mean + reward_prec * rewards.sum()) / new_prec

    # Update state.
    self.arm_param_mean[arm] = new_mean
    self.arm_param_prec[arm] = new_prec
    self.arm_num_observations[arm] += len(rewards)

predict

1
predict()

Return the arm with the largest sampled expected reward.

Source code in zeus/policy/mab.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def predict(self) -> int:
    """Return the arm with the largest sampled expected reward."""
    # Exploration-only phase.
    # Order is random considering concurrent bandit scenarios.
    arrms = np.array(self.arms)
    for arm in self.rng.choice(arrms, len(arrms), replace=False):
        if self.arm_num_observations[arm] < self.num_exploration:
            if self.verbose:
                print(f"[{self.name}] Explore arm {arm}.")
            return arm

    # Thomopson Sampling phase.
    expectations = self.predict_expectations()
    if self.verbose:
        print(f"[{self.name}] Sampled mean rewards:")
        for arm, sample in expectations.items():
            print(
                f"[{self.name}] Arm {arm:4d}: mu ~ N({self.arm_param_mean[arm]:.2f}, "
                f"{1/self.arm_param_prec[arm]:.2f}) -> {sample:.2f}"
            )
    return max(expectations, key=expectations.get)  # type: ignore

predict_expectations

1
predict_expectations()

Sample the expected reward for each arm.

Assumes that each arm has been explored at least once. Otherwise, a value will be sampled from the prior.

Returns:

Type Description
dict[int, float]

A mapping from every arm to their sampled expected reward.

Source code in zeus/policy/mab.py
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def predict_expectations(self) -> dict[int, float]:
    """Sample the expected reward for each arm.

    Assumes that each arm has been explored at least once. Otherwise,
    a value will be sampled from the prior.

    Returns:
        A mapping from every arm to their sampled expected reward.
    """
    expectations = {}
    for arm in self.arms:
        mean = self.arm_param_mean[arm]
        prec = self.arm_param_prec[arm]
        if prec == self.prior_prec:
            warnings.warn(
                f"predict_expectations called when arm '{arm}' is cold.",
                stacklevel=1,
            )
        expectations[arm] = self.rng.normal(mean, np.sqrt(np.reciprocal(prec)))
    return expectations