From b890c82237d6932f9285a901125299ce45458391 Mon Sep 17 00:00:00 2001 From: usaito Date: Sat, 3 Dec 2022 04:14:21 -0500 Subject: [PATCH 1/2] add gaussian noise option to SyntheticBanditDataset --- obp/dataset/synthetic.py | 52 +++++++++++++++++++++++----------- obp/dataset/synthetic_multi.py | 18 ++++++++---- 2 files changed, 48 insertions(+), 22 deletions(-) diff --git a/obp/dataset/synthetic.py b/obp/dataset/synthetic.py index f1f02327..2434eb7d 100644 --- a/obp/dataset/synthetic.py +++ b/obp/dataset/synthetic.py @@ -77,6 +77,11 @@ class SyntheticBanditDataset(BaseBanditDataset): A larger value leads to a noisier reward distribution. This argument is valid only when `reward_type="continuous"`. + reward_noise_distribution: str, default='normal' + From which distribution we sample noise on the reward, must be either 'normal' or 'truncated_normal'. + If 'truncated_normal' is given, we do not have any negative reward realization in the logged dataset. + This argument is valid only when `reward_type="continuous"`. + action_context: np.ndarray, default=None Vector representation of (discrete) actions. If None, one-hot representation will be used. @@ -177,6 +182,7 @@ class SyntheticBanditDataset(BaseBanditDataset): reward_type: str = RewardType.BINARY.value reward_function: Optional[Callable[[np.ndarray, np.ndarray], np.ndarray]] = None reward_std: float = 1.0 + reward_noise_distribution: str = "normal" action_context: Optional[np.ndarray] = None behavior_policy_function: Optional[ Callable[[np.ndarray, np.ndarray], np.ndarray] @@ -211,6 +217,12 @@ def __post_init__(self) -> None: f"`reward_type` must be either '{RewardType.BINARY.value}' or '{RewardType.CONTINUOUS.value}'," f"but {self.reward_type} is given.'" ) + if self.reward_noise_distribution not in ["normal", "truncated_normal"]: + raise ValueError( + f"`reward_noise_distribution` must be either 'normal' or 'truncated_normal'," + f"but {self.reward_noise_distribution} is given.'" + ) + check_scalar(self.reward_std, "reward_std", (int, float), min_val=0) if self.reward_function is None: self.expected_reward = self.sample_contextfree_expected_reward() @@ -263,16 +275,23 @@ def sample_reward_given_expected_reward( if RewardType(self.reward_type) == RewardType.BINARY: reward = self.random_.binomial(n=1, p=expected_reward_factual) elif RewardType(self.reward_type) == RewardType.CONTINUOUS: - mean = expected_reward_factual - a = (self.reward_min - mean) / self.reward_std - b = (self.reward_max - mean) / self.reward_std - reward = truncnorm.rvs( - a=a, - b=b, - loc=mean, - scale=self.reward_std, - random_state=self.random_state, - ) + if self.reward_noise_distribution == "normal": + reward = self.random_.normal( + loc=expected_reward_factual, + scale=self.reward_std, + size=action.shape, + ) + elif self.reward_noise_distribution == "truncated_normal": + mean = expected_reward_factual + a = (self.reward_min - mean) / self.reward_std + b = (self.reward_max - mean) / self.reward_std + reward = truncnorm.rvs( + a=a, + b=b, + loc=mean, + scale=self.reward_std, + random_state=self.random_state, + ) else: raise NotImplementedError @@ -329,12 +348,13 @@ def obtain_batch_bandit_feedback(self, n_rounds: int) -> BanditFeedback: expected_reward_ = self.calc_expected_reward(contexts) if RewardType(self.reward_type) == RewardType.CONTINUOUS: # correct expected_reward_, as we use truncated normal distribution here - mean = expected_reward_ - a = (self.reward_min - mean) / self.reward_std - b = (self.reward_max - mean) / self.reward_std - expected_reward_ = truncnorm.stats( - a=a, b=b, loc=mean, scale=self.reward_std, moments="m" - ) + if self.reward_noise_distribution == "truncated_normal": + mean = expected_reward_ + a = (self.reward_min - mean) / self.reward_std + b = (self.reward_max - mean) / self.reward_std + expected_reward_ = truncnorm.stats( + a=a, b=b, loc=mean, scale=self.reward_std, moments="m" + ) # calculate the action choice probabilities of the behavior policy if self.behavior_policy_function is None: diff --git a/obp/dataset/synthetic_multi.py b/obp/dataset/synthetic_multi.py index 1bf3f8a1..5b3697aa 100644 --- a/obp/dataset/synthetic_multi.py +++ b/obp/dataset/synthetic_multi.py @@ -74,6 +74,11 @@ class SyntheticMultiLoggersBanditDataset(SyntheticBanditDataset): A larger value leads to a noisier reward distribution. This argument is valid only when `reward_type="continuous"`. + reward_noise_distribution: str, default='normal' + From which distribution we sample noise on the reward, must be either 'normal' or 'truncated_normal'. + If 'truncated_normal' is given, we do not have any negative reward realization in the logged dataset. + This argument is valid only when `reward_type="continuous"`. + action_context: np.ndarray, default=None Vector representation of (discrete) actions. If None, one-hot representation will be used. @@ -272,12 +277,13 @@ def obtain_batch_bandit_feedback(self, n_rounds: int) -> BanditFeedback: expected_reward_ = self.calc_expected_reward(contexts) if RewardType(self.reward_type) == RewardType.CONTINUOUS: # correct expected_reward_, as we use truncated normal distribution here - mean = expected_reward_ - a = (self.reward_min - mean) / self.reward_std - b = (self.reward_max - mean) / self.reward_std - expected_reward_ = truncnorm.stats( - a=a, b=b, loc=mean, scale=self.reward_std, moments="m" - ) + if self.reward_noise_distribution == "truncated_normal": + mean = expected_reward_ + a = (self.reward_min - mean) / self.reward_std + b = (self.reward_max - mean) / self.reward_std + expected_reward_ = truncnorm.stats( + a=a, b=b, loc=mean, scale=self.reward_std, moments="m" + ) # calculate the action choice probabilities of the behavior policy pi_b_logits = expected_reward_ From 204777c83c8d9bf7194a6e97408ee4cc6b17747f Mon Sep 17 00:00:00 2001 From: usaito Date: Sat, 3 Dec 2022 04:14:49 -0500 Subject: [PATCH 2/2] fix a bug in QLearner when importance weighting is applied --- obp/policy/offline.py | 1 + 1 file changed, 1 insertion(+) diff --git a/obp/policy/offline.py b/obp/policy/offline.py index 21041acf..b0d0ed04 100644 --- a/obp/policy/offline.py +++ b/obp/policy/offline.py @@ -441,6 +441,7 @@ def fit( raise ValueError("When `self.len_list > 1`, `position` must be given.") unif_action_dist = np.ones((context.shape[0], self.n_actions, self.len_list)) + unif_action_dist /= self.n_actions self.q_estimator.fit( context=context, action=action,