world-federation-of-advertisers · jiayu-google · Jun 16, 2022 · Jun 16, 2022 · Jun 16, 2022 · Jun 16, 2022
diff --git a/requirements.txt b/requirements.txt
@@ -4,12 +4,12 @@ absl-py==0.12.0
 pathos>=0.2.7
 pyfarmhash==0.2.2
 matplotlib>=3.0.3
-numpy==1.20.2
+numpy>=1.20.2
 pyDOE==0.3.8
-scipy==1.6.2
+scipy>=1.6.2
 seaborn==0.9.0
-lxml==4.5.2
-cvxopt==1.2.6
+lxml>=4.5.2
+cvxopt>=1.2.6
 cvxpy>=1.1.18
 dp-accounting==0.0.1
 tqdm>=4.47.0

diff --git a/setup.py b/setup.py
@@ -35,11 +35,11 @@
     "absl-py==0.12.0",
     "pathos>=0.2.7",
     "pyfarmhash==0.2.2",
-    "numpy==1.20.2",
+    "numpy>=1.20.2",
     "pyDOE==0.3.8",
-    "scipy==1.6.2",
-    "lxml==4.5.2",
-    "cvxopt==1.2.6",
+    "scipy>=1.6.2",
+    "lxml>=4.5.2",
+    "cvxopt>=1.2.6",
     "cvxpy>=1.1.18",
     "dp-accounting==0.0.1",
     "pandas>=1.2.5",

diff --git a/src/data_generators/data_design.py b/src/data_generators/data_design.py
@@ -87,4 +87,4 @@ def add(self, data_set: DataSet) -> None:
                 )
             )
         data_set.write_data_set(self._dirpath, filesystem=self._filesystem)
-        self._data_set_names.add(data_set.name)
+        self._data_set_names.add(data_set.name)
diff --git a/src/data_generators/data_set.py b/src/data_generators/data_set.py
@@ -241,7 +241,8 @@ def write_data_set(
             dataset_dir = self._name
         ##chenwei##
         if len(dataset_dir) > 255:
-            new_dataset_dir = dataset_dir[:245] + str(np.random.randint(1e10))
+            id = self._name.split("id=")[1].split(",")[0]
+            new_dataset_dir = dataset_dir[:245] + f"...,id={id}"
         else:
             new_dataset_dir = dataset_dir
         full_dir_path = filesystem.joinpath(parent_dir, new_dataset_dir)

diff --git a/src/data_generators/m3_data_design.py b/src/data_generators/m3_data_design.py
@@ -51,7 +51,6 @@
 )
 
 
-
 # Number of samples that will be taken in the latin hypercube design
 NUM_SAMPLES_FOR_LHS = 200
 
@@ -305,16 +304,18 @@
     "overlap_generator_params": OVERLAP_GENERATORS,
 }
 
+
 def generate_data_design_config(
     random_generator: np.random.Generator,
 ) -> Iterable[DataSetParameters]:
     """Generates the data design configuration for evaluating M3 strategy."""
     keys = LEVELS.keys()
     levels = [len(LEVELS[k]) for k in keys]
-    for i, sample in enumerate(
-        lhs(n=len(levels), samples=NUM_SAMPLES_FOR_LHS, criterion="maximin")
+    ids = [16, 41, 151, 103, 69, 98, 50, 91, 73]
+    for id, sample in zip(
+        ids, lhs(n=len(levels), samples=NUM_SAMPLES_FOR_LHS, criterion="maximin")[ids]
     ):
-        design_parameters = {"id": str(i)}
+        design_parameters = {"id": str(id)}
         for key, level in zip(keys, sample):
             design_parameters[key] = LEVELS[key][int(level * len(LEVELS[key]))]
         # Specify the universe size for some datasets

diff --git a/src/data_generators/single_publisher_design.py b/src/data_generators/single_publisher_design.py
@@ -47,7 +47,6 @@
     GeneratorParameters(
         "Homogeneous", HomogeneousImpressionGenerator, {"poisson_lambda": 0.5}
     ),
-
     # Exponential Poisson: Mean = 1.5, Var = 0.75
     # For the Exponential-Poisson, the mean is beta + 1 and the variance is
     # beta * (beta + 1), where beta is the "gamma_scale" parameter.  This can
@@ -58,26 +57,23 @@
         HeterogeneousImpressionGenerator,
         {"gamma_shape": 1.0, "gamma_scale": 0.5},
     ),
-
     # Gamma Poisson: Mean = 1.5, Var = 6
     # For the shifted Gamma-Poisson, the mean is alpha * beta + 1, and the
     # variance is alpha * beta * (beta + 1), where alpha = gamma_shape and
     # beta = gamma_scale.  This can be worked out by making
     # use of the equivalence between the Gamma-Poisson and the negative
     # binomial distribution.  Using the formulation for the negative binomial
     # given in Wikipedia, the equivalent negative binomial distribution is
-    # obtained by setting p = beta / (1 + beta) and r = alpha.  
+    # obtained by setting p = beta / (1 + beta) and r = alpha.
     GeneratorParameters(
         "Heterogeneous",
         HeterogeneousImpressionGenerator,
         {"gamma_shape": 0.04545, "gamma_scale": 11},
     ),
-
     # Zeta: Mean = 1.5, Var = infinity
     GeneratorParameters(
         "HeavyTailed", HeavyTailedImpressionGenerator, {"zeta_s": 2.8106}
     ),
-
     ## Mean 3
     # Shifted Poisson: Mean = 3, Var = 2
     GeneratorParameters(
@@ -99,7 +95,6 @@
     GeneratorParameters(
         "HeavyTailed", HeavyTailedImpressionGenerator, {"zeta_s": 2.2662}
     ),
-
     ## Mean 5
     # Shifted Poisson: Mean = 5, Var = 4
     GeneratorParameters(
@@ -121,7 +116,6 @@
     GeneratorParameters(
         "HeavyTailed", HeavyTailedImpressionGenerator, {"zeta_s": 2.1416}
     ),
-
     ## Mean 10
     # Shifted Poisson: Mean = 10, Var = 9
     GeneratorParameters(

diff --git a/src/data_generators/tests/copula_data_set_test.py b/src/data_generators/tests/copula_data_set_test.py
@@ -176,7 +176,7 @@ def test_uncorrelated_copula_with_more_than_two_pubs(self):
                 # Suppose there are 100 * 2^p users in total, where p = #pubs.
                 # At each pub, half users have frequency 1, and the other half
                 # have frequency 2.
-                half_size = int(100 * 2 ** num_pubs / 2)
+                half_size = int(100 * 2**num_pubs / 2)
                 impressions = (
                     list(range(half_size)) * 1
                     + list(range(half_size, half_size * 2)) * 2
@@ -185,7 +185,7 @@ def test_uncorrelated_copula_with_more_than_two_pubs(self):
                 dataset = CopulaDataSet(
                     unlabeled_publisher_data_list=[pdf] * num_pubs,
                     copula_generator=gen,
-                    universe_size=100 * 2 ** num_pubs,
+                    universe_size=100 * 2**num_pubs,
                     random_generator=np.random.default_rng(0),
                 )
                 res = dataset.frequency_vectors_sampled_distribution
@@ -205,7 +205,7 @@ def test_fully_positively_correlated_copula_with_more_than_two_pubs(self):
                 dataset = CopulaDataSet(
                     unlabeled_publisher_data_list=[pdf] * num_pubs,
                     copula_generator=gen,
-                    universe_size=100 * 2 ** num_pubs,
+                    universe_size=100 * 2**num_pubs,
                     random_generator=np.random.default_rng(0),
                 )
                 res = dataset.frequency_vectors_sampled_distribution
@@ -289,9 +289,9 @@ def to_polar_coordinates(
             - y = r sin(phi) sin(theta)
             - z = r cos(theta).
             """
-            r = np.sqrt(x ** 2 + y ** 2 + z ** 2)
+            r = np.sqrt(x**2 + y**2 + z**2)
             phi = np.arctan2(y, x)
-            theta = np.arctan2(np.sqrt(x ** 2 + y ** 2), z)
+            theta = np.arctan2(np.sqrt(x**2 + y**2), z)
             return (r, phi, theta)
 
         correlation_tuples = [

diff --git a/src/driver/experiment_driver.py b/src/driver/experiment_driver.py
@@ -193,7 +193,9 @@ def execute(
             use_apache_beam=use_apache_beam,
             pipeline_options=pipeline_options,
         )
-        filesystem.write_text(self._output_file, result.to_csv(na_rep="NaN", index=False))
+        filesystem.write_text(
+            self._output_file, result.to_csv(na_rep="NaN", index=False)
+        )
 
         return result
 

diff --git a/src/driver/experiment_parameters.py b/src/driver/experiment_parameters.py
@@ -82,7 +82,7 @@ def generate_test_points(
                 "Invalid test point strategy: {}".format(self.test_point_strategy)
             )
         test_point_generator = TEST_POINT_STRATEGIES[self.test_point_strategy](
-            data_set, rng, **self.test_point_strategy_kwargs
+            dataset=data_set, rng=rng, **self.test_point_strategy_kwargs
         )
 
         return test_point_generator.test_points()

diff --git a/src/driver/experimental_trial.py b/src/driver/experimental_trial.py
@@ -111,6 +111,7 @@ def __init__(
         self._data_design = data_design
         self._data_set_name = data_set_name
         self._trial_descriptor = trial_descriptor
+        print("\n\n\n======", self._trial_descriptor, "====\n\n\n")
         self._analysis_type = analysis_type
 
     def evaluate(
@@ -190,28 +191,46 @@ def evaluate(
                     dataset, rng
                 )
             )
-            true_reach = [
-                halo.true_reach_by_spend(
-                    t, self._trial_descriptor.experiment_params.max_frequency
-                )
-                for t in test_points
-            ]
-            fitted_reach = [
-                reach_surface.by_spend(
-                    t, self._trial_descriptor.experiment_params.max_frequency
-                )
-                for t in test_points
-            ]
-            metrics = aggregate(true_reach, fitted_reach)
-            if hasattr(reach_surface, "evaluate_single_pub_kplus_reach_agreement"):
-                metrics["single_pub_kplus_reach_agreement"] = [
-                    reach_surface.evaluate_single_pub_kplus_reach_agreement(
-                        scaling_factor_choices=[0.5, 0.75, 1, 1.5, 2],
-                        max_frequency=max_frequency,
+            # print(
+            #     '\n',
+            #     'Test points',
+            #     self._trial_descriptor.experiment_params.test_point_strategy,
+            #     self._trial_descriptor.experiment_params.test_point_strategy_kwargs,
+            #     '\n',
+            #     test_points,
+            #     '\n',{hashlib.md5(trial_results_path.encode()).hexdigest()}
+            #     modeling_strategy._multi_pub_model,
+            #     '\n',
+            # )
+            if len(test_points) == 0:
+                true_reach, fitted_reach = [], []
+                metrics = aggregate(true_reach, fitted_reach)
+                metrics["single_pub_kplus_reach_agreement"] = [{}]
+            else:
+                true_reach = [
+                    halo.true_reach_by_spend(
+                        t, self._trial_descriptor.experiment_params.max_frequency
                     )
+                    for t in test_points
                 ]
-            else:
-                metrics["single_pub_kplus_reach_agreement"] = [{}]
+                # print('True: ', true_reach, '\n')
+                fitted_reach = [
+                    reach_surface.by_spend(
+                        t, self._trial_descriptor.experiment_params.max_frequency
+                    )
+                    for t in test_points
+                ]
+                # print('Fitted: ', fitted_reach, '\n\n')
+                metrics = aggregate(true_reach, fitted_reach)
+                if hasattr(reach_surface, "evaluate_single_pub_kplus_reach_agreement"):
+                    metrics["single_pub_kplus_reach_agreement"] = [
+                        reach_surface.evaluate_single_pub_kplus_reach_agreement(
+                            scaling_factor_choices=[0.5, 1, 2],
+                            max_frequency=max_frequency,
+                        )
+                    ]
+                else:
+                    metrics["single_pub_kplus_reach_agreement"] = [{}]
             if self._analysis_type == SINGLE_PUB_ANALYSIS:
                 single_publisher_dataframe = (
                     self._compute_single_publisher_fractions_dataframe(
@@ -246,6 +265,15 @@ def evaluate(
             ],
             axis=1,
         )
+        print(
+            "\n\n\n\n",
+            result["multi_pub_model"],
+            "\n",
+            result["privacy_budget_epsilon"],
+            "\n",
+            trial_results_path,
+            "\n\n\n\n",
+        )
         filesystem.mkdir(
             filesystem.parent(trial_results_path), parents=True, exist_ok=True
         )
@@ -259,14 +287,16 @@ def evaluate(
     def _compute_trial_results_path(self) -> str:
         """Returns path of file where the results of this trial are stored."""
         dir_name = self._experiment_dir
-        if len(dir_name) > 255:
-            dir_name = dir_name[:241] + str(np.random.randint(1e10))
         dataset_name = self._data_set_name
-        if len(dataset_name) > 255:
-            dataset_name = dataset_name[:241] + str(np.random.randint(1e10))
         descriptor_name = f"{self._trial_descriptor}"
         if len(descriptor_name) > 255:
-            descriptor_name = descriptor_name[:241] + str(np.random.randint(1e10))
+            parsed = int(descriptor_name.split(",id=")[1])
+            if parsed == -1:
+                id = hashlib.md5(descriptor_name.encode()).hexdigest()[:6]
+            else:
+                id = f"id={parsed}"
+            print("\n\n\n", descriptor_name, "\n\n\n")
+            descriptor_name = descriptor_name[:241] + f"...,{id}"
         name = f"{dir_name}/{dataset_name}/{descriptor_name}.csv"
         return name
 

diff --git a/src/driver/june_2022_experimental_design.py b/src/driver/june_2022_experimental_design.py
@@ -50,15 +50,23 @@
         "dirac_mixture_single",
         {},
         "dirac_mixture_multi",
-        {"dilution": 0.3},
+        {
+            "dilution": 0.3,
+            "largest_pub_to_universe_ratio": 0.25,
+            "single_publisher_reach_agreement": False,
+        },
     ),
     ModelingStrategyDescriptor(
         "m3strategy",
         {"use_ground_truth_for_reach_curves": False},
         "dirac_mixture_single",
         {"dilution": 0.3, "largest_pub_to_universe_ratio": 0.25},
         "dirac_mixture_multi",
-        {"dilution": 0.3, "largest_pub_to_universe_ratio": 0.25},
+        {
+            "dilution": 0.3,
+            "largest_pub_to_universe_ratio": 0.25,
+            "single_publisher_reach_agreement": False,
+        },
     ),
     # Independent multi pub
     ModelingStrategyDescriptor(
@@ -69,14 +77,6 @@
         "independent",
         {"largest_pub_to_universe_ratio": 0.25},
     ),
-    ModelingStrategyDescriptor(
-        "m3strategy",
-        {"use_ground_truth_for_reach_curves": True},
-        "dirac_mixture_single",
-        {},
-        "independent",
-        {"largest_pub_to_universe_ratio": 0.75},
-    ),
     ModelingStrategyDescriptor(
         "m3strategy",
         {"use_ground_truth_for_reach_curves": False},
@@ -85,14 +85,6 @@
         "independent",
         {"largest_pub_to_universe_ratio": 0.25},
     ),
-    ModelingStrategyDescriptor(
-        "m3strategy",
-        {"use_ground_truth_for_reach_curves": False},
-        "dirac_mixture_single",
-        {"dilution": 0.3, "largest_pub_to_universe_ratio": 0.25},
-        "independent",
-        {"largest_pub_to_universe_ratio": 0.75},
-    ),
 ]
 
 CAMPAIGN_SPEND_FRACTIONS_GENERATORS = [
@@ -108,9 +100,7 @@
 
 PRIVACY_BUDGETS = [
     PrivacyBudget(1.0, 1e-9),
-    PrivacyBudget(0.33, 1e-9),
     PrivacyBudget(0.1, 1e-9),
-    PrivacyBudget(0.033, 1e-9),
     PrivacyBudget(0.01, 1e-9),
 ]
 
@@ -133,13 +123,15 @@
     "max_frequencies": MAX_FREQUENCIES,
     "test_point_strategies": TEST_POINT_STRATEGIES,
 }
-# A total of 6 * 4 * 5 * 3 * 3 = 960 configs. Will evaluate all of them
+# A total of 4 * 4 * 3 * 3 * 3 = 432 configs. Will evaluate all of them
 # per dataset.
 
 
 def generate_experimental_design_config(seed: int = 1) -> Iterable[TrialDescriptor]:
     """Generates a list of TrialDescriptors for the 1st round eval of M3."""
-    for level_combination in itertools.product(*LEVELS.values()):
+    for id, level_combination in enumerate(itertools.product(*LEVELS.values())):
+        print("\n", id, "\n", level_combination)
+
         design_parameters = dict(zip(LEVELS.keys(), level_combination))
         mstrategy = design_parameters["modeling_strategies"]
         sparams = SystemParameters(
@@ -159,4 +151,4 @@ def generate_experimental_design_config(seed: int = 1) -> Iterable[TrialDescript
             test_point_generator,
             test_point_params,
         )
-        yield TrialDescriptor(mstrategy, sparams, eparams)
+        yield TrialDescriptor(mstrategy, sparams, eparams, id)