Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jiayu generate data #118

Draft
wants to merge 26 commits into
base: jiayu_working_branch_for_initial_eval
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
6bafe5c
small size experimental deisgn.
jiayu-google Jun 16, 2022
0ddfb8a
Merge remote-tracking branch 'origin/jiayu_working_branch_for_initial…
jiayu-google Jun 16, 2022
141fc6c
Merge remote-tracking branch 'origin/jiayu_working_branch_for_initial…
jiayu-google Jun 16, 2022
2d85e52
Merge remote-tracking branch 'origin/jiayu_working_branch_for_initial…
jiayu-google Jun 16, 2022
4f1a0c8
Merge remote-tracking branch 'origin/jiayu_working_branch_for_initial…
jiayu-google Jun 16, 2022
246d10c
update small size test design.
jiayu-google Jun 16, 2022
ff8c728
Merge remote-tracking branch 'origin/jiayu_working_branch_for_initial…
jiayu-google Jun 16, 2022
3199325
Merge remote-tracking branch 'origin/jiayu_working_branch_for_initial…
jiayu-google Jun 16, 2022
ab0a8fc
Merge remote-tracking branch 'origin/jiayu_working_branch_for_initial…
jiayu-google Jun 17, 2022
ae55f42
bug free!
jiayu-google Jun 17, 2022
adf86f1
Merge remote-tracking branch 'origin/jiayu_working_branch_for_initial…
jiayu-google Jun 17, 2022
70ecf7d
No longer forces single pub reach agreement, to accelerate the evalua…
jiayu-google Jun 19, 2022
73fd080
Simplify the experimental design to reduce running time.
jiayu-google Jun 20, 2022
7cb200b
Simplify the experimental design to reduce running time.
jiayu-google Jun 20, 2022
c696b65
Round impressions to int.
jiayu-google Jun 22, 2022
cd25538
Add max_frequency = 10 to m3_strategy
jiayu-google Jul 8, 2022
2e4f4e8
add single pub points to dm
jiayu-google Jul 8, 2022
cc0efcf
Correctly handle the single pub training points for curve and surface…
jiayu-google Jul 8, 2022
ee33942
Add a round for impression.
jiayu-google Jul 11, 2022
68424bc
Add more metrics for single pub agreement.
jiayu-google Jul 11, 2022
abb261a
cap the frequency hist by 0.
jiayu-google Jul 12, 2022
1113ec9
Random seed for random dataset name.
jiayu-google Jul 13, 2022
011c1b2
Random seed for random dataset name.
jiayu-google Jul 13, 2022
851c70e
Truncate long dataset name so that id is kept.
jiayu-google Jul 13, 2022
2a741ca
1. Fixed trial name bug. 2. Black everything.
jiayu-google Jul 14, 2022
c94b495
also updated june 2022 design
jiayu-google Jul 14, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ absl-py==0.12.0
pathos>=0.2.7
pyfarmhash==0.2.2
matplotlib>=3.0.3
numpy==1.20.2
numpy>=1.20.2
pyDOE==0.3.8
scipy==1.6.2
scipy>=1.6.2
seaborn==0.9.0
lxml==4.5.2
cvxopt==1.2.6
lxml>=4.5.2
cvxopt>=1.2.6
cvxpy>=1.1.18
dp-accounting==0.0.1
tqdm>=4.47.0
Expand Down
8 changes: 4 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@
"absl-py==0.12.0",
"pathos>=0.2.7",
"pyfarmhash==0.2.2",
"numpy==1.20.2",
"numpy>=1.20.2",
"pyDOE==0.3.8",
"scipy==1.6.2",
"lxml==4.5.2",
"cvxopt==1.2.6",
"scipy>=1.6.2",
"lxml>=4.5.2",
"cvxopt>=1.2.6",
"cvxpy>=1.1.18",
"dp-accounting==0.0.1",
"pandas>=1.2.5",
Expand Down
2 changes: 1 addition & 1 deletion src/data_generators/data_design.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,4 @@ def add(self, data_set: DataSet) -> None:
)
)
data_set.write_data_set(self._dirpath, filesystem=self._filesystem)
self._data_set_names.add(data_set.name)
self._data_set_names.add(data_set.name)
3 changes: 2 additions & 1 deletion src/data_generators/data_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,8 @@ def write_data_set(
dataset_dir = self._name
##chenwei##
if len(dataset_dir) > 255:
new_dataset_dir = dataset_dir[:245] + str(np.random.randint(1e10))
id = self._name.split("id=")[1].split(",")[0]
new_dataset_dir = dataset_dir[:245] + f"...,id={id}"
else:
new_dataset_dir = dataset_dir
full_dir_path = filesystem.joinpath(parent_dir, new_dataset_dir)
Expand Down
9 changes: 5 additions & 4 deletions src/data_generators/m3_data_design.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@
)



# Number of samples that will be taken in the latin hypercube design
NUM_SAMPLES_FOR_LHS = 200

Expand Down Expand Up @@ -305,16 +304,18 @@
"overlap_generator_params": OVERLAP_GENERATORS,
}


def generate_data_design_config(
random_generator: np.random.Generator,
) -> Iterable[DataSetParameters]:
"""Generates the data design configuration for evaluating M3 strategy."""
keys = LEVELS.keys()
levels = [len(LEVELS[k]) for k in keys]
for i, sample in enumerate(
lhs(n=len(levels), samples=NUM_SAMPLES_FOR_LHS, criterion="maximin")
ids = [16, 41, 151, 103, 69, 98, 50, 91, 73]
for id, sample in zip(
ids, lhs(n=len(levels), samples=NUM_SAMPLES_FOR_LHS, criterion="maximin")[ids]
):
design_parameters = {"id": str(i)}
design_parameters = {"id": str(id)}
for key, level in zip(keys, sample):
design_parameters[key] = LEVELS[key][int(level * len(LEVELS[key]))]
# Specify the universe size for some datasets
Expand Down
8 changes: 1 addition & 7 deletions src/data_generators/single_publisher_design.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
GeneratorParameters(
"Homogeneous", HomogeneousImpressionGenerator, {"poisson_lambda": 0.5}
),

# Exponential Poisson: Mean = 1.5, Var = 0.75
# For the Exponential-Poisson, the mean is beta + 1 and the variance is
# beta * (beta + 1), where beta is the "gamma_scale" parameter. This can
Expand All @@ -58,26 +57,23 @@
HeterogeneousImpressionGenerator,
{"gamma_shape": 1.0, "gamma_scale": 0.5},
),

# Gamma Poisson: Mean = 1.5, Var = 6
# For the shifted Gamma-Poisson, the mean is alpha * beta + 1, and the
# variance is alpha * beta * (beta + 1), where alpha = gamma_shape and
# beta = gamma_scale. This can be worked out by making
# use of the equivalence between the Gamma-Poisson and the negative
# binomial distribution. Using the formulation for the negative binomial
# given in Wikipedia, the equivalent negative binomial distribution is
# obtained by setting p = beta / (1 + beta) and r = alpha.
# obtained by setting p = beta / (1 + beta) and r = alpha.
GeneratorParameters(
"Heterogeneous",
HeterogeneousImpressionGenerator,
{"gamma_shape": 0.04545, "gamma_scale": 11},
),

# Zeta: Mean = 1.5, Var = infinity
GeneratorParameters(
"HeavyTailed", HeavyTailedImpressionGenerator, {"zeta_s": 2.8106}
),

## Mean 3
# Shifted Poisson: Mean = 3, Var = 2
GeneratorParameters(
Expand All @@ -99,7 +95,6 @@
GeneratorParameters(
"HeavyTailed", HeavyTailedImpressionGenerator, {"zeta_s": 2.2662}
),

## Mean 5
# Shifted Poisson: Mean = 5, Var = 4
GeneratorParameters(
Expand All @@ -121,7 +116,6 @@
GeneratorParameters(
"HeavyTailed", HeavyTailedImpressionGenerator, {"zeta_s": 2.1416}
),

## Mean 10
# Shifted Poisson: Mean = 10, Var = 9
GeneratorParameters(
Expand Down
10 changes: 5 additions & 5 deletions src/data_generators/tests/copula_data_set_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def test_uncorrelated_copula_with_more_than_two_pubs(self):
# Suppose there are 100 * 2^p users in total, where p = #pubs.
# At each pub, half users have frequency 1, and the other half
# have frequency 2.
half_size = int(100 * 2 ** num_pubs / 2)
half_size = int(100 * 2**num_pubs / 2)
impressions = (
list(range(half_size)) * 1
+ list(range(half_size, half_size * 2)) * 2
Expand All @@ -185,7 +185,7 @@ def test_uncorrelated_copula_with_more_than_two_pubs(self):
dataset = CopulaDataSet(
unlabeled_publisher_data_list=[pdf] * num_pubs,
copula_generator=gen,
universe_size=100 * 2 ** num_pubs,
universe_size=100 * 2**num_pubs,
random_generator=np.random.default_rng(0),
)
res = dataset.frequency_vectors_sampled_distribution
Expand All @@ -205,7 +205,7 @@ def test_fully_positively_correlated_copula_with_more_than_two_pubs(self):
dataset = CopulaDataSet(
unlabeled_publisher_data_list=[pdf] * num_pubs,
copula_generator=gen,
universe_size=100 * 2 ** num_pubs,
universe_size=100 * 2**num_pubs,
random_generator=np.random.default_rng(0),
)
res = dataset.frequency_vectors_sampled_distribution
Expand Down Expand Up @@ -289,9 +289,9 @@ def to_polar_coordinates(
- y = r sin(phi) sin(theta)
- z = r cos(theta).
"""
r = np.sqrt(x ** 2 + y ** 2 + z ** 2)
r = np.sqrt(x**2 + y**2 + z**2)
phi = np.arctan2(y, x)
theta = np.arctan2(np.sqrt(x ** 2 + y ** 2), z)
theta = np.arctan2(np.sqrt(x**2 + y**2), z)
return (r, phi, theta)

correlation_tuples = [
Expand Down
4 changes: 3 additions & 1 deletion src/driver/experiment_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,9 @@ def execute(
use_apache_beam=use_apache_beam,
pipeline_options=pipeline_options,
)
filesystem.write_text(self._output_file, result.to_csv(na_rep="NaN", index=False))
filesystem.write_text(
self._output_file, result.to_csv(na_rep="NaN", index=False)
)

return result

Expand Down
2 changes: 1 addition & 1 deletion src/driver/experiment_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def generate_test_points(
"Invalid test point strategy: {}".format(self.test_point_strategy)
)
test_point_generator = TEST_POINT_STRATEGIES[self.test_point_strategy](
data_set, rng, **self.test_point_strategy_kwargs
dataset=data_set, rng=rng, **self.test_point_strategy_kwargs
)

return test_point_generator.test_points()
Expand Down
80 changes: 55 additions & 25 deletions src/driver/experimental_trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def __init__(
self._data_design = data_design
self._data_set_name = data_set_name
self._trial_descriptor = trial_descriptor
print("\n\n\n======", self._trial_descriptor, "====\n\n\n")
self._analysis_type = analysis_type

def evaluate(
Expand Down Expand Up @@ -190,28 +191,46 @@ def evaluate(
dataset, rng
)
)
true_reach = [
halo.true_reach_by_spend(
t, self._trial_descriptor.experiment_params.max_frequency
)
for t in test_points
]
fitted_reach = [
reach_surface.by_spend(
t, self._trial_descriptor.experiment_params.max_frequency
)
for t in test_points
]
metrics = aggregate(true_reach, fitted_reach)
if hasattr(reach_surface, "evaluate_single_pub_kplus_reach_agreement"):
metrics["single_pub_kplus_reach_agreement"] = [
reach_surface.evaluate_single_pub_kplus_reach_agreement(
scaling_factor_choices=[0.5, 0.75, 1, 1.5, 2],
max_frequency=max_frequency,
# print(
# '\n',
# 'Test points',
# self._trial_descriptor.experiment_params.test_point_strategy,
# self._trial_descriptor.experiment_params.test_point_strategy_kwargs,
# '\n',
# test_points,
# '\n',{hashlib.md5(trial_results_path.encode()).hexdigest()}
# modeling_strategy._multi_pub_model,
# '\n',
# )
if len(test_points) == 0:
true_reach, fitted_reach = [], []
metrics = aggregate(true_reach, fitted_reach)
metrics["single_pub_kplus_reach_agreement"] = [{}]
else:
true_reach = [
halo.true_reach_by_spend(
t, self._trial_descriptor.experiment_params.max_frequency
)
for t in test_points
]
else:
metrics["single_pub_kplus_reach_agreement"] = [{}]
# print('True: ', true_reach, '\n')
fitted_reach = [
reach_surface.by_spend(
t, self._trial_descriptor.experiment_params.max_frequency
)
for t in test_points
]
# print('Fitted: ', fitted_reach, '\n\n')
metrics = aggregate(true_reach, fitted_reach)
if hasattr(reach_surface, "evaluate_single_pub_kplus_reach_agreement"):
metrics["single_pub_kplus_reach_agreement"] = [
reach_surface.evaluate_single_pub_kplus_reach_agreement(
scaling_factor_choices=[0.5, 1, 2],
max_frequency=max_frequency,
)
]
else:
metrics["single_pub_kplus_reach_agreement"] = [{}]
if self._analysis_type == SINGLE_PUB_ANALYSIS:
single_publisher_dataframe = (
self._compute_single_publisher_fractions_dataframe(
Expand Down Expand Up @@ -246,6 +265,15 @@ def evaluate(
],
axis=1,
)
print(
"\n\n\n\n",
result["multi_pub_model"],
"\n",
result["privacy_budget_epsilon"],
"\n",
trial_results_path,
"\n\n\n\n",
)
filesystem.mkdir(
filesystem.parent(trial_results_path), parents=True, exist_ok=True
)
Expand All @@ -259,14 +287,16 @@ def evaluate(
def _compute_trial_results_path(self) -> str:
"""Returns path of file where the results of this trial are stored."""
dir_name = self._experiment_dir
if len(dir_name) > 255:
dir_name = dir_name[:241] + str(np.random.randint(1e10))
dataset_name = self._data_set_name
if len(dataset_name) > 255:
dataset_name = dataset_name[:241] + str(np.random.randint(1e10))
descriptor_name = f"{self._trial_descriptor}"
if len(descriptor_name) > 255:
descriptor_name = descriptor_name[:241] + str(np.random.randint(1e10))
parsed = int(descriptor_name.split(",id=")[1])
if parsed == -1:
id = hashlib.md5(descriptor_name.encode()).hexdigest()[:6]
else:
id = f"id={parsed}"
print("\n\n\n", descriptor_name, "\n\n\n")
descriptor_name = descriptor_name[:241] + f"...,{id}"
name = f"{dir_name}/{dataset_name}/{descriptor_name}.csv"
return name

Expand Down
38 changes: 15 additions & 23 deletions src/driver/june_2022_experimental_design.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,23 @@
"dirac_mixture_single",
{},
"dirac_mixture_multi",
{"dilution": 0.3},
{
"dilution": 0.3,
"largest_pub_to_universe_ratio": 0.25,
"single_publisher_reach_agreement": False,
},
),
ModelingStrategyDescriptor(
"m3strategy",
{"use_ground_truth_for_reach_curves": False},
"dirac_mixture_single",
{"dilution": 0.3, "largest_pub_to_universe_ratio": 0.25},
"dirac_mixture_multi",
{"dilution": 0.3, "largest_pub_to_universe_ratio": 0.25},
{
"dilution": 0.3,
"largest_pub_to_universe_ratio": 0.25,
"single_publisher_reach_agreement": False,
},
),
# Independent multi pub
ModelingStrategyDescriptor(
Expand All @@ -69,14 +77,6 @@
"independent",
{"largest_pub_to_universe_ratio": 0.25},
),
ModelingStrategyDescriptor(
"m3strategy",
{"use_ground_truth_for_reach_curves": True},
"dirac_mixture_single",
{},
"independent",
{"largest_pub_to_universe_ratio": 0.75},
),
ModelingStrategyDescriptor(
"m3strategy",
{"use_ground_truth_for_reach_curves": False},
Expand All @@ -85,14 +85,6 @@
"independent",
{"largest_pub_to_universe_ratio": 0.25},
),
ModelingStrategyDescriptor(
"m3strategy",
{"use_ground_truth_for_reach_curves": False},
"dirac_mixture_single",
{"dilution": 0.3, "largest_pub_to_universe_ratio": 0.25},
"independent",
{"largest_pub_to_universe_ratio": 0.75},
),
]

CAMPAIGN_SPEND_FRACTIONS_GENERATORS = [
Expand All @@ -108,9 +100,7 @@

PRIVACY_BUDGETS = [
PrivacyBudget(1.0, 1e-9),
PrivacyBudget(0.33, 1e-9),
PrivacyBudget(0.1, 1e-9),
PrivacyBudget(0.033, 1e-9),
PrivacyBudget(0.01, 1e-9),
]

Expand All @@ -133,13 +123,15 @@
"max_frequencies": MAX_FREQUENCIES,
"test_point_strategies": TEST_POINT_STRATEGIES,
}
# A total of 6 * 4 * 5 * 3 * 3 = 960 configs. Will evaluate all of them
# A total of 4 * 4 * 3 * 3 * 3 = 432 configs. Will evaluate all of them
# per dataset.


def generate_experimental_design_config(seed: int = 1) -> Iterable[TrialDescriptor]:
"""Generates a list of TrialDescriptors for the 1st round eval of M3."""
for level_combination in itertools.product(*LEVELS.values()):
for id, level_combination in enumerate(itertools.product(*LEVELS.values())):
print("\n", id, "\n", level_combination)

design_parameters = dict(zip(LEVELS.keys(), level_combination))
mstrategy = design_parameters["modeling_strategies"]
sparams = SystemParameters(
Expand All @@ -159,4 +151,4 @@ def generate_experimental_design_config(seed: int = 1) -> Iterable[TrialDescript
test_point_generator,
test_point_params,
)
yield TrialDescriptor(mstrategy, sparams, eparams)
yield TrialDescriptor(mstrategy, sparams, eparams, id)
Loading