Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correlated rank similarity metric #59

Open
wants to merge 8 commits into
base: develop-paper
Choose a base branch
from
Open
4,001 changes: 4,001 additions & 0 deletions datasets/adult_dataset_small/adult_small.csv

Large diffs are not rendered by default.

60 changes: 60 additions & 0 deletions datasets/adult_dataset_small/adult_small.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
{
"columns": [
{
"name": "age",
"type": "DiscreteNumerical"
},
{
"name": "workclass",
"type": "Categorical"
},
{
"name": "fnlwgt",
"type": "DiscreteNumerical"
},
{
"name": "education",
"type": "Categorical"
},
{
"name": "education-num",
"type": "DiscreteNumerical"
},
{
"name": "marital-status",
"type": "Categorical"
},
{
"name": "occupation",
"type": "Categorical"
},
{
"name": "relationship",
"type": "Categorical"
},
{
"name": "sex",
"type": "Categorical"
},
{
"name": "capital-gain",
"type": "DiscreteNumerical"
},
{
"name": "capital-loss",
"type": "DiscreteNumerical"
},
{
"name": "hours",
"type": "DiscreteNumerical"
},
{
"name": "native",
"type": "Categorical"
},
{
"name": "label",
"type": "Categorical"
}
]
}
1 change: 1 addition & 0 deletions env-configuration/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ featuretools
shap
ipython
numpy>=1.20
pulp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def input_json(random_state):
"LogisticRegression": {"mode": "main", "params_main": {"max_iter": 1000}}
},
},
"utility_parameters_correlations": {"enabled": True},
"utility_parameters_correlations": {"enabled": False},
"utility_parameters_feature_importance": {
"enabled": True,
"label_column": "label",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def input_json(random_state, sample_frac):
"LogisticRegression": {"mode": "main", "params_main": {"max_iter": 1000}}
},
},
"utility_parameters_correlations": {"enabled": True},
"utility_parameters_correlations": {"enabled": False},
"utility_parameters_feature_importance": {
"enabled": True,
"label_column": "label",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import argparse
import json
import matplotlib.pyplot as plt
import subprocess
import pandas as pd
from itertools import product
from pathlib import Path

def input_json(random_state):
return {
"enabled": True,
"dataset": "datasets/adult_dataset_small/adult_small",
"synth-method": "synthpop",
"parameters": {
"enabled": True,
"num_samples_to_fit": -1,
"num_samples_to_synthesize": -1,
"num_datasets_to_synthesize": 1,
"random_state": int(random_state),
"vars_sequence": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
"synthesis_methods": [
"sample",
"sample",
"sample",
"sample",
"sample",
"sample",
"sample",
"sample",
"sample",
"sample",
"sample",
"sample",
"sample",
"sample",
],
"proper": False,
"tree_minbucket": 1,
},
"privacy_parameters_disclosure_risk": {
"enabled": False,
"num_samples_intruder": 5000,
"vars_intruder": ["gender", "age", "neighborhood"],
},
"utility_parameters_classifiers": {
"enabled": False,
"classifier": {
"LogisticRegression": {"mode": "main", "params_main": {"max_iter": 1000}}
},
},
"utility_parameters_correlations": {"enabled": False},
"utility_parameters_feature_importance": {
"enabled": True,
"label_column": "label",
"normalized_entities": [
{
"new_entity_id": "education",
"index": "education-num",
"additional_variables": ["education"],
"make_time_index": False,
},
{
"new_entity_id": "Workclass",
"index": "workclass",
"additional_variables": [],
"make_time_index": False,
},
{
"new_entity_id": "Occupation",
"index": "occupation",
"additional_variables": [],
"make_time_index": False,
},
],
"max_depth": 2,
"features_to_exclude": ["education-num"],
"drop_na": True,
"categorical_enconding": "labels",
"compute_shapley": True,
"skip_feature_engineering": False
},
}



def filename_stem(i):
return f"adult_small-resampling-ensemble-{i:04}"


def input_path(i):
return Path(f"../../run-inputs/{filename_stem(i)}.json")


def feature_importance_path(i):
return Path(
f"../../synth-output/{filename_stem(i)}/utility_feature_importance.json"
)


def write_input_file(i, params, force=False):
fname = input_path(i)
run_input = json.dumps(input_json(**params), indent=4)
if force or not fname.exists():
print(f"Writing {fname}")
with open(fname, "w") as input_file:
input_file.write(run_input)


def read_json(fname):
with open(fname) as f:
return json.load(f)


def handle_cmdline_args():
parser = argparse.ArgumentParser(
description="Generate (optionally run and postprocess) an ensemble of run inputs"
)

parser.add_argument(
"-n",
"--num-replicas",
dest="nreplicas",
required=True,
type=int,
help="The number of replicas to generate",
)

parser.add_argument(
"-r",
"--run",
default=False,
action="store_true",
help="Run (via make) and postprocess?",
)

parser.add_argument(
"-f",
"--force-write",
dest="force",
default=False,
action="store_true",
help="Write out input files, even if they exist",
)

args = parser.parse_args()
return args


if __name__ == "__main__":
args = handle_cmdline_args()

random_states = range(args.nreplicas)

all_params = pd.DataFrame(
data=random_states, columns=["random_state"]
)

for i, params in all_params.iterrows():
print(dict(params))
write_input_file(i, dict(params), force=args.force)

if args.run:
all_targets = [f"run-{filename_stem(i)}" for i, _ in all_params.iterrows()]
subprocess.run(["make", "-j", "-C../.."] + all_targets)
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
import argparse
import json
import matplotlib.pyplot as plt
import subprocess
import pandas as pd
from itertools import product
from pathlib import Path

def input_json(random_state, sample_frac):
return {
"enabled": True,
"dataset": "datasets/adult_dataset_small/adult_small",
"synth-method": "subsample",
"parameters": {
"enabled": True,
"frac_samples_to_synthesize": sample_frac,
"random_state": int(random_state),
},
"privacy_parameters_disclosure_risk": {
"enabled": False,
"num_samples_intruder": 5000,
"vars_intruder": ["gender", "age", "neighborhood"],
},
"utility_parameters_classifiers": {
"enabled": False,
"classifier": {
"LogisticRegression": {"mode": "main", "params_main": {"max_iter": 1000}}
},
},
"utility_parameters_correlations": {"enabled": False},
"utility_parameters_feature_importance": {
"enabled": True,
"label_column": "label",
"normalized_entities": [
{
"new_entity_id": "education",
"index": "education-num",
"additional_variables": ["education"],
"make_time_index": False,
},
{
"new_entity_id": "Workclass",
"index": "workclass",
"additional_variables": [],
"make_time_index": False,
},
{
"new_entity_id": "Occupation",
"index": "occupation",
"additional_variables": [],
"make_time_index": False,
},
],
"max_depth": 2,
"features_to_exclude": ["education-num"],
"drop_na": True,
"categorical_enconding": "labels",
"compute_shapley": True,
"skip_feature_engineering": False
},
}


def filename_stem(i):
return f"adult_small-subsample-ensemble-{i:04}"


def input_path(i):
return Path(f"../../run-inputs/{filename_stem(i)}.json")


def write_input_file(i, params, force=False):
fname = input_path(i)
run_input = json.dumps(input_json(**params), indent=4)
if force or not fname.exists():
print(f"Writing {fname}")
with open(fname, "w") as input_file:
input_file.write(run_input)


def read_json(fname):
with open(fname) as f:
return json.load(f)


def handle_cmdline_args():
parser = argparse.ArgumentParser(
description="Generate (optionally run and postprocess) an ensemble of run inputs"
)

parser.add_argument(
"-n",
"--num-replicas",
dest="nreplicas",
required=True,
type=int,
help="The number of replicas to generate",
)

parser.add_argument(
"-r",
"--run",
default=False,
action="store_true",
help="Run (via make) and postprocess?",
)

parser.add_argument(
"-f",
"--force-write",
dest="force",
default=False,
action="store_true",
help="Write out input files, even if they exist",
)

parser.add_argument(
"-s",
"--sample-fractions",
dest="sample_fracs",
required=True,
help="The list of fraction of samples used",
)

args = parser.parse_args()
return args


if __name__ == "__main__":
args = handle_cmdline_args()

random_states = range(args.nreplicas)

all_params = pd.DataFrame(
data=product(random_states, map(float, args.sample_fracs.strip('[]').split(','))), columns=["random_state", "sample_frac"]
)

for i, params in all_params.iterrows():
print(dict(params))
write_input_file(i, dict(params), force=args.force)

if args.run:
all_targets = [f"run-{filename_stem(i)}" for i, _ in all_params.iterrows()]
subprocess.run(["make", "-j72", "-C../.."] + all_targets)
Loading