alan-turing-institute · gmingas · Apr 5, 2021 · Apr 6, 2021 · Apr 8, 2021 · Apr 8, 2021
diff --git a/datasets/adult_dataset_small/adult_small.csv b/datasets/adult_dataset_small/adult_small.csv
diff --git a/datasets/adult_dataset_small/adult_small.json b/datasets/adult_dataset_small/adult_small.json
@@ -0,0 +1,60 @@
+{
+    "columns": [
+        {
+            "name": "age",
+            "type": "DiscreteNumerical"
+        },
+        {
+            "name": "workclass",
+            "type": "Categorical"
+        },
+        {
+            "name": "fnlwgt",
+            "type": "DiscreteNumerical"
+        },
+        {
+            "name": "education",
+            "type": "Categorical"
+        },
+        {
+            "name": "education-num",
+            "type": "DiscreteNumerical"
+        },
+        {
+            "name": "marital-status",
+            "type": "Categorical"
+        },
+        {
+            "name": "occupation",
+            "type": "Categorical"
+        },
+        {
+            "name": "relationship",
+            "type": "Categorical"
+        },
+        {
+            "name": "sex",
+            "type": "Categorical"
+        },
+        {
+            "name": "capital-gain",
+            "type": "DiscreteNumerical"
+        },
+        {
+            "name": "capital-loss",
+            "type": "DiscreteNumerical"
+        },
+        {
+            "name": "hours",
+            "type": "DiscreteNumerical"
+        },
+        {
+            "name": "native",
+            "type": "Categorical"
+        },
+        {
+            "name": "label",
+            "type": "Categorical"
+        }
+    ]
+}
diff --git a/env-configuration/requirements.txt b/env-configuration/requirements.txt
@@ -12,3 +12,4 @@ featuretools
 shap
 ipython
 numpy>=1.20
+pulp
diff --git a/examples/adult-resampling-ensemble/adult-resampling-ensemble.py b/examples/adult-resampling-ensemble/adult-resampling-ensemble.py
@@ -48,7 +48,7 @@ def input_json(random_state):
                 "LogisticRegression": {"mode": "main", "params_main": {"max_iter": 1000}}
             },
         },
-        "utility_parameters_correlations": {"enabled": True},
+        "utility_parameters_correlations": {"enabled": False},
         "utility_parameters_feature_importance": {
             "enabled": True,
             "label_column": "label",

diff --git a/examples/adult-subsample-ensemble/adult-subsample-ensemble.py b/examples/adult-subsample-ensemble/adult-subsample-ensemble.py
@@ -27,7 +27,7 @@ def input_json(random_state, sample_frac):
                 "LogisticRegression": {"mode": "main", "params_main": {"max_iter": 1000}}
             },
         },
-        "utility_parameters_correlations": {"enabled": True},
+        "utility_parameters_correlations": {"enabled": False},
         "utility_parameters_feature_importance": {
             "enabled": True,
             "label_column": "label",

diff --git a/examples/adult_small-resampling-ensemble/adult_small-resampling-ensemble.py b/examples/adult_small-resampling-ensemble/adult_small-resampling-ensemble.py
@@ -0,0 +1,164 @@
+import argparse
+import json
+import matplotlib.pyplot as plt
+import subprocess
+import pandas as pd
+from itertools import product
+from pathlib import Path
+
+def input_json(random_state):
+    return {
+        "enabled": True,
+        "dataset": "datasets/adult_dataset_small/adult_small",
+        "synth-method": "synthpop",
+        "parameters": {
+            "enabled": True,
+            "num_samples_to_fit": -1,
+            "num_samples_to_synthesize": -1,
+            "num_datasets_to_synthesize": 1,
+            "random_state": int(random_state),
+            "vars_sequence": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
+            "synthesis_methods": [
+                "sample",
+                "sample",
+                "sample",
+                "sample",
+                "sample",
+                "sample",
+                "sample",
+                "sample",
+                "sample",
+                "sample",
+                "sample",
+                "sample",
+                "sample",
+                "sample",
+            ],
+            "proper": False,
+            "tree_minbucket": 1,
+        },
+        "privacy_parameters_disclosure_risk": {
+            "enabled": False,
+            "num_samples_intruder": 5000,
+            "vars_intruder": ["gender", "age", "neighborhood"],
+        },
+        "utility_parameters_classifiers": {
+            "enabled": False,
+            "classifier": {
+                "LogisticRegression": {"mode": "main", "params_main": {"max_iter": 1000}}
+            },
+        },
+        "utility_parameters_correlations": {"enabled": False},
+        "utility_parameters_feature_importance": {
+            "enabled": True,
+            "label_column": "label",
+            "normalized_entities": [
+                {
+                    "new_entity_id": "education",
+                    "index": "education-num",
+                    "additional_variables": ["education"],
+                    "make_time_index": False,
+                },
+                {
+                    "new_entity_id": "Workclass",
+                    "index": "workclass",
+                    "additional_variables": [],
+                    "make_time_index": False,
+                },
+                {
+                    "new_entity_id": "Occupation",
+                    "index": "occupation",
+                    "additional_variables": [],
+                    "make_time_index": False,
+                },
+            ],
+            "max_depth": 2,
+            "features_to_exclude": ["education-num"],
+            "drop_na": True,
+            "categorical_enconding": "labels",
+            "compute_shapley": True,
+            "skip_feature_engineering": False
+        },
+    }
+
+
+
+def filename_stem(i):
+    return f"adult_small-resampling-ensemble-{i:04}"
+
+
+def input_path(i):
+    return Path(f"../../run-inputs/{filename_stem(i)}.json")
+
+
+def feature_importance_path(i):
+    return Path(
+        f"../../synth-output/{filename_stem(i)}/utility_feature_importance.json"
+    )
+
+
+def write_input_file(i, params, force=False):
+    fname = input_path(i)
+    run_input = json.dumps(input_json(**params), indent=4)
+    if force or not fname.exists():
+        print(f"Writing {fname}")
+        with open(fname, "w") as input_file:
+            input_file.write(run_input)
+
+
+def read_json(fname):
+    with open(fname) as f:
+        return json.load(f)
+
+
+def handle_cmdline_args():
+    parser = argparse.ArgumentParser(
+        description="Generate (optionally run and postprocess) an ensemble of run inputs"
+    )
+
+    parser.add_argument(
+        "-n",
+        "--num-replicas",
+        dest="nreplicas",
+        required=True,
+        type=int,
+        help="The number of replicas to generate",
+    )
+
+    parser.add_argument(
+        "-r",
+        "--run",
+        default=False,
+        action="store_true",
+        help="Run (via make) and postprocess?",
+    )
+
+    parser.add_argument(
+        "-f",
+        "--force-write",
+        dest="force",
+        default=False,
+        action="store_true",
+        help="Write out input files, even if they exist",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = handle_cmdline_args()
+
+    random_states = range(args.nreplicas)
+
+    all_params = pd.DataFrame(
+        data=random_states, columns=["random_state"]
+    )
+
+    for i, params in all_params.iterrows():
+        print(dict(params))
+        write_input_file(i, dict(params), force=args.force)
+
+    if args.run:
+        all_targets = [f"run-{filename_stem(i)}" for i, _ in all_params.iterrows()]
+        subprocess.run(["make", "-j", "-C../.."] + all_targets)
diff --git a/examples/adult_small-subsample-ensemble/adult_small-subsample-ensemble.py b/examples/adult_small-subsample-ensemble/adult_small-subsample-ensemble.py
@@ -0,0 +1,144 @@
+import argparse
+import json
+import matplotlib.pyplot as plt
+import subprocess
+import pandas as pd
+from itertools import product
+from pathlib import Path
+
+def input_json(random_state, sample_frac):
+    return {
+        "enabled": True,
+        "dataset": "datasets/adult_dataset_small/adult_small",
+        "synth-method": "subsample",
+        "parameters": {
+            "enabled": True,
+            "frac_samples_to_synthesize": sample_frac,
+            "random_state": int(random_state),
+        },
+        "privacy_parameters_disclosure_risk": {
+            "enabled": False,
+            "num_samples_intruder": 5000,
+            "vars_intruder": ["gender", "age", "neighborhood"],
+        },
+        "utility_parameters_classifiers": {
+            "enabled": False,
+            "classifier": {
+                "LogisticRegression": {"mode": "main", "params_main": {"max_iter": 1000}}
+            },
+        },
+        "utility_parameters_correlations": {"enabled": False},
+        "utility_parameters_feature_importance": {
+            "enabled": True,
+            "label_column": "label",
+            "normalized_entities": [
+                {
+                    "new_entity_id": "education",
+                    "index": "education-num",
+                    "additional_variables": ["education"],
+                    "make_time_index": False,
+                },
+                {
+                    "new_entity_id": "Workclass",
+                    "index": "workclass",
+                    "additional_variables": [],
+                    "make_time_index": False,
+                },
+                {
+                    "new_entity_id": "Occupation",
+                    "index": "occupation",
+                    "additional_variables": [],
+                    "make_time_index": False,
+                },
+            ],
+            "max_depth": 2,
+            "features_to_exclude": ["education-num"],
+            "drop_na": True,
+            "categorical_enconding": "labels",
+            "compute_shapley": True,
+            "skip_feature_engineering": False
+        },
+    }
+
+
+def filename_stem(i):
+    return f"adult_small-subsample-ensemble-{i:04}"
+
+
+def input_path(i):
+    return Path(f"../../run-inputs/{filename_stem(i)}.json")
+
+
+def write_input_file(i, params, force=False):
+    fname = input_path(i)
+    run_input = json.dumps(input_json(**params), indent=4)
+    if force or not fname.exists():
+        print(f"Writing {fname}")
+        with open(fname, "w") as input_file:
+            input_file.write(run_input)
+
+
+def read_json(fname):
+    with open(fname) as f:
+        return json.load(f)
+
+
+def handle_cmdline_args():
+    parser = argparse.ArgumentParser(
+        description="Generate (optionally run and postprocess) an ensemble of run inputs"
+    )
+
+    parser.add_argument(
+        "-n",
+        "--num-replicas",
+        dest="nreplicas",
+        required=True,
+        type=int,
+        help="The number of replicas to generate",
+    )
+
+    parser.add_argument(
+        "-r",
+        "--run",
+        default=False,
+        action="store_true",
+        help="Run (via make) and postprocess?",
+    )
+
+    parser.add_argument(
+        "-f",
+        "--force-write",
+        dest="force",
+        default=False,
+        action="store_true",
+        help="Write out input files, even if they exist",
+    )
+
+    parser.add_argument(
+        "-s",
+        "--sample-fractions",
+        dest="sample_fracs",
+        required=True,
+        help="The list of fraction of samples used",
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = handle_cmdline_args()
+
+    random_states = range(args.nreplicas)
+
+    all_params = pd.DataFrame(
+        data=product(random_states, map(float, args.sample_fracs.strip('[]').split(','))), columns=["random_state", "sample_frac"]
+    )
+
+    for i, params in all_params.iterrows():
+        print(dict(params))
+        write_input_file(i, dict(params), force=args.force)
+
+    if args.run:
+        all_targets = [f"run-{filename_stem(i)}" for i, _ in all_params.iterrows()]
+        subprocess.run(["make", "-j72", "-C../.."] + all_targets)
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,3 +12,4 @@ featuretools @@
     shap
     ipython
     numpy>=1.20
+    pulp