alan-turing-institute · OscartGiles · Jul 26, 2021 · Jul 26, 2021 · Jul 27, 2021 · Jul 27, 2021
diff --git a/.github/workflows/run-synth-pipeline.yml b/.github/workflows/run-synth-pipeline.yml
@@ -18,5 +18,7 @@ jobs:
         password: ${{ secrets.DOCKER_PAC }}
     -
       name: Run pipeline with privbayes-adult
-
       run: docker run -v $GITHUB_WORKSPACE:/quipp-pipeline --workdir /quipp-pipeline turinginst/quipp-env:base make run-privbayes-adult
+    - 
+      name: Run pipeline with Synthpop
+      run: docker run -v $GITHUB_WORKSPACE:/quipp-pipeline --workdir /quipp-pipeline turinginst/quipp-env:base python examples/artificial_1-resampling-ensemble/artificial_1-resampling-ensemble.py -n 1 -f -r 
diff --git a/README.md b/README.md
@@ -36,7 +36,26 @@ The current draft QUiPP report can be found in [`doc`](doc/), with a pdf
 available
 [here](https://github.com/alan-turing-institute/QUIPP-pipeline/releases).
 
-## Installation
+## Install
+
+### Docker
+The easiest way to use the pipeline is with [Docker](https://www.docker.com/). Get the official image with
+
+```bash
+docker pull turinginst/quipp-env:base
+```
+
+To run the pipeline use
+```bash
+docker run -v $(pwd):/quipp-pipeline --workdir /quipp-pipeline turinginst/quipp-env:base make
+```
+
+### Local
+
+Clone and checkout 
+
+
+
 - Clone the repository `git clone
   [email protected]:alan-turing-institute/QUIPP-pipeline.git`
 
@@ -51,7 +70,7 @@ More detail on setting this up can be found
 - Various parts of this code and its dependencies are written in
   Python, R, C++ and Bash.
 - It has been tested with
-  - python 3.6
+  - python 3.8
   - R 3.6
   - gcc 9.3
   - bash 3.2
@@ -62,6 +81,16 @@ More detail on setting this up can be found
 
 ### Installing the dependencies
 
+We recommend using [Anaconda](https://docs.anaconda.com/anaconda/install/index.html) with Python 3.8.
+
+```bash
+conda create -n quipp python=3.8
+```
+
+```bash
+conda activate quipp
+```
+
 #### R and Python dependencies
 
 To install all of the python and R dependencies, run the following
@@ -94,29 +123,7 @@ environmental variable `SGFROOT` to point to this location.  That is, in bash,
    - either ```export PATH=$PATH:/path/to/sgf/bin```,
    - or ```export SGFROOT=/path/to/sgf/bin```
 
-#### Forked DataSynthesizer
-
-We use the PrivBayes implementation within the DataSynthesizer fork found [here](https://github.com/gmingas/DataSynthesizer). 
-In order to install it, clone the above repository locally, go to its root directory and run `pip install .`
-
-#### Forked synthetic_data_release
 
-We use the PATE-GAN implementation within the `synthetic_data_release` fork found [here](https://github.com/kasra-hosseini/synthetic_data_release). 
-In order to use PATE-GAN in QUIPP:
-1. create a new directory:
-
-```bash
-cd /path/to/QUIPP-pipeline
-mkdir libs
-```
-
-2. Clone the above repository inside `libs` directory created in the previous step:
-
-```bash
-# from /path/to/QUIPP-pipeline 
-cd libs
-git clone https://github.com/kasra-hosseini/synthetic_data_release.git
-```
 
 ## Top-level directory contents
 

diff --git a/datasets/framingham/README.md b/datasets/framingham/README.md
diff --git a/datasets/framingham/framingham.csv b/datasets/framingham/framingham.csv
diff --git a/datasets/framingham/framingham_cleaned.csv b/datasets/framingham/framingham_cleaned.csv
diff --git a/datasets/framingham/framingham_cleaned.json b/datasets/framingham/framingham_cleaned.json
diff --git a/examples/adult-resampling-ensemble/adult-resampling-ensemble.py b/examples/adult-resampling-ensemble/adult-resampling-ensemble.py
@@ -1,10 +1,9 @@
-import argparse
-import json
-import matplotlib.pyplot as plt
-import subprocess
-import pandas as pd
-from itertools import product
+import sys
 from pathlib import Path
+sys.path.append(str(Path(__file__).parent.parent))
+from utils import run, SynthMethod
+
+FILESTEM = 'adult-resampling-ensemble'
 
 def input_json(random_state):
     return {
@@ -81,84 +80,6 @@ def input_json(random_state):
         },
     }
 
-
-
-def filename_stem(i):
-    return f"adult-resampling-ensemble-{i:04}"
-
-
-def input_path(i):
-    return Path(f"../../run-inputs/{filename_stem(i)}.json")
-
-
-def feature_importance_path(i):
-    return Path(
-        f"../../synth-output/{filename_stem(i)}/utility_feature_importance.json"
-    )
-
-
-def write_input_file(i, params, force=False):
-    fname = input_path(i)
-    run_input = json.dumps(input_json(**params), indent=4)
-    if force or not fname.exists():
-        print(f"Writing {fname}")
-        with open(fname, "w") as input_file:
-            input_file.write(run_input)
-
-
-def read_json(fname):
-    with open(fname) as f:
-        return json.load(f)
-
-
-def handle_cmdline_args():
-    parser = argparse.ArgumentParser(
-        description="Generate (optionally run and postprocess) an ensemble of run inputs"
-    )
-
-    parser.add_argument(
-        "-n",
-        "--num-replicas",
-        dest="nreplicas",
-        required=True,
-        type=int,
-        help="The number of replicas to generate",
-    )
-
-    parser.add_argument(
-        "-r",
-        "--run",
-        default=False,
-        action="store_true",
-        help="Run (via make) and postprocess?",
-    )
-
-    parser.add_argument(
-        "-f",
-        "--force-write",
-        dest="force",
-        default=False,
-        action="store_true",
-        help="Write out input files, even if they exist",
-    )
-
-    args = parser.parse_args()
-    return args
-
-
 if __name__ == "__main__":
-    args = handle_cmdline_args()
-
-    random_states = range(args.nreplicas)
-
-    all_params = pd.DataFrame(
-        data=random_states, columns=["random_state"]
-    )
-
-    for i, params in all_params.iterrows():
-        print(dict(params))
-        write_input_file(i, dict(params), force=args.force)
 
-    if args.run:
-        all_targets = [f"run-{filename_stem(i)}" for i, _ in all_params.iterrows()]
-        subprocess.run(["make", "-j", "-C../.."] + all_targets)
+    run(input_json, FILESTEM, SynthMethod.RESAMPLING)
diff --git a/examples/adult-subsample-ensemble/adult-subsample-ensemble.py b/examples/adult-subsample-ensemble/adult-subsample-ensemble.py
@@ -1,10 +1,9 @@
-import argparse
-import json
-import matplotlib.pyplot as plt
-import subprocess
-import pandas as pd
-from itertools import product
+import sys
 from pathlib import Path
+sys.path.append(str(Path(__file__).parent.parent))
+from utils import run, SynthMethod
+
+FILESTEM = 'adult-subsample-ensemble'
 
 def input_json(random_state, sample_frac):
     return {
@@ -61,84 +60,6 @@ def input_json(random_state, sample_frac):
     }
 
 
-def filename_stem(i):
-    return f"adult-subsample-ensemble-{i:04}"
-
-
-def input_path(i):
-    return Path(f"../../run-inputs/{filename_stem(i)}.json")
-
-
-def write_input_file(i, params, force=False):
-    fname = input_path(i)
-    run_input = json.dumps(input_json(**params), indent=4)
-    if force or not fname.exists():
-        print(f"Writing {fname}")
-        with open(fname, "w") as input_file:
-            input_file.write(run_input)
-
-
-def read_json(fname):
-    with open(fname) as f:
-        return json.load(f)
-
-
-def handle_cmdline_args():
-    parser = argparse.ArgumentParser(
-        description="Generate (optionally run and postprocess) an ensemble of run inputs"
-    )
-
-    parser.add_argument(
-        "-n",
-        "--num-replicas",
-        dest="nreplicas",
-        required=True,
-        type=int,
-        help="The number of replicas to generate",
-    )
-
-    parser.add_argument(
-        "-r",
-        "--run",
-        default=False,
-        action="store_true",
-        help="Run (via make) and postprocess?",
-    )
-
-    parser.add_argument(
-        "-f",
-        "--force-write",
-        dest="force",
-        default=False,
-        action="store_true",
-        help="Write out input files, even if they exist",
-    )
-
-    parser.add_argument(
-        "-s",
-        "--sample-fractions",
-        dest="sample_fracs",
-        required=True,
-        help="The list of fraction of samples used",
-    )
-
-    args = parser.parse_args()
-    return args
-
-
 if __name__ == "__main__":
-    args = handle_cmdline_args()
-
-    random_states = range(args.nreplicas)
-
-    all_params = pd.DataFrame(
-        data=product(random_states, map(float, args.sample_fracs.strip('[]').split(','))), columns=["random_state", "sample_frac"]
-    )
-
-    for i, params in all_params.iterrows():
-        print(dict(params))
-        write_input_file(i, dict(params), force=args.force)
 
-    if args.run:
-        all_targets = [f"run-{filename_stem(i)}" for i, _ in all_params.iterrows()]
-        subprocess.run(["make", "-j72", "-C../.."] + all_targets)
+    run(input_json, FILESTEM, SynthMethod.SUBSAMPLING)