Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reproducing experiments using docker #71

Open
wants to merge 12 commits into
base: develop-paper
Choose a base branch
from
4 changes: 3 additions & 1 deletion .github/workflows/run-synth-pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,7 @@ jobs:
password: ${{ secrets.DOCKER_PAC }}
-
name: Run pipeline with privbayes-adult

run: docker run -v $GITHUB_WORKSPACE:/quipp-pipeline --workdir /quipp-pipeline turinginst/quipp-env:base make run-privbayes-adult
-
name: Run pipeline with Synthpop
run: docker run -v $GITHUB_WORKSPACE:/quipp-pipeline --workdir /quipp-pipeline turinginst/quipp-env:base python examples/artificial_1-resampling-ensemble/artificial_1-resampling-ensemble.py -n 1 -f -r
55 changes: 31 additions & 24 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,26 @@ The current draft QUiPP report can be found in [`doc`](doc/), with a pdf
available
[here](https://github.com/alan-turing-institute/QUIPP-pipeline/releases).

## Installation
## Install

### Docker
The easiest way to use the pipeline is with [Docker](https://www.docker.com/). Get the official image with

```bash
docker pull turinginst/quipp-env:base
```

To run the pipeline use
```bash
docker run -v $(pwd):/quipp-pipeline --workdir /quipp-pipeline turinginst/quipp-env:base make
```

### Local

Clone and checkout



- Clone the repository `git clone
[email protected]:alan-turing-institute/QUIPP-pipeline.git`

Expand All @@ -51,7 +70,7 @@ More detail on setting this up can be found
- Various parts of this code and its dependencies are written in
Python, R, C++ and Bash.
- It has been tested with
- python 3.6
- python 3.8
- R 3.6
- gcc 9.3
- bash 3.2
Expand All @@ -62,6 +81,16 @@ More detail on setting this up can be found

### Installing the dependencies

We recommend using [Anaconda](https://docs.anaconda.com/anaconda/install/index.html) with Python 3.8.

```bash
conda create -n quipp python=3.8
```

```bash
conda activate quipp
```

#### R and Python dependencies

To install all of the python and R dependencies, run the following
Expand Down Expand Up @@ -94,29 +123,7 @@ environmental variable `SGFROOT` to point to this location. That is, in bash,
- either ```export PATH=$PATH:/path/to/sgf/bin```,
- or ```export SGFROOT=/path/to/sgf/bin```

#### Forked DataSynthesizer

We use the PrivBayes implementation within the DataSynthesizer fork found [here](https://github.com/gmingas/DataSynthesizer).
In order to install it, clone the above repository locally, go to its root directory and run `pip install .`

#### Forked synthetic_data_release

We use the PATE-GAN implementation within the `synthetic_data_release` fork found [here](https://github.com/kasra-hosseini/synthetic_data_release).
In order to use PATE-GAN in QUIPP:
1. create a new directory:

```bash
cd /path/to/QUIPP-pipeline
mkdir libs
```

2. Clone the above repository inside `libs` directory created in the previous step:

```bash
# from /path/to/QUIPP-pipeline
cd libs
git clone https://github.com/kasra-hosseini/synthetic_data_release.git
```

## Top-level directory contents

Expand Down
19 changes: 0 additions & 19 deletions datasets/framingham/README.md

This file was deleted.

1 change: 0 additions & 1 deletion datasets/framingham/framingham.csv

This file was deleted.

4,241 changes: 0 additions & 4,241 deletions datasets/framingham/framingham_cleaned.csv

This file was deleted.

69 changes: 0 additions & 69 deletions datasets/framingham/framingham_cleaned.json

This file was deleted.

91 changes: 6 additions & 85 deletions examples/adult-resampling-ensemble/adult-resampling-ensemble.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import argparse
import json
import matplotlib.pyplot as plt
import subprocess
import pandas as pd
from itertools import product
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent.parent))
from utils import run, SynthMethod

FILESTEM = 'adult-resampling-ensemble'

def input_json(random_state):
return {
Expand Down Expand Up @@ -81,84 +80,6 @@ def input_json(random_state):
},
}



def filename_stem(i):
return f"adult-resampling-ensemble-{i:04}"


def input_path(i):
return Path(f"../../run-inputs/{filename_stem(i)}.json")


def feature_importance_path(i):
return Path(
f"../../synth-output/{filename_stem(i)}/utility_feature_importance.json"
)


def write_input_file(i, params, force=False):
fname = input_path(i)
run_input = json.dumps(input_json(**params), indent=4)
if force or not fname.exists():
print(f"Writing {fname}")
with open(fname, "w") as input_file:
input_file.write(run_input)


def read_json(fname):
with open(fname) as f:
return json.load(f)


def handle_cmdline_args():
parser = argparse.ArgumentParser(
description="Generate (optionally run and postprocess) an ensemble of run inputs"
)

parser.add_argument(
"-n",
"--num-replicas",
dest="nreplicas",
required=True,
type=int,
help="The number of replicas to generate",
)

parser.add_argument(
"-r",
"--run",
default=False,
action="store_true",
help="Run (via make) and postprocess?",
)

parser.add_argument(
"-f",
"--force-write",
dest="force",
default=False,
action="store_true",
help="Write out input files, even if they exist",
)

args = parser.parse_args()
return args


if __name__ == "__main__":
args = handle_cmdline_args()

random_states = range(args.nreplicas)

all_params = pd.DataFrame(
data=random_states, columns=["random_state"]
)

for i, params in all_params.iterrows():
print(dict(params))
write_input_file(i, dict(params), force=args.force)

if args.run:
all_targets = [f"run-{filename_stem(i)}" for i, _ in all_params.iterrows()]
subprocess.run(["make", "-j", "-C../.."] + all_targets)
run(input_json, FILESTEM, SynthMethod.RESAMPLING)
91 changes: 6 additions & 85 deletions examples/adult-subsample-ensemble/adult-subsample-ensemble.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import argparse
import json
import matplotlib.pyplot as plt
import subprocess
import pandas as pd
from itertools import product
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).parent.parent))
from utils import run, SynthMethod

FILESTEM = 'adult-subsample-ensemble'

def input_json(random_state, sample_frac):
return {
Expand Down Expand Up @@ -61,84 +60,6 @@ def input_json(random_state, sample_frac):
}


def filename_stem(i):
return f"adult-subsample-ensemble-{i:04}"


def input_path(i):
return Path(f"../../run-inputs/{filename_stem(i)}.json")


def write_input_file(i, params, force=False):
fname = input_path(i)
run_input = json.dumps(input_json(**params), indent=4)
if force or not fname.exists():
print(f"Writing {fname}")
with open(fname, "w") as input_file:
input_file.write(run_input)


def read_json(fname):
with open(fname) as f:
return json.load(f)


def handle_cmdline_args():
parser = argparse.ArgumentParser(
description="Generate (optionally run and postprocess) an ensemble of run inputs"
)

parser.add_argument(
"-n",
"--num-replicas",
dest="nreplicas",
required=True,
type=int,
help="The number of replicas to generate",
)

parser.add_argument(
"-r",
"--run",
default=False,
action="store_true",
help="Run (via make) and postprocess?",
)

parser.add_argument(
"-f",
"--force-write",
dest="force",
default=False,
action="store_true",
help="Write out input files, even if they exist",
)

parser.add_argument(
"-s",
"--sample-fractions",
dest="sample_fracs",
required=True,
help="The list of fraction of samples used",
)

args = parser.parse_args()
return args


if __name__ == "__main__":
args = handle_cmdline_args()

random_states = range(args.nreplicas)

all_params = pd.DataFrame(
data=product(random_states, map(float, args.sample_fracs.strip('[]').split(','))), columns=["random_state", "sample_frac"]
)

for i, params in all_params.iterrows():
print(dict(params))
write_input_file(i, dict(params), force=args.force)

if args.run:
all_targets = [f"run-{filename_stem(i)}" for i, _ in all_params.iterrows()]
subprocess.run(["make", "-j72", "-C../.."] + all_targets)
run(input_json, FILESTEM, SynthMethod.SUBSAMPLING)
Loading