Skip to content

Commit

Permalink
deploy: 33a2752
Browse files Browse the repository at this point in the history
  • Loading branch information
rcannood committed Jun 6, 2024
0 parents commit 682b426
Show file tree
Hide file tree
Showing 425 changed files with 209,955 additions and 0 deletions.
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
resources
work
.nextflow*
.idea
.vscode
.DS_Store
output
trace-*
.ipynb_checkpoints
Empty file added .nojekyll
Empty file.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# task_perturbation_prediction 1.0.0

Initial release of the Perturbation Prediction task. Initial components:

* `src/process_dataset`: Compute the DGE data from the raw single-cell counts using Limma.
* `src/control_methods`: Baseline control methods: sample, ground_truth, zeros, mean_across_celltypes, mean_across_compounds, mean_outcome.
* `src/methods`: Perturbation prediction methods: jn_ap_op2, lgc_ensemble, nn_retraining_with_pseudolabels, pyboost, scape, transformer_ensemble.
* `src/metrics`: Evaluation metrics: mean_rowwise_error, mean_rowwise_correlation.


21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2024 Open Problems in Single-Cell Analysis

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
500 changes: 500 additions & 0 deletions README.md

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions _viash.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
viash_version: 0.8.6

source: src
target: target

config_mods: |
.functionality.version := 'dev'
.functionality.arguments[.multiple == true].multiple_sep := ';'
.platforms[.type == 'docker'].target_registry := 'ghcr.io'
.platforms[.type == 'docker'].target_organization := 'openproblems-bio/task_perturbation_prediction'
.platforms[.type == 'docker'].target_image_source := 'https://github.com/openproblems-bio/task_perturbation_prediction'
.platforms[.type == "nextflow"].directives.tag := "$id"
.platforms[.type == "nextflow"].auto.simplifyOutput := false
.platforms[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }
.platforms[.type == "nextflow"].config.script := "process.errorStrategy = 'ignore'"
3 changes: 3 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
workflow {
print("This is a dummy placeholder for pipeline execution. Please use the corresponding nf files for running pipelines.")
}
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
process.container = 'nextflow/bash:latest'
42 changes: 42 additions & 0 deletions scripts/add_a_method.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash

echo "This script is not supposed to be run directly."
echo "Please run the script step-by-step."
exit 1

# sync resources
scripts/download_resources.sh

# create a new component
method_id="my_method"
method_lang="python" # change this to "r" if need be

viash run src/common/create_component/config.vsh.yaml -- \
--language "$method_lang" \
--name "$method_id"

# TODO: fill in required fields in src/methods/foo/config.vsh.yaml
# TODO: edit src/methods/foo/script.py/R

# test the component
viash test src/methods/$method_id/config.vsh.yaml

# rebuild the container (only if you change something to the docker platform)
# You can reduce the memory and cpu allotted to jobs in _viash.yaml by modifying .platforms[.type == "nextflow"].config.labels
viash run src/methods/$method_id/config.vsh.yaml -- \
---setup cachedbuild ---verbose

# run the method (using h5ad as input)
viash run src/methods/$method_id/config.vsh.yaml -- \
--de_train_h5ad "resources/neurips-2023-kaggle/2023-09-12_de_by_cell_type_train.h5ad" \
--id_map "resources/neurips-2023-kaggle/id_map.csv" \
--output "output/prediction.h5ad"

# run evaluation metric
viash run src/metrics/mean_rowwise_error/config.vsh.yaml -- \
--de_test_h5ad "resources/neurips-2023-kaggle/de_test.h5ad" \
--prediction "output/prediction.h5ad" \
--output "output/score.h5ad"

# print score on kaggle test dataset
python -c 'import anndata; print(anndata.read_h5ad("output/score.h5ad").uns)'
3 changes: 3 additions & 0 deletions scripts/build_components.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash

viash ns build --parallel --setup cachedbuild
14 changes: 14 additions & 0 deletions scripts/download_resources.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash

set -e

echo ">> Downloading resources"
# aws s3 sync --no-sign-request \
# "s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/" \
# "resources" \
# --delete

viash run src/common/sync_test_resources/config.vsh.yaml -- \
--input "s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/" \
--output "resources" \
--delete
63 changes: 63 additions & 0 deletions scripts/generate_kaggle_resources.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash

set -e

OUT=resources/neurips-2023-kaggle

[[ ! -d $OUT ]] && mkdir -p $OUT

if [[ ! -f "$OUT/2023-09-12_de_by_cell_type_test.h5ad" ]]; then
echo ">> Downloading data"
aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/2023-09-14_kaggle_upload/2023-09-12_de_by_cell_type_test.h5ad --no-sign-request $OUT/2023-09-12_de_by_cell_type_test.h5ad
aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/2023-09-14_kaggle_upload/2023-09-12_de_by_cell_type_train.h5ad --no-sign-request $OUT/2023-09-12_de_by_cell_type_train.h5ad

# recompress h5ad files
python -c \
"import anndata as ad; ad.read_h5ad('$OUT/2023-09-12_de_by_cell_type_test.h5ad').write_h5ad('$OUT/2023-09-12_de_by_cell_type_test.h5ad', compression='gzip')"
python -c \
"import anndata as ad; ad.read_h5ad('$OUT/2023-09-12_de_by_cell_type_train.h5ad').write_h5ad('$OUT/2023-09-12_de_by_cell_type_train.h5ad', compression='gzip')"
fi

viash run src/process_dataset/convert_kaggle_h5ad_to_parquet/config.vsh.yaml -- \
--input_train "$OUT/2023-09-12_de_by_cell_type_train.h5ad" \
--input_test "$OUT/2023-09-12_de_by_cell_type_test.h5ad" \
--input_single_cell_h5ad "resources/neurips-2023-raw/sc_counts.h5ad" \
--output_train_h5ad "$OUT/de_train.h5ad" \
--output_test_h5ad "$OUT/de_test.h5ad" \
--output_id_map "$OUT/id_map.csv" \
--dataset_id neurips-2023-kaggle \
--dataset_name "NeurIPS2023 scPerturb DGE (Kaggle)" \
--dataset_summary 'Original Kaggle dataset' \
--dataset_description 'Original Kaggle dataset' \
--dataset_url TBD \
--dataset_reference TBD \
--dataset_organism homo_sapiens

echo ">> Run method"
viash run src/control_methods/mean_across_compounds/config.vsh.yaml -- \
--de_train_h5ad "$OUT/de_train.h5ad" \
--de_test_h5ad "$OUT/de_test.h5ad" \
--id_map "$OUT/id_map.csv" \
--output "$OUT/prediction.h5ad"

echo ">> Run metric"
viash run src/metrics/mean_rowwise_error/config.vsh.yaml -- \
--prediction "$OUT/prediction.h5ad" \
--de_test_h5ad "$OUT/de_test.h5ad" \
--output "$OUT/score.h5ad"

cat > "$OUT/state.yaml" <<'EOF'
id: neurips-2023-kaggle
de_train_h5ad: !file de_train.h5ad
de_test_h5ad: !file de_test.h5ad
id_map: !file id_map.csv
EOF

echo ">> Uploading results to S3"
aws s3 sync --profile op2 \
--include "*" \
--exclude "neurips-2023-raw/*" \
--exclude "neurips-2023-public/*" \
"resources" \
"s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/" \
--delete --dryrun
54 changes: 54 additions & 0 deletions scripts/generate_resources.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash

set -e

IN=resources/neurips-2023-raw
OUT=resources/neurips-2023-data

[[ ! -d $IN ]] && mkdir -p $IN

if [[ ! -f "$IN/sc_counts.h5ad" ]]; then
echo ">> Downloading 'sc_counts.h5ad'"
aws s3 cp --no-sign-request \
s3://openproblems-bio/public/neurips-2023-competition/sc_counts_reannotated_with_counts.h5ad \
"$IN/sc_counts_reannotated_with_counts.h5ad"
fi

echo ">> Running 'process_dataset' workflow"
nextflow run \
target/nextflow/workflows/process_dataset/main.nf \
-profile docker \
-resume \
--id neurips-2023-data \
--sc_counts "$IN/sc_counts_reannotated_with_counts.h5ad" \
--dataset_id "neurips-2023-data" \
--dataset_name "NeurIPS2023 scPerturb DGE" \
--dataset_url "TBD" \
--dataset_reference "TBD" \
--dataset_summary "Differential gene expression sign(logFC) * -log10(p-value) values after 24 hours of treatment with 144 compounds in human PBMCs" \
--dataset_description "For this competition, we designed and generated a novel single-cell perturbational dataset in human peripheral blood mononuclear cells (PBMCs). We selected 144 compounds from the Library of Integrated Network-Based Cellular Signatures (LINCS) Connectivity Map dataset (PMID: 29195078) and measured single-cell gene expression profiles after 24 hours of treatment. The experiment was repeated in three healthy human donors, and the compounds were selected based on diverse transcriptional signatures observed in CD34+ hematopoietic stem cells (data not released). We performed this experiment in human PBMCs because the cells are commercially available with pre-obtained consent for public release and PBMCs are a primary, disease-relevant tissue that contains multiple mature cell types (including T-cells, B-cells, myeloid cells, and NK cells) with established markers for annotation of cell types. To supplement this dataset, we also measured cells from each donor at baseline with joint scRNA and single-cell chromatin accessibility measurements using the 10x Multiome assay. We hope that the addition of rich multi-omic data for each donor and cell type at baseline will help establish biological priors that explain the susceptibility of particular genes to exhibit perturbation responses in difference biological contexts." \
--dataset_organism "homo_sapiens" \
--output_state "state.yaml" \
--publish_dir "$OUT"

echo ">> Run method"
viash run src/control_methods/mean_across_compounds/config.vsh.yaml -- \
--de_train_h5ad "$OUT/de_train.h5ad" \
--de_test_h5ad "$OUT/de_test.h5ad" \
--id_map "$OUT/id_map.csv" \
--output "$OUT/prediction.h5ad"

echo ">> Run metric"
viash run src/metrics/mean_rowwise_error/config.vsh.yaml -- \
--prediction "$OUT/prediction.h5ad" \
--de_test_h5ad "$OUT/de_test.h5ad" \
--output "$OUT/score.h5ad"

echo ">> Uploading results to S3"
aws s3 sync --profile op2 \
--include "*" \
--exclude "neurips-2023-raw/*" \
--exclude "neurips-2023-public/*" \
"resources" \
"s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/" \
--delete --dryrun
11 changes: 11 additions & 0 deletions scripts/render_readme.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

set -e

[[ ! -d ../openproblems-v2 ]] && echo "You need to clone the openproblems-v2 repository next to this repository" && exit 1

../openproblems-v2/target/docker/common/create_task_readme/create_task_readme \
--task "perturbation_prediction" \
--task_dir "src" \
--github_url "https://github.com/openproblems-bio/task_perturbation_prediction/tree/main/" \
--output "README.md"
28 changes: 28 additions & 0 deletions scripts/run_benchmark_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

export NXF_VER=23.04.2

resources_dir="resources"
publish_dir="output/test_run_benchmark"

cat > /tmp/params.yaml << HERE
param_list:
- id: neurips-2023-data
de_train_h5ad: "$resources_dir/neurips-2023-data/de_train.h5ad"
de_test_h5ad: "$resources_dir/neurips-2023-data/de_test.h5ad"
id_map: "$resources_dir/neurips-2023-data/id_map.csv"
layer: clipped_sign_log10_pval
- id: neurips-2023-kaggle
de_train_h5ad: "$resources_dir/neurips-2023-kaggle/de_train.h5ad"
de_test_h5ad: "$resources_dir/neurips-2023-kaggle/de_test.h5ad"
id_map: "$resources_dir/neurips-2023-kaggle/id_map.csv"
layer: sign_log10_pval
output_state: "state.yaml"
publish_dir: "$publish_dir"
HERE

nextflow run . \
-main-script target/nextflow/workflows/run_benchmark/main.nf \
-profile docker \
-resume \
-params-file /tmp/params.yaml
30 changes: 30 additions & 0 deletions scripts/run_benchmark_tw.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
resources_dir="s3://openproblems-bio/public/neurips-2023-competition/workflow-resources"
publish_dir="s3://openproblems-data/resources/perturbation_prediction/results/${RUN_ID}"

cat > /tmp/params.yaml << HERE
param_list:
- id: neurips-2023-data
de_train_h5ad: "$resources_dir/neurips-2023-data/de_train.h5ad"
de_test_h5ad: "$resources_dir/neurips-2023-data/de_test.h5ad"
id_map: "$resources_dir/neurips-2023-data/id_map.csv"
layer: clipped_sign_log10_pval
# - id: neurips-2023-kaggle
# de_train_h5ad: "$resources_dir/neurips-2023-kaggle/de_train.h5ad"
# de_test_h5ad: "$resources_dir/neurips-2023-kaggle/de_test.h5ad"
# id_map: "$resources_dir/neurips-2023-kaggle/id_map.csv"
# layer: sign_log10_pval
output_state: "state.yaml"
publish_dir: "$publish_dir"
HERE

tw launch https://github.com/openproblems-bio/task_perturbation_prediction.git \
--revision main_build \
--pull-latest \
--main-script target/nextflow/workflows/run_benchmark/main.nf \
--workspace 53907369739130 \
--compute-env 6TeIFgV5OY4pJCk8I0bfOh \
--params-file /tmp/params.yaml \
--config src/common/nextflow_helpers/labels_tw.config
28 changes: 28 additions & 0 deletions scripts/run_benchmark_tw_traens.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash

# todo: remove this before merging PR #64

RUN_ID="traens_$(date +%Y-%m-%d_%H-%M-%S)"
resources_dir="s3://openproblems-bio/public/neurips-2023-competition/workflow-resources"
publish_dir="s3://openproblems-data/resources/perturbation_prediction/results/${RUN_ID}"

cat > /tmp/params.yaml << HERE
param_list:
- id: neurips-2023-data
de_train_h5ad: "$resources_dir/neurips-2023-data/de_train.h5ad"
de_test_h5ad: "$resources_dir/neurips-2023-data/de_test.h5ad"
id_map: "$resources_dir/neurips-2023-data/id_map.csv"
layer: clipped_sign_log10_pval
method_ids: [transformer_ensemble]
output_state: "state.yaml"
publish_dir: "$publish_dir"
HERE

tw launch https://github.com/openproblems-bio/task_perturbation_prediction.git \
--revision suggestions_elior_build \
--pull-latest \
--main-script target/nextflow/workflows/run_benchmark/main.nf \
--workspace 53907369739130 \
--compute-env 6TeIFgV5OY4pJCk8I0bfOh \
--params-file /tmp/params.yaml \
--config src/common/nextflow_helpers/labels_tw.config
23 changes: 23 additions & 0 deletions scripts/run_layert_tw.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/bash

RUN_ID="layert_$(date +%Y-%m-%d_%H-%M-%S)"
publish_dir="s3://openproblems-data/resources/perturbation_prediction/results/${RUN_ID}"

cat > /tmp/params.yaml << HERE
id: dge_perturbation_task
input_states: s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/neurips-2023-data/state.yaml
output_state: "state.yaml"
publish_dir: "$publish_dir"
rename_keys: "de_train_h5ad:de_train_h5ad,de_test_h5ad:de_test_h5ad,id_map:id_map"
settings: '{"layer": "t"}'
HERE

tw launch https://github.com/openproblems-bio/task_perturbation_prediction.git \
--revision main_build \
--pull-latest \
--main-script target/nextflow/workflows/run_benchmark/main.nf \
--workspace 53907369739130 \
--compute-env 6TeIFgV5OY4pJCk8I0bfOh \
--params-file /tmp/params.yaml \
--entry-name auto \
--config src/common/nextflow_helpers/labels_tw.config
19 changes: 19 additions & 0 deletions scripts/run_stability_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

export NXF_VER=23.04.2

cat > /tmp/params.yaml <<'HERE'
id: neurips-2023-data
sc_counts: resources/neurips-2023-raw/sc_counts_reannotated_with_counts.h5ad
method_ids: ['ground_truth', 'sample', 'mean_across_celltypes', 'mean_across_compounds']
layer: t # test a different layer
bootstrap_num_replicates: 2
publish_dir: "output/test_stability_analysis"
output_state: "state.yaml"
HERE

nextflow run . \
-main-script target/nextflow/workflows/run_stability_analysis/main.nf \
-profile docker \
-resume \
-params-file /tmp/params.yaml
Loading

0 comments on commit 682b426

Please sign in to comment.