deploy: 33a2752

openproblems-bio · Jun 6, 2024 · 682b426 · 682b426
commit 682b426
Show file tree

Hide file tree

Showing 425 changed files with 209,955 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+resources
+work
+.nextflow*
+.idea
+.vscode
+.DS_Store
+output
+trace-*
+.ipynb_checkpoints
diff --git a/.nojekyll b/.nojekyll
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,10 @@
+# task_perturbation_prediction 1.0.0
+
+Initial release of the Perturbation Prediction task. Initial components:
+
+* `src/process_dataset`: Compute the DGE data from the raw single-cell counts using Limma.
+* `src/control_methods`: Baseline control methods: sample, ground_truth, zeros, mean_across_celltypes, mean_across_compounds, mean_outcome.
+* `src/methods`: Perturbation prediction methods: jn_ap_op2, lgc_ensemble, nn_retraining_with_pseudolabels, pyboost, scape, transformer_ensemble.
+* `src/metrics`: Evaluation metrics: mean_rowwise_error, mean_rowwise_correlation.
+
+
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Open Problems in Single-Cell Analysis
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
diff --git a/_viash.yaml b/_viash.yaml
@@ -0,0 +1,15 @@
+viash_version: 0.8.6
+
+source: src
+target: target
+
+config_mods: |
+  .functionality.version := 'dev'
+  .functionality.arguments[.multiple == true].multiple_sep := ';'
+  .platforms[.type == 'docker'].target_registry := 'ghcr.io'
+  .platforms[.type == 'docker'].target_organization := 'openproblems-bio/task_perturbation_prediction'
+  .platforms[.type == 'docker'].target_image_source := 'https://github.com/openproblems-bio/task_perturbation_prediction'
+  .platforms[.type == "nextflow"].directives.tag := "$id"
+  .platforms[.type == "nextflow"].auto.simplifyOutput := false
+  .platforms[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }
+  .platforms[.type == "nextflow"].config.script := "process.errorStrategy = 'ignore'"
diff --git a/main.nf b/main.nf
@@ -0,0 +1,3 @@
+workflow {
+    print("This is a dummy placeholder for pipeline execution. Please use the corresponding nf files for running pipelines.")
+}
diff --git a/nextflow.config b/nextflow.config
@@ -0,0 +1 @@
+process.container = 'nextflow/bash:latest'
diff --git a/scripts/add_a_method.sh b/scripts/add_a_method.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+echo "This script is not supposed to be run directly."
+echo "Please run the script step-by-step."
+exit 1
+
+# sync resources
+scripts/download_resources.sh
+
+# create a new component
+method_id="my_method"
+method_lang="python" # change this to "r" if need be
+
+viash run src/common/create_component/config.vsh.yaml -- \
+  --language "$method_lang" \
+  --name "$method_id"
+
+# TODO: fill in required fields in src/methods/foo/config.vsh.yaml
+# TODO: edit src/methods/foo/script.py/R
+
+# test the component
+viash test src/methods/$method_id/config.vsh.yaml
+
+# rebuild the container (only if you change something to the docker platform)
+# You can reduce the memory and cpu allotted to jobs in _viash.yaml by modifying .platforms[.type == "nextflow"].config.labels
+viash run src/methods/$method_id/config.vsh.yaml -- \
+  ---setup cachedbuild ---verbose
+
+# run the method (using h5ad as input)
+viash run src/methods/$method_id/config.vsh.yaml -- \
+  --de_train_h5ad "resources/neurips-2023-kaggle/2023-09-12_de_by_cell_type_train.h5ad" \
+  --id_map "resources/neurips-2023-kaggle/id_map.csv" \
+  --output "output/prediction.h5ad"
+
+# run evaluation metric
+viash run src/metrics/mean_rowwise_error/config.vsh.yaml -- \
+  --de_test_h5ad "resources/neurips-2023-kaggle/de_test.h5ad" \
+  --prediction "output/prediction.h5ad" \
+  --output "output/score.h5ad"
+
+# print score on kaggle test dataset
+python -c 'import anndata; print(anndata.read_h5ad("output/score.h5ad").uns)'
diff --git a/scripts/build_components.sh b/scripts/build_components.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+viash ns build --parallel --setup cachedbuild
diff --git a/scripts/download_resources.sh b/scripts/download_resources.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -e
+
+echo ">> Downloading resources"
+# aws s3 sync --no-sign-request \
+#   "s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/" \
+#   "resources" \
+#   --delete
+
+viash run src/common/sync_test_resources/config.vsh.yaml -- \
+  --input "s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/" \
+  --output "resources" \
+  --delete
diff --git a/scripts/generate_kaggle_resources.sh b/scripts/generate_kaggle_resources.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+set -e
+
+OUT=resources/neurips-2023-kaggle
+
+[[ ! -d $OUT ]] && mkdir -p $OUT
+
+if [[ ! -f "$OUT/2023-09-12_de_by_cell_type_test.h5ad" ]]; then
+  echo ">> Downloading data"
+  aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/2023-09-14_kaggle_upload/2023-09-12_de_by_cell_type_test.h5ad --no-sign-request $OUT/2023-09-12_de_by_cell_type_test.h5ad
+  aws s3 cp s3://openproblems-bio/public/neurips-2023-competition/2023-09-14_kaggle_upload/2023-09-12_de_by_cell_type_train.h5ad --no-sign-request $OUT/2023-09-12_de_by_cell_type_train.h5ad
+
+  # recompress h5ad files
+  python -c \
+    "import anndata as ad; ad.read_h5ad('$OUT/2023-09-12_de_by_cell_type_test.h5ad').write_h5ad('$OUT/2023-09-12_de_by_cell_type_test.h5ad', compression='gzip')"
+  python -c \
+    "import anndata as ad; ad.read_h5ad('$OUT/2023-09-12_de_by_cell_type_train.h5ad').write_h5ad('$OUT/2023-09-12_de_by_cell_type_train.h5ad', compression='gzip')"
+fi
+
+viash run src/process_dataset/convert_kaggle_h5ad_to_parquet/config.vsh.yaml -- \
+  --input_train "$OUT/2023-09-12_de_by_cell_type_train.h5ad" \
+  --input_test "$OUT/2023-09-12_de_by_cell_type_test.h5ad" \
+  --input_single_cell_h5ad "resources/neurips-2023-raw/sc_counts.h5ad" \
+  --output_train_h5ad "$OUT/de_train.h5ad" \
+  --output_test_h5ad "$OUT/de_test.h5ad" \
+  --output_id_map "$OUT/id_map.csv" \
+  --dataset_id neurips-2023-kaggle \
+  --dataset_name "NeurIPS2023 scPerturb DGE (Kaggle)" \
+  --dataset_summary 'Original Kaggle dataset' \
+  --dataset_description 'Original Kaggle dataset' \
+  --dataset_url TBD \
+  --dataset_reference TBD \
+  --dataset_organism homo_sapiens
+
+echo ">> Run method"
+viash run src/control_methods/mean_across_compounds/config.vsh.yaml -- \
+  --de_train_h5ad "$OUT/de_train.h5ad" \
+  --de_test_h5ad "$OUT/de_test.h5ad" \
+  --id_map "$OUT/id_map.csv" \
+  --output "$OUT/prediction.h5ad"
+
+echo ">> Run metric"
+viash run src/metrics/mean_rowwise_error/config.vsh.yaml -- \
+  --prediction "$OUT/prediction.h5ad" \
+  --de_test_h5ad "$OUT/de_test.h5ad" \
+  --output "$OUT/score.h5ad"
+
+cat > "$OUT/state.yaml" <<'EOF'
+id: neurips-2023-kaggle
+de_train_h5ad: !file de_train.h5ad
+de_test_h5ad: !file de_test.h5ad
+id_map: !file id_map.csv
+EOF
+
+echo ">> Uploading results to S3"
+aws s3 sync --profile op2 \
+  --include "*" \
+  --exclude "neurips-2023-raw/*" \
+  --exclude "neurips-2023-public/*" \
+  "resources" \
+  "s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/" \
+  --delete --dryrun
diff --git a/scripts/generate_resources.sh b/scripts/generate_resources.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+set -e
+
+IN=resources/neurips-2023-raw
+OUT=resources/neurips-2023-data
+
+[[ ! -d $IN ]] && mkdir -p $IN
+
+if [[ ! -f "$IN/sc_counts.h5ad" ]]; then
+  echo ">> Downloading 'sc_counts.h5ad'"
+  aws s3 cp --no-sign-request \
+    s3://openproblems-bio/public/neurips-2023-competition/sc_counts_reannotated_with_counts.h5ad \
+    "$IN/sc_counts_reannotated_with_counts.h5ad"
+fi
+
+echo ">> Running 'process_dataset' workflow"
+nextflow run \
+  target/nextflow/workflows/process_dataset/main.nf \
+  -profile docker \
+  -resume \
+  --id neurips-2023-data \
+  --sc_counts "$IN/sc_counts_reannotated_with_counts.h5ad" \
+  --dataset_id "neurips-2023-data" \
+  --dataset_name "NeurIPS2023 scPerturb DGE" \
+  --dataset_url "TBD" \
+  --dataset_reference "TBD" \
+  --dataset_summary "Differential gene expression sign(logFC) * -log10(p-value) values after 24 hours of treatment with 144 compounds in human PBMCs" \
+  --dataset_description "For this competition, we designed and generated a novel single-cell perturbational dataset in human peripheral blood mononuclear cells (PBMCs). We selected 144 compounds from the Library of Integrated Network-Based Cellular Signatures (LINCS) Connectivity Map dataset (PMID: 29195078) and measured single-cell gene expression profiles after 24 hours of treatment. The experiment was repeated in three healthy human donors, and the compounds were selected based on diverse transcriptional signatures observed in CD34+ hematopoietic stem cells (data not released). We performed this experiment in human PBMCs because the cells are commercially available with pre-obtained consent for public release and PBMCs are a primary, disease-relevant tissue that contains multiple mature cell types (including T-cells, B-cells, myeloid cells, and NK cells) with established markers for annotation of cell types. To supplement this dataset, we also measured cells from each donor at baseline with joint scRNA and single-cell chromatin accessibility measurements using the 10x Multiome assay. We hope that the addition of rich multi-omic data for each donor and cell type at baseline will help establish biological priors that explain the susceptibility of particular genes to exhibit perturbation responses in difference biological contexts." \
+  --dataset_organism "homo_sapiens" \
+  --output_state "state.yaml" \
+  --publish_dir "$OUT"
+
+echo ">> Run method"
+viash run src/control_methods/mean_across_compounds/config.vsh.yaml -- \
+  --de_train_h5ad "$OUT/de_train.h5ad" \
+  --de_test_h5ad "$OUT/de_test.h5ad" \
+  --id_map "$OUT/id_map.csv" \
+  --output "$OUT/prediction.h5ad"
+
+echo ">> Run metric"
+viash run src/metrics/mean_rowwise_error/config.vsh.yaml -- \
+  --prediction "$OUT/prediction.h5ad" \
+  --de_test_h5ad "$OUT/de_test.h5ad" \
+  --output "$OUT/score.h5ad"
+
+echo ">> Uploading results to S3"
+aws s3 sync --profile op2 \
+  --include "*" \
+  --exclude "neurips-2023-raw/*" \
+  --exclude "neurips-2023-public/*" \
+  "resources" \
+  "s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/" \
+  --delete --dryrun
diff --git a/scripts/render_readme.sh b/scripts/render_readme.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+set -e
+
+[[ ! -d ../openproblems-v2 ]] && echo "You need to clone the openproblems-v2 repository next to this repository" && exit 1
+
+../openproblems-v2/target/docker/common/create_task_readme/create_task_readme \
+  --task "perturbation_prediction" \
+  --task_dir "src" \
+  --github_url "https://github.com/openproblems-bio/task_perturbation_prediction/tree/main/" \
+  --output "README.md"
diff --git a/scripts/run_benchmark_test.sh b/scripts/run_benchmark_test.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+export NXF_VER=23.04.2
+
+resources_dir="resources"
+publish_dir="output/test_run_benchmark"
+
+cat > /tmp/params.yaml << HERE
+param_list:
+  - id: neurips-2023-data
+    de_train_h5ad: "$resources_dir/neurips-2023-data/de_train.h5ad"
+    de_test_h5ad: "$resources_dir/neurips-2023-data/de_test.h5ad"
+    id_map: "$resources_dir/neurips-2023-data/id_map.csv"
+    layer: clipped_sign_log10_pval
+  - id: neurips-2023-kaggle
+    de_train_h5ad: "$resources_dir/neurips-2023-kaggle/de_train.h5ad"
+    de_test_h5ad: "$resources_dir/neurips-2023-kaggle/de_test.h5ad"
+    id_map: "$resources_dir/neurips-2023-kaggle/id_map.csv"
+    layer: sign_log10_pval
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+nextflow run . \
+  -main-script target/nextflow/workflows/run_benchmark/main.nf \
+  -profile docker \
+  -resume \
+  -params-file /tmp/params.yaml
diff --git a/scripts/run_benchmark_tw.sh b/scripts/run_benchmark_tw.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
+resources_dir="s3://openproblems-bio/public/neurips-2023-competition/workflow-resources"
+publish_dir="s3://openproblems-data/resources/perturbation_prediction/results/${RUN_ID}"
+
+cat > /tmp/params.yaml << HERE
+param_list:
+  - id: neurips-2023-data
+    de_train_h5ad: "$resources_dir/neurips-2023-data/de_train.h5ad"
+    de_test_h5ad: "$resources_dir/neurips-2023-data/de_test.h5ad"
+    id_map: "$resources_dir/neurips-2023-data/id_map.csv"
+    layer: clipped_sign_log10_pval
+  # - id: neurips-2023-kaggle
+  #   de_train_h5ad: "$resources_dir/neurips-2023-kaggle/de_train.h5ad"
+  #   de_test_h5ad: "$resources_dir/neurips-2023-kaggle/de_test.h5ad"
+  #   id_map: "$resources_dir/neurips-2023-kaggle/id_map.csv"
+  #   layer: sign_log10_pval
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+tw launch https://github.com/openproblems-bio/task_perturbation_prediction.git \
+  --revision main_build \
+  --pull-latest \
+  --main-script target/nextflow/workflows/run_benchmark/main.nf \
+  --workspace 53907369739130 \
+  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --params-file /tmp/params.yaml \
+  --config src/common/nextflow_helpers/labels_tw.config
diff --git a/scripts/run_benchmark_tw_traens.sh b/scripts/run_benchmark_tw_traens.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# todo: remove this before merging PR #64
+
+RUN_ID="traens_$(date +%Y-%m-%d_%H-%M-%S)"
+resources_dir="s3://openproblems-bio/public/neurips-2023-competition/workflow-resources"
+publish_dir="s3://openproblems-data/resources/perturbation_prediction/results/${RUN_ID}"
+
+cat > /tmp/params.yaml << HERE
+param_list:
+  - id: neurips-2023-data
+    de_train_h5ad: "$resources_dir/neurips-2023-data/de_train.h5ad"
+    de_test_h5ad: "$resources_dir/neurips-2023-data/de_test.h5ad"
+    id_map: "$resources_dir/neurips-2023-data/id_map.csv"
+    layer: clipped_sign_log10_pval
+method_ids: [transformer_ensemble]
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+tw launch https://github.com/openproblems-bio/task_perturbation_prediction.git \
+  --revision suggestions_elior_build \
+  --pull-latest \
+  --main-script target/nextflow/workflows/run_benchmark/main.nf \
+  --workspace 53907369739130 \
+  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --params-file /tmp/params.yaml \
+  --config src/common/nextflow_helpers/labels_tw.config
diff --git a/scripts/run_layert_tw.sh b/scripts/run_layert_tw.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+RUN_ID="layert_$(date +%Y-%m-%d_%H-%M-%S)"
+publish_dir="s3://openproblems-data/resources/perturbation_prediction/results/${RUN_ID}"
+
+cat > /tmp/params.yaml << HERE
+id: dge_perturbation_task
+input_states: s3://openproblems-bio/public/neurips-2023-competition/workflow-resources/neurips-2023-data/state.yaml
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+rename_keys: "de_train_h5ad:de_train_h5ad,de_test_h5ad:de_test_h5ad,id_map:id_map"
+settings: '{"layer": "t"}'
+HERE
+
+tw launch https://github.com/openproblems-bio/task_perturbation_prediction.git \
+  --revision main_build \
+  --pull-latest \
+  --main-script target/nextflow/workflows/run_benchmark/main.nf \
+  --workspace 53907369739130 \
+  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --params-file /tmp/params.yaml \
+  --entry-name auto \
+  --config src/common/nextflow_helpers/labels_tw.config
diff --git a/scripts/run_stability_test.sh b/scripts/run_stability_test.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+export NXF_VER=23.04.2
+
+cat > /tmp/params.yaml <<'HERE'
+id: neurips-2023-data
+sc_counts: resources/neurips-2023-raw/sc_counts_reannotated_with_counts.h5ad
+method_ids: ['ground_truth', 'sample', 'mean_across_celltypes', 'mean_across_compounds']
+layer: t # test a different layer
+bootstrap_num_replicates: 2
+publish_dir: "output/test_stability_analysis"
+output_state: "state.yaml"
+HERE
+
+nextflow run . \
+  -main-script target/nextflow/workflows/run_stability_analysis/main.nf \
+  -profile docker \
+  -resume \
+  -params-file /tmp/params.yaml
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/bash

		viash ns build --parallel --setup cachedbuild