From f02ba12fe4c58636f9eb458aab7cbc116b21ffa3 Mon Sep 17 00:00:00 2001 From: Sai Nirmayi Yasa <92786623+sainirmayi@users.noreply.github.com> Date: Wed, 4 Sep 2024 20:19:14 +0200 Subject: [PATCH] Add spatially variable genes task (#462) * add spatially variable genes task * add file_output.ymal * add control method * add metrics * update file format * fix control method * add task info and generate readme * update example path * fix slot name * add preferred normalization * update input data path * add author info * fix path * update api * update path in script * add dataset processor and simple workflows * update path * Update readme * update path * Update api * Minor updates to scripts * Drop feature_name column and group by index feature_name * Update arguments * Fix workflow * Update test scripts * Update task readme * add moranI for spatially variable gene detection task * add SpatialDE * Add 10x visium dataset loader * add comment * add wip simulate svg * extend wf * update output options * add Sepal * Add example * Add initial 10x visium workflow * Add resource scripts for 10x visium * Remove gene name from method output * wip upgrade script * Add subsampling and processing * make script executable * set n_obs to 600 * add SPARK and SPARK-X * update api * update workflow and components * add nnSVG * update resource test script * rename resource test script * update resource script * rename .var columns * fix nnSVG * fix typo * add Spanve * add SOMDE * add SpaGFT * add SpaGCN * add scGCO * add scGCO * add BOOST-GP * fix BOOST-GP * add spatialDE2 * add GPcounts * update library.bib * add spot or gene filters * remove subsetting * update test resources * fix dataset id * update svg simulator * add feature id to .var in dataset * optionally output simulated dataset * sort svg method references * add info * fix .var slot * add doi * fix .var slot name * fix metric * fix positive control * fix methods * add feature id as index * add methods * compute svg for reference * update common dataset loader * minor fixes * select top variable genes as reference for simulation * update test value * update dataset processor to select reference genes * Add info * use k 50 for test resource generation * update test value * fix output file destination * update dataset description * add spatial_10x_xenium.sh * fix import error in GPcounts * update GPcounts to all genes * update spatialDE2 config * Using normalized data in Sepal * update SpaGFT to use normalized data * rename component * add filters * fix command * rename components to match funcionality name * minor fix * update benchmark workflow * minor fix * update component name * add slideseq v2 datasets * update dataset url * minor updates * add metadata * add DBiT-seq datasets * update DBiT-seq and slideseq v2 datasets * add MERFISH datasets * add data loader for DBiT-seq * add loaders for MERFISH datasets * add slide-tags datasets * add stereo-seq datasets * add STARmap datasets * add seqFISH dataset * add visium and xenium dataset references * add dataset reference * minor fix * update docker setup * minor updates * update test value * add dataset reference to .uns slot * update docker setup * add dataset reference * add dataset references * create single component for all datasets from zenodo * update dataset loader and processor for slide tags datasets * add feature name to .var * use feature name as var names when feature id not available * use feature name instead of feature id * add resource scripts * add feature name to .var * revert back to using feature id * add datasets * update nextflow directives * update .var slots * update input arguments * fix typo * change to hightime * update readme * add negative control * update test default * fix column name * add time label veryhightime * add negative control * revert back to label hightime * update time labels * add veryhightime label to test * minor updates * check if matrix is sparse before converting to dense matrix * minor updates * minor update * Add task description * update metric description * get author info * add doi to the preprint and update list of authors * add protocol to dataset name * add two 10x visium datasets * update datasets * update simulate script * update simulation script * rename gene id to feature id * update api * update dataset id * include boostgp * minor fix * update api and task readme * update dataset resource * update scripts for data resource * update normalization function to allow None as a parameter * update resource scripts * update parameters for each dataset * update parameter for sepal * add arguments * fix random seed for simulation * add coord_type and max_neighs arguments * update arguments * fix workflow * update intro * update links * update task info * update dataset ids and dataset loader scripts * rename dataset loader --------- Co-authored-by: lzj1769 Co-authored-by: Robrecht Cannoodt --- _viash.yaml | 2 +- src/common/comp_tests/check_method_config.py | 2 +- src/common/comp_tests/run_and_check_adata.py | 12 +- src/common/library.bib | 469 ++++++++++++++++++ .../get_task_info/script.R | 5 +- .../loaders/tenx_visium/config.vsh.yaml | 96 ++++ src/datasets/loaders/tenx_visium/script.py | 82 +++ src/datasets/loaders/tenx_visium/test.py | 57 +++ .../loaders/zenodo_spatial/config.vsh.yaml | 87 ++++ src/datasets/loaders/zenodo_spatial/script.py | 85 ++++ src/datasets/loaders/zenodo_spatial/test.py | 55 ++ .../zenodo_spatial_slidetags/config.vsh.yaml | 88 ++++ .../zenodo_spatial_slidetags/script.py | 103 ++++ .../loaders/zenodo_spatial_slidetags/test.py | 55 ++ .../normalization/log_cp/config.vsh.yaml | 2 +- src/datasets/normalization/log_cp/script.py | 20 +- src/datasets/resource_scripts/tenx_visium.sh | 316 ++++++++++++ .../resource_scripts/zenodo_spatial.sh.sh | 414 ++++++++++++++++ .../zenodo_spatial_slidetags.sh | 82 +++ .../mouse_brain_coronal_section1.sh | 37 ++ .../resource_test_scripts/slideseq_test.sh | 36 ++ .../process_tenx_visium/config.vsh.yaml | 142 ++++++ .../workflows/process_tenx_visium/main.nf | 133 +++++ .../process_zenodo_spatial/config.vsh.yaml | 138 ++++++ .../workflows/process_zenodo_spatial/main.nf | 132 +++++ .../config.vsh.yaml | 138 ++++++ .../process_zenodo_spatial_slidetags/main.nf | 132 +++++ src/tasks/spatially_variable_genes/README.md | 335 +++++++++++++ .../api/comp_control_method.yaml | 34 ++ .../api/comp_method.yaml | 25 + .../api/comp_metric.yaml | 31 ++ .../api/comp_process_dataset.yaml | 27 + .../api/file_common_dataset.yaml | 58 +++ .../api/file_dataset.yaml | 40 ++ .../api/file_output.yaml | 30 ++ .../api/file_score.yaml | 25 + .../api/file_simulated_dataset.yaml | 66 +++ .../api/file_solution.yaml | 57 +++ .../api/task_info.yaml | 47 ++ .../random_ranking/config.vsh.yaml | 25 + .../control_methods/random_ranking/script.py | 28 ++ .../true_ranking/config.vsh.yaml | 25 + .../control_methods/true_ranking/script.py | 25 + .../methods/boostgp/config.vsh.yaml | 48 ++ .../methods/boostgp/script.R | 50 ++ .../methods/gpcounts/config.vsh.yaml | 56 +++ .../methods/gpcounts/script.py | 92 ++++ .../methods/moran_i/config.vsh.yaml | 40 ++ .../methods/moran_i/script.py | 44 ++ .../methods/nnsvg/config.vsh.yaml | 31 ++ .../methods/nnsvg/script.R | 71 +++ .../methods/scgco/config.vsh.yaml | 69 +++ .../methods/scgco/script.py | 63 +++ .../methods/sepal/config.vsh.yaml | 46 ++ .../methods/sepal/script.py | 40 ++ .../methods/somde/config.vsh.yaml | 37 ++ .../methods/somde/script.py | 53 ++ .../methods/spagcn/config.vsh.yaml | 47 ++ .../methods/spagcn/script.py | 132 +++++ .../methods/spagft/config.vsh.yaml | 59 +++ .../methods/spagft/script.py | 44 ++ .../methods/spanve/config.vsh.yaml | 45 ++ .../methods/spanve/script.py | 33 ++ .../methods/spark/config.vsh.yaml | 30 ++ .../methods/spark/script.R | 75 +++ .../methods/spark_x/config.vsh.yaml | 35 ++ .../methods/spark_x/script.R | 57 +++ .../methods/spatialde/config.vsh.yaml | 39 ++ .../methods/spatialde/script.py | 53 ++ .../methods/spatialde2/config.vsh.yaml | 53 ++ .../methods/spatialde2/script.py | 51 ++ .../metrics/correlation/config.vsh.yaml | 32 ++ .../metrics/correlation/script.py | 37 ++ .../select_reference/config.vsh.yaml | 51 ++ .../select_reference/script.py | 36 ++ .../simulate_svg/config.vsh.yaml | 46 ++ .../process_dataset/simulate_svg/script.R | 196 ++++++++ .../split_dataset/config.vsh.yaml | 38 ++ .../process_dataset/split_dataset/script.py | 34 ++ .../resources_scripts/process_datasets.sh | 110 ++++ .../resources_scripts/run_benchmark.sh | 64 +++ .../mouse_brain_coronal_section1.sh | 43 ++ .../process_datasets/config.vsh.yaml | 67 +++ .../workflows/process_datasets/main.nf | 86 ++++ .../workflows/process_datasets/run_test.sh | 32 ++ .../workflows/run_benchmark/config.vsh.yaml | 87 ++++ .../workflows/run_benchmark/main.nf | 197 ++++++++ .../workflows/run_benchmark/run_test.sh | 28 ++ 88 files changed, 6461 insertions(+), 14 deletions(-) create mode 100644 src/datasets/loaders/tenx_visium/config.vsh.yaml create mode 100644 src/datasets/loaders/tenx_visium/script.py create mode 100644 src/datasets/loaders/tenx_visium/test.py create mode 100644 src/datasets/loaders/zenodo_spatial/config.vsh.yaml create mode 100644 src/datasets/loaders/zenodo_spatial/script.py create mode 100644 src/datasets/loaders/zenodo_spatial/test.py create mode 100644 src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml create mode 100644 src/datasets/loaders/zenodo_spatial_slidetags/script.py create mode 100644 src/datasets/loaders/zenodo_spatial_slidetags/test.py create mode 100755 src/datasets/resource_scripts/tenx_visium.sh create mode 100755 src/datasets/resource_scripts/zenodo_spatial.sh.sh create mode 100755 src/datasets/resource_scripts/zenodo_spatial_slidetags.sh create mode 100755 src/datasets/resource_test_scripts/mouse_brain_coronal_section1.sh create mode 100755 src/datasets/resource_test_scripts/slideseq_test.sh create mode 100644 src/datasets/workflows/process_tenx_visium/config.vsh.yaml create mode 100644 src/datasets/workflows/process_tenx_visium/main.nf create mode 100644 src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml create mode 100644 src/datasets/workflows/process_zenodo_spatial/main.nf create mode 100644 src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml create mode 100644 src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf create mode 100644 src/tasks/spatially_variable_genes/README.md create mode 100644 src/tasks/spatially_variable_genes/api/comp_control_method.yaml create mode 100644 src/tasks/spatially_variable_genes/api/comp_method.yaml create mode 100644 src/tasks/spatially_variable_genes/api/comp_metric.yaml create mode 100644 src/tasks/spatially_variable_genes/api/comp_process_dataset.yaml create mode 100644 src/tasks/spatially_variable_genes/api/file_common_dataset.yaml create mode 100644 src/tasks/spatially_variable_genes/api/file_dataset.yaml create mode 100644 src/tasks/spatially_variable_genes/api/file_output.yaml create mode 100644 src/tasks/spatially_variable_genes/api/file_score.yaml create mode 100644 src/tasks/spatially_variable_genes/api/file_simulated_dataset.yaml create mode 100644 src/tasks/spatially_variable_genes/api/file_solution.yaml create mode 100644 src/tasks/spatially_variable_genes/api/task_info.yaml create mode 100644 src/tasks/spatially_variable_genes/control_methods/random_ranking/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/control_methods/random_ranking/script.py create mode 100644 src/tasks/spatially_variable_genes/control_methods/true_ranking/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/control_methods/true_ranking/script.py create mode 100644 src/tasks/spatially_variable_genes/methods/boostgp/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/methods/boostgp/script.R create mode 100644 src/tasks/spatially_variable_genes/methods/gpcounts/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/methods/gpcounts/script.py create mode 100644 src/tasks/spatially_variable_genes/methods/moran_i/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/methods/moran_i/script.py create mode 100644 src/tasks/spatially_variable_genes/methods/nnsvg/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/methods/nnsvg/script.R create mode 100644 src/tasks/spatially_variable_genes/methods/scgco/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/methods/scgco/script.py create mode 100644 src/tasks/spatially_variable_genes/methods/sepal/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/methods/sepal/script.py create mode 100644 src/tasks/spatially_variable_genes/methods/somde/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/methods/somde/script.py create mode 100644 src/tasks/spatially_variable_genes/methods/spagcn/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/methods/spagcn/script.py create mode 100644 src/tasks/spatially_variable_genes/methods/spagft/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/methods/spagft/script.py create mode 100644 src/tasks/spatially_variable_genes/methods/spanve/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/methods/spanve/script.py create mode 100644 src/tasks/spatially_variable_genes/methods/spark/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/methods/spark/script.R create mode 100644 src/tasks/spatially_variable_genes/methods/spark_x/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/methods/spark_x/script.R create mode 100644 src/tasks/spatially_variable_genes/methods/spatialde/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/methods/spatialde/script.py create mode 100644 src/tasks/spatially_variable_genes/methods/spatialde2/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/methods/spatialde2/script.py create mode 100644 src/tasks/spatially_variable_genes/metrics/correlation/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/metrics/correlation/script.py create mode 100644 src/tasks/spatially_variable_genes/process_dataset/select_reference/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/process_dataset/select_reference/script.py create mode 100644 src/tasks/spatially_variable_genes/process_dataset/simulate_svg/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/process_dataset/simulate_svg/script.R create mode 100644 src/tasks/spatially_variable_genes/process_dataset/split_dataset/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/process_dataset/split_dataset/script.py create mode 100755 src/tasks/spatially_variable_genes/resources_scripts/process_datasets.sh create mode 100755 src/tasks/spatially_variable_genes/resources_scripts/run_benchmark.sh create mode 100755 src/tasks/spatially_variable_genes/resources_test_scripts/mouse_brain_coronal_section1.sh create mode 100644 src/tasks/spatially_variable_genes/workflows/process_datasets/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/workflows/process_datasets/main.nf create mode 100644 src/tasks/spatially_variable_genes/workflows/process_datasets/run_test.sh create mode 100644 src/tasks/spatially_variable_genes/workflows/run_benchmark/config.vsh.yaml create mode 100644 src/tasks/spatially_variable_genes/workflows/run_benchmark/main.nf create mode 100755 src/tasks/spatially_variable_genes/workflows/run_benchmark/run_test.sh diff --git a/_viash.yaml b/_viash.yaml index b4621a2b92..0f0a8fa8f6 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -10,5 +10,5 @@ config_mods: | .platforms[.type == 'docker'].target_image_source := 'https://github.com/openproblems-bio/openproblems-v2' .platforms[.type == "nextflow"].directives.tag := "$id" .platforms[.type == "nextflow"].auto.simplifyOutput := false - .platforms[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h" } + .platforms[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } .platforms[.type == "nextflow"].config.script := "process.errorStrategy = 'ignore'" \ No newline at end of file diff --git a/src/common/comp_tests/check_method_config.py b/src/common/comp_tests/check_method_config.py index 04f3962bf3..a30111d648 100644 --- a/src/common/comp_tests/check_method_config.py +++ b/src/common/comp_tests/check_method_config.py @@ -15,7 +15,7 @@ _MISSING_DOIS = ["vandermaaten2008visualizing", "hosmer2013applied"] -TIME_LABELS = ["lowtime", "midtime", "hightime"] +TIME_LABELS = ["lowtime", "midtime", "hightime", "veryhightime"] MEM_LABELS = ["lowmem", "midmem", "highmem"] CPU_LABELS = ["lowcpu", "midcpu", "highcpu"] diff --git a/src/common/comp_tests/run_and_check_adata.py b/src/common/comp_tests/run_and_check_adata.py index f076cd3b19..d2cda5af94 100644 --- a/src/common/comp_tests/run_and_check_adata.py +++ b/src/common/comp_tests/run_and_check_adata.py @@ -77,6 +77,7 @@ def run_and_check(arguments, cmd): for arg in config["functionality"]["arguments"]: new_arg = arg.copy() + arg_info = new_arg.get("info") or {} # set clean name clean_name = re.sub("^--", "", arg["name"]) @@ -89,7 +90,9 @@ def run_and_check(arguments, cmd): else: value = f"{clean_name}.h5ad" new_arg["value"] = value - + elif "test_default" in arg_info: + new_arg["value"] = arg_info["test_default"] + arguments.append(new_arg) @@ -115,7 +118,10 @@ def run_and_check(arguments, cmd): # construct command cmd = [ meta["executable"] ] for arg in argset_args: - if arg["type"] == "file": - cmd.extend([arg["name"], arg["value"]]) + if "value" in arg: + value = arg["value"] + if arg["multiple"] and isinstance(value, list): + value = arg["multiple_sep"].join(value) + cmd.extend([arg["name"], str(value)]) run_and_check(argset_args, cmd) \ No newline at end of file diff --git a/src/common/library.bib b/src/common/library.bib index 313bfff56d..af730fe8cd 100644 --- a/src/common/library.bib +++ b/src/common/library.bib @@ -6,6 +6,22 @@ @misc{10x2018pbmc } +@misc{10x2019heart, + title = {Human Heart}, + author = {{10x Genomics}}, + year = {2019}, + url = {https://www.10xgenomics.com/datasets/human-heart-1-standard-1-0-0} +} + + +@misc{10x2019lymph, + title = {Human Lymph Node}, + author = {{10x Genomics}}, + year = {2019}, + url = {https://www.10xgenomics.com/datasets/human-lymph-node-1-standard-1-0-0} +} + + @misc{10x2019pbmc, title = {5k Peripheral Blood Mononuclear Cells (PBMCs) from a Healthy Donor with a Panel of TotalSeq-B Antibodies (v3 chemistry)}, author = {{10x Genomics}}, @@ -14,6 +30,150 @@ @misc{10x2019pbmc } +@misc{10x2020breast, + title = {Human Breast Cancer: Whole Transcriptome Analysis}, + author = {{10x Genomics}}, + year = {2020}, + url = {https://www.10xgenomics.com/datasets/human-breast-cancer-whole-transcriptome-analysis-1-standard-1-2-0} +} + + +@misc{10x2020cerebellum, + title = {Human Cerebellum: Whole Transcriptome Analysis}, + author = {{10x Genomics}}, + year = {2020}, + url = {https://www.10xgenomics.com/datasets/human-cerebellum-whole-transcriptome-analysis-1-standard-1-2-0} +} + + +@misc{10x2020kidney, + title = {Mouse Kidney Section (Coronal)}, + author = {{10x Genomics}}, + year = {2020}, + url = {https://www.10xgenomics.com/datasets/mouse-kidney-section-coronal-1-standard-1-1-0} +} + + +@misc{10x2021breast, + title = {Human Breast Cancer: Ductal Carcinoma In Situ, Invasive Carcinoma (FFPE)}, + author = {{10x Genomics}}, + year = {2021}, + url = {https://www.10xgenomics.com/datasets/human-breast-cancer-ductal-carcinoma-in-situ-invasive-carcinoma-ffpe-1-standard-1-3-0} +} + + +@misc{10x2021prostate, + title = {Normal Human Prostate (FFPE)}, + author = {{10x Genomics}}, + year = {2021}, + url = {https://www.10xgenomics.com/datasets/normal-human-prostate-ffpe-1-standard-1-3-0} +} + + +@misc{10x2022brain, + title = {Mouse Brain Coronal Section 1 (FFPE)}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/mouse-brain-coronal-section-1-ffpe-2-standard} +} + + +@misc{10x2022cervical, + title = {Human Cervical Cancer (FFPE)}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/human-cervical-cancer-1-standard} +} + + +@misc{10x2022olfactory, + title = {Adult Mouse Olfactory Bulb}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/adult-mouse-olfactory-bulb-1-standard-1} +} + + +@misc{10x2022intestine, + title = {Human Intestine Cancer (FPPE)}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/human-intestine-cancer-1-standard} +} + + +@misc{10x2022melanoma, + title = {Human Melanoma, IF Stained (FFPE)}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/human-melanoma-if-stained-ffpe-2-standard} +} + + +@misc{10x2022prostate, + title = {Human Prostate Cancer, Adjacent Normal Section with IF Staining (FFPE)}, + author = {{10x Genomics}}, + year = {2022}, + url = {https://www.10xgenomics.com/datasets/human-prostate-cancer-adjacent-normal-section-with-if-staining-ffpe-1-standard} +} + + +@misc{10x2023brain, + title = {Human Brain Cancer, 11 mm Capture Area (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/human-brain-cancer-11-mm-capture-area-ffpe-2-standard} +} + + +@misc{10x2023colon, + title = {Visium CytAssist Gene Expression Libraries of Post-Xenium Human Colon Cancer (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/visium-cytassist-gene-expression-libraries-of-post-xenium-human-colon-cancer-ffpe-using-the-human-whole-transcriptome-probe-set-2-standard} +} + + +@misc{10x2023colorectal, + title = {Human Colorectal Cancer, 11 mm Capture Area (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/human-colorectal-cancer-11-mm-capture-area-ffpe-2-standard} +} + + +@misc{10x2023embryo, + title = {Visium CytAssist, Mouse Embryo, 11 mm Capture Area (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/visium-cytassist-mouse-embryo-11-mm-capture-area-ffpe-2-standard} +} + + +@misc{10x2023kidney, + title = {Human Kidney, 11 mm Capture Area (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/human-kidney-11-mm-capture-area-ffpe-2-standard} +} + + +@misc{10x2023lung, + title = {Human Lung Cancer, 11 mm Capture Area (FFPE)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/human-lung-cancer-11-mm-capture-area-ffpe-2-standard} +} + + +@misc{10x2023mousebrain, + title = {Visium CytAssist Gene Expression Libraries of Post-Xenium Mouse Brain (FF)}, + author = {{10x Genomics}}, + year = {2023}, + url = {https://www.10xgenomics.com/datasets/visium-cytassist-gene-expression-libraries-of-post-xenium-mouse-brain-ff-using-the-mouse-whole-transcriptome-probe-set-2-standard} +} + + @article{agostinis2022newwave, doi = {10.1093/bioinformatics/btac149}, url = {https://doi.org/10.1093/bioinformatics/btac149}, @@ -85,6 +245,19 @@ @article{andersson2020single } +@article{andersson2021sepal, + title={sepal: Identifying transcript profiles with spatial patterns by diffusion-based modeling}, + author={Andersson, Alma and Lundeberg, Joakim}, + journal={Bioinformatics}, + volume={37}, + number={17}, + pages={2644--2650}, + year={2021}, + publisher={Oxford University Press}, + doi={10.1093/bioinformatics/btab164} +} + + @string{apr = {Apr.}} @@ -119,6 +292,22 @@ @article{biancalani2021deep } +@article{bintayyash2021non, + author = {BinTayyash, Nuha and Georgaka, Sokratia and John, S T and Ahmed, Sumon and Boukouvalas, Alexis and Hensman, James and Rattray, Magnus}, + title = "{Non-parametric modelling of temporal and spatial counts data from RNA-seq experiments}", + journal = {Bioinformatics}, + volume = {37}, + number = {21}, + pages = {3788-3795}, + year = {2021}, + month = {07}, + issn = {1367-4803}, + doi = {10.1093/bioinformatics/btab486}, + url = {https://doi.org/10.1093/bioinformatics/btab486}, + eprint = {https://academic.oup.com/bioinformatics/article-pdf/37/21/3788/50336570/btab486.pdf}, +} + + @article{bland2000odds, title = {Statistics Notes: The odds ratio}, author = {J. M. Bland}, @@ -204,6 +393,17 @@ @misc{cannoodt2021viashfromscripts } +@article{cai2023spanve, + title={Spanve: an Statistical Method to Detect Clustering-friendly Spatially Variable Genes in Large-scale Spatial Transcriptomics Data}, + author={Cai, Guoxin and Chen, Yichang and Chen, Shuqing and Gu, Xun and Zhou, Zhan}, + journal={bioRxiv}, + pages={2023--02}, + year={2023}, + publisher={Cold Spring Harbor Laboratory}, + doi={10.1101/2023.02.08.527623} +} + + @article{cao2018joint, title = {Joint profiling of chromatin accessibility and gene expression in thousands of single cells}, author = {Junyue Cao and Darren A. Cusanovich and Vijay Ramani and Delasa Aghamirzaie and Hannah A. Pliner and Andrew J. Hill and Riza M. Daza and Jose L. McFaline-Figueroa and Jonathan S. Packer and Lena Christiansen and Frank J. Steemers and Andrew C. Adey and Cole Trapnell and Jay Shendure}, @@ -244,6 +444,17 @@ @article{chai2014root } +@article{chang2022spatial, + title={Spatial omics representation and functional tissue module inference using graph Fourier transform}, + author={Chang, Yuzhou and Liu, Jixin and Ma, Anjun and Jiang, Sizun and Krull, Jordan and Yeo, Yao Yu and Liu, Yang and Rodig, Scott J and Barouch, Dan H and Fan, Rong and others}, + journal={bioRxiv}, + pages={2022--12}, + year={2022}, + publisher={Cold Spring Harbor Laboratory}, + doi={10.1101/2022.12.10.519929} +} + + @article{chazarragil2021flexible, doi = {10.1093/nar/gkab004}, url = {https://doi.org/10.1093/nar/gkab004}, @@ -431,6 +642,22 @@ @article{eraslan2019single } +@article{fang2022conservation, + title = {Conservation and divergence of cortical cell organization in human and mouse revealed by MERFISH}, + volume = {377}, + ISSN = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.abm1741}, + DOI = {10.1126/science.abm1741}, + number = {6601}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Fang, Rongxin and Xia, Chenglong and Close, Jennie L. and Zhang, Meng and He, Jiang and Huang, Zhengkai and Halpern, Aaron R. and Long, Brian and Miller, Jeremy A. and Lein, Ed S. and Zhuang, Xiaowei}, + year = {2022}, + month = jul, + pages = {56-62} +} + + @string{feb = {Feb.}} @@ -567,6 +794,19 @@ @article{hao2021integrated } +@article{hao2021somde, + title={SOMDE: a scalable method for identifying spatially variable genes with self-organizing map}, + author={Hao, Minsheng and Hua, Kui and Zhang, Xuegong}, + journal={Bioinformatics}, + volume={37}, + number={23}, + pages={4392--4398}, + year={2021}, + publisher={Oxford University Press}, + doi={10.1093/bioinformatics/btab471} +} + + @article{hie2019efficient, title = {Efficient integration of heterogeneous single-cell transcriptomes using Scanorama}, author = {Brian Hie and Bryan Bryson and Bonnie Berger}, @@ -665,6 +905,19 @@ @article{hubert1985comparing } +@article{hu2021spagcn, + title={SpaGCN: Integrating gene expression, spatial location and histology to identify spatial domains and spatially variable genes by graph convolutional network}, + author={Hu, Jian and Li, Xiangjie and Coleman, Kyle and Schroeder, Amelia and Ma, Nan and Irwin, David J and Lee, Edward B and Shinohara, Russell T and Li, Mingyao}, + journal={Nature methods}, + volume={18}, + number={11}, + pages={1342--1351}, + year={2021}, + publisher={Nature Publishing Group US New York}, + doi={10.1038/s41592-021-01255-8} +} + + @string{jan = {Jan}} @@ -674,6 +927,17 @@ @string{jul @string{jun = {Jun.}} +@article{kats2021spatialde2, + title={SpatialDE2: fast and localized variance component analysis of spatial transcriptomics}, + author={Kats, Ilia and Vento-Tormo, Roser and Stegle, Oliver}, + journal={Biorxiv}, + pages={2021--10}, + year={2021}, + publisher={Cold Spring Harbor Laboratory}, + doi={10.1101/2021.10.27.466045} +} + + @article{kendall1938new, doi = {10.1093/biomet/30.1-2.81}, url = {https://doi.org/10.1093/biomet/30.1-2.81}, @@ -763,6 +1027,18 @@ @article{kruskal1964mds } +@article{kuppe2022spatial, + title={Spatial multi-omic map of human myocardial infarction}, + author={Kuppe, Christoph and Ramirez Flores, Ricardo O and Li, Zhijian and Hayat, Sikander and Levinson, Rebecca T and Liao, Xian and Hannani, Monica T and Tanevski, Jovan and W{\"u}nnemann, Florian and Nagai, James S and others}, + journal={Nature}, + volume={608}, + number={7924}, + pages={766--777}, + year={2022}, + publisher={Nature Publishing Group UK London} +} + + @article{lance2022multimodal, title = {Multimodal single cell data integration challenge: results and lessons learned}, author = {Lance, Christopher and Luecken, Malte D. and Burkhardt, Daniel B. and Cannoodt, Robrecht and Rautenstrauch, Pia and Laddach, Anna and Ubingazhibov, Aidyn and Cao, Zhi-Jie and Deng, Kaiwen and Khan, Sumeer and Liu, Qiao and Russkikh, Nikolay and Ryazantsev, Gleb and Ohler, Uwe and , and Pisco, Angela Oliveira and Bloom, Jonathan and Krishnaswamy, Smita and Theis, Fabian J.}, @@ -810,6 +1086,23 @@ @article{lee2009quality } +@article{li2021bayesian, + author = {Li, Qiwei and Zhang, Minzhe and Xie, Yang and Xiao, Guanghua}, + title = "{Bayesian modeling of spatial molecular profiling data via Gaussian process}", + journal = {Bioinformatics}, + volume = {37}, + number = {22}, + pages = {4129-4136}, + year = {2021}, + month = {06}, + abstract = "{The location, timing and abundance of gene expression (both mRNA and proteins) within a tissue define the molecular mechanisms of cell functions. Recent technology breakthroughs in spatial molecular profiling, including imaging-based technologies and sequencing-based technologies, have enabled the comprehensive molecular characterization of single cells while preserving their spatial and morphological contexts. This new bioinformatics scenario calls for effective and robust computational methods to identify genes with spatial patterns.We represent a novel Bayesian hierarchical model to analyze spatial transcriptomics data, with several unique characteristics. It models the zero-inflated and over-dispersed counts by deploying a zero-inflated negative binomial model that greatly increases model stability and robustness. Besides, the Bayesian inference framework allows us to borrow strength in parameter estimation in a de novo fashion. As a result, the proposed model shows competitive performances in accuracy and robustness over existing methods in both simulation studies and two real data applications.The related R/C++ source code is available at https://github.com/Minzhe/BOOST-GP.Supplementary data are available at Bioinformatics online. }", + issn = {1367-4803}, + doi = {10.1093/bioinformatics/btab455}, + url = {https://doi.org/10.1093/bioinformatics/btab455}, + eprint = {https://academic.oup.com/bioinformatics/article-pdf/37/22/4129/50335106/btab455.pdf}, +} + + @article{linderman2018zero, title = {Zero-preserving imputation of scRNA-seq data using low-rank approximation}, author = {Linderman, George C. and Zhao, Jun and Kluger, Yuval}, @@ -823,6 +1116,38 @@ @article{linderman2018zero } +@article{liu2020high, + title = {High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue}, + volume = {183}, + ISSN = {0092-8674}, + url = {http://dx.doi.org/10.1016/j.cell.2020.10.026}, + DOI = {10.1016/j.cell.2020.10.026}, + number = {6}, + journal = {Cell}, + publisher = {Elsevier BV}, + author = {Liu, Yang and Yang, Mingyu and Deng, Yanxiang and Su, Graham and Enninful, Archibald and Guo, Cindy C. and Tebaldi, Toma and Zhang, Di and Kim, Dongjoo and Bai, Zhiliang and Norris, Eileen and Pan, Alisia and Li, Jiatong and Xiao, Yang and Halene, Stephanie and Fan, Rong}, + year = {2020}, + month = dec, + pages = {1665--1681.e18} +} + + +@article{lohoff2021integration, + title = {Integration of spatial and single-cell transcriptomic data elucidates mouse organogenesis}, + volume = {40}, + ISSN = {1546-1696}, + url = {http://dx.doi.org/10.1038/s41587-021-01006-2}, + DOI = {10.1038/s41587-021-01006-2}, + number = {1}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media LLC}, + author = {Lohoff, T. and Ghazanfar, S. and Missarova, A. and Koulena, N. and Pierson, N. and Griffiths, J. A. and Bardot, E. S. and Eng, C.-H. L. and Tyser, R. C. V. and Argelaguet, R. and Guibentif, C. and Srinivas, S. and Briscoe, J. and Simons, B. D. and Hadjantonakis, A.-K. and G\"{o}ttgens, B. and Reik, W. and Nichols, J. and Cai, L. and Marioni, J. C.}, + year = {2021}, + month = sep, + pages = {74-85} +} + + @article{lopez2018deep, title = {Deep generative modeling for single-cell transcriptomics}, author = {Romain Lopez and Jeffrey Regier and Michael B. Cole and Michael I. Jordan and Nir Yosef}, @@ -989,6 +1314,7 @@ @article{nestorowa2016single url = {https://doi.org/10.1182/blood-2016-05-716480} } + @inproceedings{luecken2021neurips, author = {Luecken, Malte and Burkhardt, Daniel and Cannoodt, Robrecht and Lance, Christopher and Agrawal, Aditi and Aliee, Hananeh and Chen, Ann and Deconinck, Louise and Detweiler, Angela and Granados, Alejandro and Huynh, Shelly and Isacco, Laura and Kim, Yang and Klein, Dominik and DE KUMAR, BONY and Kuppasani, Sunil and Lickert, Heiko and McGeever, Aaron and Melgarejo, Joaquin and Mekonen, Honey and Morri, Maurizio and M\"{u}ller, Michaela and Neff, Norma and Paul, Sheryl and Rieck, Bastian and Schneider, Kaylie and Steelman, Scott and Sterr, Michael and Treacy, Daniel and Tong, Alexander and Villani, Alexandra-Chloe and Wang, Guilin and Yan, Jia and Zhang, Ce and Pisco, Angela and Krishnaswamy, Smita and Theis, Fabian and Bloom, Jonathan M}, booktitle = {Proceedings of the Neural Information Processing Systems Track on Datasets and Benchmarks}, @@ -1031,6 +1357,19 @@ @misc{openproblems } +@article{palla2022squidpy, + title={Squidpy: a scalable framework for spatial omics analysis}, + author={Palla, Giovanni and Spitzer, Hannah and Klein, Michal and Fischer, David and Schaar, Anna Christina and Kuemmerle, Louis Benedikt and Rybakov, Sergei and Ibarra, Ignacio L and Holmberg, Olle and Virshup, Isaac and others}, + journal={Nature methods}, + volume={19}, + number={2}, + pages={171--178}, + year={2022}, + publisher={Nature Publishing Group US New York}, + doi={10.1038/s41592-021-01358-2} +} + + @article{pearson1895regression, doi = {10.1098/rspl.1895.0041}, title = {VII. Note on regression and inheritance in the case of two parents}, @@ -1116,6 +1455,22 @@ @article{rodriques2019slide } +@article{russell2023slide, + title = {Slide-tags enables single-nucleus barcoding for multimodal spatial genomics}, + volume = {625}, + ISSN = {1476-4687}, + url = {http://dx.doi.org/10.1038/s41586-023-06837-4}, + DOI = {10.1038/s41586-023-06837-4}, + number = {7993}, + journal = {Nature}, + publisher = {Springer Science and Business Media LLC}, + author = {Russell, Andrew J. C. and Weir, Jackson A. and Nadaf, Naeem M. and Shabet, Matthew and Kumar, Vipin and Kambhampati, Sandeep and Raichur, Ruth and Marrero, Giovanni J. and Liu, Sophia and Balderrama, Karol S. and Vanderburg, Charles R. and Shanmugam, Vignesh and Tian, Luyi and Iorgulescu, J. Bryan and Yoon, Charles H. and Wu, Catherine J. and Macosko, Evan Z. and Chen, Fei}, + year = {2023}, + month = dec, + pages = {101–109} +} + + @InProceedings{santos2009on, author = {Santos, Jorge M. and Embrechts, Mark"}, editor = {Alippi, Cesare and Polycarpou, Marios and Panayiotou, Christos and Ellinas, Georgios}, @@ -1177,6 +1532,22 @@ @inproceedings{stanley2020harmonic } +@article{stickels2020highly, + title = {Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2}, + volume = {39}, + ISSN = {1546-1696}, + url = {http://dx.doi.org/10.1038/s41587-020-0739-1}, + DOI = {10.1038/s41587-020-0739-1}, + number = {3}, + journal = {Nature Biotechnology}, + publisher = {Springer Science and Business Media LLC}, + author = {Stickels, Robert R. and Murray, Evan and Kumar, Pawan and Li, Jilong and Marshall, Jamie L. and Di Bella, Daniela J. and Arlotta, Paola and Macosko, Evan Z. and Chen, Fei}, + year = {2020}, + month = dec, + pages = {313–319} +} + + @article{stoeckius2017simultaneous, title = {Simultaneous epitope and transcriptome measurement in single cells}, author = {Marlon Stoeckius and Christoph Hafemeister and William Stephenson and Brian Houck-Loomis and Pratip K Chattopadhyay and Harold Swerdlow and Rahul Satija and Peter Smibert}, @@ -1204,6 +1575,32 @@ @article{stuart2019comprehensive } +@article{sun2020statistical, + title={Statistical analysis of spatial expression patterns for spatially resolved transcriptomic studies}, + author={Sun, Shiquan and Zhu, Jiaqiang and Zhou, Xiang}, + journal={Nature methods}, + volume={17}, + number={2}, + pages={193--200}, + year={2020}, + publisher={Nature Publishing Group US New York}, + doi={10.1038/s41592-019-0701-7} +} + + +@article{svensson2018spatialde, + title={SpatialDE: identification of spatially variable genes}, + author={Svensson, Valentine and Teichmann, Sarah A and Stegle, Oliver}, + journal={Nature methods}, + volume={15}, + number={5}, + pages={343--346}, + year={2018}, + publisher={Nature Publishing Group}, + doi={10.1038/nmeth.4636} +} + + @article{szubert2019structurepreserving, title = {Structure-preserving visualisation of high dimensional single-cell datasets}, author = {Benjamin Szubert and Jennifer E. Cole and Claudia Monaco and Ignat Drozdov}, @@ -1420,6 +1817,50 @@ @article{wang2017visualization } +@article{wang2018three, + title = {Three-dimensional intact-tissue sequencing of single-cell transcriptional states}, + volume = {361}, + ISSN = {1095-9203}, + url = {http://dx.doi.org/10.1126/science.aat5691}, + DOI = {10.1126/science.aat5691}, + number = {6400}, + journal = {Science}, + publisher = {American Association for the Advancement of Science (AAAS)}, + author = {Wang, Xiao and Allen, William E. and Wright, Matthew A. and Sylwestrak, Emily L. and Samusik, Nikolay and Vesuna, Sam and Evans, Kathryn and Liu, Cindy and Ramakrishnan, Charu and Liu, Jia and Nolan, Garry P. and Bava, Felice-Alessio and Deisseroth, Karl}, + year = {2018}, + month = jul +} + + +@article{wang2022high, + title = {High-resolution 3D spatiotemporal transcriptomic maps of developing Drosophila embryos and larvae}, + volume = {57}, + ISSN = {1534-5807}, + url = {http://dx.doi.org/10.1016/j.devcel.2022.04.006}, + DOI = {10.1016/j.devcel.2022.04.006}, + number = {10}, + journal = {Developmental Cell}, + publisher = {Elsevier BV}, + author = {Wang, Mingyue and Hu, Qinan and Lv, Tianhang and Wang, Yuhang and Lan, Qing and Xiang, Rong and Tu, Zhencheng and Wei, Yanrong and Han, Kai and Shi, Chang and Guo, Junfu and Liu, Chao and Yang, Tao and Du, Wensi and An, Yanru and Cheng, Mengnan and Xu, Jiangshan and Lu, Haorong and Li, Wangsheng and Zhang, Shaofang and Chen, Ao and Chen, Wei and Li, Yuxiang and Wang, Xiaoshan and Xu, Xun and Hu, Yuhui and Liu, Longqi}, + year = {2022}, + month = may, + pages = {1271--1283.e4} +} + + +@article{weber2023nnsvg, + title={nnSVG for the scalable identification of spatially variable genes using nearest-neighbor Gaussian processes}, + author={Weber, Lukas M and Saha, Arkajyoti and Datta, Abhirup and Hansen, Kasper D and Hicks, Stephanie C}, + journal={Nature communications}, + volume={14}, + number={1}, + pages={4059}, + year={2023}, + publisher={Nature Publishing Group UK London}, + doi={10.1038/s41467-023-39748-z} +} + + @article{welch2019single, title = {Single-Cell Multi-omic Integration Compares and Contrasts Features of Brain Cell Identity}, author = {Joshua D. Welch and Velina Kozareva and Ashley Ferreira and Charles Vanderburg and Carly Martin and Evan Z. Macosko}, @@ -1535,6 +1976,33 @@ @article{zhang2021pydrmetrics url = {https://doi.org/10.1016/j.heliyon.2021.e06199} } + +@article{zhang2022identification, + title={Identification of spatially variable genes with graph cuts}, + author={Zhang, Ke and Feng, Wanwan and Wang, Peng}, + journal={Nature Communications}, + volume={13}, + number={1}, + pages={5488}, + year={2022}, + publisher={Nature Publishing Group UK London}, + doi={10.1038/s41467-022-33182-3} +} + + +@article{zhu2021spark, + title={SPARK-X: non-parametric modeling enables scalable and robust detection of spatial expression patterns for large spatial transcriptomic studies}, + author={Zhu, Jiaqiang and Sun, Shiquan and Zhou, Xiang}, + journal={Genome biology}, + volume={22}, + number={1}, + pages={184}, + year={2021}, + publisher={Springer}, + doi={10.1186/s13059-021-02404-0} +} + + @article {hrovatin2023delineating, author = {Karin Hrovatin and Aim{\'e}e Bastidas-Ponce and Mostafa Bakhti and Luke Zappia and Maren B{\"u}ttner and Ciro Sallino and Michael Sterr and Anika B{\"o}ttcher and Adriana Migliorini and Heiko Lickert and Fabian J. Theis}, title = {Delineating mouse β-cell identity during lifetime and in diabetes with a single cell atlas}, @@ -1720,3 +2188,4 @@ @article{chari2023speciousart month = aug, pages = {e1011288} } + diff --git a/src/common/process_task_results/get_task_info/script.R b/src/common/process_task_results/get_task_info/script.R index 16137707fb..cfe529edfc 100644 --- a/src/common/process_task_results/get_task_info/script.R +++ b/src/common/process_task_results/get_task_info/script.R @@ -20,7 +20,8 @@ out <- list( task_name = info$label, task_summary = info$summary, task_description = paste0(info$motivation, "\n\n", info$description), - repo = "openproblems-bio/openproblems-v2" + repo = "openproblems-bio/openproblems-v2", + authors = info$authors ) # show warning when certain data is missing and return null? @@ -36,4 +37,4 @@ jsonlite::write_json( par$output, auto_unbox = TRUE, pretty = TRUE -) \ No newline at end of file +) diff --git a/src/datasets/loaders/tenx_visium/config.vsh.yaml b/src/datasets/loaders/tenx_visium/config.vsh.yaml new file mode 100644 index 0000000000..ba28b32b89 --- /dev/null +++ b/src/datasets/loaders/tenx_visium/config.vsh.yaml @@ -0,0 +1,96 @@ +functionality: + name: tenx_visium + namespace: datasets/loaders + description: | + Download a SpaceRanger h5 gene expression file and spatial imaging data from the 10x genomics website (or someplace else). + + argument_groups: + - name: Inputs + arguments: + - name: "--input_expression" + type: string + description: URL to the feature / barcode matrix HDF5 of the 10x dataset. + required: true + - name: "--input_spatial" + type: string + description: URL to the Spatial imaging data of the 10x dataset. + required: true + - name: Outputs + arguments: + - name: "--dataset" + type: file + direction: output + description: Output h5ad file + required: true + example: dataset.h5ad + - name: Metadata + arguments: + - name: "--dataset_id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: "--spot_filter_min_genes" + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: "--spot_filter_min_counts" + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: "--gene_filter_min_spots" + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: "--gene_filter_min_counts" + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: "--remove_mitochondrial" + type: boolean + description: Remove mitochondrial genes? + required: false + + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: + - squidpy + - type: nextflow diff --git a/src/datasets/loaders/tenx_visium/script.py b/src/datasets/loaders/tenx_visium/script.py new file mode 100644 index 0000000000..7de04e6b5e --- /dev/null +++ b/src/datasets/loaders/tenx_visium/script.py @@ -0,0 +1,82 @@ +import subprocess +import squidpy as sq +import tempfile +import scanpy as sc + +## VIASH START +par = { + "input_expression": "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_filtered_feature_bc_matrix.h5", + "input_spatial": "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_spatial.tar.gz", + "dataset_id": "tenx_visium/mouse_brain_coronal_section1_visium", + "dataset_name": "Mouse Brain Coronal Section 1 (FFPE)", + "dataset_url": "https://www.10xgenomics.com/datasets/mouse-brain-coronal-section-1-ffpe-2-standard", + "dataset_summary": "Gene expression library of Mouse Brain (CytAssist FFPE) using the Mouse Whole Transcriptome Probe Set", + "dataset_organism": "Mus musculus", + "dataset": "dataset.h5ad", + "spot_filter_min_genes": 200, + "gene_filter_min_spots": 50, + "remove_mitochondrial": True +} +meta = { + "functionality_name": "tenx_visium" +} +## VIASH END + +print(f"Downloading data", flush=True) +with tempfile.TemporaryDirectory() as tempdir: + input_exp = "feature_bc_matrix.h5" + input_sp = "image_data.tar.gz" + epx_data = subprocess.run(["wget", "-O", f"{tempdir}/{input_exp}", par['input_expression']], stderr=subprocess.STDOUT) + sp_data = subprocess.run(["wget", "-O", f"{tempdir}/{input_sp}", par['input_spatial']], stderr=subprocess.STDOUT) + extract_spatial = subprocess.run(["tar", "-xzf", f"{tempdir}/{input_sp}", "-C", tempdir], stderr=subprocess.STDOUT) + + # Read visium data and create anndata object + adata = sq.read.visium(path=tempdir, counts_file=input_exp) + +# Make variable names unique +adata.var_names_make_unique() + +sc.pp.calculate_qc_metrics(adata, inplace=True) + +print("Filtering spots or genes") +t0 = adata.shape +# remove cells with few counts +if par["spot_filter_min_counts"]: + sc.pp.filter_cells(adata, min_counts=par["spot_filter_min_counts"], inplace=True) +# remove cells with few genes +if par["spot_filter_min_genes"]: + sc.pp.filter_cells(adata, min_genes=par["spot_filter_min_genes"], inplace=True) +# remove genes that have few counts +if par["gene_filter_min_counts"]: + sc.pp.filter_genes(adata, min_counts=par["gene_filter_min_counts"], inplace=True) +# remove genes that are found in few cells +if par["gene_filter_min_spots"]: + sc.pp.filter_genes(adata, min_cells=par["gene_filter_min_spots"], inplace=True) +t1 = adata.shape +print(f"Removed {t0[0] - t1[0]} cells and {(t0[1] - t1[1])} genes.") + +if par["remove_mitochondrial"]: + print("Removing mitochondrial genes") + non_mito_genes_list = [name for name in adata.var_names if not (name.startswith('MT-') or name.startswith('mt-'))] + adata = adata[:, non_mito_genes_list] + +# Rename .var columns +adata.var['feature_name'] = adata.var_names +adata.var.set_index(adata.var['gene_ids'], inplace=True) +adata.var.rename(columns={"gene_ids": "feature_id"}, inplace=True) + +# Move counts to .layers +print("Add metadata to uns", flush=True) +adata.layers["counts"] = adata.X +adata.X = None + +# Add metadata +print("Add metadata to uns", flush=True) +metadata_fields = ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"] +for key in metadata_fields: + if key in par: + print(f"Setting .uns['{key}']", flush=True) + adata.uns[key] = par[key] + +print("Writing adata to file", flush=True) +adata.write_h5ad(par["dataset"], compression="gzip") \ No newline at end of file diff --git a/src/datasets/loaders/tenx_visium/test.py b/src/datasets/loaders/tenx_visium/test.py new file mode 100644 index 0000000000..a559ae1d3d --- /dev/null +++ b/src/datasets/loaders/tenx_visium/test.py @@ -0,0 +1,57 @@ +import os +import subprocess +import anndata as ad + +input_expression ="https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_filtered_feature_bc_matrix.h5" +input_spatial = "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_spatial.tar.gz" +dataset_id = "10x_visium/mouse_brain_coronal_section1" +dataset_name = "Mouse Brain Coronal Section 1 (FFPE)" +dataset_url = "https://www.10xgenomics.com/datasets/mouse-brain-coronal-section-1-ffpe-2-standard" +dataset_summary = "Gene expression library of Mouse Brain (CytAssist FFPE) using the Mouse Whole Transcriptome Probe Set" +dataset_description = "CytAssist_FFPE_Mouse_Brain_Rep1 - Gene expression library of Mouse Brain (CytAssist FFPE) using the Mouse Whole Transcriptome Probe Set" +dataset_organism = "Mus musculus" +dataset = "dataset.h5ad" + +print(">> Running script", flush=True) +out = subprocess.run( + [ + meta['executable'], + "--input_expression", input_expression, + "--input_spatial", input_spatial, + "--dataset_id", dataset_id, + "--dataset_name", dataset_name, + "--dataset_url", dataset_url, + "--dataset_summary", dataset_summary, + "--dataset_description", dataset_description, + "--dataset_organism", dataset_organism, + "--dataset", dataset + ], + stderr=subprocess.STDOUT +) + +if out.stdout: + print(out.stdout, flush=True) + +if out.returncode: + print(f"script: '{out.args}' exited with an error.", flush=True) + exit(out.returncode) + +print(">> Checking whether output file exists", flush=True) +assert os.path.exists(dataset), "Output does not exist" + +print(">> Read output anndata", flush=True) +adata = ad.read_h5ad(dataset) + +print(adata) + +print(">> Check that output fits expected API", flush=True) +assert adata.X is None, "adata.X should be None/empty" +assert "counts" in adata.layers, "Counts layer not found in .layers" +assert adata.uns["dataset_id"] == dataset_id, f"Expected {dataset_id} as value" +assert adata.uns["dataset_name"] == dataset_name, f"Expected {dataset_name} as value" +assert adata.uns["dataset_url"] == dataset_url, f"Expected {dataset_url} as value" +assert adata.uns["dataset_summary"] == dataset_summary, f"Expected {dataset_summary} as value" +assert adata.uns["dataset_organism"] == dataset_organism, f"Expected {dataset_organism} as value" +assert 'spatial' in adata.obsm, "Spatial spot coordinates not found in .obsm" + +print(">> All tests passed successfully", flush=True) diff --git a/src/datasets/loaders/zenodo_spatial/config.vsh.yaml b/src/datasets/loaders/zenodo_spatial/config.vsh.yaml new file mode 100644 index 0000000000..e4204802e1 --- /dev/null +++ b/src/datasets/loaders/zenodo_spatial/config.vsh.yaml @@ -0,0 +1,87 @@ +functionality: + name: zenodo_spatial + namespace: datasets/loaders + description: | + Download an Anndata file containing DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo. + argument_groups: + - name: Inputs + arguments: + - name: "--input_data" + type: string + description: URL to the Anndata file. + required: true + - name: Outputs + arguments: + - name: "--dataset" + type: file + direction: output + description: Output h5ad file + required: true + example: dataset.h5ad + - name: Metadata + arguments: + - name: "--dataset_id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: "--spot_filter_min_genes" + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: "--spot_filter_min_counts" + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: "--gene_filter_min_spots" + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: "--gene_filter_min_counts" + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: "--remove_mitochondrial" + type: boolean + description: Remove mitochondrial genes? + required: false + + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 + - type: nextflow diff --git a/src/datasets/loaders/zenodo_spatial/script.py b/src/datasets/loaders/zenodo_spatial/script.py new file mode 100644 index 0000000000..83aeb86056 --- /dev/null +++ b/src/datasets/loaders/zenodo_spatial/script.py @@ -0,0 +1,85 @@ +import subprocess +import tempfile +import scanpy as sc + +# VIASH START +par = { + "input_data": "ps://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1", + "dataset_id": "zenodo_spatial/mouse_olfactory_bulb_puck_slideseqv2", + "dataset_name": "Mouse Olfactory Bulk Puck", + "dataset_url": "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary", + "dataset_summary": "Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2", + "dataset_organism": "Mus musculus", + "dataset": "dataset.h5ad", + "spot_filter_min_genes": 10, + "gene_filter_min_spots": 500, + "remove_mitochondrial": True +} +meta = { + "functionality_name": "zenodo_spatial" +} +# VIASH END + +print(f"Downloading data", flush=True) +with tempfile.TemporaryDirectory() as tempdir: + input_data = "input_data.h5ad" + epx_data = subprocess.run(["wget", "-O", f"{tempdir}/{input_data}", par['input_data']], stderr=subprocess.STDOUT) + adata = sc.read_h5ad(filename=f"{tempdir}/{input_data}") + +# Make variable names unique +adata.var_names_make_unique() + +sc.pp.calculate_qc_metrics(adata, inplace=True, percent_top=None) + +print("Filtering spots or genes") +t0 = adata.shape +# remove cells with few counts +if par["spot_filter_min_counts"]: + sc.pp.filter_cells( + adata, min_counts=par["spot_filter_min_counts"], inplace=True) + +# remove cells with few genes +if par["spot_filter_min_genes"]: + sc.pp.filter_cells( + adata, min_genes=par["spot_filter_min_genes"], inplace=True) + +# remove genes that have few counts +if par["gene_filter_min_counts"]: + sc.pp.filter_genes( + adata, min_counts=par["gene_filter_min_counts"], inplace=True) + +# remove genes that are found in few cells +if par["gene_filter_min_spots"]: + sc.pp.filter_genes( + adata, min_cells=par["gene_filter_min_spots"], inplace=True) + +t1 = adata.shape +print(f"Removed {t0[0] - t1[0]} cells and {(t0[1] - t1[1])} genes.") + +if par["remove_mitochondrial"]: + print("Removing mitochondrial genes") + non_mito_genes_list = [name for name in adata.var_names if not ( + name.startswith('MT-') or name.startswith('mt-'))] + adata = adata[:, non_mito_genes_list] + +# Rename .var columns +adata.var['feature_name'] = adata.var_names +if('gene_ids' in adata.var): + adata.var.set_index(adata.var['gene_ids'], inplace=True) + adata.var.rename(columns={"gene_ids": "feature_id"}, inplace=True) + +# Move counts to .layers +print("Add metadata to uns", flush=True) +adata.layers["counts"] = adata.X +adata.X = None + +# Add metadata +print("Add metadata to uns", flush=True) +metadata_fields = ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"] +for key in metadata_fields: + if key in par: + print(f"Setting .uns['{key}']", flush=True) + adata.uns[key] = par[key] + +print("Writing adata to file", flush=True) +adata.write_h5ad(par["dataset"], compression="gzip") diff --git a/src/datasets/loaders/zenodo_spatial/test.py b/src/datasets/loaders/zenodo_spatial/test.py new file mode 100644 index 0000000000..07dcd953a8 --- /dev/null +++ b/src/datasets/loaders/zenodo_spatial/test.py @@ -0,0 +1,55 @@ +import os +import subprocess +import anndata as ad + +input_data ="https://zenodo.org/records/12784832/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1" +dataset_id = "zenodo_spatial/mouse_olfactory_bulb_puck" +dataset_name = "mouse_olfactory_bulb_puck" +dataset_url = "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" +dataset_summary = "Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2" +dataset_description = "Gene expression library of mouse olfactory bulk puck profiled using Slide-seq V2" +dataset_organism = "Mus musculus" +dataset = "dataset.h5ad" + +print(">> Running script", flush=True) +out = subprocess.run( + [ + meta['executable'], + "--input_data", input_data, + "--dataset_id", dataset_id, + "--dataset_name", dataset_name, + "--dataset_url", dataset_url, + "--dataset_summary", dataset_summary, + "--dataset_description", dataset_description, + "--dataset_organism", dataset_organism, + "--dataset", dataset + ], + stderr=subprocess.STDOUT +) + +if out.stdout: + print(out.stdout, flush=True) + +if out.returncode: + print(f"script: '{out.args}' exited with an error.", flush=True) + exit(out.returncode) + +print(">> Checking whether output file exists", flush=True) +assert os.path.exists(dataset), "Output does not exist" + +print(">> Read output anndata", flush=True) +adata = ad.read_h5ad(dataset) + +print(adata) + +print(">> Check that output fits expected API", flush=True) +assert adata.X is None, "adata.X should be None/empty" +assert "counts" in adata.layers, "Counts layer not found in .layers" +assert adata.uns["dataset_id"] == dataset_id, f"Expected {dataset_id} as value" +assert adata.uns["dataset_name"] == dataset_name, f"Expected {dataset_name} as value" +assert adata.uns["dataset_url"] == dataset_url, f"Expected {dataset_url} as value" +assert adata.uns["dataset_summary"] == dataset_summary, f"Expected {dataset_summary} as value" +assert adata.uns["dataset_organism"] == dataset_organism, f"Expected {dataset_organism} as value" +assert 'spatial' in adata.obsm, "Spatial spot coordinates not found in .obsm" + +print(">> All tests passed successfully", flush=True) diff --git a/src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml b/src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml new file mode 100644 index 0000000000..8a6dfb189e --- /dev/null +++ b/src/datasets/loaders/zenodo_spatial_slidetags/config.vsh.yaml @@ -0,0 +1,88 @@ +functionality: + name: zenodo_spatial_slidetags + namespace: datasets/loaders + description: | + Download a compressed file containing gene expression matrix and spatial locations from zenodo. + + argument_groups: + - name: Inputs + arguments: + - name: "--input_data" + type: string + description: URL to the file. + required: true + - name: Outputs + arguments: + - name: "--dataset" + type: file + direction: output + description: Output h5ad file + required: true + example: dataset.h5ad + - name: Metadata + arguments: + - name: "--dataset_id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: "--spot_filter_min_genes" + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: "--spot_filter_min_counts" + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: "--gene_filter_min_spots" + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: "--gene_filter_min_counts" + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: "--remove_mitochondrial" + type: boolean + description: Remove mitochondrial genes? + required: false + + resources: + - type: python_script + path: script.py + test_resources: + - type: python_script + path: test.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_images/python:1.1.0 + - type: nextflow diff --git a/src/datasets/loaders/zenodo_spatial_slidetags/script.py b/src/datasets/loaders/zenodo_spatial_slidetags/script.py new file mode 100644 index 0000000000..5a8cf212fa --- /dev/null +++ b/src/datasets/loaders/zenodo_spatial_slidetags/script.py @@ -0,0 +1,103 @@ +import subprocess +import pandas as pd +import tempfile +import scanpy as sc + +# VIASH START +par = { + "input_data": "https://zenodo.org/records/12785822/files/slidetag_human_cortex.tar.gz?download=1", + "dataset_id": "zenodo_spatial_slidetags/human_cortex_slidetags", + "dataset_name": "slidetag_human_cortex", + "dataset_url": "https://www.nature.com/articles/s41586-023-06837-4", + "dataset_summary": "Slide-tags enables single-nucleus barcoding for multimodal spatial genomics", + "dataset_organism": "Homo sapiens", + "dataset": "dataset.h5ad", + "spot_filter_min_genes": 200, + "gene_filter_min_spots": 50, + "remove_mitochondrial": True +} +meta = { + "functionality_name": "zenodo_spatial_slidetags" +} +# VIASH END + +print(f"Downloading data", flush=True) +with tempfile.TemporaryDirectory() as tempdir: + input_data = "input_data.tar.gz" + dataset_name = par['dataset_name'] + epx_data = subprocess.run( + ["wget", "-O", f"{tempdir}/{input_data}", par['input_data']], stderr=subprocess.STDOUT) + extract_spatial = subprocess.run( + ["tar", "-xzf", f"{tempdir}/{input_data}", "-C", tempdir, "--strip-components=1"], stderr=subprocess.STDOUT) + + # Read gene expression and create anndata object + adata = sc.read_10x_mtx(path=tempdir) + + # Read spatial locations + df = pd.read_csv(f"{tempdir}/spatial.csv", skiprows=1) + df = df.set_index('TYPE') + df.columns = ['spatial1', 'spatial2', 'cell_type'] + + # add spatial locations to anndata object + sel_cells = list(set(df.index) & set(adata.obs_names)) + + df = df.loc[sel_cells, ] + adata = adata[sel_cells, ] + + adata.obs = df + adata.obsm['spatial'] = df[['spatial2', 'spatial1']].values + +# Make variable names unique +adata.var_names_make_unique() + +sc.pp.calculate_qc_metrics(adata, inplace=True) + +print("Filtering spots or genes") +t0 = adata.shape +# remove cells with few counts +if par["spot_filter_min_counts"]: + sc.pp.filter_cells( + adata, min_counts=par["spot_filter_min_counts"], inplace=True) +# remove cells with few genes +if par["spot_filter_min_genes"]: + sc.pp.filter_cells( + adata, min_genes=par["spot_filter_min_genes"], inplace=True) +# remove genes that have few counts +if par["gene_filter_min_counts"]: + sc.pp.filter_genes( + adata, min_counts=par["gene_filter_min_counts"], inplace=True) +# remove genes that are found in few cells +if par["gene_filter_min_spots"]: + sc.pp.filter_genes( + adata, min_cells=par["gene_filter_min_spots"], inplace=True) +t1 = adata.shape +print(f"Removed {t0[0] - t1[0]} cells and {(t0[1] - t1[1])} genes.") + +if par["remove_mitochondrial"]: + print("Removing mitochondrial genes") + non_mito_genes_list = [name for name in adata.var_names if not ( + name.startswith('MT-') or name.startswith('mt-'))] + adata = adata[:, non_mito_genes_list] + + +# Rename .var columns +adata.var['feature_name'] = adata.var_names +adata.var.set_index(adata.var['gene_ids'], inplace=True) +adata.var.rename(columns={"gene_ids": "feature_id"}, inplace=True) + +# Move counts to .layers +print("Add metadata to uns", flush=True) +adata.layers["counts"] = adata.X +adata.X = None + +# Add metadata +print("Add metadata to uns", flush=True) +metadata_fields = ["dataset_id", "dataset_name", "dataset_url", + "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"] +for key in metadata_fields: + if key in par: + print(f"Setting .uns['{key}']", flush=True) + adata.uns[key] = par[key] + +print("Writing adata to file", flush=True) +adata.write_h5ad(par["dataset"], compression="gzip") diff --git a/src/datasets/loaders/zenodo_spatial_slidetags/test.py b/src/datasets/loaders/zenodo_spatial_slidetags/test.py new file mode 100644 index 0000000000..9f859ebea6 --- /dev/null +++ b/src/datasets/loaders/zenodo_spatial_slidetags/test.py @@ -0,0 +1,55 @@ +import os +import subprocess +import anndata as ad + +input_data ="https://zenodo.org/records/12785822/files/slidetag_human_cortex.tar.gz?download=1" +dataset_id = "zenodo_spatial_slidetags/human_cortex" +dataset_name = "slidetag_human_cortex" +dataset_url = "https://www.nature.com/articles/s41586-023-06837-4" +dataset_summary = "Slide-tags enables single-nucleus barcoding for multimodal spatial genomics" +dataset_description = "A 100 mm2 region of the human prefrontal cortex from a neurotypical donor aged 78 years was profiled by Slide-tags" +dataset_organism = "Homo sapiens" +dataset = "dataset.h5ad" + +print(">> Running script", flush=True) +out = subprocess.run( + [ + meta['executable'], + "--input_data", input_data, + "--dataset_id", dataset_id, + "--dataset_name", dataset_name, + "--dataset_url", dataset_url, + "--dataset_summary", dataset_summary, + "--dataset_description", dataset_description, + "--dataset_organism", dataset_organism, + "--dataset", dataset + ], + stderr=subprocess.STDOUT +) + +if out.stdout: + print(out.stdout, flush=True) + +if out.returncode: + print(f"script: '{out.args}' exited with an error.", flush=True) + exit(out.returncode) + +print(">> Checking whether output file exists", flush=True) +assert os.path.exists(dataset), "Output does not exist" + +print(">> Read output anndata", flush=True) +adata = ad.read_h5ad(dataset) + +print(adata) + +print(">> Check that output fits expected API", flush=True) +assert adata.X is None, "adata.X should be None/empty" +assert "counts" in adata.layers, "Counts layer not found in .layers" +assert adata.uns["dataset_id"] == dataset_id, f"Expected {dataset_id} as value" +assert adata.uns["dataset_name"] == dataset_name, f"Expected {dataset_name} as value" +assert adata.uns["dataset_url"] == dataset_url, f"Expected {dataset_url} as value" +assert adata.uns["dataset_summary"] == dataset_summary, f"Expected {dataset_summary} as value" +assert adata.uns["dataset_organism"] == dataset_organism, f"Expected {dataset_organism} as value" +assert 'spatial' in adata.obsm, "Spatial spot coordinates not found in .obsm" + +print(">> All tests passed successfully", flush=True) diff --git a/src/datasets/normalization/log_cp/config.vsh.yaml b/src/datasets/normalization/log_cp/config.vsh.yaml index a040258526..89b2a283f9 100644 --- a/src/datasets/normalization/log_cp/config.vsh.yaml +++ b/src/datasets/normalization/log_cp/config.vsh.yaml @@ -9,7 +9,7 @@ functionality: - name: "--n_cp" type: integer default: 1e4 - description: "Number of counts per cell" + description: "Number of counts per cell. When set to -1, will use None." platforms: - type: docker image: openproblems/base_python:1.0.0 diff --git a/src/datasets/normalization/log_cp/script.py b/src/datasets/normalization/log_cp/script.py index 905c91e976..39ddf61636 100644 --- a/src/datasets/normalization/log_cp/script.py +++ b/src/datasets/normalization/log_cp/script.py @@ -17,12 +17,20 @@ adata = sc.read_h5ad(par['input']) print(">> Normalize data", flush=True) -norm = sc.pp.normalize_total( - adata, - target_sum=par["n_cp"], - layer="counts", - inplace=False -) +if par["n_cp"] == -1: + norm = sc.pp.normalize_total( + adata, + target_sum=None, + layer="counts", + inplace=False + ) +else: + norm = sc.pp.normalize_total( + adata, + target_sum=par["n_cp"], + layer="counts", + inplace=False + ) lognorm = sc.pp.log1p(norm["X"]) print(">> Store output in adata", flush=True) diff --git a/src/datasets/resource_scripts/tenx_visium.sh b/src/datasets/resource_scripts/tenx_visium.sh new file mode 100755 index 0000000000..7993cebd4b --- /dev/null +++ b/src/datasets/resource_scripts/tenx_visium.sh @@ -0,0 +1,316 @@ +#!/bin/bash + +# cat > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: tenx_visium/mouse_brain_coronal_section1_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_spatial.tar.gz" +# dataset_name: 10X Visium - Mouse Brain Coronal +# dataset_url: "https://www.10xgenomics.com/datasets/mouse-brain-coronal-section-1-ffpe-2-standard" +# dataset_summary: Gene expression library of Mouse Brain (CytAssist FFPE) using the Mouse Whole Transcriptome Probe Set +# dataset_description: "FFPE Mouse Brain tissue blocks sectioned as described in Visium CytAssist Spatial Gene Expression for FFPE - Tissue Preparation Guide Demonstrated Protocol. The H&E stained glass slide with tissue section was processed via Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression slide. The probe extension and library construction steps follow the standard Visium for FFPE workflow outside of the instrument. The H&E image was acquired using Olympus VS200 Slide Scanning Microscope. Sequencing depth was 53,497 reads per spot. Sequencing configuration: 28bp read 1 (16bp Visium spatial barcode, 12bp UMI), 90bp read 2 (transcript), 10bp i7 sample barcode and 10bp i5 sample barcode. Key metrics include: 2,310 spots detected under tissue; 6,736 median genes per spot; 24,862 median UMI counts per spot." +# dataset_reference: 10x2022brain +# dataset_organism: Mus musculus +# spot_filter_min_genes: 200 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: tenx_visium/human_colorectal_cancer_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Colorectal_Cancer/CytAssist_11mm_FFPE_Human_Colorectal_Cancer_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Colorectal_Cancer/CytAssist_11mm_FFPE_Human_Colorectal_Cancer_spatial.tar.gz" +# dataset_name: 10X Visium - Human Colorectal Cancer +# dataset_url: "https://www.10xgenomics.com/datasets/human-colorectal-cancer-11-mm-capture-area-ffpe-2-standard" +# dataset_summary: Gene expression library of Human Colorectal Cancer (CytAssist FFPE) using the Human Whole Transcriptome Probe Set +# dataset_description: "The tissue was sectioned as described in the Visium CytAssist Spatial Gene Expression for FFPE Tissue Preparation Guide (CG000518). Tissue section of 5 µm was placed on a standard glass slide, then stained following the Deparaffinization, H&E Staining, Imaging & Decrosslinking Demonstrated Protocol (CG000520). The glass slide with tissue section was processed via Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression Slide v2, with 11 mm capture areas following the Visium CytAssist Spatial Gene Expression Reagent Kits User Guide (CG000495)." +# dataset_reference: 10x2023colorectal +# dataset_organism: Homo sapiens +# spot_filter_min_genes: 200 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: tenx_visium/human_heart_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Heart/V1_Human_Heart_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Heart/V1_Human_Heart_spatial.tar.gz" +# dataset_name: 10X Visium - Human Heart +# dataset_url: "https://www.10xgenomics.com/datasets/human-heart-1-standard-1-0-0" +# dataset_summary: V1_Human_Heart +# dataset_description: "10x Genomics obtained fresh frozen human heart tissue from BioIVT Asterand. The tissue was embedded and cryosectioned as described in Visium Spatial Protocols - Tissue Preparation Guide Demonstrated Protocol (CG000240). Tissue sections of 10 µm thickness were placed on Visium Gene Expression Slides." +# dataset_reference: 10x2019heart +# dataset_organism: Homo sapiens +# spot_filter_min_genes: 200 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: tenx_visium/mouse_embryo_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.1.0/CytAssist_11mm_FFPE_Mouse_Embryo/CytAssist_11mm_FFPE_Mouse_Embryo_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.1.0/CytAssist_11mm_FFPE_Mouse_Embryo/CytAssist_11mm_FFPE_Mouse_Embryo_spatial.tar.gz" +# dataset_name: 10X Visium - Mouse Embryo +# dataset_url: "https://www.10xgenomics.com/datasets/visium-cytassist-mouse-embryo-11-mm-capture-area-ffpe-2-standard" +# dataset_summary: Gene expression library of Mouse Embryo (CytAssist FFPE) using the Mouse Whole Transcriptome Probe Set +# dataset_description: "The tissue was sectioned as described in Visium CytAssist Spatial Gene Expression for FFPE Tissue Preparation Guide Demonstrated Protocol CG000518. Tissue sections of 5 µm was placed on a standard glass slide, and H&E-stained following deparaffinization. Sections were coverslipped with 85% glycerol, imaged, decoverslipped, followed by dehydration & decrosslinking (Demonstrated Protocol CG000520). The glass slide with the tissue section was processed with the Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression slide (11 mm Capture Area). The probe extension and library construction steps follow the standard Visium for FFPE workflow outside of the instrument." +# dataset_reference: 10x2023embryo +# dataset_organism: Mus musculus +# spot_filter_min_genes: 200 +# gene_filter_min_spots: 50 +# remove_mitochondrial: false + +# - id: tenx_visium/mouse_olfactory_bulb_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_Mouse_Olfactory_Bulb/Visium_Mouse_Olfactory_Bulb_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_Mouse_Olfactory_Bulb/Visium_Mouse_Olfactory_Bulb_spatial.tar.gz" +# dataset_name: 10X Visium - Mouse Olfactory Bulb +# dataset_url: "https://www.10xgenomics.com/datasets/adult-mouse-olfactory-bulb-1-standard-1" +# dataset_summary: 10X Genomics obtained fresh frozen mouse olfactory bulb tissue from BioIVT. +# dataset_description: "The tissue was embedded and cryosectioned as described in Visium Spatial Protocols Tissue Preparation Guide (Demonstrated Protocol CG000240). Tissue sections of 10µm were placed on Visium Gene Expression slides, then fixed and stained following Methanol Fixation, H&E Staining & Imaging for Visium Spatial Protocols (CG000160)." +# dataset_reference: 10x2022olfactory +# dataset_organism: Mus musculus +# spot_filter_min_genes: 200 +# gene_filter_min_spots: 30 +# remove_mitochondrial: false + +# - id: tenx_visium/human_breast_cancer_1_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.2.0/Parent_Visium_Human_BreastCancer/Parent_Visium_Human_BreastCancer_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.2.0/Parent_Visium_Human_BreastCancer/Parent_Visium_Human_BreastCancer_spatial.tar.gz" +# dataset_name: 10X Visium - Human Breast Cancer 1 +# dataset_url: "https://www.10xgenomics.com/datasets/human-breast-cancer-whole-transcriptome-analysis-1-standard-1-2-0" +# dataset_summary: Whole transcriptome analysis, Adult Human Breast Cancer (Visium) +# dataset_description: "10X Genomics obtained fresh frozen human Invasive Lobular Carcinoma breast tissue from BioIVT Asterand. The tissue was embedded and cryosectioned as described in Visium Spatial Protocols Tissue Preparation Guide Demonstrated Protocol (CG000240). Tissue sections of 10µm were placed on Visium Gene Expression slides and fixed and stained following Methanol Fixation, H&E Staining & Imaging for Visium Spatial Protocols (CG000160)." +# dataset_reference: 10x2020breast +# dataset_organism: Homo sapiens +# spot_filter_min_genes: 100 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: tenx_visium/human_lymph_node_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Lymph_Node/V1_Human_Lymph_Node_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.0.0/V1_Human_Lymph_Node/V1_Human_Lymph_Node_spatial.tar.gz" +# dataset_name: 10X Visium - Human Lymph Node +# dataset_url: "https://www.10xgenomics.com/datasets/human-lymph-node-1-standard-1-0-0" +# dataset_summary: Whole transcriptome analysis, Human Lymph Node +# dataset_description: "10x Genomics obtained fresh frozen human lymph node from BioIVT Asterand. The tissue was embedded and cryosectioned as described in Visium Spatial Protocols - Tissue Preparation Guide Demonstrated Protocol (CG000240). Tissue sections of 10 µm thickness were placed on Visium Gene Expression Slides." +# dataset_reference: 10x2019lymph +# dataset_organism: Homo sapiens +# spot_filter_min_genes: 100 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: tenx_visium/human_normal_prostate_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Normal_Prostate/Visium_FFPE_Human_Normal_Prostate_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Normal_Prostate/Visium_FFPE_Human_Normal_Prostate_spatial.tar.gz" +# dataset_name: 10X Visium - Human Normal Prostate +# dataset_url: "https://www.10xgenomics.com/datasets/normal-human-prostate-ffpe-1-standard-1-3-0" +# dataset_summary: Gene expression library of Human Normal Prostate (Visium FFPE) using the Human Whole Transcriptome Probe Set +# dataset_description: "10x Genomics obtained FFPE human prostate tissue from Indivumed Human Tissue Specimens. The tissue was sectioned as described in Visium Spatial Gene Expression for FFPE – Tissue Preparation Guide Demonstrated Protocol (CG000408). Tissue sections of 5 µm were placed on Visium Gene Expression slides, then stained following Deparaffinization, H&E Staining, Imaging & Decrosslinking Demonstrated Protocol (CG000409)." +# dataset_reference: 10x2021prostate +# dataset_organism: Homo sapiens +# spot_filter_min_genes: 100 +# gene_filter_min_spots: 30 +# remove_mitochondrial: true + +# - id: tenx_visium/human_prostate_cancer_visium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Prostate_IF/Visium_FFPE_Human_Prostate_IF_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Prostate_IF/Visium_FFPE_Human_Prostate_IF_spatial.tar.gz" +# dataset_name: 10X Visium - Human Prostate Cancer +# dataset_url: "https://www.10xgenomics.com/datasets/human-prostate-cancer-adjacent-normal-section-with-if-staining-ffpe-1-standard" +# dataset_summary: Gene expression library of Human Prostate Cancer (Visium FFPE) with an IF image using the Human Whole Transcriptome Probe Set +# dataset_description: "10x Genomics obtained FFPE human prostate tissue from Indivumed Human Tissue Specimens. Original diagnosis with adenocarcinoma. The tissue was sectioned as described in Visium Spatial Gene Expression for FFPE Tissue Preparation Guide Demonstrated Protocol (CG000408). Tissue sections of 10 µm were placed on Visium Gene Expression slides, then stained following Deparaffinization, Decrosslinking, Immunofluorescence Staining & Imaging Demonstrated Protocol (CG000410)." +# dataset_reference: 10x2022prostate +# dataset_organism: Homo sapiens +# spot_filter_min_genes: 100 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# HERE + +cat > "/tmp/params.yaml" << 'HERE' +param_list: + - id: tenx_visium/human_cerebellum_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.2.0/Parent_Visium_Human_Cerebellum/Parent_Visium_Human_Cerebellum_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.2.0/Parent_Visium_Human_Cerebellum/Parent_Visium_Human_Cerebellum_spatial.tar.gz" + dataset_name: 10X Visium - Adult Human Cerebellum + dataset_url: "https://www.10xgenomics.com/datasets/human-cerebellum-whole-transcriptome-analysis-1-standard-1-2-0" + dataset_summary: Human Cerebellum Whole Transcriptome Analysis + dataset_description: "10X Genomics obtained fresh frozen human cerebellum tissue from BioIVT Asterand. The tissue was embedded and cryosectioned as described in Visium Spatial Protocols Tissue Preparation Guide (Demonstrated Protocol CG000240). Tissue sections of 10µm were placed on Visium Gene Expression slides and fixed and stained following Methanol Fixation, H&E Staining & Imaging for Visium Spatial Protocols (CG000160)." + dataset_reference: 10x2020cerebellum + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: tenx_visium/mouse_kidney_v1_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.1.0/V1_Mouse_Kidney/V1_Mouse_Kidney_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.1.0/V1_Mouse_Kidney/V1_Mouse_Kidney_spatial.tar.gz" + dataset_name: 10X Visium - Mouse Kidney 1 + dataset_url: "https://www.10xgenomics.com/datasets/mouse-kidney-section-coronal-1-standard-1-1-0" + dataset_summary: Mouse Kidney Whole Transcriptome Analysis + dataset_description: "10x Genomics obtained fresh frozen mouse kidney tissue from BioIVT Asterand. The tissue was embedded and cryosectioned as described in Visium Spatial Protocols - Tissue Preparation Guide Demonstrated Protocol (CG000240). Tissue sections of 10 µm thickness from a slice of the coronal plane were placed on Visium Gene Expression slides, then stained following the Methanol Fixation, H&E Staining & Imaging Demonstrated Protocol (CG000160)." + dataset_reference: 10x2020kidney + dataset_organism: Mus musculus + spot_filter_min_genes: 100 + gene_filter_min_spots: 30 + remove_mitochondrial: false + + - id: tenx_visium/human_lung_cancer_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Lung_Cancer/CytAssist_11mm_FFPE_Human_Lung_Cancer_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Lung_Cancer/CytAssist_11mm_FFPE_Human_Lung_Cancer_spatial.tar.gz" + dataset_name: 10X Visium - Human Lung Cancer + dataset_url: "https://www.10xgenomics.com/datasets/human-lung-cancer-11-mm-capture-area-ffpe-2-standard" + dataset_summary: Gene expression library of Human Lung Cancer (CytAssist FFPE) using the Human Whole Transcriptome Probe Set + dataset_description: "10x Genomics obtained FFPE human lung cancer tissue from Avaden Biosciences. The tissue was sectioned as described in the Visium CytAssist Spatial Gene Expression for FFPE Tissue Preparation Guide (CG000518). Tissue section of 5 µm was placed on a standard glass slide, then stained following the Deparaffinization, H&E Staining, Imaging & Decrosslinking Demonstrated Protocol (CG000520). The glass slide with tissue section was processed via Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression Slide v2, with 11 mm capture areas following the Visium CytAssist Spatial Gene Expression Reagent Kits User Guide (CG000495)." + dataset_reference: 10x2023lung + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: tenx_visium/human_brain_cancer_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Glioblastoma/CytAssist_11mm_FFPE_Human_Glioblastoma_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Glioblastoma/CytAssist_11mm_FFPE_Human_Glioblastoma_spatial.tar.gz" + dataset_name: 10X Visium - Human Brain Cancer + dataset_url: "https://www.10xgenomics.com/datasets/human-brain-cancer-11-mm-capture-area-ffpe-2-standard" + dataset_summary: Gene expression library of Human Glioblastoma (CytAssist FFPE) using the Human Whole Transcriptome Probe Set + dataset_description: "10x Genomics obtained FFPE human brain cancer tissue from Avaden Biosciences. The tissue was sectioned as described in the Visium CytAssist Spatial Gene Expression for FFPE - Tissue Preparation Guide (CG000518). Tissue section of 5 µm was placed on a standard glass slide, then stained following the Deparaffinization, H&E Staining, Imaging & Decrosslinking Demonstrated Protocol (CG000520). The glass slide with tissue section was processed via Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression Slide v2, with 11 mm capture areas following the Visium CytAssist Spatial Gene Expression Reagent Kits User Guide (CG000495)." + dataset_reference: 10x2023brain + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 100 + remove_mitochondrial: true + + - id: tenx_visium/human_kidney_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Kidney/CytAssist_11mm_FFPE_Human_Kidney_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.1/CytAssist_11mm_FFPE_Human_Kidney/CytAssist_11mm_FFPE_Human_Kidney_spatial.tar.gz" + dataset_name: 10X Visium - Human Kidney + dataset_url: "https://www.10xgenomics.com/datasets/human-kidney-11-mm-capture-area-ffpe-2-standard" + dataset_summary: Gene expression library of Human Kidney (CytAssist FFPE) using the Human Whole Transcriptome Probe Set + dataset_description: "10x Genomics obtained FFPE human kidney tissue from Avaden Biosciences. The tissue was sectioned as described in the Visium CytAssist Spatial Gene Expression for FFPE – Tissue Preparation Guide (CG000518). Tissue section of 5 µm was placed on a standard glass slide, then stained following the Deparaffinization, H&E Staining, Imaging & Decrosslinking Demonstrated Protocol (CG000520). The glass slide with tissue section was processed via Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression Slide v2, with 11 mm capture areas following the Visium CytAssist Spatial Gene Expression Reagent Kits User Guide (CG000495)." + dataset_reference: 10x2023kidney + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: tenx_visium/human_intestinal_cancer_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Intestinal_Cancer/Visium_FFPE_Human_Intestinal_Cancer_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Intestinal_Cancer/Visium_FFPE_Human_Intestinal_Cancer_spatial.tar.gz" + dataset_name: 10X Visium - Human Intestine Cancer + dataset_url: "https://www.10xgenomics.com/datasets/human-intestine-cancer-1-standard" + dataset_summary: Gene expression library of Human Intestinal Cancer (Visium FFPE) using the Human Whole Transcriptome Probe Set + dataset_description: "5 µm section from Human Intestinal Cancer. FFPE tissue purchased from BioIVT Asterand Human Tissue Specimens. Libraries were prepared following the Visium Spatial Gene Expression Reagent Kits for FFPE User Guide (CG000407 Rev A)." + dataset_reference: 10x2022intestine + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 30 + remove_mitochondrial: true + + - id: tenx_visium/human_skin_melanoma_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Human_Skin_Melanoma/CytAssist_FFPE_Human_Skin_Melanoma_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Human_Skin_Melanoma/CytAssist_FFPE_Human_Skin_Melanoma_spatial.tar.gz" + dataset_name: 10X Visium - Human Skin Melanoma + dataset_url: "https://www.10xgenomics.com/datasets/human-melanoma-if-stained-ffpe-2-standard" + dataset_summary: Gene expression library of Human Skin Melanoma (CytAssist FFPE) using the Human Whole Transcriptome Probe Set + dataset_description: "10x Genomics obtained FFPE Human Melanoma tissue blocks from Avaden Biosciences. The tissue was sectioned as described in Visium CytAssist Spatial Gene Expression for FFPE Tissue Preparation Guide Demonstrated Protocol (CG000518). Tissue sections of 5 µm was placed on a standard glass slide, deparaffinized followed by immunofluorescence (IF) staining. Sections were coverslipped with 85% glycerol, imaged, decoverslipped, followed by dehydration & decrosslinking Demonstrated Protocol (CG000519). The glass slide with tissue section was processed via Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression slide. The probe extension and library construction steps follow the standard Visium for FFPE workflow outside of the instrument." + dataset_reference: 10x2022melanoma + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: tenx_visium/human_cervical_cancer_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Cervical_Cancer/Visium_FFPE_Human_Cervical_Cancer_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Cervical_Cancer/Visium_FFPE_Human_Cervical_Cancer_spatial.tar.gz" + dataset_name: 10X Visium - Human Cervical Cancer + dataset_url: "https://www.10xgenomics.com/datasets/human-cervical-cancer-1-standard" + dataset_summary: Gene expression library of Human Cervical Cancer (Visium FFPE) using the Human Whole Transcriptome Probe Set + dataset_description: "5 µm section from squamous cell carcinoma of human cervical cancer. FFPE tissue purchased from Discovery Life Sciences." + dataset_reference: 10x2022cervical + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: tenx_visium/human_breast_cancer_2_visium + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Breast_Cancer/Visium_FFPE_Human_Breast_Cancer_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/1.3.0/Visium_FFPE_Human_Breast_Cancer/Visium_FFPE_Human_Breast_Cancer_spatial.tar.gz" + dataset_name: 10X Visium - Human Breast Cancer 2 + dataset_url: "https://www.10xgenomics.com/datasets/human-breast-cancer-ductal-carcinoma-in-situ-invasive-carcinoma-ffpe-1-standard-1-3-0" + dataset_summary: Gene expression library of Human Breast Cancer (Visium FFPE) using the Human Whole Transcriptome Probe Set + dataset_description: "10x Genomics obtained FFPE human breast tissue from BioIVT Asterand Human Tissue Specimens. The tissue was annotated with Ductal Carcinoma In Situ, Invasive Carcinoma. The tissue was sectioned as described in Visium Spatial Gene Expression for FFPE – Tissue Preparation Guide Demonstrated Protocol (CG000408). Tissue sections of 5 µm were placed on Visium Gene Expression slides, then stained following Deparaffinization, H&E Staining, Imaging & Decrosslinking Demonstrated Protocol (CG000409)." + dataset_reference: 10x2021breast + dataset_organism: Homo sapiens + spot_filter_min_genes: 100 + gene_filter_min_spots: 50 + remove_mitochondrial: true + +normalization_methods: [log_cp10k] +output_dataset: '$id/dataset.h5ad' +output_meta: '$id/dataset_metadata.yaml' +output_state: '$id/state.yaml' +output_raw: force_null +output_normalized: force_null +publish_dir: resources/datasets +HERE + +# cat > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: tenx_visium/human_colon_cancer_xenium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.1.0/CytAssist_FFPE_Human_Colon_Post_Xenium_Rep1/CytAssist_FFPE_Human_Colon_Post_Xenium_Rep1_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.1.0/CytAssist_FFPE_Human_Colon_Post_Xenium_Rep1/CytAssist_FFPE_Human_Colon_Post_Xenium_Rep1_spatial.tar.gz" +# dataset_name: 10X Xenium - Human Colon +# dataset_url: "https://www.10xgenomics.com/datasets/visium-cytassist-gene-expression-libraries-of-post-xenium-human-colon-cancer-ffpe-using-the-human-whole-transcriptome-probe-set-2-standard" +# dataset_summary: Gene expression library of Post Xenium Human Colon Cancer (CytAssist FFPE) using the Human Whole Transcriptome Probe Set - Replicate 1 +# dataset_description: "This dataset is provided as part of the Technical Note: Post-Xenium In Situ Applications: Immunofluorescence, H&E, and Visium CytAssist Spatial Gene Expression (CG000709). Post-Xenium samples were compared to controls (samples not processed through the Xenium workflow) using 5 µm (FFPE) serial sections." +# dataset_reference: 10x2023colon +# dataset_organism: Homo sapiens +# spot_filter_min_genes: 100 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: tenx_visium/mouse_brain_xenium +# input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.1.0/CytAssist_FreshFrozen_Mouse_Brain_Post_Xenium_Rep1/CytAssist_FreshFrozen_Mouse_Brain_Post_Xenium_Rep1_filtered_feature_bc_matrix.h5" +# input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.1.0/CytAssist_FreshFrozen_Mouse_Brain_Post_Xenium_Rep1/CytAssist_FreshFrozen_Mouse_Brain_Post_Xenium_Rep1_spatial.tar.gz" +# dataset_name: 10X Xenium - Mouse Brain +# dataset_url: "https://www.10xgenomics.com/datasets/visium-cytassist-gene-expression-libraries-of-post-xenium-mouse-brain-ff-using-the-mouse-whole-transcriptome-probe-set-2-standard" +# dataset_summary: Gene expression library of Post Xenium Mouse Brain (CytAssist Fresh Frozen) using the Mouse Whole Transcriptome Probe Set - Replicate 1 +# dataset_description: "This dataset is provided as part of the Technical Note: Post-Xenium In Situ Applications: Immunofluorescence, H&E, and Visium CytAssist Spatial Gene Expression (CG000709). Post-Xenium samples were compared to controls (samples not processed through the Xenium workflow) using 10 µm fresh-frozen (FF) serial sections." +# dataset_reference: 10x2023mousebrain +# dataset_organism: Mus musculus +# spot_filter_min_genes: 100 +# gene_filter_min_spots: 50 +# remove_mitochondrial: false + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' + withLabel: highmem { + memory = '350GB' + } + withName: '.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision integration_build \ + --pull-latest \ + --main-script target/nextflow/datasets/workflows/process_tenx_visium/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file "/tmp/params.yaml" \ + --config /tmp/nextflow.config diff --git a/src/datasets/resource_scripts/zenodo_spatial.sh.sh b/src/datasets/resource_scripts/zenodo_spatial.sh.sh new file mode 100755 index 0000000000..192cb8cc9c --- /dev/null +++ b/src/datasets/resource_scripts/zenodo_spatial.sh.sh @@ -0,0 +1,414 @@ +#!/bin/bash + +cat > "/tmp/params.yaml" << 'HERE' +param_list: + - id: zenodo_spatial/human_heart_myocardial_infarction_1_visium + input_data: "https://zenodo.org/records/13328275/files/10X0018.h5ad?download=1" + dataset_name: 10X Visium - Human Heart MI 1 + dataset_url: "https://www.nature.com/articles/s41586-022-05060-x" + dataset_summary: Gene expression library of human heart using 10x Visium. + dataset_description: "Frozen heart samples were embedded in OCT (Tissue-Tek) and cryosectioned (Thermo Cryostar). The 10-µm section was placed on the pre-chilled Optimization slides (Visium, 10X Genomics, PN-1000193) and the optimal lysis time was determined. The tissues were treated as recommended by 10X Genomics and the optimization procedure showed an optimal permeabilization time of 12 or 18 min of digestion and release of RNA from the tissue slide. Spatial gene expression slides (Visium, 10X Genomics, PN-1000187) were used for spatial transcriptomics following the Visium User Guides" + dataset_reference: kuppe2022spatial + dataset_organism: Homo sapiens + spot_filter_min_genes: 200 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: zenodo_spatial/human_heart_myocardial_infarction_2_visium + input_data: "https://zenodo.org/records/13328275/files/10X009.h5ad?download=1" + dataset_name: 10X Visium - Human Heart MI 2 + dataset_url: "https://www.nature.com/articles/s41586-022-05060-x" + dataset_summary: Gene expression library of human heart using 10x Visium. + dataset_description: "Frozen heart samples were embedded in OCT (Tissue-Tek) and cryosectioned (Thermo Cryostar). The 10-µm section was placed on the pre-chilled Optimization slides (Visium, 10X Genomics, PN-1000193) and the optimal lysis time was determined. The tissues were treated as recommended by 10X Genomics and the optimization procedure showed an optimal permeabilization time of 12 or 18 min of digestion and release of RNA from the tissue slide. Spatial gene expression slides (Visium, 10X Genomics, PN-1000187) were used for spatial transcriptomics following the Visium User Guides" + dataset_reference: kuppe2022spatial + dataset_organism: Homo sapiens + spot_filter_min_genes: 200 + gene_filter_min_spots: 50 + remove_mitochondrial: true + +normalization_methods: [log_cp10k] +output_dataset: '$id/dataset.h5ad' +output_meta: '$id/dataset_metadata.yaml' +output_state: '$id/state.yaml' +output_raw: force_null +output_normalized: force_null +publish_dir: resources/datasets +remove_mitochondrial: true +HERE + +# catt > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: zenodo_spatial/mouse_e10_brain_dbitseq +# input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E10_brain_gene_25um_data.h5ad?download=1" +# dataset_name: DBiT-seq - Mouse Brain (E10) +# dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" +# dataset_summary: High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue. +# dataset_description: "Gene expression library of an E10 whole mouse embryo tissue (brain in early-stage organogenesis) profiled using DBiT-seq." +# dataset_organism: Mus musculus +# dataset_reference: liu2020high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_e10_eye_dbitseq +# input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E10_eye_and_nearby_data.h5ad?download=1" +# dataset_name: DBiT-seq - Mouse Eye (E10) +# dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" +# dataset_summary: High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue. +# dataset_description: "Gene expression library of an E10 whole mouse embryo tissue (eye in early-stage organogenesis) profiled using DBiT-seq." +# dataset_organism: Mus musculus +# dataset_reference: liu2020high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_e10_whole_body_dbitseq +# input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E10_whole_gene_best_data.h5ad?download=1" +# dataset_name: DBiT-seq - Mouse Whole Body (E10) +# dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" +# dataset_summary: High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue. +# dataset_description: "Gene expression library of an E10 whole mouse embryo tissue profiled using DBiT-seq." +# dataset_organism: Mus musculus +# dataset_reference: liu2020high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_e11_lower_body_dbitseq +# input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_E11_lower_body_data.h5ad?download=1" +# dataset_name: DBiT-seq - Mouse Lower Body (E11) +# dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" +# dataset_summary: High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue. +# dataset_description: "Gene expression library of an E11 whole mouse embryo tissue (lower body in early-stage organogenesis) profiled using DBiT-seq." +# dataset_organism: Mus musculus +# dataset_reference: liu2020high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_e11_1_dbitseq +# input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_GSM4364244_E11-FL-1L_gene_data.h5ad?download=1" +# dataset_name: DBiT-seq - Mouse Whole Body 1 (E11) +# dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" +# dataset_summary: High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue. +# dataset_description: "Gene expression library of an E11 whole mouse embryo tissue profiled using DBiT-seq." +# dataset_organism: Mus musculus +# dataset_reference: liu2020high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_e11_2_dbitseq +# input_data: "https://zenodo.org/records/12785822/files/DBiT-seq_liu2020high_GSM4364245_E11-FL-2L_gene_data.h5ad?download=1" +# dataset_name: DBiT-seq - Mouse Whole Body 2 (E11) +# dataset_url: "https://www.cell.com/cell/fulltext/S0092-8674(20)31390-8" +# dataset_summary: High-Spatial-Resolution Multi-Omics Sequencing via Deterministic Barcoding in Tissue. +# dataset_description: "Gene expression library of an E11 whole mouse embryo tissue profiled using DBiT-seq." +# dataset_organism: Mus musculus +# dataset_reference: liu2020high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# HERE + +# cat > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: zenodo_spatial/human_cortex_1_merfish +# input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.250.expand.rep1_data.h5ad?download=1" +# dataset_name: MERFISH - Human Cortex 1 +# dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" +# dataset_summary: Spatially resolved profiling of human cerebral cortex using multiplexed error-robust fluorescence in situ hybridization (MERFISH). +# dataset_description: "Spatially resolved profiling of human cerebral cortex (middle temopral gyrus) replicate 1 using multiplexed error-robust fluorescence in situ hybridization (MERFISH) (250 gene panel)." +# dataset_organism: Homo sapiens +# dataset_reference: fang2022conservation +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 100 +# remove_mitochondrial: false + +# - id: zenodo_spatial/human_cortex_2_merfish +# input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.4000.expand.rep1_data.h5ad?download=1" +# dataset_name: MERFISH - Human Cortex 2 +# dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" +# dataset_summary: Spatially resolved profiling of human cerebral cortex using multiplexed error-robust fluorescence in situ hybridization (MERFISH). +# dataset_description: "Spatially resolved profiling of human cerebral cortex (middle temopral gyrus) replicate 1 using multiplexed error-robust fluorescence in situ hybridization (MERFISH) (4000 gene panel)." +# dataset_organism: Homo sapiens +# dataset_reference: fang2022conservation +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: false + +# - id: zenodo_spatial/human_cortex_3_merfish +# input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.4000.expand.rep2_data.h5ad?download=1" +# dataset_name: MERFISH - Human Cortex 3 +# dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" +# dataset_summary: Spatially resolved profiling of human cerebral cortex using multiplexed error-robust fluorescence in situ hybridization (MERFISH). +# dataset_description: "Spatially resolved profiling of human cerebral cortex (middle temopral gyrus) replicate 2 using multiplexed error-robust fluorescence in situ hybridization (MERFISH) (4000 gene panel)." +# dataset_organism: Homo sapiens +# dataset_reference: fang2022conservation +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: false + +# - id: zenodo_spatial/human_cortex_4_merfish +# input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_H18.06.006.MTG.4000.expand.rep3_data.h5ad?download=1" +# dataset_name: MERFISH - Human Cortex 4 +# dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" +# dataset_summary: Spatially resolved profiling of human cerebral cortex using multiplexed error-robust fluorescence in situ hybridization (MERFISH). +# dataset_description: "Spatially resolved profiling of human cerebral cortex (middle temopral gyrus) replicate 3 using multiplexed error-robust fluorescence in situ hybridization (MERFISH) (4000 gene panel)." +# dataset_organism: Homo sapiens +# dataset_reference: fang2022conservation +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: false + +# - id: zenodo_spatial/mouse_cortex_merfish +# input_data: "https://zenodo.org/records/12785822/files/MERFISH_Fang2022Conservation_mouse1.AUD_TEA_VIS.242.unexpand_data.h5ad?download=1" +# dataset_name: MERFISH - Mouse Cortex +# dataset_url: "https://www.science.org/doi/10.1126/science.abm1741" +# dataset_summary: Spatially resolved profiling of mouse cerebral cortex using multiplexed error-robust fluorescence in situ hybridization (MERFISH). +# dataset_description: "Spatially resolved profiling of mouse cerebral cortex (visual cortex (VIS), auditory cortex (AUD) and temporal association area (TEa) unexpanded sections) using multiplexed error-robust fluorescence in situ hybridization (MERFISH)." +# dataset_organism: Mus musculus +# dataset_reference: fang2022conservation +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# HERE + +# cat > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: zenodo_spatial/mouse_organogenesis_seqfish +# input_data: "https://zenodo.org/records/12785822/files/seqfish.h5ad?download=1" +# dataset_name: Seqfish - Mouse Organogenesis +# dataset_url: "https://www.nature.com/articles/s41587-021-01006-2" +# dataset_summary: Single-cell spatial expression of mouse organogenesis. +# dataset_description: "Sagittal sections from mouse embryo at the 8-12 ss was profiled by seqFISH." +# dataset_organism: Mus musculus +# dataset_reference: lohoff2021integration +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 10 +# remove_mitochondrial: true + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# remove_mitochondrial: true +# HERE + +# cat > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: zenodo_spatial/mouse_olfactory_bulb_puck_slideseqv2 +# input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1" +# dataset_name: Slide-seqV2 - Mouse Olfactory Bulb Puck +# dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" +# dataset_summary: Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2. +# dataset_description: "Gene expression library of mouse olfactory bulk puck profiled using Slide-seq V2." +# dataset_reference: stickels2020highly +# dataset_organism: Mus musculus +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 500 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_cortex_slideseqv2 +# input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_palla2021squidpy_Slide-seqV2_Mouse_Cortex_data_whole.h5ad?download=1" +# dataset_name: Slide-seqV2 - Mouse Cortex +# dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" +# dataset_summary: Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2. +# dataset_description: "Gene expression library of Mouse cortex profiled using Slide-seq V2." +# dataset_reference: stickels2020highly +# dataset_organism: Mus musculus +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 500 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_cerebellum_slideseqv2 +# input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_Slide-seqV2_Mouse_Cerebellum_SCP948_data_whole.h5ad?download=1" +# dataset_name: Slide-seqV2 - Mouse Cerebellum +# dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" +# dataset_summary: Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2. +# dataset_description: "Gene expression library of mouse cerebellum profiled using Slide-seq V2." +# dataset_reference: stickels2020highly +# dataset_organism: Mus musculus +# spot_filter_min_genes: 100 +# gene_filter_min_spots: 500 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_hippocampus_puck_slideseqv2 +# input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_Slide-seqV2_Mouse_Hippocampus_Puck_200115_08_data_whole.h5ad?download=1" +# dataset_name: Slide-seqV2 - Mouse Hippocampus Puck +# dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" +# dataset_summary: Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2. +# dataset_description: "Gene expression library of mouse hippocampus puck profiled using Slide-seq V2." +# dataset_reference: stickels2020highly +# dataset_organism: Mus musculus +# spot_filter_min_genes: 200 +# gene_filter_min_spots: 500 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_somatosensory_cortex_puck_slideseqv2 +# input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_Slide-seqV2_Mouse_SomatosensoryCortex_Puck_200306_03_data_whole.h5ad?download=1" +# dataset_name: Slide-seqV2 - Mouse Somatosensory Cortex Puck +# dataset_url: "https://singlecell.broadinstitute.org/single_cell/study/SCP815/sensitive-spatial-genome-wide-expression-profiling-at-cellular-resolution#study-summary" +# dataset_summary: Highly sensitive spatial transcriptomics at near-cellular resolution with Slide-seqV2. +# dataset_description: "Gene expression library of mouse somatosensory cortex puck profiled using Slide-seq V2." +# dataset_reference: stickels2020highly +# dataset_organism: Mus musculus +# spot_filter_min_genes: 200 +# gene_filter_min_spots: 500 +# remove_mitochondrial: true + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# HERE + +# cat > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: zenodo_spatial/mouse_brain_2d_zstep10_0_starmap +# input_data: "https://zenodo.org/records/12785822/files/STARmap_Wang2018three_data_2D_zstep10_0_data.h5ad?download=1" +# dataset_name: STARmap - Mouse Brain 1 +# dataset_url: "https://www.science.org/doi/10.1126/science.aat5691" +# dataset_summary: Three-dimensional intact-tissue sequencing of single-cell transcriptional states. +# dataset_description: "3D architecture of cell types in visual cortex volumes." +# dataset_organism: Mus musculus +# dataset_reference: wang2018three +# spot_filter_min_genes: 1 +# gene_filter_min_spots: 1 +# remove_mitochondrial: true + +# - id: zenodo_spatial/mouse_brain_2d_zstep15_0_starmap +# input_data: "https://zenodo.org/records/12785822/files/STARmap_Wang2018three_data_2D_zstep15_0_data.h5ad?download=1" +# dataset_name: STARmap - Mouse Brain 2 +# dataset_url: "https://www.science.org/doi/10.1126/science.aat5691" +# dataset_summary: Three-dimensional intact-tissue sequencing of single-cell transcriptional states. +# dataset_description: "3D architecture of cell types in visual cortex volumes." +# dataset_organism: Mus musculus +# dataset_reference: wang2018three +# spot_filter_min_genes: 1 +# gene_filter_min_spots: 1 +# remove_mitochondrial: true + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# HERE + +# cat > "/tmp/params.yaml" << 'HERE' +# param_list: +# - id: zenodo_spatial/drosophila_embryo_e5_6_stereoseq +# input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_5.6.h5ad?download=1" +# dataset_name: Stereo-seq - Drosophila embryo E5_6 +# dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" +# dataset_summary: Stereo-seq faithfully captures Drosophila spatial transcriptomes with high resolution. +# dataset_description: "Drosophila has long been a successful model organism in multiple biomedical fields. Spatial gene expression patterns are critical for the understanding of complex pathways and interactions, whereas temporal gene expression changes are vital for studying highly dynamic physiological activities. Systematic studies in Drosophila are still impeded by the lack of spatiotemporal transcriptomic information. Here, utilizing spatial enhanced resolution omics-sequencing (Stereo-seq), we dissected the spatiotemporal transcriptomic changes of developing Drosophila with high resolution and sensitivity. (Data from an embryo collected 14-16 h after egg laying)" +# dataset_organism: Drosophila +# dataset_reference: wang2022high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/drosophila_embryo_e6_3_stereoseq +# input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_6.3.h5ad?download=1" +# dataset_name: Stereo-seq - Drosophila embryo E6_3 +# dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" +# dataset_summary: Stereo-seq faithfully captures Drosophila spatial transcriptomes with high resolution. +# dataset_description: "Drosophila has long been a successful model organism in multiple biomedical fields. Spatial gene expression patterns are critical for the understanding of complex pathways and interactions, whereas temporal gene expression changes are vital for studying highly dynamic physiological activities. Systematic studies in Drosophila are still impeded by the lack of spatiotemporal transcriptomic information. Here, utilizing spatial enhanced resolution omics-sequencing (Stereo-seq), we dissected the spatiotemporal transcriptomic changes of developing Drosophila with high resolution and sensitivity. (Data from an embryo collected 14-16 h after egg laying)" +# dataset_organism: Drosophila +# dataset_reference: wang2022high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/drosophila_embryo_e7_stereoseq +# input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_7.h5ad?download=1" +# dataset_name: Stereo-seq - Drosophila embryo E7 +# dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" +# dataset_summary: Stereo-seq faithfully captures Drosophila spatial transcriptomes with high resolution. +# dataset_description: "Drosophila has long been a successful model organism in multiple biomedical fields. Spatial gene expression patterns are critical for the understanding of complex pathways and interactions, whereas temporal gene expression changes are vital for studying highly dynamic physiological activities. Systematic studies in Drosophila are still impeded by the lack of spatiotemporal transcriptomic information. Here, utilizing spatial enhanced resolution omics-sequencing (Stereo-seq), we dissected the spatiotemporal transcriptomic changes of developing Drosophila with high resolution and sensitivity. (Data from an embryo collected 14-16 h after egg laying)" +# dataset_organism: Drosophila +# dataset_reference: wang2022high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/drosophila_embryo_e9_1_stereoseq +# input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_9.1.h5ad?download=1" +# dataset_name: Stereo-seq - Drosophila embryo E9_1 +# dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" +# dataset_summary: Stereo-seq faithfully captures Drosophila spatial transcriptomes with high resolution. +# dataset_description: "Drosophila has long been a successful model organism in multiple biomedical fields. Spatial gene expression patterns are critical for the understanding of complex pathways and interactions, whereas temporal gene expression changes are vital for studying highly dynamic physiological activities. Systematic studies in Drosophila are still impeded by the lack of spatiotemporal transcriptomic information. Here, utilizing spatial enhanced resolution omics-sequencing (Stereo-seq), we dissected the spatiotemporal transcriptomic changes of developing Drosophila with high resolution and sensitivity. (Data from an embryo collected 14-16 h after egg laying)" +# dataset_organism: Drosophila +# dataset_reference: wang2022high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# - id: zenodo_spatial/drosophila_embryo_e10_stereoseq +# input_data: "https://zenodo.org/records/12785822/files/Stereo-seq_wang2022high_E14-16h_a_count_normal_stereoseq_data_whole_time_point_10.5.h5ad?download=1" +# dataset_name: Stereo-seq - Drosophila embryo E10 +# dataset_url: "https://www.sciencedirect.com/science/article/pii/S1534580722002465" +# dataset_summary: Stereo-seq faithfully captures Drosophila spatial transcriptomes with high resolution. +# dataset_description: "Drosophila has long been a successful model organism in multiple biomedical fields. Spatial gene expression patterns are critical for the understanding of complex pathways and interactions, whereas temporal gene expression changes are vital for studying highly dynamic physiological activities. Systematic studies in Drosophila are still impeded by the lack of spatiotemporal transcriptomic information. Here, utilizing spatial enhanced resolution omics-sequencing (Stereo-seq), we dissected the spatiotemporal transcriptomic changes of developing Drosophila with high resolution and sensitivity. (Data from an embryo collected 14-16 h after egg laying)" +# dataset_organism: Drosophila +# dataset_reference: wang2022high +# spot_filter_min_genes: 10 +# gene_filter_min_spots: 50 +# remove_mitochondrial: true + +# normalization_methods: [log_cp10k] +# output_dataset: '$id/dataset.h5ad' +# output_meta: '$id/dataset_metadata.yaml' +# output_state: '$id/state.yaml' +# output_raw: force_null +# output_normalized: force_null +# publish_dir: resources/datasets +# HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' + withLabel: highmem { + memory = '350GB' + } + withName: '.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/datasets/workflows/process_zenodo_spatial/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file "/tmp/params.yaml" \ + --config /tmp/nextflow.config diff --git a/src/datasets/resource_scripts/zenodo_spatial_slidetags.sh b/src/datasets/resource_scripts/zenodo_spatial_slidetags.sh new file mode 100755 index 0000000000..d8654ce439 --- /dev/null +++ b/src/datasets/resource_scripts/zenodo_spatial_slidetags.sh @@ -0,0 +1,82 @@ +#!/bin/bash + +cat > "/tmp/params.yaml" << 'HERE' +param_list: + - id: zenodo_spatial_slidetags/human_cortex_slidetags + input_data: "https://zenodo.org/records/12785822/files/slidetag_human_cortex.tar.gz?download=1" + dataset_name: Slide-tags - Human Cortex + dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" + dataset_summary: Slide-tags enables single-nucleus barcoding for multimodal spatial genomics. + dataset_description: "A 100 mm2 region of the human prefrontal cortex from a neurotypical donor aged 78 years was profiled by Slide-tags." + dataset_organism: Homo sapiens + dataset_reference: russell2023slide + spot_filter_min_genes: 200 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: zenodo_spatial_slidetags/human_skin_melanoma_slidetags + input_data: "https://zenodo.org/records/12785822/files/slidetag_human_skin_melanoma.tar.gz?download=1" + dataset_name: Slide-tags - Human Skin Melanoma + dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" + dataset_summary: Slide-tags enables single-nucleus barcoding for multimodal spatial genomics. + dataset_description: "A metastatic melanoma sample was profiled by Slide-tags." + dataset_organism: Homo sapiens + dataset_reference: russell2023slide + spot_filter_min_genes: 200 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: zenodo_spatial_slidetags/human_tonsil_slidetags + input_data: "https://zenodo.org/records/12785822/files/slidetag_human_tonsil.tar.gz?download=1" + dataset_name: Slide-tags - Human Tonsil + dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" + dataset_summary: Slide-tags enables single-nucleus barcoding for multimodal spatial genomics. + dataset_description: "A human tonsil was profiled by Slide-tags." + dataset_organism: Homo sapiens + dataset_reference: russell2023slide + spot_filter_min_genes: 200 + gene_filter_min_spots: 50 + remove_mitochondrial: true + + - id: zenodo_spatial_slidetags/mouse_embryo_slidetags + input_data: "https://zenodo.org/records/12785822/files/slidetag_mouse_embryo.tar.gz?download=1" + dataset_name: Slide-tags - Mouse Embryo + dataset_url: "https://www.nature.com/articles/s41586-023-06837-4" + dataset_summary: Slide-tags enables single-nucleus barcoding for multimodal spatial genomics. + dataset_description: "Mouse embryo tonsil was profiled by Slide-tags." + dataset_organism: Mus musculus + dataset_reference: russell2023slide + spot_filter_min_genes: 200 + gene_filter_min_spots: 50 + remove_mitochondrial: false + +normalization_methods: [log_cp10k] +output_dataset: '$id/dataset.h5ad' +output_meta: '$id/dataset_metadata.yaml' +output_state: '$id/state.yaml' +output_raw: force_null +output_normalized: force_null +publish_dir: resources/datasets +HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' + withLabel: highmem { + memory = '350GB' + } + withName: '.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision main_build \ + --pull-latest \ + --main-script target/nextflow/datasets/workflows/process_zenodo_spatial_slidetags/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file "/tmp/params.yaml" \ + --config /tmp/nextflow.config diff --git a/src/datasets/resource_test_scripts/mouse_brain_coronal_section1.sh b/src/datasets/resource_test_scripts/mouse_brain_coronal_section1.sh new file mode 100755 index 0000000000..e4b889e063 --- /dev/null +++ b/src/datasets/resource_test_scripts/mouse_brain_coronal_section1.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +set -e + +cat > /tmp/params.yaml << 'HERE' +param_list: + - id: mouse_brain_coronal_section1 + input_expression: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_filtered_feature_bc_matrix.h5" + input_spatial: "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/CytAssist_FFPE_Mouse_Brain_Rep1/CytAssist_FFPE_Mouse_Brain_Rep1_spatial.tar.gz" + dataset_name: Mouse Brain Coronal Section 1 (FFPE) + dataset_url: "https://www.10xgenomics.com/datasets/mouse-brain-coronal-section-1-ffpe-2-standard" + dataset_summary: Gene expression library of Mouse Brain (CytAssist FFPE) using the Mouse Whole Transcriptome Probe Set + dataset_description: "FFPE Mouse Brain tissue blocks sectioned as described in Visium CytAssist Spatial Gene Expression for FFPE - Tissue Preparation Guide Demonstrated Protocol. The H&E stained glass slide with tissue section was processed via Visium CytAssist instrument to transfer analytes to a Visium CytAssist Spatial Gene Expression slide. The probe extension and library construction steps follow the standard Visium for FFPE workflow outside of the instrument. The H&E image was acquired using Olympus VS200 Slide Scanning Microscope. Sequencing depth was 53,497 reads per spot. Sequencing configuration: 28bp read 1 (16bp Visium spatial barcode, 12bp UMI), 90bp read 2 (transcript), 10bp i7 sample barcode and 10bp i5 sample barcode. Key metrics include: 2,310 spots detected under tissue; 6,736 median genes per spot; 24,862 median UMI counts per spot." + dataset_reference: 10x2022brain + dataset_organism: Mus musculus + +normalization_methods: [log_cp10k] +n_obs: 600 +n_vars: 500 +output_dataset: '$id/dataset.h5ad' +output_meta: '$id/dataset_metadata.yaml' +output_state: '$id/state.yaml' +output_raw: force_null +output_normalized: force_null +publish_dir: resources_test/common +do_subsample: true +spot_filter_min_genes: 200 +gene_filter_min_spots: 50 +remove_mitochondrial: true +HERE + +nextflow run . \ + -main-script target/nextflow/datasets/workflows/process_tenx_visium/main.nf \ + -c src/wf_utils/labels_ci.config \ + -profile docker \ + -params-file "/tmp/params.yaml" + diff --git a/src/datasets/resource_test_scripts/slideseq_test.sh b/src/datasets/resource_test_scripts/slideseq_test.sh new file mode 100755 index 0000000000..a9050be40a --- /dev/null +++ b/src/datasets/resource_test_scripts/slideseq_test.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +set -e + +cat > /tmp/params.yaml << 'HERE' +param_list: + - id: mouse_cerebellum + input_data: "https://zenodo.org/records/12785822/files/Slide-seqV2_stickels2020highly_stickels2021highly_SlideSeqV2_Mouse_Olfactory_bulb_Puck_200127_15_data_whole.h5ad?download=1" + dataset_name: Mouse cerebellum + dataset_url: "..." + dataset_summary: ... + dataset_description: "..." + dataset_reference: ref + dataset_organism: Mus musculus + +normalization_methods: [log_cp10k] +n_obs: 600 +n_vars: 500 +output_dataset: '$id/dataset.h5ad' +output_meta: '$id/dataset_metadata.yaml' +output_state: '$id/state.yaml' +output_raw: force_null +output_normalized: force_null +publish_dir: resources_test/common +do_subsample: true +spot_filter_min_genes: 200 +gene_filter_min_spots: 50 +remove_mitochondrial: true +HERE + +nextflow run . \ + -main-script target/nextflow/datasets/workflows/process_spatial_from_zenodo/main.nf \ + -c src/wf_utils/labels_ci.config \ + -profile docker \ + -params-file "/tmp/params.yaml" + diff --git a/src/datasets/workflows/process_tenx_visium/config.vsh.yaml b/src/datasets/workflows/process_tenx_visium/config.vsh.yaml new file mode 100644 index 0000000000..91a2867820 --- /dev/null +++ b/src/datasets/workflows/process_tenx_visium/config.vsh.yaml @@ -0,0 +1,142 @@ +functionality: + name: process_tenx_visium + namespace: datasets/workflows + description: | + Download and process datasets originating from 10x Genomics. + argument_groups: + - name: Input + arguments: + - name: "--input_expression" + type: string + description: URL to the feature / barcode matrix HDF5. + required: true + - name: "--input_spatial" + type: string + description: URL to the Spatial imaging data. + required: true + - name: Outputs + arguments: + - name: "--output_dataset" + type: file + direction: output + description: Output h5ad file + required: true + __merge__: /src/datasets/api/file_raw.yaml + - name: "--output_meta" + direction: "output" + type: file + description: "Dataset metadata" + default: "dataset_metadata.yaml" + - name: Metadata + arguments: + - name: "--id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: "--spot_filter_min_genes" + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: "--spot_filter_min_counts" + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: "--gene_filter_min_spots" + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: "--gene_filter_min_counts" + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: "--remove_mitochondrial" + type: boolean + description: Remove mitovhondrial genes? + required: false + - name: Sampling options + arguments: + - name: "--do_subsample" + type: boolean + default: false + description: "Whether or not to subsample the dataset" + - name: "--n_obs" + type: integer + description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + - name: "--n_vars" + type: integer + description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + # - name: "--keep_features" + # type: string + # multiple: true + # description: A list of genes to keep. + # - name: "--keep_cell_type_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--keep_batch_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--even" + # type: "boolean_true" + # description: Subsample evenly from different batches + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + - name: Normalization + arguments: + - name: "--normalization_methods" + type: string + multiple: true + choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] + default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] + description: "Which normalization methods to run." + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: datasets/loaders/tenx_visium + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: common/extract_metadata +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/datasets/workflows/process_tenx_visium/main.nf b/src/datasets/workflows/process_tenx_visium/main.nf new file mode 100644 index 0000000000..2ec0eae247 --- /dev/null +++ b/src/datasets/workflows/process_tenx_visium/main.nf @@ -0,0 +1,133 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // create different normalization methods by overriding the defaults + normalization_methods = [ + log_cp.run( + key: "log_cp10k", + args: [normalization_id: "log_cp10k", n_cp: 10000], + ), + log_cp.run( + key: "log_cpm", + args: [normalization_id: "log_cpm", n_cp: 1000000], + ), + sqrt_cp.run( + key: "sqrt_cp10k", + args: [normalization_id: "sqrt_cp10k", n_cp: 10000], + ), + sqrt_cp.run( + key: "sqrt_cpm", + args: [normalization_id: "sqrt_cpm", n_cp: 1000000], + ), + l1_sqrt.run( + key: "l1_sqrt", + args: [normalization_id: "l1_sqrt"], + ), + log_scran_pooling.run( + key: "log_scran_pooling", + args: [normalization_id: "log_scran_pooling"], + ) + ] + + output_ch = input_ch + + // store original id for later use + | map{ id, state -> + [id, state + [_meta: [join_id: id]]] + } + + // fetch data from legacy openproblems + | tenx_visium.run( + fromState: [ + "input_expression": "input_expression", + "input_spatial": "input_spatial", + "dataset_id": "id", + "dataset_name": "dataset_name", + "dataset_url": "dataset_url", + "dataset_reference": "dataset_reference", + "dataset_summary": "dataset_summary", + "dataset_description": "dataset_description", + "dataset_organism": "dataset_organism", + "spot_filter_min_genes": "spot_filter_min_genes", + "gene_filter_min_spots": "gene_filter_min_spots", + "remove_mitochondrial": "remove_mitochondrial" + ], + toState: ["output_raw": "dataset"] + ) + + // subsample if so desired + | subsample.run( + runIf: { id, state -> state.do_subsample }, + fromState: [ + "input": "output_raw", + "n_obs": "n_obs", + "n_vars": "n_vars", + "seed": "seed" + ], + args: [output_mod2: null], + toState: ["output_raw": "output"] + ) + + | runEach( + components: normalization_methods, + id: { id, state, comp -> + if (state.normalization_methods.size() > 1) { + id + "/" + comp.name + } else { + id + } + }, + filter: { id, state, comp -> + comp.name in state.normalization_methods + }, + fromState: ["input": "output_raw"], + toState: { id, output, state, comp -> + state + [ + output_normalized: output.output, + normalization_id: comp.name + ] + } + ) + + // add synonym + | map{ id, state -> + [id, state + [output_dataset: state.output_normalized]] + } + + | extract_metadata.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "output_dataset") + // workaround: convert GString to String + schema = iterateMap(schema, { it instanceof GString ? it.toString() : it }) + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.output_dataset, + "schema": schemaYaml + ] + }, + toState: ["output_meta": "output"] + ) + + // only output the files for which an output file was specified + | setState([ + "output_dataset", + "output_meta", + "_meta" + ]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml b/src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml new file mode 100644 index 0000000000..45b938b716 --- /dev/null +++ b/src/datasets/workflows/process_zenodo_spatial/config.vsh.yaml @@ -0,0 +1,138 @@ +functionality: + name: process_zenodo_spatial + namespace: datasets/workflows + description: | + Download and process DBiT seq, MERFISH, seqFISH, Slide-seq v2, STARmap, and Stereo-seq data from Zenodo. + argument_groups: + - name: Input + arguments: + - name: "--input_data" + type: string + description: URL to the Anndata file. + required: true + - name: Outputs + arguments: + - name: "--output_dataset" + type: file + direction: output + description: Output h5ad file + required: true + __merge__: /src/datasets/api/file_raw.yaml + - name: "--output_meta" + direction: "output" + type: file + description: "Dataset metadata" + default: "dataset_metadata.yaml" + - name: Metadata + arguments: + - name: "--id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: "--spot_filter_min_genes" + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: "--spot_filter_min_counts" + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: "--gene_filter_min_spots" + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: "--gene_filter_min_counts" + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: "--remove_mitochondrial" + type: boolean + description: Remove mitovhondrial genes? + required: false + - name: Sampling options + arguments: + - name: "--do_subsample" + type: boolean + default: false + description: "Whether or not to subsample the dataset" + - name: "--n_obs" + type: integer + description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. + default: 600 + - name: "--n_vars" + type: integer + description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + # - name: "--keep_features" + # type: string + # multiple: true + # description: A list of genes to keep. + # - name: "--keep_cell_type_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--keep_batch_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--even" + # type: "boolean_true" + # description: Subsample evenly from different batches + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + - name: Normalization + arguments: + - name: "--normalization_methods" + type: string + multiple: true + choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] + default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] + description: "Which normalization methods to run." + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: datasets/loaders/zenodo_spatial + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: common/extract_metadata +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/datasets/workflows/process_zenodo_spatial/main.nf b/src/datasets/workflows/process_zenodo_spatial/main.nf new file mode 100644 index 0000000000..a5893c0ab4 --- /dev/null +++ b/src/datasets/workflows/process_zenodo_spatial/main.nf @@ -0,0 +1,132 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // create different normalization methods by overriding the defaults + normalization_methods = [ + log_cp.run( + key: "log_cp10k", + args: [normalization_id: "log_cp10k", n_cp: 10000], + ), + log_cp.run( + key: "log_cpm", + args: [normalization_id: "log_cpm", n_cp: 1000000], + ), + sqrt_cp.run( + key: "sqrt_cp10k", + args: [normalization_id: "sqrt_cp10k", n_cp: 10000], + ), + sqrt_cp.run( + key: "sqrt_cpm", + args: [normalization_id: "sqrt_cpm", n_cp: 1000000], + ), + l1_sqrt.run( + key: "l1_sqrt", + args: [normalization_id: "l1_sqrt"], + ), + log_scran_pooling.run( + key: "log_scran_pooling", + args: [normalization_id: "log_scran_pooling"], + ) + ] + + output_ch = input_ch + + // store original id for later use + | map{ id, state -> + [id, state + [_meta: [join_id: id]]] + } + + // fetch data from legacy openproblems + | zenodo_spatial.run( + fromState: [ + "input_data": "input_data", + "dataset_id": "id", + "dataset_name": "dataset_name", + "dataset_url": "dataset_url", + "dataset_reference": "dataset_reference", + "dataset_summary": "dataset_summary", + "dataset_description": "dataset_description", + "dataset_organism": "dataset_organism", + "spot_filter_min_genes": "spot_filter_min_genes", + "gene_filter_min_spots": "gene_filter_min_spots", + "remove_mitochondrial": "remove_mitochondrial" + ], + toState: ["output_raw": "dataset"] + ) + + // subsample if so desired + | subsample.run( + runIf: { id, state -> state.do_subsample }, + fromState: [ + "input": "output_raw", + "n_obs": "n_obs", + "n_vars": "n_vars", + "seed": "seed" + ], + args: [output_mod2: null], + toState: ["output_raw": "output"] + ) + + | runEach( + components: normalization_methods, + id: { id, state, comp -> + if (state.normalization_methods.size() > 1) { + id + "/" + comp.name + } else { + id + } + }, + filter: { id, state, comp -> + comp.name in state.normalization_methods + }, + fromState: ["input": "output_raw"], + toState: { id, output, state, comp -> + state + [ + output_normalized: output.output, + normalization_id: comp.name + ] + } + ) + + // add synonym + | map{ id, state -> + [id, state + [output_dataset: state.output_normalized]] + } + + | extract_metadata.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "output_dataset") + // workaround: convert GString to String + schema = iterateMap(schema, { it instanceof GString ? it.toString() : it }) + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.output_dataset, + "schema": schemaYaml + ] + }, + toState: ["output_meta": "output"] + ) + + // only output the files for which an output file was specified + | setState([ + "output_dataset", + "output_meta", + "_meta" + ]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml b/src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml new file mode 100644 index 0000000000..23934fe161 --- /dev/null +++ b/src/datasets/workflows/process_zenodo_spatial_slidetags/config.vsh.yaml @@ -0,0 +1,138 @@ +functionality: + name: process_zenodo_spatial_slidetags + namespace: datasets/workflows + description: | + Download and process slide tags datasets originating from Zenodo. + argument_groups: + - name: Input + arguments: + - name: "--input_data" + type: string + description: URL to the Anndata file. + required: true + - name: Outputs + arguments: + - name: "--output_dataset" + type: file + direction: output + description: Output h5ad file + required: true + __merge__: /src/datasets/api/file_raw.yaml + - name: "--output_meta" + direction: "output" + type: file + description: "Dataset metadata" + default: "dataset_metadata.yaml" + - name: Metadata + arguments: + - name: "--id" + type: string + description: Unique identifier of the dataset. + required: true + - name: "--dataset_name" + type: string + description: Nicely formatted name. + required: true + - name: "--dataset_url" + type: string + description: Link to the original source of the dataset. + required: false + - name: "--dataset_reference" + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: "--dataset_summary" + type: string + description: Short description of the dataset. + required: true + - name: "--dataset_description" + type: string + description: Long description of the dataset. + required: true + - name: "--dataset_organism" + type: string + description: The organism of the dataset. + required: false + - name: Gene or spot filtering + description: Arguments related to filtering cells and genes by counts. + arguments: + - name: "--spot_filter_min_genes" + type: integer + description: Remove spots with less than this number of genes. + required: false + example: 200 + - name: "--spot_filter_min_counts" + type: integer + description: Remove spots with less than this number of counts. + required: false + - name: "--gene_filter_min_spots" + type: integer + description: Remove genes expressed in less than this number of cells. + required: false + example: 50 + - name: "--gene_filter_min_counts" + type: integer + description: Remove genes with less than this number of counts. + required: false + - name: "--remove_mitochondrial" + type: boolean + description: Remove mitovhondrial genes? + required: false + - name: Sampling options + arguments: + - name: "--do_subsample" + type: boolean + default: false + description: "Whether or not to subsample the dataset" + - name: "--n_obs" + type: integer + description: Maximum number of observations to be kept. It might end up being less because empty cells / genes are removed. + default: 600 + - name: "--n_vars" + type: integer + description: Maximum number of variables to be kept. It might end up being less because empty cells / genes are removed. + default: 500 + # - name: "--keep_features" + # type: string + # multiple: true + # description: A list of genes to keep. + # - name: "--keep_cell_type_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--keep_batch_categories" + # type: "string" + # multiple: true + # description: "Categories indexes to be selected" + # required: false + # - name: "--even" + # type: "boolean_true" + # description: Subsample evenly from different batches + - name: "--seed" + type: "integer" + description: "A seed for the subsampling." + example: 123 + - name: Normalization + arguments: + - name: "--normalization_methods" + type: string + multiple: true + choices: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt", "log_scran_pooling"] + default: ["log_cp10k", "log_cpm", "sqrt_cp10k", "sqrt_cpm", "l1_sqrt"] + description: "Which normalization methods to run." + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: datasets/loaders/zenodo_spatial_slidetags + - name: datasets/normalization/log_cp + - name: datasets/normalization/log_scran_pooling + - name: datasets/normalization/sqrt_cp + - name: datasets/normalization/l1_sqrt + - name: datasets/processors/subsample + - name: common/extract_metadata +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf b/src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf new file mode 100644 index 0000000000..2bb6b9300a --- /dev/null +++ b/src/datasets/workflows/process_zenodo_spatial_slidetags/main.nf @@ -0,0 +1,132 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // create different normalization methods by overriding the defaults + normalization_methods = [ + log_cp.run( + key: "log_cp10k", + args: [normalization_id: "log_cp10k", n_cp: 10000], + ), + log_cp.run( + key: "log_cpm", + args: [normalization_id: "log_cpm", n_cp: 1000000], + ), + sqrt_cp.run( + key: "sqrt_cp10k", + args: [normalization_id: "sqrt_cp10k", n_cp: 10000], + ), + sqrt_cp.run( + key: "sqrt_cpm", + args: [normalization_id: "sqrt_cpm", n_cp: 1000000], + ), + l1_sqrt.run( + key: "l1_sqrt", + args: [normalization_id: "l1_sqrt"], + ), + log_scran_pooling.run( + key: "log_scran_pooling", + args: [normalization_id: "log_scran_pooling"], + ) + ] + + output_ch = input_ch + + // store original id for later use + | map{ id, state -> + [id, state + [_meta: [join_id: id]]] + } + + // fetch data from legacy openproblems + | zenodo_spatial_slidetags.run( + fromState: [ + "input_data": "input_data", + "dataset_id": "id", + "dataset_name": "dataset_name", + "dataset_url": "dataset_url", + "dataset_reference": "dataset_reference", + "dataset_summary": "dataset_summary", + "dataset_description": "dataset_description", + "dataset_organism": "dataset_organism", + "spot_filter_min_genes": "spot_filter_min_genes", + "gene_filter_min_spots": "gene_filter_min_spots", + "remove_mitochondrial": "remove_mitochondrial" + ], + toState: ["output_raw": "dataset"] + ) + + // subsample if so desired + | subsample.run( + runIf: { id, state -> state.do_subsample }, + fromState: [ + "input": "output_raw", + "n_obs": "n_obs", + "n_vars": "n_vars", + "seed": "seed" + ], + args: [output_mod2: null], + toState: ["output_raw": "output"] + ) + + | runEach( + components: normalization_methods, + id: { id, state, comp -> + if (state.normalization_methods.size() > 1) { + id + "/" + comp.name + } else { + id + } + }, + filter: { id, state, comp -> + comp.name in state.normalization_methods + }, + fromState: ["input": "output_raw"], + toState: { id, output, state, comp -> + state + [ + output_normalized: output.output, + normalization_id: comp.name + ] + } + ) + + // add synonym + | map{ id, state -> + [id, state + [output_dataset: state.output_normalized]] + } + + | extract_metadata.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "output_dataset") + // workaround: convert GString to String + schema = iterateMap(schema, { it instanceof GString ? it.toString() : it }) + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.output_dataset, + "schema": schemaYaml + ] + }, + toState: ["output_meta": "output"] + ) + + // only output the files for which an output file was specified + | setState([ + "output_dataset", + "output_meta", + "_meta" + ]) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/tasks/spatially_variable_genes/README.md b/src/tasks/spatially_variable_genes/README.md new file mode 100644 index 0000000000..315c01900a --- /dev/null +++ b/src/tasks/spatially_variable_genes/README.md @@ -0,0 +1,335 @@ +# Spatially variable genes + + + + +Spatially variable genes (SVGs) are genes whose expression levels vary +significantly across different spatial regions within a tissue or across +cells in a spatially structured context. + +Path to source: +[`src/tasks/spatially_variable_genes`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/tasks/spatially_variable_genes) + +## Motivation + +Recent years have witnessed significant progress in spatially-resolved +transcriptome profiling techniques that simultaneously characterize +cellular gene expression and their physical position, generating spatial +transcriptomic (ST) data. The application of these techniques has +dramatically advanced our understanding of disease and developmental +biology. One common task for all ST profiles, regardless of the employed +protocols, is to identify genes that exhibit spatial patterns. These +genes, defined as spatially variable genes (SVGs), contain additional +information about the spatial structure of the tissues of interest, +compared to highly variable genes (HVGs). + +## Description + +Identification of spatially variable genes is crucial to for studying +spatial domains within tissue microenvironmnets, developmental gradients +and cell signaling pathways. In this task we attempt to evaluate various +methods for detecting SVGs using a number of realistic simulated +datasets with diverse patterns derived from real-world spatial +transcriptomics data using scDesign3. Synthetic data is generated by +mixing a Gaussian Process (GP) model and a non-spatial model (obtained +by shuffling mean parameters of the GP model to remove spatial +correlation between spots) to generate gene expressions with various +spatial variability. + +## Authors & contributors + +| name | roles | +|:------------------|:-------------------| +| Zhijian Li | author, maintainer | +| Zain M. Patel | author | +| Dongyuan Song | author | +| Guanao Yan | author | +| Jingyi Jessica Li | author | +| Luca Pinello | author | +| Robrecht Cannoodt | contributor | +| Sai Nirmayi Yasa | contributor | + +## API + +``` mermaid +flowchart LR + file_common_dataset("Common Dataset") + comp_process_dataset[/"Data processor"/] + file_dataset("Dataset") + file_solution("Solution") + comp_control_method[/"Control method"/] + comp_method[/"Method"/] + comp_metric[/"Metric"/] + file_output("Output") + file_score("Score") + file_common_dataset---comp_process_dataset + comp_process_dataset-->file_dataset + comp_process_dataset-->file_solution + file_dataset---comp_control_method + file_dataset---comp_method + file_solution---comp_control_method + file_solution---comp_metric + comp_control_method-->file_output + comp_method-->file_output + comp_metric-->file_score + file_output---comp_metric +``` + +## File format: Common Dataset + +A subset of the common dataset. + +Example file: +`resources_test/common/mouse_brain_coronal_section1/dataset.h5ad` + +Format: + +
+ + AnnData object + var: 'feature_id', 'feature_name' + obsm: 'spatial' + layers: 'counts', 'counts' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------------|:----------|:-------------------------------------------------------------------------------| +| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `obsm["spatial"]` | `double` | Spatial coordinates for each spot. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["counts"]` | `double` | Normalized expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | (*Optional*) Nicely formatted name. | +| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | (*Optional*) Short description of the dataset. | +| `uns["dataset_description"]` | `string` | (*Optional*) Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. | + +
+ +## Component type: Data processor + +Path: +[`src/spatially_variable_genes`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/spatially_variable_genes) + +A spatially variable genes dataset processor. + +Arguments: + +
+ +| Name | Type | Description | +|:--------------------|:-------|:---------------------------------------------------------| +| `--input` | `file` | A subset of the common dataset. | +| `--output_dataset` | `file` | (*Output*) The dataset without spatially variable genes. | +| `--output_solution` | `file` | (*Output*) Anndata with true spatial variability. | + +
+ +## File format: Dataset + +The dataset without spatially variable genes. + +Example file: +`resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad` + +Format: + +
+ + AnnData object + var: 'feature_id', 'feature_name' + obsm: 'spatial' + layers: 'counts', 'normalized' + uns: 'dataset_id', 'dataset_name' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------|:----------|:----------------------------------------------------------------------------------------------------------| +| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, in this case a ENSEMBL gene id suffixed with alpha value. | +| `var["feature_name"]` | `string` | (*Optional*) A human-readable name for the feature, in this case a gene symbol suffixed with alpha value. | +| `obsm["spatial"]` | `double` | Spatial coordinates for each spot. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalised expression values. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | (*Optional*) Nicely formatted name. | + +
+ +## File format: Solution + +Anndata with true spatial variability. + +Example file: +`resources_test/spatially_variable_genes/mouse_brain_coronal_section1/solution.h5ad` + +Description: + +Anndata with true spatial variability score for each gene. + +Format: + +
+ + AnnData object + var: 'feature_id', 'feature_name', 'orig_feature_name', 'true_spatial_var_score' + uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------------|:---------|:-------------------------------------------------------------------------------------------------| +| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature (e.g., ESEMBL gene id suffixed with alpha value). | +| `var["feature_name"]` | `string` | A human-readable name for the feature, in this case a gene symbol suffixed with alpha value. | +| `var["orig_feature_name"]` | `string` | Original human-readable name for the feature, usually a gene symbol. | +| `var["true_spatial_var_score"]` | `double` | True spatial variability score. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["dataset_name"]` | `string` | Nicely formatted name. | +| `uns["dataset_url"]` | `string` | Link to the original source of the dataset. | +| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. | +| `uns["dataset_summary"]` | `string` | Short description of the dataset. | +| `uns["dataset_description"]` | `string` | Long description of the dataset. | +| `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. | + +
+ +## Component type: Control method + +Path: +[`src/spatially_variable_genes/control_methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/spatially_variable_genes/control_methods) + +Quality control methods for verifying the pipeline. + +Arguments: + +
+ +| Name | Type | Description | +|:-------------------|:-------|:------------------------------------------------------| +| `--input_data` | `file` | The dataset without spatially variable genes. | +| `--input_solution` | `file` | Anndata with true spatial variability. | +| `--output` | `file` | (*Output*) Anndata with estimate spatial variability. | + +
+ +## Component type: Method + +Path: +[`src/spatially_variable_genes/methods`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/spatially_variable_genes/methods) + +A spatially variable gene identification method. + +Arguments: + +
+ +| Name | Type | Description | +|:---------------|:-------|:------------------------------------------------------| +| `--input_data` | `file` | The dataset without spatially variable genes. | +| `--output` | `file` | (*Output*) Anndata with estimate spatial variability. | + +
+ +## Component type: Metric + +Path: +[`src/spatially_variable_genes/metrics`](https://github.com/openproblems-bio/openproblems-v2/tree/main/src/spatially_variable_genes/metrics) + +A spatially variable genes identification metric. + +Arguments: + +
+ +| Name | Type | Description | +|:-------------------|:-------|:-------------------------------------------| +| `--input_method` | `file` | Anndata with estimate spatial variability. | +| `--input_solution` | `file` | Anndata with true spatial variability. | +| `--output` | `file` | (*Output*) Metric score file. | + +
+ +## File format: Output + +Anndata with estimate spatial variability. + +Example file: +`resources_test/spatially_variable_genes/mouse_brain_coronal_section1/output.h5ad` + +Description: + +Anndata with estimated spatial variability score for each gene. + +Format: + +
+ + AnnData object + var: 'feature_id', 'feature_name', 'pred_spatial_var_score' + uns: 'dataset_id', 'method_id' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:--------------------------------|:---------|:-------------------------------------| +| `var["feature_id"]` | `string` | Feature ID. | +| `var["feature_name"]` | `string` | (*Optional*) Feature name. | +| `var["pred_spatial_var_score"]` | `double` | Predicted spatial variability score. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | + +
+ +## File format: Score + +Metric score file. + +Example file: +`resources_test/spatially_variable_genes/mouse_brain_coronal_section1/score.h5ad` + +Format: + +
+ + AnnData object + uns: 'dataset_id', 'method_id', 'metric_ids', 'metric_values' + +
+ +Slot description: + +
+ +| Slot | Type | Description | +|:-----------------------|:---------|:---------------------------------------------------------------------------------------------| +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | +| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. | +| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. | + +
+ diff --git a/src/tasks/spatially_variable_genes/api/comp_control_method.yaml b/src/tasks/spatially_variable_genes/api/comp_control_method.yaml new file mode 100644 index 0000000000..ee107bfd24 --- /dev/null +++ b/src/tasks/spatially_variable_genes/api/comp_control_method.yaml @@ -0,0 +1,34 @@ +functionality: + namespace: "spatially_variable_genes/control_methods" + info: + type: control_method + type_info: + label: Control method + summary: Quality control methods for verifying the pipeline. + description: | + Control methods have the same interface as the regular methods + but also receive the solution object as input. It serves as a + starting point to test the relative accuracy of new methods in + the task, and also as a quality control for the metrics defined + in the task. + arguments: + - name: "--input_data" + __merge__: file_dataset.yaml + direction: input + required: true + - name: "--input_solution" + __merge__: file_solution.yaml + direction: input + required: true + - name: "--output" + __merge__: file_output.yaml + direction: output + required: true + test_resources: + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/spatially_variable_genes/mouse_brain_coronal_section1 + dest: resources_test/spatially_variable_genes/mouse_brain_coronal_section1 + - path: /src/common/library.bib diff --git a/src/tasks/spatially_variable_genes/api/comp_method.yaml b/src/tasks/spatially_variable_genes/api/comp_method.yaml new file mode 100644 index 0000000000..52372f7b33 --- /dev/null +++ b/src/tasks/spatially_variable_genes/api/comp_method.yaml @@ -0,0 +1,25 @@ +functionality: + namespace: "spatially_variable_genes/methods" + info: + type: method + type_info: + label: Method + summary: A spatially variable gene identification method. + description: "Method to identify spatially variable genes" + arguments: + - name: "--input_data" + __merge__: file_dataset.yaml + direction: input + required: true + - name: "--output" + __merge__: file_output.yaml + direction: output + required: true + test_resources: + - type: python_script + path: /src/common/comp_tests/check_method_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/spatially_variable_genes/mouse_brain_coronal_section1 + dest: resources_test/spatially_variable_genes/mouse_brain_coronal_section1 + - path: /src/common/library.bib \ No newline at end of file diff --git a/src/tasks/spatially_variable_genes/api/comp_metric.yaml b/src/tasks/spatially_variable_genes/api/comp_metric.yaml new file mode 100644 index 0000000000..73166a2160 --- /dev/null +++ b/src/tasks/spatially_variable_genes/api/comp_metric.yaml @@ -0,0 +1,31 @@ +functionality: + namespace: "spatially_variable_genes/metrics" + info: + type: metric + type_info: + label: Metric + summary: A spatially variable genes identification metric. + description: | + A metric for evaluating accuracy spatially variable genes identification + arguments: + - name: "--input_method" + __merge__: file_output.yaml + direction: input + required: true + - name: "--input_solution" + __merge__: file_solution.yaml + direction: input + required: true + - name: "--output" + __merge__: file_score.yaml + direction: output + required: true + test_resources: + - type: python_script + path: /src/common/comp_tests/check_metric_config.py + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/spatially_variable_genes/mouse_brain_coronal_section1 + dest: resources_test/spatially_variable_genes/mouse_brain_coronal_section1 + - path: /src/common/library.bib + \ No newline at end of file diff --git a/src/tasks/spatially_variable_genes/api/comp_process_dataset.yaml b/src/tasks/spatially_variable_genes/api/comp_process_dataset.yaml new file mode 100644 index 0000000000..b18780013d --- /dev/null +++ b/src/tasks/spatially_variable_genes/api/comp_process_dataset.yaml @@ -0,0 +1,27 @@ +functionality: + namespace: "spatially_variable_genes" + info: + type: process_dataset + type_info: + label: Data processor + summary: A spatially variable genes dataset processor. + description: | + Prepare a common dataset for the spatially_variable_genes task. + arguments: + - name: "--input" + __merge__: file_common_dataset.yaml + direction: input + required: true + - name: "--output_dataset" + __merge__: file_dataset.yaml + direction: output + required: true + - name: "--output_solution" + __merge__: file_solution.yaml + direction: output + required: true + test_resources: + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/common/mouse_brain_coronal_section1 + dest: resources_test/common/mouse_brain_coronal_section1 \ No newline at end of file diff --git a/src/tasks/spatially_variable_genes/api/file_common_dataset.yaml b/src/tasks/spatially_variable_genes/api/file_common_dataset.yaml new file mode 100644 index 0000000000..1837e45020 --- /dev/null +++ b/src/tasks/spatially_variable_genes/api/file_common_dataset.yaml @@ -0,0 +1,58 @@ +type: file +example: "resources_test/common/mouse_brain_coronal_section1/dataset.h5ad" +info: + label: "Common Dataset" + summary: A subset of the common dataset. + slots: + layers: + - type: integer + name: counts + description: Raw counts. + required: true + - type: double + name: counts + description: Normalized expression values. + required: true + var: + - type: string + name: feature_id + description: Unique identifier for the feature, usually a ENSEMBL gene id. + required: false + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + required: true + obsm: + - type: double + name: spatial + description: Spatial coordinates for each spot. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: false + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: false + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: false + - name: dataset_description + type: string + description: Long description of the dataset. + required: false + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: false \ No newline at end of file diff --git a/src/tasks/spatially_variable_genes/api/file_dataset.yaml b/src/tasks/spatially_variable_genes/api/file_dataset.yaml new file mode 100644 index 0000000000..1061720a11 --- /dev/null +++ b/src/tasks/spatially_variable_genes/api/file_dataset.yaml @@ -0,0 +1,40 @@ +type: file +example: "resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad" +info: + label: "Dataset" + summary: The dataset without spatially variable genes. + slots: + layers: + - type: integer + name: counts + description: Raw counts. + required: true + - type: double + name: normalized + description: Normalised expression values + required: true + var: + - type: string + name: feature_id + description: Unique identifier for the feature, in this case a ENSEMBL gene id suffixed with alpha value. + required: false + - type: string + name: feature_name + description: A human-readable name for the feature, in this case a gene symbol suffixed with alpha value. + required: false + + obsm: + - type: double + name: spatial + description: Spatial coordinates for each spot. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: false + \ No newline at end of file diff --git a/src/tasks/spatially_variable_genes/api/file_output.yaml b/src/tasks/spatially_variable_genes/api/file_output.yaml new file mode 100644 index 0000000000..e1fb7f6eac --- /dev/null +++ b/src/tasks/spatially_variable_genes/api/file_output.yaml @@ -0,0 +1,30 @@ +type: file +example: "resources_test/spatially_variable_genes/mouse_brain_coronal_section1/output.h5ad" +info: + label: Output + summary: "Anndata with estimate spatial variability." + description: "Anndata with estimated spatial variability score for each gene." + slots: + var: + - type: string + name: feature_id + description: Feature ID + required: true + - type: string + name: feature_name + description: Feature name + required: false + - type: double + name: pred_spatial_var_score + description: Predicted spatial variability score + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + \ No newline at end of file diff --git a/src/tasks/spatially_variable_genes/api/file_score.yaml b/src/tasks/spatially_variable_genes/api/file_score.yaml new file mode 100644 index 0000000000..28b3a47e14 --- /dev/null +++ b/src/tasks/spatially_variable_genes/api/file_score.yaml @@ -0,0 +1,25 @@ +type: file +example: "resources_test/spatially_variable_genes/mouse_brain_coronal_section1/score.h5ad" +info: + label: "Score" + summary: Metric score file. + slots: + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true + - type: string + name: metric_ids + description: "One or more unique metric identifiers" + multiple: true + required: true + - type: double + name: metric_values + description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + multiple: true + required: true diff --git a/src/tasks/spatially_variable_genes/api/file_simulated_dataset.yaml b/src/tasks/spatially_variable_genes/api/file_simulated_dataset.yaml new file mode 100644 index 0000000000..043b459690 --- /dev/null +++ b/src/tasks/spatially_variable_genes/api/file_simulated_dataset.yaml @@ -0,0 +1,66 @@ +type: file +example: "resources_test/spatially_variable_genes/mouse_brain_coronal_section1/simulated_dataset.h5ad" +info: + label: "Common Dataset" + summary: A subset of the common dataset. + slots: + layers: + - type: integer + name: counts + description: Raw counts. + required: true + var: + - type: string + name: feature_id + description: Unique identifier for the feature, in this case a ENSEMBL gene id suffixed with alpha value. + required: false + - type: string + name: feature_name + description: A human-readable name for the feature, in this case a gene symbol suffixed with alpha value. + required: true + - type: string + name: orig_feature_id + description: Original unique identifier for the feature, usually a ENSEMBL gene id. + required: false + - type: string + name: orig_feature_name + description: Original human-readable name for the feature, usually a gene symbol. + required: true + - type: double + name: true_spatial_var_score + description: True spatial variability score + required: true + obsm: + - type: double + name: spatial + description: Spatial coordinates for each spot. + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: true + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: true \ No newline at end of file diff --git a/src/tasks/spatially_variable_genes/api/file_solution.yaml b/src/tasks/spatially_variable_genes/api/file_solution.yaml new file mode 100644 index 0000000000..f26006bfd0 --- /dev/null +++ b/src/tasks/spatially_variable_genes/api/file_solution.yaml @@ -0,0 +1,57 @@ +type: file +example: "resources_test/spatially_variable_genes/mouse_brain_coronal_section1/solution.h5ad" +info: + label: Solution + summary: "Anndata with true spatial variability." + description: "Anndata with true spatial variability score for each gene." + slots: + var: + - type: string + name: feature_id + description: Unique identifier for the feature (e.g., ESEMBL gene id suffixed with alpha value). + required: false + - type: string + name: feature_name + description: A human-readable name for the feature, in this case a gene symbol suffixed with alpha value. + required: true + # - type: string + # name: orig_feature_id + # description: Original unique identifier for the feature, usually a ENSEMBL gene id. + # required: false + - type: string + name: orig_feature_name + description: Original human-readable name for the feature, usually a gene symbol. + required: true + - type: double + name: true_spatial_var_score + description: True spatial variability score + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - name: dataset_name + type: string + description: Nicely formatted name. + required: true + - type: string + name: dataset_url + description: Link to the original source of the dataset. + required: true + - name: dataset_reference + type: string + description: Bibtex reference of the paper in which the dataset was published. + required: false + - name: dataset_summary + type: string + description: Short description of the dataset. + required: true + - name: dataset_description + type: string + description: Long description of the dataset. + required: true + - name: dataset_organism + type: string + description: The organism of the sample in the dataset. + required: true \ No newline at end of file diff --git a/src/tasks/spatially_variable_genes/api/task_info.yaml b/src/tasks/spatially_variable_genes/api/task_info.yaml new file mode 100644 index 0000000000..78937fb59e --- /dev/null +++ b/src/tasks/spatially_variable_genes/api/task_info.yaml @@ -0,0 +1,47 @@ +name: spatially_variable_genes +label: "Spatially variable genes" +summary: "Detecting genes whose expression levels vary across spatial regions." +motivation: | + Recent years have witnessed significant progress in spatially-resolved transcriptome profiling techniques that simultaneously characterize cellular gene expression and their physical position, generating spatial transcriptomic (ST) data. The application of these techniques has dramatically advanced our understanding of disease and developmental biology. One common task for all ST profiles, regardless of the employed protocols, is to identify genes that exhibit spatial patterns. These genes, defined as spatially variable genes (SVGs), contain additional information about the spatial structure of the tissues of interest, compared to highly variable genes (HVGs). +description: | + Identification of spatially variable genes is crucial to for studying spatial domains within tissue microenvironmnets, developmental gradients and cell signaling pathways. In this task we attempt to evaluate various methods for detecting SVGs using a number of realistic simulated datasets with diverse patterns derived from real-world spatial transcriptomics data using scDesign3. Synthetic data is generated by mixing a Gaussian Process (GP) model and a non-spatial model (obtained by shuffling mean parameters of the GP model to remove spatial correlation between spots) to generate gene expressions with various spatial variability. For more details, please refer to our [manuscript](https://www.biorxiv.org/content/10.1101/2023.12.02.569717v1) and [Github](https://github.com/pinellolab/SVG_Benchmarking). +references: + doi: + # Benchmarking computational methods to identify spatially variable genes and peaks + # Zhijian Li, Zain M.Patel, Dongyuan Song, Guanao Yan, Jingyi Jessica Li, Luca Pinello + # bioRxiv 2023.12.02.569717; doi: https://doi.org/10.1101/2023.12.02.569717 + - 10.1101/2023.12.02.569717 +authors: + - name: Zhijian Li + roles: [ author, maintainer ] + info: + github: lzj1769 + orcid: 0000-0002-1523-1333 + - name: Zain M. Patel + roles: [ author ] + info: + github: doczmp + - name: Dongyuan Song + roles: [ author] + info: + github: SONGDONGYUAN1994 + - name: Guanao Yan + roles: [ author ] + - name: Jingyi Jessica Li + roles: [ author ] + info: + github: JSB-UCLA + - name: Luca Pinello + roles: [ author ] + info: + github: pinellolab + - name: Robrecht Cannoodt + roles: [contributor] + info: + github: rcannood + orcid: 0000-0003-3641-729X + - name: Sai Nirmayi Yasa + roles: [contributor] + info: + github: sainirmayi + orcid: 0009-0003-6319-9803 \ No newline at end of file diff --git a/src/tasks/spatially_variable_genes/control_methods/random_ranking/config.vsh.yaml b/src/tasks/spatially_variable_genes/control_methods/random_ranking/config.vsh.yaml new file mode 100644 index 0000000000..fcb1b767f1 --- /dev/null +++ b/src/tasks/spatially_variable_genes/control_methods/random_ranking/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: ../../api/comp_control_method.yaml + +functionality: + name: random_ranking + info: + label: Random Ranking + summary: "Negative control method that randomly rank genes." + description: | + A negative control method with random ranking of genes. + preferred_normalization: counts + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: pandas + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/control_methods/random_ranking/script.py b/src/tasks/spatially_variable_genes/control_methods/random_ranking/script.py new file mode 100644 index 0000000000..e43c4e5079 --- /dev/null +++ b/src/tasks/spatially_variable_genes/control_methods/random_ranking/script.py @@ -0,0 +1,28 @@ +import anndata as ad +import numpy as np + +# VIASH START +par = { + 'input_data': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad', + 'input_solution': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/solution.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'random_ranking' +} +# VIASH END + +print('Generate predictions', flush=True) +input_data = ad.read_h5ad(par['input_data']) + +df = input_data.var[["feature_id"]] + +np.random.seed(0) +df['pred_spatial_var_score'] = np.random.rand(len(df['feature_id'])) + +output = ad.AnnData(var=df, + uns={'dataset_id': input_data.uns['dataset_id'], + 'method_id': meta['functionality_name']}) + +print("Write output to file", flush=True) +output.write_h5ad(par['output']) diff --git a/src/tasks/spatially_variable_genes/control_methods/true_ranking/config.vsh.yaml b/src/tasks/spatially_variable_genes/control_methods/true_ranking/config.vsh.yaml new file mode 100644 index 0000000000..b37a66d9d8 --- /dev/null +++ b/src/tasks/spatially_variable_genes/control_methods/true_ranking/config.vsh.yaml @@ -0,0 +1,25 @@ +__merge__: ../../api/comp_control_method.yaml + +functionality: + name: true_ranking + info: + label: True Ranking + summary: "Positive control method that correctly rank genes." + description: | + A positive control method with correct ranking of genes. + preferred_normalization: counts + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: pandas + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/control_methods/true_ranking/script.py b/src/tasks/spatially_variable_genes/control_methods/true_ranking/script.py new file mode 100644 index 0000000000..2504fdc4f2 --- /dev/null +++ b/src/tasks/spatially_variable_genes/control_methods/true_ranking/script.py @@ -0,0 +1,25 @@ +import anndata as ad + +# VIASH START +par = { + 'input_data': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad', + 'input_solution': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/solution.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'true_ranking' +} +# VIASH END + +print('Generate predictions', flush=True) +input_solution = ad.read_h5ad(par['input_solution']) + +df = input_solution.var[["feature_id", "true_spatial_var_score"]] +df.rename(columns={'true_spatial_var_score': 'pred_spatial_var_score'}, inplace=True) + +output = ad.AnnData(var=df, + uns={'dataset_id': input_solution.uns['dataset_id'], + 'method_id': meta['functionality_name']}) + +print("Write output to file", flush=True) +output.write_h5ad(par['output']) diff --git a/src/tasks/spatially_variable_genes/methods/boostgp/config.vsh.yaml b/src/tasks/spatially_variable_genes/methods/boostgp/config.vsh.yaml new file mode 100644 index 0000000000..060dc44675 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/boostgp/config.vsh.yaml @@ -0,0 +1,48 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: boostgp + info: + label: BOOST-GP + summary: "Bayesian modeling of spatial molecular profiling data via Gaussian process" + description: | + BOOST-GP a novel Bayesian hierarchical model to analyze spatial transcriptomics data, + with several unique characteristics. It models the zero-inflated and over-dispersed + counts by deploying a zero-inflated negative binomial model that greatly increases + model stability and robustness. Besides, the Bayesian inference framework allows us + to borrow strength in parameter estimation in a de novo fashion. As a result, + the proposed model shows competitive performances in accuracy and robustness + over existing methods in both simulation studies and two real data applications. + preferred_normalization: counts + reference: li2021bayesian + documentation_url: https://github.com/Minzhe/BOOST-GP + repository_url: https://github.com/Minzhe/BOOST-GP + + arguments: + - name: --n_iter + type: integer + description: Number of iterations. + default: 10 + info: + test_default: 7 + + resources: + - type: r_script + path: script.R + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_r:1.0.4 + setup: + - type: apt + packages: + - git + - type: docker + run : | + git clone https://github.com/Minzhe/BOOST-GP.git /opt/BOOST-GP + - type: r + cran: [RcppDist, ggplot2, anndata] + - type: native + - type: nextflow + directives: + label: [veryhightime, midmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/methods/boostgp/script.R b/src/tasks/spatially_variable_genes/methods/boostgp/script.R new file mode 100644 index 0000000000..4596bff2e6 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/boostgp/script.R @@ -0,0 +1,50 @@ +library(RcppDist) +library(anndata) + +dest <- getwd() + +# VIASH START +par <- list( + "input_data" = "resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad", + "output" = "output.h5ad", + "n_iter" = 10 +) +meta <- list( + "functionality_name" = "BOOST-GP" +) +# VIASH END + +cat("Load data\n") +adata <- anndata::read_h5ad(par$input_data) + +setwd("/opt/BOOST-GP") +source("./R/boost.gp.R") + +counts <- as.matrix(adata$layers[["counts"]]) +colnames(counts) <- adata$var_names +rownames(counts) <- adata$obs_names +mode(counts) <- "integer" + +loc <- as.data.frame(adata$obsm[["spatial"]]) +rownames(loc) <- adata$obs_names +colnames(loc) <- c("x", "y") + +cat("Run BOOST-GP\n") +df <- as.data.frame(boost.gp(Y = counts, loc = loc, iter = par$n_iter, burn = 5)) + +df$feature_id <- rownames(df) +df <- subset(df, select = c("feature_id", "PPI")) +colnames(df) <- c("feature_id", "pred_spatial_var_score") + +# save output +cat("Write output AnnData to file\n") +output <- anndata::AnnData( + shape = adata$shape, + var = df, + uns = list( + "dataset_id" = adata$uns[["dataset_id"]], + "method_id" = meta[["functionality_name"]] + ) +) + +zzz <- output$write_h5ad(paste0(dest, "/", par$output), compression = "gzip") diff --git a/src/tasks/spatially_variable_genes/methods/gpcounts/config.vsh.yaml b/src/tasks/spatially_variable_genes/methods/gpcounts/config.vsh.yaml new file mode 100644 index 0000000000..7104dc7a62 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/gpcounts/config.vsh.yaml @@ -0,0 +1,56 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: gpcounts + info: + label: GPcounts + summary: "GPcounts is non-parametric modelling of temporal and spatial counts data from RNA-seq experiments." + description: | + The GPcounts package implements GP regression methods for modelling counts data using a + negative binomial likelihood function. Computational efficiency is achieved through the use of + variational Bayesian inference. The GP function models changes in the mean of the negative binomial + likelihood through a logarithmic link function and the dispersion parameter is fitted by maximum + likelihood. We validate the method on simulated time course data, showing better performance to identify + changes in over-dispersed counts data than methods based on Gaussian or Poisson likelihoods. + preferred_normalization: counts + reference: bintayyash2021non + documentation_url: https://github.com/ManchesterBioinference/GPcounts/blob/master/demo_notebooks/GPcounts_spatial.ipynb + repository_url: https://github.com/ManchesterBioinference/GPcounts + + arguments: + - name: --n_features + type: integer + description: Number of features to include. + info: + test_default: 120 + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + # image: python:3.9.16 + image: ghcr.io/openproblems-bio/base_images/tensorflow_nvidia:1.1.0 + setup: + - type: apt + packages: + - git + - type: python + packages: + - tensorflow-probability + - tensorflow[and-cuda] + - gpflow + - scipy==1.9.1 + - type: docker + run : | + git clone https://github.com/markvdw/RobustGP.git /opt/RobustGP && \ + git clone https://github.com/lzj1769/GPcounts.git /opt/GPcounts + - type: python + packages: + - /opt/RobustGP + - /opt/GPcounts + - type: native + - type: nextflow + directives: + label: [veryhightime, midmem, midcpu, gpu] diff --git a/src/tasks/spatially_variable_genes/methods/gpcounts/script.py b/src/tasks/spatially_variable_genes/methods/gpcounts/script.py new file mode 100644 index 0000000000..9bcf0497be --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/gpcounts/script.py @@ -0,0 +1,92 @@ +import statsmodels.api as sm +import statsmodels.formula.api as smf +import pandas as pd +import anndata as ad +import scipy +from GPcounts.RNA_seq_GP import rna_seq_gp +import warnings +warnings.filterwarnings('ignore') + + +# VIASH START +par = { + 'input_data': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad', + 'output': 'output.h5ad', + 'n_features': 120 +} +meta = { + 'functionality_name': 'GPcounts' +} +# VIASH END + +print('Load data', flush=True) +adata = ad.read_h5ad(par['input_data']) + +print('Run GPcounts') + +# Subset if required +if par['n_features']: + adata = adata[:, :par['n_features']] + +counts = adata.layers["counts"] +if scipy.sparse.issparse(counts): + counts = counts.todense() + +# spatialx = [str(i) for i in adata.obsm['spatial'][:, 0]] +# spatialy = [str(i) for i in adata.obsm['spatial'][:, 1]] + +# index_names = [i+'x'+j for i, j in zip(spatialx, spatialy)] +# Y = pd.DataFrame(data=counts, index=index_names, columns=adata.var.index) + +# spatial_locations = pd.DataFrame(index=Y.index) +# spatial_locations['x'] = Y.index.str.split('x').str.get(0).map(float) +# spatial_locations['y'] = Y.index.str.split('x').str.get(1).map(float) + +# spatial_locations['total_counts'] = Y.sum(1) + +Y = pd.DataFrame(data=counts, + index=adata.obs_names, + columns=adata.var_names) + +spatial_locations = pd.DataFrame(data=adata.obsm['spatial'], + index=adata.obs_names, + columns=['x', 'y']) +spatial_locations['total_counts'] = Y.sum(1) + +Y = Y.loc[spatial_locations.index] +X = spatial_locations[['x', 'y']] + +scales = [] +for i in range(0, len(Y.columns)): + model = smf.glm(formula="Y.iloc[:,i]~0+spatial_locations['total_counts']", data=Y, + family=sm.families.NegativeBinomial(sm.families.links.identity())).fit() + res = model.params[0]*spatial_locations['total_counts'] + scales.append(res) +scalesdf = pd.DataFrame(scales) +scalesdf = scalesdf.T + +Y = Y.T +X = X[['x', 'y']] + +sparse = True +nb_scaled = True # set the nb_scaled argument to True to pass the scale factors +gene_name = Y.index +likelihood = 'Negative_binomial' +gp_counts = rna_seq_gp( + X, Y.loc[gene_name], sparse=sparse, M=250, scale=scalesdf, safe_mode=False) + +log_likelihood_ratio = gp_counts.One_sample_test(likelihood) + +df = gp_counts.calculate_FDR(log_likelihood_ratio) + +# save results +df = df.loc[adata.var_names][['log_likelihood_ratio']] +df = df.reset_index() +df.columns = ['feature_id', 'pred_spatial_var_score'] + +output = ad.AnnData(var=df, + uns={'dataset_id': adata.uns['dataset_id'], + 'method_id': meta['functionality_name']}) + +print("Write output to file", flush=True) +output.write_h5ad(par['output']) diff --git a/src/tasks/spatially_variable_genes/methods/moran_i/config.vsh.yaml b/src/tasks/spatially_variable_genes/methods/moran_i/config.vsh.yaml new file mode 100644 index 0000000000..594f51f423 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/moran_i/config.vsh.yaml @@ -0,0 +1,40 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: moran_i + info: + label: Moran's I + summary: "Moran's I is a measurement of spatial autocorrelation." + description: | + The MoranI global spatial auto-correlation statistics evaluates whether features (i.e. genes) + shows a pattern that is clustered, dispersed or random in the tissue are under consideration. + preferred_normalization: counts + reference: palla2022squidpy + documentation_url: https://squidpy.readthedocs.io/en/stable/api/squidpy.gr.spatial_autocorr.html + repository_url: https://github.com/scverse/squidpy + + # Component-specific parameters (optional) + arguments: + - name: "--coord_type_moran_i" + type: string + default: "generic" + description: Type of coordinate system. Valid options are "grid" for grid coordinates or "generic" for generic coordinates. + choices: [grid, generic] + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: + - pandas + - squidpy==1.4.1 + - matplotlib==3.8.3 + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/methods/moran_i/script.py b/src/tasks/spatially_variable_genes/methods/moran_i/script.py new file mode 100644 index 0000000000..c158348dd5 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/moran_i/script.py @@ -0,0 +1,44 @@ +import warnings +warnings.filterwarnings('ignore') + +import anndata as ad +import squidpy as sq + +# VIASH START +par = { + 'input_data': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad', + 'output': 'output.h5ad', + 'coord_type_moran_i': 'generic' + +} +meta = { + 'functionality_name': 'moranI' +} +# VIASH END + +print('Load data', flush=True) +adata = ad.read_h5ad(par['input_data']) + +print('Run moranI', flush=True) +sq.gr.spatial_neighbors(adata, + coord_type=par['coord_type_moran_i'], + delaunay=True) + +sq.gr.spatial_autocorr(adata, + mode="moran", + layer='normalized', + n_perms=100, + genes=adata.var_names) + +# save results +df = adata.uns["moranI"] +df = df.loc[adata.var_names][['I']] +df = df.reset_index() +df.columns = ['feature_id', 'pred_spatial_var_score'] + +output = ad.AnnData(var=df, + uns={'dataset_id': adata.uns['dataset_id'], + 'method_id': meta['functionality_name']}) + +print("Write output to file", flush=True) +output.write_h5ad(par['output']) diff --git a/src/tasks/spatially_variable_genes/methods/nnsvg/config.vsh.yaml b/src/tasks/spatially_variable_genes/methods/nnsvg/config.vsh.yaml new file mode 100644 index 0000000000..ec8db15e15 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/nnsvg/config.vsh.yaml @@ -0,0 +1,31 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: nnsvg + info: + label: nnSVG + summary: "nnSVG is based on nearest-neighbor Gaussian process (NNGP) models to estimate parameters in GPs" + description: | + nnSVG identifies genes that vary in expression continuously across the entire tissue or within a priori defined + spatial domains. It uses gene-specific estimates of length scale parameters within the Gaussian process models, + and scales linearly with the number of spatial locations. + preferred_normalization: counts + reference: weber2023nnsvg + documentation_url: https://bioconductor.org/packages/release/bioc/vignettes/nnSVG/inst/doc/nnSVG.html + repository_url: https://github.com/lmweber/nnSVG + + resources: + - type: r_script + path: script.R + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_r:1.0.4 + setup: + - type: r + cran: [anndata, dplyr] + bioc: [SpatialExperiment, scran, nnSVG] + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/methods/nnsvg/script.R b/src/tasks/spatially_variable_genes/methods/nnsvg/script.R new file mode 100644 index 0000000000..44a95571d6 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/nnsvg/script.R @@ -0,0 +1,71 @@ +suppressMessages(library(SpatialExperiment)) +suppressMessages(library(scran)) +suppressMessages(library(nnSVG)) +suppressMessages(library(anndata)) +suppressMessages(library(dplyr)) + +# VIASH START +par = list( + 'input_data' = 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad', + 'output' = 'output.h5ad' +) +meta = list( + 'functionality_name' = 'nnSVG', + 'cpus' = 4 +) + +# VIASH END + +# load data +cat('Load data') +adata <- read_h5ad(par$input_data) +counts <- t(as.matrix(adata$layers[['counts']])) + +colnames(counts) <- adata$obs_names +rownames(counts) <- adata$var_names + +loc <- as.data.frame(adata$obsm[['spatial']]) + +row_data = adata$var +row_data$gene_id = rownames(row_data) +row_data$feature_type = "Gene Expression" + +colnames(loc) <- c("x", "y") +rownames(loc) <- colnames(counts) + +spe <- SpatialExperiment( + assays = list(counts = counts), + rowData = row_data, + colData = loc, + spatialCoordsNames = c("x", "y")) + +# calculate logcounts (log-transformed normalized counts) using scran package +# using library size factors +spe <- computeLibraryFactors(spe) +spe <- logNormCounts(spe) + +# run nnSVG +if (!is.null(meta$cpus)) { + n_cpus <- meta$cpus +} else { + n_cpus <- 1 +} + +cat('Run nnSVG') +spe <- nnSVG(spe, n_threads=n_cpus) + +df <- as.data.frame(rowData(spe)) %>% + subset(select = c('feature_id', 'LR_stat')) + +colnames(df) <- c('feature_id', 'pred_spatial_var_score') +rownames(df) <- NULL + +# save output +cat("Write output AnnData to file\n") +output = anndata::AnnData( + shape = adata$shape, + var=df, + uns=list('dataset_id' = adata$uns[['dataset_id']], + 'method_id' = meta[['functionality_name']])) + +anndata::write_h5ad(anndata = output, filename = par$output) diff --git a/src/tasks/spatially_variable_genes/methods/scgco/config.vsh.yaml b/src/tasks/spatially_variable_genes/methods/scgco/config.vsh.yaml new file mode 100644 index 0000000000..6980284a42 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/scgco/config.vsh.yaml @@ -0,0 +1,69 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: scgco + info: + label: scGCO + summary: "Identification of spatially variable genes with graph cuts." + description: | + Single-cell gene expression data with positional information is critical to dissect + mechanisms and architectures of multicellular organisms, but the potential is limited + by the scalability of current data analysis strategies. Here, we present scGCO, + a method based on fast optimization of hidden Markov Random Fields with graph cuts + to identify spatially variable genes. Comparing to existing methods, scGCO delivers + a superior performance with lower false positive rate and improved specificity, + while demonstrates a more robust performance in the presence of noises. + Critically, scGCO scales near linearly with inputs and demonstrates orders of + magnitude better running time and memory requirement than existing methods, + and could represent a valuable solution when spatial transcriptomics data grows + into millions of data points and beyond.. + preferred_normalization: counts + reference: zhang2022identification + documentation_url: https://github.com/WangPeng-Lab/scGCO/blob/master/code/Tutorial/scGCO_tutorial.ipynb + repository_url: https://github.com/WangPeng-Lab/scGCO + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: python:3.9.16 + setup: + - type: apt + packages: + - git + - procps + - libhdf5-dev + - cmake + - gdal-bin + - libgdal-dev + - type: docker + run : | + pip install Cython==0.29.33 numpy==1.23.5 scipy==1.9.1 + - type: docker + run : | + git clone https://github.com/lzj1769/scGCO_simple.git /opt/scGCO/scGCO_simple + - type: python + packages: + - h5py==3.8.0 + - pandas==1.5.3 + - parmap==1.6.0 + - scanpy==1.9.3 + - tqdm==4.65.0 + - anndata==0.8.0 + - matplotlib==3.7.1 + - scikit-learn==1.2.2 + - hdbscan + - seaborn==0.12.2 + - pysal==2.0.0 + - pygco==0.0.16 + - shapely==2.0.1 + - networkx==2.5 + - scikit-image + - pyyaml + - requests + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/methods/scgco/script.py b/src/tasks/spatially_variable_genes/methods/scgco/script.py new file mode 100644 index 0000000000..062a0dede3 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/scgco/script.py @@ -0,0 +1,63 @@ +import warnings +warnings.filterwarnings('ignore') + +import pandas as pd +import anndata as ad +import numpy as np +import scipy +import sys +sys.path.append("/opt/scGCO") + +from scGCO_simple import * + +# VIASH START +par = { + 'input_data': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'scGCO' +} +# VIASH END + +print('Load data', flush=True) +adata = ad.read_h5ad(par['input_data']) + +counts = adata.layers["counts"] +if scipy.sparse.issparse(counts): + counts = counts.todense() + +data = pd.DataFrame( + counts, + columns=adata.var_names, + index=adata.obs_names +) + +print('Run scGCO', flush=True) +data_norm = normalize_count_cellranger(data) + +exp = data.iloc[:, 0] +locs = adata.obsm['spatial'].copy() + +print('Create graph with weight', flush=True) +cellGraph = create_graph_with_weight(locs, exp) +gmmDict = gmm_model(data_norm) + +print('Identify spatial genes', flush=True) +df = identify_spatial_genes(locs, data_norm, cellGraph, gmmDict) + +# save results +df = df.loc[adata.var_names][['fdr']] +df = df.reset_index() +df.columns = ['feature_id', 'pred_spatial_var_score'] + +# Transform the values via -log10 to make sure a bigger score represents a +# higher spatial variation +df['pred_spatial_var_score'] = -np.log10(df['pred_spatial_var_score'].tolist()) + +output = ad.AnnData(var=df, + uns={'dataset_id': adata.uns['dataset_id'], + 'method_id': meta['functionality_name']}) + +print("Write output to file", flush=True) +output.write_h5ad(par['output']) diff --git a/src/tasks/spatially_variable_genes/methods/sepal/config.vsh.yaml b/src/tasks/spatially_variable_genes/methods/sepal/config.vsh.yaml new file mode 100644 index 0000000000..d64abab561 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/sepal/config.vsh.yaml @@ -0,0 +1,46 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: sepal + info: + label: Sepal + summary: "Sepal simulates diffusion of individual transcripts to extract genes with spatial patterns." + description: | + This method assesses the degree of randomness exhibited by each transcript profile and rank them accordingly. + preferred_normalization: counts + reference: andersson2021sepal + documentation_url: https://squidpy.readthedocs.io/en/stable/api/squidpy.gr.sepal.html + repository_url: https://github.com/scverse/squidpy + + + # Component-specific parameters (optional) + arguments: + - name: "--max_neighs_sepal" + type: integer + default: 6 + description: Maximum number of neighbors of a node in the spatial graph. Valid options are 4 (square-grid) and 6 (hexagonal-grid). + choices: [4, 6] + + - name: "--coord_type_sepal" + type: string + default: "grid" + description: Type of coordinate system. Valid options are "grid" for grid coordinates or "generic" for generic coordinates. + choices: [grid, generic] + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: + - pandas + - squidpy==1.4.1 + - matplotlib==3.8.3 + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/methods/sepal/script.py b/src/tasks/spatially_variable_genes/methods/sepal/script.py new file mode 100644 index 0000000000..b2672adaed --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/sepal/script.py @@ -0,0 +1,40 @@ +import anndata as ad +import squidpy as sq + +# VIASH START +par = { + 'input_data': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad', + 'output': 'output.h5ad', + 'coord_type_sepal': 'grid', + 'max_neighs_sepal': 6, +} +meta = { + 'functionality_name': 'Sepal' +} +# VIASH END + +print('Generate predictions', flush=True) +adata = ad.read_h5ad(par['input_data']) + +sq.gr.spatial_neighbors(adata, + coord_type=par['coord_type_sepal'], + delaunay=False) + +sq.gr.sepal(adata, + layer='normalized', + max_neighs=par['max_neighs_sepal'], + genes=adata.var_names, + n_jobs=1) + +# save results +df = adata.uns["sepal_score"] +df = df.loc[adata.var_names][['sepal_score']] +df = df.reset_index() +df.columns = ['feature_id', 'pred_spatial_var_score'] + +output = ad.AnnData(var=df, + uns={'dataset_id': adata.uns['dataset_id'], + 'method_id': meta['functionality_name']}) + +print("Write output to file", flush=True) +output.write_h5ad(par['output']) diff --git a/src/tasks/spatially_variable_genes/methods/somde/config.vsh.yaml b/src/tasks/spatially_variable_genes/methods/somde/config.vsh.yaml new file mode 100644 index 0000000000..546059110e --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/somde/config.vsh.yaml @@ -0,0 +1,37 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: somde + info: + label: SOMDE + summary: "SOMDE is a scalable method for identifying spatially variable genes with self-organizing map." + description: | + SOMDE uses self-organizing map to cluster neighboring cells into nodes, and then uses a Gaussian process + to fit the node-level spatial gene expression to identify SVgenes. Experiments show that SOMDE is about + 5 to 50 times faster than existing methods with comparable results. + The adjustable resolution of SOMDE makes it the only method that can give results in about + 5 min in large datasets of more than 20 000 sequencing sites. + preferred_normalization: counts + reference: hao2021somde + documentation_url: https://github.com/WhirlFirst/somde/blob/master/slide_seq0819_11_SOM.ipynb + repository_url: https://github.com/XuegongLab/somde + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: + - somde + - scanpy==1.9.8 + - pandas==2.2.1 + - numpy==1.26.4 + - scipy==1.11.4 + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/methods/somde/script.py b/src/tasks/spatially_variable_genes/methods/somde/script.py new file mode 100644 index 0000000000..4dc3b84c95 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/somde/script.py @@ -0,0 +1,53 @@ +import anndata as ad +import pandas as pd +import numpy as np +import scanpy as sc +from somde import SomNode +import scipy + +# VIASH START +par = { + 'input_data': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'SOMDE' +} +# VIASH END + +print('Load data', flush=True) +adata = ad.read_h5ad(par['input_data']) + +print('Run SOMDE', flush=True) +counts = adata.layers["counts"] +if scipy.sparse.issparse(counts): + counts = counts.todense() + +data = pd.DataFrame( + counts, + columns=adata.var_names, + index=adata.obs_names +) + +X = pd.DataFrame(adata.obsm["spatial"], + index=adata.obs_names, + columns=["x", "y"]).values.astype(np.float32) + +som = SomNode(X, k=10) +ndf, ninfo = som.mtx(data.transpose()) +nres = som.norm() + +df, SVnum = som.run() + +# save results +df.set_index("g", inplace=True) +df = df.loc[adata.var_names][['FSV']] +df = df.reset_index() +df.columns = ['feature_id', 'pred_spatial_var_score'] + +output = ad.AnnData(var=df, + uns={'dataset_id': adata.uns['dataset_id'], + 'method_id': meta['functionality_name']}) + +print("Write output to file", flush=True) +output.write_h5ad(par['output']) diff --git a/src/tasks/spatially_variable_genes/methods/spagcn/config.vsh.yaml b/src/tasks/spatially_variable_genes/methods/spagcn/config.vsh.yaml new file mode 100644 index 0000000000..78db85a819 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/spagcn/config.vsh.yaml @@ -0,0 +1,47 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: spagcn + info: + label: SpaGCN + summary: "Integrating gene expression, spatial location and histology to identify spatial + domains and spatially variable genes by graph convolutional network." + description: | + To elucidate spatial gene expression variation, we present SpaGCN, a graph convolutional + network approach that integrates gene expression, spatial location and histology in SRT data analysis. + Through graph convolution, SpaGCN aggregates gene expression of each spot from its neighboring spots, + which enables the identification of spatial domains with coherent expression and histology. + The subsequent domain guided differential expression (DE) analysis then detects genes with + enriched expression patterns in the identified domains. Analyzing seven SRT datasets using + SpaGCN, we show it can detect genes with much more enriched spatial expression patterns than competing methods. Furthermore, genes detected by SpaGCN are transferrable and can be utilized to study spatial variation of gene expression in other datasets. SpaGCN is computationally + fast, platform independent, making it a desirable tool for diverse SRT studies. + preferred_normalization: counts + reference: hu2021spagcn + documentation_url: https://github.com/jianhuupenn/SpaGCN/blob/master/tutorial/tutorial.ipynb + repository_url: https://github.com/jianhuupenn/SpaGCN + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: apt + packages: + - git + - procps + - libhdf5-dev + - cmake + - type: docker + run : | + git clone https://github.com/jianhuupenn/SpaGCN.git /opt/SpaGCN + - type: python + packages: + - numpy<2.0 + - /opt/SpaGCN/SpaGCN_package + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/methods/spagcn/script.py b/src/tasks/spatially_variable_genes/methods/spagcn/script.py new file mode 100644 index 0000000000..e60e08db61 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/spagcn/script.py @@ -0,0 +1,132 @@ +import anndata as ad +import SpaGCN as spg +import pandas as pd +import numpy as np +import scanpy as sc +import random +import torch + +# VIASH START +par = { + 'input_data': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'SpaGCN' +} +# VIASH END + +print('Load data', flush=True) +adata = ad.read_h5ad(par['input_data']) + +# run normalization +adata.X = adata.layers['counts'].copy() +sc.pp.normalize_total(adata=adata) +sc.pp.log1p(adata) + +print('Run SpaGCN', flush=True) +random_seed = 100 + +# Set seed +random.seed(random_seed) +torch.manual_seed(random_seed) +np.random.seed(random_seed) + +p = 0.5 +min_in_group_fraction = 0 +min_in_out_group_ratio = 0 +min_fold_change = 0 + + +adj = spg.calculate_adj_matrix( + x=adata.obsm["spatial"][:, 0], + y=adata.obsm["spatial"][:, 1], + histology=False +) +l = spg.search_l(p, adj, start=0.01, end=1000, tol=0.01, max_run=100) + +clf = spg.SpaGCN() +clf.set_l(l) + +# Run +clf.train( + adata, + adj, + init_spa=True, + init="louvain", + res=0.5, + tol=5e-3, + lr=0.05, + max_epochs=200, +) + +y_pred, prob = clf.predict() +adata.obs["pred"] = y_pred +de_genes_all = list() +n_clusters = len(adata.obs["pred"].unique()) + +# identify DE genes +for target in range(n_clusters): + print(f"target: {target}") + start, end = np.quantile(adj[adj != 0], q=0.001), np.quantile( + adj[adj != 0], q=0.1 + ) + r = spg.search_radius( + target_cluster=target, + cell_id=adata.obs.index.tolist(), + x=adata.obsm["spatial"][:, 0], + y=adata.obsm["spatial"][:, 1], + pred=adata.obs["pred"].tolist(), + start=start, + end=end, + num_min=10, + num_max=14, + max_run=100, + ) + + try: + nbr_domians = spg.find_neighbor_clusters( + target_cluster=target, + cell_id=adata.obs.index.tolist(), + x=adata.obsm["spatial"][:, 0], + y=adata.obsm["spatial"][:, 1], + pred=adata.obs["pred"].tolist(), + radius=r, + ratio=0, + ) + + de_genes_info = spg.rank_genes_groups( + input_adata=adata, + target_cluster=target, + nbr_list=nbr_domians, + label_col="pred", + adj_nbr=True, + log=True, + ) + de_genes_all.append(de_genes_info) + except (RuntimeError, TypeError, NameError): + pass + +if len(de_genes_all) == 0: + df = adata.var + df['pvals_adj'] = np.random.random(adata.n_vars) +else: + df_res = pd.concat(de_genes_all) + df_res = df_res.groupby(["genes"]).min() + df_res = df_res.loc[adata.var_names] + df = pd.concat([df_res, adata.var], axis=1) + +# save results +df = df.loc[adata.var_names][['pvals_adj']] +df = df.reset_index() +df.columns = ['feature_id', 'pred_spatial_var_score'] + +# reverse it to make sure a bigger score represents a higher spatial variation +df['pred_spatial_var_score'] = -np.log10(df['pred_spatial_var_score']) + +output = ad.AnnData(var=df, + uns={'dataset_id': adata.uns['dataset_id'], + 'method_id': meta['functionality_name']}) + +print("Write output to file", flush=True) +output.write_h5ad(par['output']) diff --git a/src/tasks/spatially_variable_genes/methods/spagft/config.vsh.yaml b/src/tasks/spatially_variable_genes/methods/spagft/config.vsh.yaml new file mode 100644 index 0000000000..48418da815 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/spagft/config.vsh.yaml @@ -0,0 +1,59 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: spagft + info: + label: SpaGFT + summary: "SpaGFT is a graph Fourier transform for tissue module identification from spatially resolved transcriptomics" + description: | + The tissue module (TM) was defined as an architectural area containing recurrent cellular + communities executing specific biological functions at different tissue sites. + However, the computational identification of TMs poses challenges owing to their various + length scales, convoluted biological processes, not well-defined molecular features, and + irregular spatial patterns. Here, we present a hypothesis-free graph Fourier transform model, + SpaGFT, to characterize TMs. For the first time, SpaGFT transforms complex gene expression + patterns into simple, but informative signals, leading to the accurate identification of + spatially variable genes (SVGs) at a fast computational speed. Based on clustering the + transformed signals of the SVGs, SpaGFT provides a novel computational framework for TM + characterization. Three case studies were used to illustrate TM identities, the biological + processes of convoluted TMs in the lymph node, and conserved TMs across multiple samples constituting + the complex organ. The superior accuracy, scalability, and interpretability of SpaGFT indicate + that it is a novel and powerful tool for the investigation of TMs to gain new insights into a variety + of biological questions. + preferred_normalization: counts + reference: chang2022spatial + documentation_url: https://spagft.readthedocs.io/en/latest/ + repository_url: https://github.com/jxLiu-bio/SpaGFT + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: python:3.10 + setup: + - type: apt + packages: + - git + - procps + - libhdf5-dev + - cmake + - type: docker + run : | + git clone https://github.com/jxLiu-bio/SpaGFT.git /opt/SpaGFT + - type: python + packages: + - h5py + - numba==0.55.1 + - louvain==0.7.1 + - chardet==5.1.0 + - charset-normalizer==3.1.0 + - anndata + - /opt/SpaGFT + - mizani==0.9.3 + - pyyaml + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/methods/spagft/script.py b/src/tasks/spatially_variable_genes/methods/spagft/script.py new file mode 100644 index 0000000000..9968e5aad0 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/spagft/script.py @@ -0,0 +1,44 @@ +import anndata as ad +import SpaGFT as spg + +# VIASH START +par = { + 'input_data': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'SpaGFT' +} +# VIASH END + +print('Load data', flush=True) +adata = ad.read_h5ad(par['input_data']) + +print('Run SpaGFT', flush=True) + +adata.X = adata.layers['normalized'].copy() + +adata.obs.loc[:, ['array_row', 'array_col']] = adata.obsm['spatial'] + +(ratio_low, ratio_high) = spg.gft.determine_frequency_ratio(adata, ratio_neighbors=1) + +df = spg.detect_svg(adata, + spatial_info=['array_row', 'array_col'], + ratio_low_freq=ratio_low, + ratio_high_freq=ratio_high, + ratio_neighbors=1, + filter_peaks=True, + S=6) + + +# save results +df = df.loc[adata.var_names][['gft_score']] +df = df.reset_index() +df.columns = ['feature_id', 'pred_spatial_var_score'] + +output = ad.AnnData(var=df, + uns={'dataset_id': adata.uns['dataset_id'], + 'method_id': meta['functionality_name']}) + +print("Write output to file", flush=True) +output.write_h5ad(par['output']) diff --git a/src/tasks/spatially_variable_genes/methods/spanve/config.vsh.yaml b/src/tasks/spatially_variable_genes/methods/spanve/config.vsh.yaml new file mode 100644 index 0000000000..ec95755912 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/spanve/config.vsh.yaml @@ -0,0 +1,45 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: spanve + info: + label: Spanve + summary: "Spanve is a non-parametric statistical approach based on modeling space dependence as a distance of two distributions for detecting SV genes." + description: | + The depiction of in situ gene expression through spatial transcriptomics facilitates the inference of cell + function mechanisms. To build spatial maps of transcriptomes, the first and crucial step is to + identify spatially variable (SV) genes. However, current methods fall short in dealing with + large-scale spatial transcriptomics data and may result in a high false positive rate due to the + modeling of gene expression into parametric distributions. + This paper introduces Spanve (https://github.com/zjupgx/Spanve), a non-parametric statistical approach + based on modeling space dependence as a distance of two distributions for detecting SV genes. + The high computing efficiency and accuracy of Spanve is demonstrated through comprehensive benchmarking. + Additionally, Spanve can detect clustering-friendly SV genes and spatially variable co-expression, + facilitating the identification of spatial tissue domains by an imputation. + preferred_normalization: counts + reference: cai2023spanve + documentation_url: https://github.com/zjupgx/Spanve/blob/main/tutorial.ipynb + repository_url: https://github.com/zjupgx/Spanve + + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: apt + packages: git + - type: docker + run : | + git clone https://github.com/gx-Cai/Spanve.git /opt/Spanve + - type: python + packages: + - /opt/Spanve + - numpy==1.26.4 + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/methods/spanve/script.py b/src/tasks/spatially_variable_genes/methods/spanve/script.py new file mode 100644 index 0000000000..ea2c7a98e3 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/spanve/script.py @@ -0,0 +1,33 @@ +import anndata as ad +from Spanve import Spanve + +# VIASH START +par = { + 'input_data': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'Spanve' +} +# VIASH END + +print('Load data', flush=True) +adata = ad.read_h5ad(par['input_data']) + +print('Run Spanve', flush=True) +adata.X = adata.layers['counts'] +spanve = Spanve(adata) +spanve.fit(verbose=False) + +# save results +df = spanve.result_df +df = df.loc[adata.var_names][['ent']] +df = df.reset_index() +df.columns = ['feature_id', 'pred_spatial_var_score'] + +output = ad.AnnData(var=df, + uns={'dataset_id': adata.uns['dataset_id'], + 'method_id': meta['functionality_name']}) + +print("Write output to file", flush=True) +output.write_h5ad(par['output']) diff --git a/src/tasks/spatially_variable_genes/methods/spark/config.vsh.yaml b/src/tasks/spatially_variable_genes/methods/spark/config.vsh.yaml new file mode 100644 index 0000000000..4ed2e5fd0a --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/spark/config.vsh.yaml @@ -0,0 +1,30 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: spark + info: + label: SPARK + summary: "Spatial PAttern Recognition via Kernels" + description: | + SPARK builds upon a generalized linear spatial model (GLSM) with a variety of spatial kernels to accommodate count data. + With a newly developed penalized quasi-likelihood (PQL) algorithm, SPARK is scalable to analyzing tens of + thousands of genes across tens of thousands spatial locations. + preferred_normalization: counts + reference: sun2020statistical + documentation_url: https://xzhoulab.github.io/SPARK/02_SPARK_Example/ + repository_url: https://github.com/xzhoulab/SPARK + + resources: + - type: r_script + path: script.R + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_r:1.0.4 + setup: + - type: r + github: xzhoulab/SPARK + - type: native + - type: nextflow + directives: + label: [veryhightime, highmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/methods/spark/script.R b/src/tasks/spatially_variable_genes/methods/spark/script.R new file mode 100644 index 0000000000..2de1f38bbb --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/spark/script.R @@ -0,0 +1,75 @@ +suppressMessages(library(SPARK)) +suppressMessages(library(anndata)) + +# VIASH START +par <- list( + "input_data" = "resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad", + "output" = "output.h5ad" +) +meta <- list( + "functionality_name" = "SPARK", + "cpus" = 4 +) + +# VIASH END + +# load data +cat("Load data\n") +adata <- anndata::read_h5ad(par$input_data) +counts <- t(as.matrix(adata$layers[["counts"]])) +colnames(counts) <- adata$obs_names +rownames(counts) <- adata$var_names +info <- as.data.frame(adata$obsm[["spatial"]]) +rownames(info) <- colnames(counts) +colnames(info) <- c("x", "y") + +# run SPARK +cat("Run SPARK\n") +if (!is.null(meta$cpus)) { + n_cpus <- meta$cpus +} else { + n_cpus <- 1 +} + +spark <- CreateSPARKObject( + counts = counts, percentage = 0, + min_total_counts = 0, location = info[, 1:2] +) + +spark@lib_size <- apply(spark@counts, 2, sum) +spark <- spark.vc(spark, + covariates = NULL, + lib_size = spark@lib_size, + num_core = n_cpus, + verbose = FALSE +) + +## Calculating pval +spark <- spark.test(spark, + check_positive = T, + verbose = F +) + +df <- as.data.frame(spark@res_mtest) + +df$feature_id <- rownames(df) + +df <- subset(df, select = c("feature_id", "adjusted_pvalue")) +colnames(df) <- c("feature_id", "pred_spatial_var_score") + +# because SPARK only generates p-values, we here transform the values +# via -log10 to make sure a bigger score represents a higher spatial variation +df$pred_spatial_var_score <- -log10(df$pred_spatial_var_score) + +# save output +cat("Write output AnnData to file\n") +output <- anndata::AnnData( + shape = adata$shape, + var = df, + uns = list( + "dataset_id" = adata$uns[["dataset_id"]], + "method_id" = meta[["functionality_name"]] + ) +) + +anndata::write_h5ad(anndata = output, filename = par$output) diff --git a/src/tasks/spatially_variable_genes/methods/spark_x/config.vsh.yaml b/src/tasks/spatially_variable_genes/methods/spark_x/config.vsh.yaml new file mode 100644 index 0000000000..cc3a62eeef --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/spark_x/config.vsh.yaml @@ -0,0 +1,35 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: spark_x + info: + label: SPARK-X + summary: "SPARK-X is a non-parametric method for rapid and effective detection of spatially expressed genes in large spatial transcriptomic studies." + description: | + Spatial transcriptomic studies are becoming increasingly common and large, posing important + statistical and computational challenges for many analytic tasks. Here, we present SPARK-X, + a non-parametric method for rapid and effective detection of spatially expressed genes in large + spatial transcriptomic studies. SPARK-X not only produces effective type I error control and + high power but also brings orders of magnitude computational savings. We apply SPARK-X to + analyze three large datasets, one of which is only analyzable by SPARK-X. In these data, + SPARK-X identifies many spatially expressed genes including those that are spatially + expressed within the same cell type, revealing new biological insights. + preferred_normalization: counts + reference: zhu2021spark + documentation_url: https://xzhoulab.github.io/SPARK/02_SPARK_Example/ + repository_url: https://github.com/xzhoulab/SPARK + + resources: + - type: r_script + path: script.R + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_r:1.0.4 + setup: + - type: r + github: xzhoulab/SPARK + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/methods/spark_x/script.R b/src/tasks/spatially_variable_genes/methods/spark_x/script.R new file mode 100644 index 0000000000..c5f9d8a96b --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/spark_x/script.R @@ -0,0 +1,57 @@ +suppressMessages(library(SPARK)) +suppressMessages(library(anndata)) + +# VIASH START +par <- list( + "input_data" = "resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad", + "output" = "output.h5ad" +) +meta <- list( + "functionality_name" = "SPARK-X", + "cpus" = 4L +) + +# VIASH END + +# load data +cat("Load data\n") +adata <- anndata::read_h5ad(par$input_data) +counts <- t(as.matrix(adata$layers[["counts"]])) +colnames(counts) <- adata$obs_names +rownames(counts) <- adata$var_names +info <- as.data.frame(adata$obsm[["spatial"]]) +rownames(info) <- colnames(counts) +colnames(info) <- c("x", "y") + +# run SPARK-X +cat("Load SPARK-X\n") +if (!is.null(meta$cpus)) { + n_cpus <- meta$cpus +} else { + n_cpus <- 1 +} + +sparkX <- sparkx(counts, info[, 1:2], numCores = n_cpus, option = "mixture") + +df <- as.data.frame(sparkX$res_mtest) +df$feature_id <- rownames(df) +df <- subset(df, select = c("feature_id", "adjustedPval")) +colnames(df) <- c("feature_id", "pred_spatial_var_score") +rownames(df) <- NULL + +# because SPARK-X only generates p-values, we here transform the values +# via -log10 to make sure a bigger score represents a higher spatial variation +df$pred_spatial_var_score <- -log10(df$pred_spatial_var_score) + +# save output +cat("Write output AnnData to file\n") +output <- anndata::AnnData( + shape = adata$shape, + var = df, + uns = list( + "dataset_id" = adata$uns[["dataset_id"]], + "method_id" = meta[["functionality_name"]] + ) +) + +anndata::write_h5ad(anndata = output, filename = par$output) diff --git a/src/tasks/spatially_variable_genes/methods/spatialde/config.vsh.yaml b/src/tasks/spatially_variable_genes/methods/spatialde/config.vsh.yaml new file mode 100644 index 0000000000..eee1ca1be1 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/spatialde/config.vsh.yaml @@ -0,0 +1,39 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: spatialde + info: + label: SpatialDE + summary: "SpatialDE is a method for identify spatially variable genes based on Gaussian Process model " + description: | + SpatialDE decomposes expression variability into spatial and nonspatial components using two random effect terms: a spatial variance term that parametrizes gene expression covariance by pairwise distances of samples, and a noise term that models nonspatial variability. + preferred_normalization: counts + reference: svensson2018spatialde + documentation_url: https://github.com/Teichlab/SpatialDE + repository_url: https://github.com/Teichlab/SpatialDE + + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: apt + packages: git + - type: docker + run : | + git clone https://github.com/Teichlab/SpatialDE.git /opt/SpatialDE + - type: python + packages: + - /opt/SpatialDE/Python-module + - scanpy==1.9.8 + - pandas==2.2.1 + - numpy==1.26.4 + - scipy==1.11.4 + - type: native + - type: nextflow + directives: + label: [hightime, highmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/methods/spatialde/script.py b/src/tasks/spatially_variable_genes/methods/spatialde/script.py new file mode 100644 index 0000000000..f5e0a9b21d --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/spatialde/script.py @@ -0,0 +1,53 @@ +import warnings +warnings.filterwarnings('ignore') + +import scanpy as sc +import anndata as ad +import NaiveDE +import SpatialDE + +# VIASH START +par = { + 'input_data': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'spatialDE' +} +# VIASH END + +print('Load data', flush=True) +adata = ad.read_h5ad(par['input_data']) + +# run spatialDE +print('Run spatialDE') +sc.pp.calculate_qc_metrics(adata, + layer='counts', + inplace=True, + percent_top=[10]) + +counts = sc.get.obs_df(adata, + keys=list(adata.var_names), + use_raw=False, + layer='counts') + +total_counts = sc.get.obs_df(adata, keys=["total_counts"]) +norm_expr = NaiveDE.stabilize(counts.T).T +resid_expr = NaiveDE.regress_out(total_counts, + norm_expr.T, + "np.log(total_counts)").T + +df = SpatialDE.run(adata.obsm["spatial"], resid_expr) + +# save results +df.set_index("g", inplace=True) +df = df.loc[adata.var_names][['FSV']] +df = df.reset_index() +df.columns = ['feature_id', 'pred_spatial_var_score'] + +output = ad.AnnData(var=df, + uns={'dataset_id': adata.uns['dataset_id'], + 'method_id': meta['functionality_name']}) + +print("Write output to file", flush=True) +output.write_h5ad(par['output']) diff --git a/src/tasks/spatially_variable_genes/methods/spatialde2/config.vsh.yaml b/src/tasks/spatially_variable_genes/methods/spatialde2/config.vsh.yaml new file mode 100644 index 0000000000..0d812b0a1f --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/spatialde2/config.vsh.yaml @@ -0,0 +1,53 @@ +__merge__: ../../api/comp_method.yaml + +functionality: + name: spatialde2 + info: + label: SpatialDE2 + summary: "SpatialDE2: Fast and localized variance component analysis of spatial transcriptomics" + description: | + Spatial transcriptomics is now a mature technology, allowing to assay gene expression changes + in the histological context of complex tissues. A canonical analysis workflow starts with the + identification of tissue zones that share similar expression profiles, followed by the detection + of highly variable or spatially variable genes. Rapid increases in the scale and complexity of + spatial transcriptomic datasets demand that these analysis steps are conducted in a consistent + and integrated manner, a requirement that is not met by current methods. To address this, we + here present SpatialDE2, which unifies the mapping of tissue zones and spatial variable gene + detection as integrated software framework, while at the same time advancing current algorithms + for both of these steps. Formulated in a Bayesian framework, the model accounts for the Poisson + count noise, while simultaneously offering superior computational speed compared to previous methods. + We validate SpatialDE2 using simulated data and illustrate its utility in the context of two real-world + applications to the spatial transcriptomics profiles of the mouse brain and human endometrium. + preferred_normalization: counts + reference: kats2021spatialde2 + documentation_url: https://pmbio.github.io/SpatialDE/ + repository_url: https://github.com/PMBio/SpatialDE + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: python:3.7.12 + setup: + - type: apt + packages: + - git + - procps + - libhdf5-dev + - cmake + - type: docker + run : | + git clone https://github.com/PMBio/SpatialDE.git /opt/SpatialDE2 + - type: python + packages: + - scanpy + - anndata + - patsy + - /opt/SpatialDE2 + - pyyaml + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu, gpu] diff --git a/src/tasks/spatially_variable_genes/methods/spatialde2/script.py b/src/tasks/spatially_variable_genes/methods/spatialde2/script.py new file mode 100644 index 0000000000..fe82d40981 --- /dev/null +++ b/src/tasks/spatially_variable_genes/methods/spatialde2/script.py @@ -0,0 +1,51 @@ +import scanpy as sc +import anndata as ad +import SpatialDE as sd +import NaiveDE +import warnings +warnings.filterwarnings("ignore") + + +# VIASH START +par = { + 'input_data': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/dataset.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'functionality_name': 'spatialDE2' +} +# VIASH END + +print('Load data', flush=True) +adata = ad.read_h5ad(par['input_data']) + +# run SpatialDE2 +print('Run spatialDE2', flush=True) +adata.X = adata.layers['counts'].copy() +sc.pp.calculate_qc_metrics(adata, inplace=True, percent_top=[10]) + +counts = sc.get.obs_df(adata, + keys=list(adata.var_names), + use_raw=False, + layer='counts') + +total_counts = sc.get.obs_df(adata, keys=["total_counts"]) +norm_expr = NaiveDE.stabilize(counts.T).T +adata.X = NaiveDE.regress_out( + total_counts, norm_expr.T, "np.log(total_counts)").T + +# run SpatialDE2 +df = sd.fit(adata, normalized=True, control=None) +df.set_index("gene", inplace=True) + +# save results +df = df.loc[adata.var_names][['FSV']] +df = df.reset_index() +df.columns = ['feature_id', 'pred_spatial_var_score'] + +output = ad.AnnData(var=df, + uns={'dataset_id': adata.uns['dataset_id'], + 'method_id': meta['functionality_name']}) + +print("Write output to file", flush=True) +output.write_h5ad(par['output']) diff --git a/src/tasks/spatially_variable_genes/metrics/correlation/config.vsh.yaml b/src/tasks/spatially_variable_genes/metrics/correlation/config.vsh.yaml new file mode 100644 index 0000000000..c13a852c24 --- /dev/null +++ b/src/tasks/spatially_variable_genes/metrics/correlation/config.vsh.yaml @@ -0,0 +1,32 @@ +__merge__: ../../api/comp_metric.yaml + +functionality: + name: correlation + info: + metrics: + - name: correlation + label: correlation + summary: "Correlation represents the agreement of true and predicted spatial variability." + description: | + Kendall rank correlation coefficient measures the ordinal association between two measured quantities. The best score and upper bound is 1 (observations have an identical rank), while the lower bound is -1 (observations have a completely different rank). + reference: kendall1938new + documentation_url: https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient + repository_url: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html + min: -1 + max: 1 + maximize: true + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: pandas + - type: native + - type: nextflow + directives: + label: [midtime, midmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/metrics/correlation/script.py b/src/tasks/spatially_variable_genes/metrics/correlation/script.py new file mode 100644 index 0000000000..f61ea17193 --- /dev/null +++ b/src/tasks/spatially_variable_genes/metrics/correlation/script.py @@ -0,0 +1,37 @@ +import anndata as ad +import pandas as pd + +## VIASH START +par = { + 'input_method': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/output.h5ad', + 'input_solution': 'resources_test/spatially_variable_genes/mouse_brain_coronal_section1/solution.h5ad', + 'output': 'score.h5ad' +} +meta = { + 'functionality_name': 'correlation' +} +## VIASH END + +print('Reading input files', flush=True) +input_method = ad.read_h5ad(par['input_method']) +input_solution = ad.read_h5ad(par['input_solution']) + +print('Compute metrics', flush=True) +df = pd.merge(input_method.var, input_solution.var, how='left', on='feature_id') +groupby = df.groupby('orig_feature_name', observed=True) +corr = groupby.apply(lambda x: x['pred_spatial_var_score'].corr(x['true_spatial_var_score'], method='kendall')) + +uns_metric_ids = [ 'correlation' ] +uns_metric_values = [ corr.mean() ] + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + 'dataset_id': input_method.uns['dataset_id'], + 'method_id': input_method.uns['method_id'], + 'metric_ids': uns_metric_ids, + 'metric_values': uns_metric_values + } +) +output.write_h5ad(par['output'], compression='gzip') + diff --git a/src/tasks/spatially_variable_genes/process_dataset/select_reference/config.vsh.yaml b/src/tasks/spatially_variable_genes/process_dataset/select_reference/config.vsh.yaml new file mode 100644 index 0000000000..229f039a62 --- /dev/null +++ b/src/tasks/spatially_variable_genes/process_dataset/select_reference/config.vsh.yaml @@ -0,0 +1,51 @@ +functionality: + name: "select_reference" + namespace: "spatially_variable_genes/process_dataset" + description: "Compute SVG" + info: + type: dataset_processor + type_info: + label: select_reference + description: | + Computes the spatially variable genes scores and select certain number of SVGs as reference. + arguments: + - name: "--input" + __merge__: ../../api/file_common_dataset.yaml + required: true + direction: input + - name: "--input_layer" + type: string + default: "normalized" + description: Which layer to use as input. + - name: "--output" + type: file + direction: output + required: true + __merge__: ../../api/file_common_dataset.yaml + - name: "--coord_type_proc" + type: string + default: "grid" + description: "How to create spatial graph to select reference genes." + choices: [grid, generic] + - name: "--num_features" + type: integer + default: 200 + description: "The number of variable genes to select" + resources: + - type: python_script + path: script.py + test_resources: + - path: /resources_test/common/mouse_brain_coronal_section1 + dest: resources_test/common/mouse_brain_coronal_section1 + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + setup: + - type: python + packages: + - squidpy + - type: nextflow + directives: + label: [midtime, highmem, midcpu] diff --git a/src/tasks/spatially_variable_genes/process_dataset/select_reference/script.py b/src/tasks/spatially_variable_genes/process_dataset/select_reference/script.py new file mode 100644 index 0000000000..481735c6fa --- /dev/null +++ b/src/tasks/spatially_variable_genes/process_dataset/select_reference/script.py @@ -0,0 +1,36 @@ +import anndata as ad +import squidpy as sq + +### VIASH START +par = { + "input": "resources_test/common/mouse_brain_coronal_section1/dataset.h5ad", + "input_layer": "normalized", + "output": "reference_dataset.h5ad", + "num_features": 50, + "coord_type_proc": "grid" +} +### VIASH END + +print(">> Load data", flush=True) +adata = ad.read_h5ad(par['input']) + +print(">> Run Moran's I spatial autocorrelation", flush=True) +sq.gr.spatial_neighbors(adata, + coord_type=par['coord_type_proc'], + delaunay=False) +sq.gr.spatial_autocorr(adata, + layer="normalized", + mode="moran", + n_perms=100, n_jobs=10, + genes=adata.var_names) + +n_svgs = par['num_features'] +sel_genes = ( + adata.uns["moranI"]["I"].sort_values(ascending=False).head(n_svgs).index.tolist() +) + +adata = adata[:, sel_genes] + +print(">> Writing data", flush=True) +adata.write_h5ad(par['output']) + diff --git a/src/tasks/spatially_variable_genes/process_dataset/simulate_svg/config.vsh.yaml b/src/tasks/spatially_variable_genes/process_dataset/simulate_svg/config.vsh.yaml new file mode 100644 index 0000000000..825958d337 --- /dev/null +++ b/src/tasks/spatially_variable_genes/process_dataset/simulate_svg/config.vsh.yaml @@ -0,0 +1,46 @@ +functionality: + name: "simulate_svg" + namespace: "spatially_variable_genes/process_dataset" + info: + type: process_dataset + type_info: + label: Data processor + summary: A spatially variable genes simulator. + description: | + Simulate spatially variable and spatially non-variable genes. + arguments: + - name: "--input" + __merge__: ../../api/file_common_dataset.yaml + direction: input + required: true + - name: "--output" + __merge__: ../../api/file_simulated_dataset.yaml + direction: output + required: true + - type: integer + name: --gp_k + description: Dimension of basis used for the Gaussian process smoother. + default: 500 + info: + test_default: 50 + - type: integer + name: --select_top_variable_genes + description: Number of top variable genes to use for subsetting. + default: 50 + resources: + - type: r_script + path: script.R + test_resources: + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/common/mouse_brain_coronal_section1 + dest: resources_test/common/mouse_brain_coronal_section1 +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_r:1.0.4 + setup: + - type: r + github: SONGDONGYUAN1994/scDesign3 + - type: nextflow + directives: + label: [hightime, highmem, highcpu] diff --git a/src/tasks/spatially_variable_genes/process_dataset/simulate_svg/script.R b/src/tasks/spatially_variable_genes/process_dataset/simulate_svg/script.R new file mode 100644 index 0000000000..43ea0476d8 --- /dev/null +++ b/src/tasks/spatially_variable_genes/process_dataset/simulate_svg/script.R @@ -0,0 +1,196 @@ +requireNamespace("scDesign3", quietly = TRUE) +requireNamespace("anndata", quietly = TRUE) +requireNamespace("Matrix", quietly = TRUE) +requireNamespace("SingleCellExperiment", quietly = TRUE) +library(rlang) + +# set random seed +set.seed(2024) + +## VIASH START +par <- list( + input = "resources_test/common/mouse_brain_coronal_section1/dataset.h5ad", + output = "dataset_sim.h5ad", + gp_k = 50L, + select_top_variable_genes = 50L +) +meta <- list( + cpus = 30L +) +## VIASH END + +cat("Read AnnData\n") +adata <- anndata::read_h5ad(par$input) + +cat("Transform into SCE\n") +df_loc <- as.data.frame(adata$obsm[['spatial']]) +colnames(df_loc) <- c("spatial1", "spatial2") +rownames(df_loc) <- adata$obs_names + +ref_sce <- SingleCellExperiment::SingleCellExperiment( + list(counts = Matrix::t(adata$layers[["counts"]])), + colData = df_loc +) + +ref_sce + +# check the number of genes in reference object +n_genes <- dim(ref_sce)[1] + +mu_formula <- paste0( + "s(spatial1, spatial2, bs = 'gp', k = ", par$gp_k, ")" +) + +if (n_genes > par$select_top_variable_genes) { + cat("Select ", par$select_top_variable_genes, " genes among ", n_genes, " reference genes ", "\n", sep = "") + + cat("Transform into scDesign3 data format\n") + ref_data <- scDesign3::construct_data( + sce = ref_sce, + assay_use = "counts", + celltype = NULL, + pseudotime = NULL, + spatial = c("spatial1", "spatial2"), + other_covariates = NULL, + corr_by = "1" + ) + + cat("Fit regression models for each feature\n") + ref_marginal <- scDesign3::fit_marginal( + data = ref_data, + predictor = "gene", + mu_formula = mu_formula, + sigma_formula = "1", + family_use = "nb", + parallelization = "pbmcmapply", + n_cores = 2L, + usebam = FALSE, + trace = TRUE + ) + + cat("Subset to the top variable genes\n") + dev_explain <- sapply(ref_marginal, function(x) { + if (length(x$fit) == 1 && is.na(x$fit)) { + return(NA_real_) + } + summary(x$fit)$dev.expl + }) + top_sel <- names(sort(dev_explain, decreasing = TRUE))[seq_len(par$select_top_variable_genes)] +} else { + top_sel <- adata$var_names +} + +ref_sce <- ref_sce[top_sel, ] +var_subset <- adata$var[top_sel, , drop = FALSE] + +cat("Transform subset matrix into scDesign3 data format\n") +ref_data <- scDesign3::construct_data( + sce = ref_sce, + assay_use = "counts", + celltype = NULL, + pseudotime = NULL, + spatial = c("spatial1", "spatial2"), + other_covariates = NULL, + corr_by = "1" +) + +cat("Fit expression of each gene with GP model\n") +ref_marginal <- scDesign3::fit_marginal( + data = ref_data, + predictor = "gene", + mu_formula = mu_formula, + sigma_formula = "1", + family_use = "nb", + parallelization = "pbmcmapply", + n_cores = 2L, + usebam = FALSE, + trace = TRUE +) + +cat("Fit a copula, obtain AIC and BIC\n") +ref_copula <- scDesign3::fit_copula( + sce = ref_sce, + assay_use = "counts", + marginal_list = ref_marginal, + family_use = "nb", + copula = "gaussian", + parallelization = "pbmcmapply", + n_cores = 2L, + input_data = ref_data$dat +) + +cat("Extract out the estimated parameters\n") +ref_para <- scDesign3::extract_para( + sce = ref_sce, + marginal_list = ref_marginal, + family_use = "nb", + new_covariate = ref_data$newCovariate, + data = ref_data$dat, + parallelization = "pbmcmapply", + n_cores = 2L +) + +cat("Simulate the new count matrix\n") +# generate non-spatially variable mean values with shuffling +shuffle_idx <- sample(nrow(ref_para$mean_mat)) +non_de_mat <- ref_para$mean_mat[shuffle_idx, ] + +# simulate data with varied spatial variability +outputs <- lapply(seq(0, 1.0, 0.05), function(alpha){ + cat("Simulate data with alpha = ", alpha, "\n", sep = "") + counts <- scDesign3::simu_new( + sce = ref_sce, + mean_mat = alpha * ref_para$mean_mat + (1 - alpha) * non_de_mat, + sigma_mat = ref_para$sigma_mat, + zero_mat = ref_para$zero_mat, + quantile_mat = NULL, + copula_list = ref_copula$copula_list, + n_cores = 5L, + family_use = "nb", + input_data = ref_data$dat, + new_covariate = ref_data$newCovariate, + important_feature = rep(TRUE, nrow(ref_sce)), + filtered_gene = NULL + ) + + if ("feature_id" %in% names(var_subset)) { + new_var <- data.frame( + feature_id = paste0(var_subset$feature_id, "_", alpha), + feature_name = paste0(var_subset$feature_name, "_", alpha), + orig_feature_id = var_subset$feature_id, + orig_feature_name = var_subset$feature_name, + true_spatial_var_score = alpha + ) + rownames(counts) <- new_var$feature_id + rownames(new_var) <- new_var$feature_id + } else { + new_var <- data.frame( + feature_id = paste0(var_subset$feature_name, "_", alpha), + feature_name = paste0(var_subset$feature_name, "_", alpha), + orig_feature_name = var_subset$feature_name, + true_spatial_var_score = alpha + ) + rownames(counts) <- new_var$feature_name + rownames(new_var) <- new_var$feature_name + } + + list( + counts = Matrix::t(counts), + var = new_var + ) +}) + +cat("Collecting final output\n", sep = "") +final_counts <- do.call(cbind, lapply(outputs, function(x) x$counts)) +final_var <- do.call(rbind, lapply(outputs, function(x) x$var)) +final_uns <- adata$uns[c("dataset_id", "dataset_name", "dataset_description", "dataset_summary", "dataset_url", "dataset_organism", "dataset_reference")] + +output <- anndata::AnnData( + layers = list(counts = final_counts), + obs = adata$obs, + var = final_var, + obsm = adata$obsm, + uns = final_uns +) + +zzz <- output$write_h5ad(par$output, compression = "gzip") diff --git a/src/tasks/spatially_variable_genes/process_dataset/split_dataset/config.vsh.yaml b/src/tasks/spatially_variable_genes/process_dataset/split_dataset/config.vsh.yaml new file mode 100644 index 0000000000..d99688d759 --- /dev/null +++ b/src/tasks/spatially_variable_genes/process_dataset/split_dataset/config.vsh.yaml @@ -0,0 +1,38 @@ +functionality: + name: "split_dataset" + namespace: "spatially_variable_genes/process_dataset" + info: + type: process_dataset + type_info: + label: Data processor + summary: A spatially variable genes dataset processor. + description: | + Split the common dataset for the spatially_variable_genes task. + arguments: + - name: "--input" + __merge__: ../../api/file_simulated_dataset.yaml + direction: input + required: true + - name: "--output_dataset" + __merge__: ../../api/file_dataset.yaml + direction: output + required: true + - name: "--output_solution" + __merge__: ../../api/file_solution.yaml + direction: output + required: true + resources: + - type: python_script + path: script.py + - path: /src/common/helper_functions/subset_anndata.py + test_resources: + - type: python_script + path: /src/common/comp_tests/run_and_check_adata.py + - path: /resources_test/spatially_variable_genes/mouse_brain_coronal_section1 + dest: resources_test/spatially_variable_genes/mouse_brain_coronal_section1 +platforms: + - type: docker + image: ghcr.io/openproblems-bio/base_python:1.0.4 + - type: nextflow + directives: + label: [midtime, highmem, highcpu] diff --git a/src/tasks/spatially_variable_genes/process_dataset/split_dataset/script.py b/src/tasks/spatially_variable_genes/process_dataset/split_dataset/script.py new file mode 100644 index 0000000000..97bf014fa5 --- /dev/null +++ b/src/tasks/spatially_variable_genes/process_dataset/split_dataset/script.py @@ -0,0 +1,34 @@ +import anndata as ad +import sys + +## VIASH START +par = { + "input": "resources_test/spatially_variable_genes/mouse_brain_coronal_section1/simulated_dataset.h5ad", + "output_dataset": "dataset.h5ad", + "output_solution": "solution.h5ad", +} +meta = { + "functionality_name": "process_dataset", + "resources_dir": "src/tasks/spatially_variable_genes/process_dataset", + "config": "target/nextflow/spatially_variable_genes/process_dataset/split_dataset/.config.vsh.yaml" +} +## VIASH END + +sys.path.append(meta['resources_dir']) +from subset_anndata import read_config_slots_info, subset_anndata + +print(">> Load dataset", flush=True) +adata = ad.read_h5ad(par["input"]) + +print(">> Figuring out which data needs to be copied to which output file", flush=True) +slot_info = read_config_slots_info(meta["config"]) + +print(">> Create dataset for methods", flush=True) +output_dataset = subset_anndata(adata, slot_info['output_dataset']) + +print(">> Create solution object for metrics", flush=True) +output_solution = subset_anndata(adata, slot_info['output_solution']) + +print(">> Write to disk", flush=True) +output_dataset.write_h5ad(par["output_dataset"]) +output_solution.write_h5ad(par["output_solution"]) diff --git a/src/tasks/spatially_variable_genes/resources_scripts/process_datasets.sh b/src/tasks/spatially_variable_genes/resources_scripts/process_datasets.sh new file mode 100755 index 0000000000..bdd90f9786 --- /dev/null +++ b/src/tasks/spatially_variable_genes/resources_scripts/process_datasets.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +cat > /tmp/params.yaml << 'HERE' +param_list: + - id: svg_process_datasets_visium + input_states: "s3://openproblems-data/resources/datasets/spatial_10x_visium/**/state.yaml" + settings: '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad", "dataset_simulated_normalized": "$id/simulated_dataset.h5ad", "gp_k_sim": 500, "select_top_variable_genes_sim": 50, "num_reference_genes": 200, "coord_type_proc": "grid"}' + + - id: svg_process_datasets_xenium + input_states: "s3://openproblems-data/resources/datasets/spatial_10x_xenium/**/state.yaml" + settings: '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad", "dataset_simulated_normalized": "$id/simulated_dataset.h5ad", "gp_k_sim": 500, "select_top_variable_genes_sim": 50, "num_reference_genes": 100, "coord_type_proc": "grid"}' + + - id: svg_process_datasets_slidetags + input_states: "s3://openproblems-data/resources/datasets/spatial_slide_tags/**/state.yaml" + settings: '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad", "dataset_simulated_normalized": "$id/simulated_dataset.h5ad", "gp_k_sim": 500, "select_top_variable_genes_sim": 50, "num_reference_genes": 50, "coord_type_proc": "grid"}' + + - id: svg_process_datasets_slideseqv2 + input_states: "s3://openproblems-data/resources/datasets/spatial_slideseq_v2/**/state.yaml" + settings: '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad", "dataset_simulated_normalized": "$id/simulated_dataset.h5ad", "gp_k_sim": 500, "select_top_variable_genes_sim": 10, "num_reference_genes": 10, "coord_type_proc": "generic"}' + + - id: svg_process_datasets_dbitseq + input_states: "s3://openproblems-data/resources/datasets/spatial_dbit_seq/**/state.yaml" + settings: '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad", "dataset_simulated_normalized": "$id/simulated_dataset.h5ad", "gp_k_sim": 500, "select_top_variable_genes_sim": 50, "num_reference_genes": 200, "coord_type_proc": "generic"}' + + - id: svg_process_datasets_seqfish + input_states: "s3://openproblems-data/resources/datasets/spatial_seqfish/**/state.yaml" + settings: '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad", "dataset_simulated_normalized": "$id/simulated_dataset.h5ad", "gp_k_sim": 500, "select_top_variable_genes_sim": 25, "num_reference_genes": 25, "coord_type_proc": "generic"}' + + - id: svg_process_datasets_starmap + input_states: "s3://openproblems-data/resources/datasets/spatial_star_map/**/state.yaml" + settings: '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad", "dataset_simulated_normalized": "$id/simulated_dataset.h5ad", "gp_k_sim": 500, "select_top_variable_genes_sim": 25, "num_reference_genes": 25, "coord_type_proc": "generic"}' + + - id: svg_process_datasets_stereoseq + input_states: "s3://openproblems-data/resources/datasets/spatial_stereo_seq/**/state.yaml" + settings: '{"output_dataset": "$id/dataset.h5ad", "output_solution": "$id/solution.h5ad", "dataset_simulated_normalized": "$id/simulated_dataset.h5ad", "gp_k_sim": 500, "select_top_variable_genes_sim": 50, "num_reference_genes": 50, "coord_type_proc": "generic"}' + +rename_keys: 'input:output_dataset' +output_state: "$id/state.yaml" +publish_dir: s3://openproblems-data/resources/spatially_variable_genes/datasets +HERE + +# cat > /tmp/params.yaml << 'HERE' +# param_list: +# - id: spatial_merfish/human_cortex_1 +# input: "s3://openproblems-data/resources/datasets/spatial_merfish/human_cortex_1/dataset.h5ad" +# gp_k_sim: 500 +# select_top_variable_genes_sim: 25 +# num_reference_genes: 25 +# coord_type_proc: generic + +# - id: spatial_merfish/human_cortex_2 +# input: "s3://openproblems-data/resources/datasets/spatial_merfish/human_cortex_2/dataset.h5ad" +# gp_k_sim: 500 +# select_top_variable_genes_sim: 50 +# num_reference_genes: 50 +# coord_type_proc: generic + +# - id: spatial_merfish/human_cortex_3 +# input: "s3://openproblems-data/resources/datasets/spatial_merfish/human_cortex_3/dataset.h5ad" +# gp_k_sim: 500 +# select_top_variable_genes_sim: 50 +# num_reference_genes: 50 +# coord_type_proc: generic + +# - id: spatial_merfish/human_cortex_4 +# input: "s3://openproblems-data/resources/datasets/spatial_merfish/human_cortex_4/dataset.h5ad" +# gp_k_sim: 500 +# select_top_variable_genes_sim: 50 +# num_reference_genes: 50 +# coord_type_proc: generic + +# - id: spatial_merfish/mouse_cortex +# input: "s3://openproblems-data/resources/datasets/spatial_merfish/mouse_cortex/dataset.h5ad" +# gp_k_sim: 500 +# select_top_variable_genes_sim: 25 +# num_reference_genes: 25 +# coord_type_proc: generic + +# output_dataset: "$id/dataset.h5ad" +# output_solution: "$id/solution.h5ad" +# dataset_simulated_normalized: "$id/simulated_dataset.h5ad" +# output_state: "$id/state.yaml" +# publish_dir: s3://openproblems-data/resources/spatially_variable_genes/datasets +# HERE + +cat > /tmp/nextflow.config << HERE +process { + executor = 'awsbatch' + withName:'.*publishStatesProc' { + memory = '16GB' + disk = '100GB' + } + withLabel:highmem { + memory = '350GB' + } + withLabel:hightime { + time = 15.h + } +} +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision integration_build \ + --pull-latest \ + --main-script target/nextflow/spatially_variable_genes/workflows/process_datasets/main.nf \ + --workspace 53907369739130 \ + --compute-env 6TeIFgV5OY4pJCk8I0bfOh \ + --params-file /tmp/params.yaml \ + --config /tmp/nextflow.config \ + --entry-name auto \ diff --git a/src/tasks/spatially_variable_genes/resources_scripts/run_benchmark.sh b/src/tasks/spatially_variable_genes/resources_scripts/run_benchmark.sh new file mode 100755 index 0000000000..9539cea4fe --- /dev/null +++ b/src/tasks/spatially_variable_genes/resources_scripts/run_benchmark.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" +publish_dir="s3://openproblems-data/resources/spatially_variable_genes/results/${RUN_ID}" + +# cat > /tmp/params.yaml << HERE +# input_states: s3://openproblems-data/resources/spatially_variable_genes/datasets/**/state.yaml +# rename_keys: 'input_dataset:output_dataset,input_solution:output_solution' +# output_state: "state.yaml" +# publish_dir: "$publish_dir" +# HERE + +cat > /tmp/params.yaml << HERE +param_list: + - id: svg_datasets_visium + input_states: "s3://openproblems-data/resources/spatially_variable_genes/datasets/spatial_10x_visium/**/state.yaml" + settings: '{"coord_type_moran_i": "generic", "coord_type_sepal": "grid", "max_neighs_sepal": 6}' + + - id: svg_datasets_xenium + input_states: "s3://openproblems-data/resources/spatially_variable_genes/datasets/spatial_10x_xenium/**/state.yaml" + settings: '{"coord_type_moran_i": "generic", "coord_type_sepal": "grid", "max_neighs_sepal": 6}' + + - id: svg_datasets_dbitseq + input_states: "s3://openproblems-data/resources/spatially_variable_genes/datasets/spatial_dbit_seq/**/state.yaml" + settings: '{"coord_type_moran_i": "generic", "coord_type_sepal": "grid", "max_neighs_sepal": 4}' + + - id: svg_datasets_merfish + input_states: "s3://openproblems-data/resources/spatially_variable_genes/datasets/spatial_merfish/**/state.yaml" + settings: '{"coord_type_moran_i": "generic", "coord_type_sepal": "grid", "max_neighs_sepal": 6}' + + - id: svg_datasets_seqfish + input_states: "s3://openproblems-data/resources/spatially_variable_genes/datasets/spatial_seqfish/**/state.yaml" + settings: '{"coord_type_moran_i": "generic", "coord_type_sepal": "grid", "max_neighs_sepal": 6}' + + - id: svg_datasets_slidetags + input_states: "s3://openproblems-data/resources/spatially_variable_genes/datasets/spatial_slide_tags/**/state.yaml" + settings: '{"coord_type_moran_i": "generic", "coord_type_sepal": "grid", "max_neighs_sepal": 6}' + + - id: svg_datasets_slideseqv2 + input_states: "s3://openproblems-data/resources/spatially_variable_genes/datasets/spatial_slideseq_v2/**/state.yaml" + settings: '{"coord_type_moran_i": "generic", "coord_type_sepal": "grid", "max_neighs_sepal": 6}' + + - id: svg_datasets_starmap + input_states: "s3://openproblems-data/resources/spatially_variable_genes/datasets/spatial_star_map/**/state.yaml" + settings: '{"coord_type_moran_i": "generic", "coord_type_sepal": "grid", "max_neighs_sepal": 6}' + + - id: svg_datasets_stereoseq + input_states: "s3://openproblems-data/resources/spatially_variable_genes/datasets/spatial_stereo_seq/**/state.yaml" + settings: '{"coord_type_moran_i": "generic", "coord_type_sepal": "grid", "max_neighs_sepal": 4}' + +rename_keys: 'input_dataset:output_dataset,input_solution:output_solution' +output_state: "state.yaml" +publish_dir: "$publish_dir" +HERE + +tw launch https://github.com/openproblems-bio/openproblems-v2.git \ + --revision integration_build \ + --pull-latest \ + --main-script target/nextflow/spatially_variable_genes/workflows/run_benchmark/main.nf \ + --workspace 53907369739130 \ + --compute-env 1euVrtATIcRyy9Yc2ERbaZ \ + --params-file /tmp/params.yaml \ + --entry-name auto \ + --config src/wf_utils/labels_tw.config \ diff --git a/src/tasks/spatially_variable_genes/resources_test_scripts/mouse_brain_coronal_section1.sh b/src/tasks/spatially_variable_genes/resources_test_scripts/mouse_brain_coronal_section1.sh new file mode 100755 index 0000000000..2110d29f1b --- /dev/null +++ b/src/tasks/spatially_variable_genes/resources_test_scripts/mouse_brain_coronal_section1.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# make sure the following command has been executed +# viash ns build -q 'spatially_variable_genes|common' + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +RAW_DATA=resources_test/common +DATASET_DIR=resources_test/spatially_variable_genes + +mkdir -p $DATASET_DIR + +echo "Running process_dataset" +nextflow run . \ + -main-script target/nextflow/spatially_variable_genes/workflows/process_datasets/main.nf \ + -profile docker \ + -c src/wf_utils/labels_ci.config \ + --id mouse_brain_coronal_section1 \ + --input $RAW_DATA/mouse_brain_coronal_section1/dataset.h5ad \ + --output_dataset dataset.h5ad \ + --output_solution solution.h5ad \ + --dataset_simulated_normalized simulated_dataset.h5ad \ + --publish_dir $DATASET_DIR/mouse_brain_coronal_section1 \ + --output_state "state.yaml" \ + --gp_k_sim 50 \ + --select_top_variable_genes 50 \ + --num_reference_genes 200 + +echo "Running control method" +viash run src/tasks/spatially_variable_genes/control_methods/true_ranking/config.vsh.yaml -- \ + --input_data $DATASET_DIR/mouse_brain_coronal_section1/dataset.h5ad \ + --input_solution $DATASET_DIR/mouse_brain_coronal_section1/solution.h5ad \ + --output $DATASET_DIR/mouse_brain_coronal_section1/output.h5ad + +echo "Running metric" +viash run src/tasks/spatially_variable_genes/metrics/correlation/config.vsh.yaml -- \ + --input_method $DATASET_DIR/mouse_brain_coronal_section1/output.h5ad \ + --input_solution $DATASET_DIR/mouse_brain_coronal_section1/solution.h5ad \ + --output $DATASET_DIR/mouse_brain_coronal_section1/score.h5ad diff --git a/src/tasks/spatially_variable_genes/workflows/process_datasets/config.vsh.yaml b/src/tasks/spatially_variable_genes/workflows/process_datasets/config.vsh.yaml new file mode 100644 index 0000000000..19f3c2f200 --- /dev/null +++ b/src/tasks/spatially_variable_genes/workflows/process_datasets/config.vsh.yaml @@ -0,0 +1,67 @@ +functionality: + name: "process_datasets" + namespace: "spatially_variable_genes/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input" + __merge__: /src/tasks/spatially_variable_genes/api/file_common_dataset.yaml + description: "Input dataset" + required: true + - name: Outputs + arguments: + - name: "--output_dataset" + __merge__: /src/tasks/spatially_variable_genes/api/file_dataset.yaml + required: true + direction: output + - name: "--output_solution" + __merge__: /src/tasks/spatially_variable_genes/api/file_solution.yaml + required: true + direction: output + - name: "--dataset_simulated_normalized" + __merge__: /src/tasks/spatially_variable_genes/api/file_simulated_dataset.yaml + required: false + direction: output + - name: Simulation options + arguments: + - type: integer + name: --gp_k_sim + description: Dimension of basis used for the Gaussian process smoother. + default: 500 + info: + test_value: 50 + - type: integer + name: --select_top_variable_genes_sim + description: Number of top variable genes to use for subsetting. + default: 50 + - name: Reference genes + arguments: + - name: "--num_reference_genes" + type: integer + description: Number of top SVGs to select as reference. + default: 200 + - name: "--coord_type_proc" + type: string + default: "grid" + description: "How to create spatial graph to select reference genes." + choices: [grid, generic] + - name: Normalization options + arguments: + - name: "--n_cp" + type: integer + default: -1 + description: "Number of counts per cell. When set to -1, will use None." + + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - path: /src/wf_utils/helper.nf + dependencies: + - name: common/check_dataset_schema + - name: spatially_variable_genes/process_dataset/select_reference + - name: spatially_variable_genes/process_dataset/simulate_svg + - name: datasets/normalization/log_cp + - name: spatially_variable_genes/process_dataset/split_dataset +platforms: + - type: nextflow diff --git a/src/tasks/spatially_variable_genes/workflows/process_datasets/main.nf b/src/tasks/spatially_variable_genes/workflows/process_datasets/main.nf new file mode 100644 index 0000000000..7a5f58356b --- /dev/null +++ b/src/tasks/spatially_variable_genes/workflows/process_datasets/main.nf @@ -0,0 +1,86 @@ +include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" + +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + + | check_dataset_schema.run( + fromState: { id, state -> + def schema = findArgumentSchema(meta.config, "input") + def schemaYaml = tempFile("schema.yaml") + writeYaml(schema, schemaYaml) + [ + "input": state.input, + "schema": schemaYaml + ] + }, + toState: { id, output, state -> + // read the output to see if dataset passed the qc + def checks = readYaml(output.output) + state + [ + "dataset": checks["exit_code"] == 0 ? state.input : null, + ] + } + ) + + // remove datasets which didn't pass the schema check + | filter { id, state -> + state.dataset != null + } + + | select_reference.run( + fromState: [ + input: "dataset", + num_features: "num_reference_genes", + coord_type_proc: "coord_type_proc" + ], + toState: [dataset: "output"] + ) + + | simulate_svg.run( + fromState: [ + input: "dataset", + gp_k: "gp_k_sim", + select_top_variable_genes: "select_top_variable_genes_sim" + ], + toState: [ + dataset_simulated: "output" + ] + ) + + | log_cp.run( + fromState: [ + input: "dataset_simulated", + ], + toState: [ + dataset_simulated_normalized: "output" + ], + args: [n_cp: -1] + ) + + | split_dataset.run( + fromState: [ + input: "dataset_simulated_normalized" + ], + toState: [ + output_dataset: "output_dataset", + output_solution: "output_solution" + ] + ) + + // only output the files for which an output file was specified + | setState(["output_dataset", "output_solution", "dataset_simulated_normalized"]) + + emit: + output_ch +} diff --git a/src/tasks/spatially_variable_genes/workflows/process_datasets/run_test.sh b/src/tasks/spatially_variable_genes/workflows/process_datasets/run_test.sh new file mode 100644 index 0000000000..b5df48aa92 --- /dev/null +++ b/src/tasks/spatially_variable_genes/workflows/process_datasets/run_test.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Run this prior to executing this script: +# viash ns build -q 'spatially_variable_genes' + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +RAW_DATA=resources_test/common +DATASET_DIR=resources_test/spatially_variable_genes + +mkdir -p $DATASET_DIR + +nextflow run . \ + -main-script target/nextflow/spatially_variable_genes/workflows/process_datasets/main.nf \ + -profile docker \ + -c src/wf_utils/labels_ci.config \ + --id mouse_brain_coronal_section1 \ + --input $RAW_DATA/mouse_brain_coronal_section1/dataset.h5ad \ + --output_dataset dataset.h5ad \ + --output_solution solution.h5ad \ + --dataset_simulated_normalized simulated_dataset.h5ad \ + --publish_dir $DATASET_DIR/mouse_brain_coronal_section1 \ + --output_state "state.yaml" \ + --gp_k_sim 50 \ + --select_top_variable_genes 50 \ + --num_reference_genes 200 \ No newline at end of file diff --git a/src/tasks/spatially_variable_genes/workflows/run_benchmark/config.vsh.yaml b/src/tasks/spatially_variable_genes/workflows/run_benchmark/config.vsh.yaml new file mode 100644 index 0000000000..f1a8c599d4 --- /dev/null +++ b/src/tasks/spatially_variable_genes/workflows/run_benchmark/config.vsh.yaml @@ -0,0 +1,87 @@ +functionality: + name: "run_benchmark" + namespace: "spatially_variable_genes/workflows" + argument_groups: + - name: Inputs + arguments: + - name: "--input_dataset" + __merge__: "/src/tasks/spatially_variable_genes/api/file_dataset.yaml" + required: true + direction: input + - name: "--input_solution" + __merge__: "/src/tasks/spatially_variable_genes/api/file_solution.yaml" + required: true + direction: input + - name: Method specific arguments + arguments: + - name: "--coord_type_moran_i" + type: string + required: false + description: Type of coordinate system to use for Moran's I. Valid options are "grid" for grid coordinates or "generic" for generic coordinates. + choices: [grid, generic] + - name: "--coord_type_sepal" + type: string + required: false + description: Type of coordinate system to use for Sepal. Valid options are "grid" for grid coordinates or "generic" for generic coordinates. + choices: [grid, generic] + - name: "--max_neighs_sepal" + type: integer + choices: [4, 6] + required: false + description: Maximum number of neighbors of a node in the spatial graph. Valid options are 4 (square-grid) and 6 (hexagonal-grid). + - name: Outputs + arguments: + - name: "--output_scores" + type: file + required: true + direction: output + description: A yaml file containing the scores of each of the methods + default: score_uns.yaml + - name: "--output_method_configs" + type: file + required: true + direction: output + default: method_configs.yaml + - name: "--output_metric_configs" + type: file + required: true + direction: output + default: metric_configs.yaml + - name: "--output_dataset_info" + type: file + required: true + direction: output + default: dataset_uns.yaml + - name: "--output_task_info" + type: file + required: true + direction: output + default: task_info.yaml + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + - type: file + path: "../../api/task_info.yaml" + dependencies: + - name: common/check_dataset_schema + - name: common/extract_metadata + - name: spatially_variable_genes/control_methods/random_ranking + - name: spatially_variable_genes/control_methods/true_ranking + - name: spatially_variable_genes/methods/boostgp + - name: spatially_variable_genes/methods/gpcounts + - name: spatially_variable_genes/methods/moran_i + - name: spatially_variable_genes/methods/nnsvg + - name: spatially_variable_genes/methods/scgco + - name: spatially_variable_genes/methods/sepal + - name: spatially_variable_genes/methods/somde + - name: spatially_variable_genes/methods/spagcn + - name: spatially_variable_genes/methods/spagft + - name: spatially_variable_genes/methods/spanve + - name: spatially_variable_genes/methods/spark + - name: spatially_variable_genes/methods/spark_x + - name: spatially_variable_genes/methods/spatialde + - name: spatially_variable_genes/methods/spatialde2 + - name: spatially_variable_genes/metrics/correlation +platforms: + - type: nextflow \ No newline at end of file diff --git a/src/tasks/spatially_variable_genes/workflows/run_benchmark/main.nf b/src/tasks/spatially_variable_genes/workflows/run_benchmark/main.nf new file mode 100644 index 0000000000..821f0911e9 --- /dev/null +++ b/src/tasks/spatially_variable_genes/workflows/run_benchmark/main.nf @@ -0,0 +1,197 @@ +workflow auto { + findStates(params, meta.config) + | meta.workflow.run( + auto: [publish: "state"] + ) +} + +workflow run_wf { + take: + input_ch + + main: + + // construct list of methods + methods = [ + random_ranking, + true_ranking, + boostgp, + gpcounts, + moran_i, + nnsvg, + scgco, + sepal, + somde, + spagcn, + spagft, + spanve, + spark, + spark_x, + spatialde, + spatialde2 + ] + + // construct list of metrics + metrics = [ + correlation + ] + + + /**************************** + * EXTRACT DATASET METADATA * + ****************************/ + dataset_ch = input_ch + // store join id + | map{ id, state -> + [id, state + ["_meta": [join_id: id]]] + } + + // extract the dataset metadata + | extract_metadata.run( + fromState: [input: "input_solution"], + toState: { id, output, state -> + state + [ + dataset_uns: readYaml(output.output).uns + ] + } + ) + + /*************************** + * RUN METHODS AND METRICS * + ***************************/ + score_ch = dataset_ch + + // run all methods + | runEach( + components: methods, + + // define a new 'id' by appending the method name to the dataset id + id: { id, state, comp -> + id + "." + comp.config.functionality.name + }, + + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: { id, state, comp -> + def new_args = [ + input_data: state.input_dataset + ] + if (comp.config.functionality.info.type == "control_method") { + new_args.input_solution = state.input_solution + } + if (comp.config.functionality.name == "sepal") { + new_args.coord_type_sepal = state.coord_type_sepal + new_args.max_neighs_sepal = state.max_neighs_sepal + } + if (comp.config.functionality.name == "moran_i") { + new_args.coord_type_moran_i = state.coord_type_moran_i + } + new_args + }, + + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + method_id: comp.config.functionality.name, + method_output: output.output + ] + } + ) + + // run all metrics + | runEach( + components: metrics, + id: { id, state, comp -> + id + "." + comp.config.functionality.name + }, + // use 'fromState' to fetch the arguments the component requires from the overall state + fromState: { id, state, comp -> + [ + input_solution: state.input_solution, + input_method: state.method_output + ] + }, + // use 'toState' to publish that component's outputs to the overall state + toState: { id, output, state, comp -> + state + [ + metric_id: comp.config.functionality.name, + metric_output: output.output + ] + } + ) + + /****************************** + * GENERATE OUTPUT YAML FILES * + ******************************/ + // TODO: can we store everything below in a separate helper function? + + // extract the dataset metadata + dataset_meta_ch = dataset_ch + | joinStates { ids, states -> + // store the dataset metadata in a file + def dataset_uns = states.collect{state -> + def uns = state.dataset_uns.clone() + uns.remove("normalization_id") + uns + } + def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) + def dataset_uns_file = tempFile("dataset_uns.yaml") + dataset_uns_file.write(dataset_uns_yaml_blob) + + ["output", [output_dataset_info: dataset_uns_file]] + } + + output_ch = score_ch + + // extract the scores + | extract_metadata.run( + key: "extract_scores", + fromState: [input: "metric_output"], + toState: { id, output, state -> + state + [ + score_uns: readYaml(output.output).uns + ] + } + ) + + | joinStates { ids, states -> + // store the method configs in a file + def method_configs = methods.collect{it.config} + def method_configs_yaml_blob = toYamlBlob(method_configs) + def method_configs_file = tempFile("method_configs.yaml") + method_configs_file.write(method_configs_yaml_blob) + + // store the metric configs in a file + def metric_configs = metrics.collect{it.config} + def metric_configs_yaml_blob = toYamlBlob(metric_configs) + def metric_configs_file = tempFile("metric_configs.yaml") + metric_configs_file.write(metric_configs_yaml_blob) + + def task_info_file = meta.resources_dir.resolve("task_info.yaml") + + // store the scores in a file + def score_uns = states.collect{it.score_uns} + def score_uns_yaml_blob = toYamlBlob(score_uns) + def score_uns_file = tempFile("score_uns.yaml") + score_uns_file.write(score_uns_yaml_blob) + + def new_state = [ + output_method_configs: method_configs_file, + output_metric_configs: metric_configs_file, + output_task_info: task_info_file, + output_scores: score_uns_file, + _meta: states[0]._meta + ] + + ["output", new_state] + } + + // merge all of the output data + | mix(dataset_meta_ch) + | joinStates{ ids, states -> + def mergedStates = states.inject([:]) { acc, m -> acc + m } + [ids[0], mergedStates] + } + + emit: + output_ch +} diff --git a/src/tasks/spatially_variable_genes/workflows/run_benchmark/run_test.sh b/src/tasks/spatially_variable_genes/workflows/run_benchmark/run_test.sh new file mode 100755 index 0000000000..a5c57c7a41 --- /dev/null +++ b/src/tasks/spatially_variable_genes/workflows/run_benchmark/run_test.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +# get the root of the directory +REPO_ROOT=$(git rev-parse --show-toplevel) + +# ensure that the command below is run from the root of the repository +cd "$REPO_ROOT" + +set -e + +DATASETS_DIR="resources_test/spatially_variable_genes" +OUTPUT_DIR="output/temp" + +if [ ! -d "$OUTPUT_DIR" ]; then + mkdir -p "$OUTPUT_DIR" +fi + +nextflow run . \ + -main-script target/nextflow/spatially_variable_genes/workflows/run_benchmark/main.nf \ + -profile docker \ + -resume \ + -entry auto \ + -c src/wf_utils/labels_ci.config \ + --input_states "$DATASETS_DIR/**/state.yaml" \ + --rename_keys 'input_dataset:output_dataset,input_solution:output_solution' \ + --publish_dir "$OUTPUT_DIR" \ + --output_state "state.yaml" \ + --settings '{"coord_type_moran_i": "generic", "coord_type_sepal": "grid", "max_neighs_sepal": 4}' \ No newline at end of file