Merge branch 'main' into feature/no-ref/add_simplemlp

openproblems-bio · Aug 26, 2024 · 6933a39 · 6933a39
2 parents 9a09b43 + 7cea6a5
commit 6933a39
Show file tree

Hide file tree

Showing 59 changed files with 1,737 additions and 303 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -132,13 +132,11 @@
 
 * Removed the separate subtask specific subfolders. The "subtask" is added to the config.
 
-* `control_methods/no_integration_batch`: Migrated from v1 embedding.
+* `control_methods/no_integration`: Migrated from v1.
 
-* `control_methods/random_embed_cell`: Migrated from v1 embedding.
+* `control_methods/perfect_integration`: Migrated from v1, renaming "random embedding" to "perfect integration".
 
-* `control_methods/random_embed_cel_jitter`: Migrated from v1 embedding.
-
-* `control_methods/random_integration`: Migrated from v1 graph.
+* `control_methods/random_integration`: Migrated from v1.
 
 * `methods/bbknn`: Migrated from v1 graph.
 

diff --git a/src/common/helper_functions/read_api_files.R b/src/common/helper_functions/read_api_files.R
@@ -1,14 +1,21 @@
 
 anndata_struct_names <- c("obs", "var", "obsm", "obsp", "varm", "varp", "layers", "uns")
 
-read_anndata_spec <- function(path) {
+read_file_spec <- function(path) {
   spec <- read_and_merge_yaml(path)
-  list(
-    info = read_anndata_info(spec, path),
-    slots = read_anndata_slots(spec, path)
+  out <- list(
+    info = read_file_info(spec, path)
   )
+  if (out$info$file_type == "h5ad" || "slots" %in% names(spec$info)) {
+    out$info$file_type <- "h5ad"
+    out$slots <- read_anndata_slots(spec, path)
+  }
+  if (out$info$file_type == "csv" || out$info$file_type == "tsv" || out$info$file_type == "parquet") {
+    out$columns <- read_tabular_columns(spec, path)
+  }
+  out
 }
-read_anndata_info <- function(spec, path) {
+read_file_info <- function(spec, path) {
   # TEMP: make it readable
   spec$info$slots <- NULL
   df <- list_as_tibble(spec)
@@ -35,44 +42,95 @@ read_anndata_slots <- function(spec, path) {
     }
   )
 }
+read_tabular_columns <- function(spec, path) {
+  map_df(
+    spec$info$columns,
+    function(column) {
+      df <- list_as_tibble(column)
+      df$file_name <- basename(path) %>% gsub("\\.yaml", "", .)
+      df$required <- df$required %||% TRUE %|% TRUE
+      df$multiple <- df$multiple %||% FALSE %|% FALSE
+      as_tibble(df)
+    }
+  )
+}
 
-format_slots <- function(spec) {
-  example <- spec$slots %>%
-    group_by(struct) %>%
-    summarise(
-      str = paste0(unique(struct), ": ", paste0("'", name, "'", collapse = ", "))
-    ) %>%
-    arrange(match(struct, anndata_struct_names))
+format_file_format <- function(spec) {
+  if (spec$info$file_type == "h5ad") {
+    example <- spec$slots %>%
+      group_by(struct) %>%
+      summarise(
+        str = paste0(unique(struct), ": ", paste0("'", name, "'", collapse = ", "))
+      ) %>%
+      arrange(match(struct, anndata_struct_names))
+
+    c("    AnnData object", paste0("     ", example$str))
+  } else if (spec$info$file_type == "csv" || spec$info$file_type == "tsv" || spec$info$file_type == "parquet") {
+    example <- spec$columns %>%
+      summarise(
+        str = paste0("'", name, "'", collapse = ", ")
+      )
 
-  c("    AnnData object", paste0("     ", example$str))
+    c("    Tabular data", paste0("     ", example$str))
+  } else {
+    ""
+  }
 }
 
-format_slots_as_kable <- function(spec) {
-  if (nrow(spec$slots) == 0) return("")
-  spec$slots %>%
-    mutate(
-      tag_str = pmap_chr(lst(required), function(required) {
-        out <- c()
-        if (!required) {
-          out <- c(out, "Optional")
-        }
-        if (length(out) == 0) {
-          ""
-        } else {
-          paste0("(_", paste(out, collapse = ", "), "_) ")
-        }
-      })
-    ) %>%
-    transmute(
-      Slot = paste0("`", struct, "[\"", name, "\"]`"),
-      Type = paste0("`", type, "`"),
-      Description = paste0(
-        tag_str,
-        description %>% gsub(" *\n *", " ", .) %>% gsub("\\. *$", "", .), 
-        "."
-      )
-    ) %>%
-    knitr::kable()
+format_file_format_as_kable <- function(spec) {
+  if (spec$info$file_type == "h5ad") {
+    spec$slots %>%
+      mutate(
+        tag_str = pmap_chr(lst(required), function(required) {
+          out <- c()
+          if (!required) {
+            out <- c(out, "Optional")
+          }
+          if (length(out) == 0) {
+            ""
+          } else {
+            paste0("(_", paste(out, collapse = ", "), "_) ")
+          }
+        })
+      ) %>%
+      transmute(
+        Slot = paste0("`", struct, "[\"", name, "\"]`"),
+        Type = paste0("`", type, "`"),
+        Description = paste0(
+          tag_str,
+          description %>% gsub(" *\n *", " ", .) %>% gsub("\\. *$", "", .), 
+          "."
+        )
+      ) %>%
+      knitr::kable()
+  } else if (spec$info$file_type == "csv" || spec$info$file_type == "tsv" || spec$info$file_type == "parquet") {
+    spec$columns %>%
+      mutate(
+        tag_str = pmap_chr(lst(required), function(required) {
+          out <- c()
+          if (!required) {
+            out <- c(out, "Optional")
+          }
+          if (length(out) == 0) {
+            ""
+          } else {
+            paste0("(_", paste(out, collapse = ", "), "_) ")
+          }
+        })
+      ) %>%
+      transmute(
+        Column = paste0("`", name, "`"),
+        Type = paste0("`", type, "`"),
+        Description = paste0(
+          tag_str,
+          description %>% gsub(" *\n *", " ", .) %>% gsub("\\. *$", "", .), 
+          "."
+        )
+      ) %>%
+      knitr::kable()
+  } else {
+    ""
+  }
 }
 
 list_contains_tibble <- function(li) {
@@ -97,6 +155,9 @@ read_comp_info <- function(spec_yaml, path) {
   spec_yaml$functionality$argument_groups <- NULL
 
   df <- list_as_tibble(spec_yaml$functionality)
+  if (nrow(df) == 0) {
+    df <- data.frame(a = 1)[, integer(0)]
+  }
   if (list_contains_tibble(spec_yaml$functionality$info)) {
     df <- dplyr::bind_cols(df, list_as_tibble(spec_yaml$functionality$info))
   }
@@ -187,7 +248,7 @@ render_component <- function(spec) {
 # path <- "src/datasets/api/file_pca.yaml"
 render_file <- function(spec) {
   if (is.character(spec)) {
-    spec <- read_anndata_spec(spec)
+    spec <- read_file_spec(spec)
   }
 
   if (!"label" %in% names(spec$info)) {
@@ -220,13 +281,13 @@ render_file <- function(spec) {
     §Format:
     §
     §:::{{.small}}
-    §{paste(format_slots(spec), collapse = '\n')}
+    §{paste(format_file_format(spec), collapse = '\n')}
     §:::
     §
     §Slot description:
     §
     §:::{{.small}}
-    §{paste(format_slots_as_kable(spec), collapse = '\n')}
+    §{paste(format_file_format_as_kable(spec), collapse = '\n')}
     §:::
     §
     §"), symbol = "§")
@@ -262,7 +323,7 @@ read_task_api <- function(path) {
     project_path = project_path,
     parent_path = api_dir
   )
-  files <- map(file_yamls, read_anndata_spec)
+  files <- map(file_yamls, read_file_spec)
   names(files) <- basename(file_yamls) %>% gsub("\\..*$", "", .)
   file_info <- map_df(files, "info")
   file_slots <- map_df(files, "slots")

diff --git a/src/common/process_task_results/get_method_info/script.R b/src/common/process_task_results/get_method_info/script.R
@@ -16,20 +16,28 @@ outputs <- map(configs, function(config) {
   if (length(config$functionality$status) > 0 && config$functionality$status == "disabled") {
     return(NULL)
   }
-  info <- config$functionality$info
+
+  # prep for viash 0.9.0
+  build_info <- config$build_info %||% config$info
+  if ("functionality" %in% names(config)) {
+    config[names(config$functionality)] <- config$functionality
+    config[["functionality"]] <- NULL
+  }
+
+  info <- config$info
 
   # add extra info
-  info$config_path <- gsub(".*openproblems-v2/src/", "src/", config$info$config)
-  info$task_id <- gsub("/.*", "", config$functionality$namespace)
-  info$id <- config$functionality$name
-  info$namespace <- config$functionality$namespace
-  info$commit_sha <- config$info$git_commit %||% "missing-sha"
+  info$config_path <- gsub(".*/src/", "src/", build_info$config)
+  info$task_id <- gsub("/.*", "", config$namespace)
+  info$id <- config$name
+  info$namespace <- config$namespace
+  info$commit_sha <- build_info$git_commit %||% "missing-sha"
   info$code_version <- "missing-version"
-  info$implementation_url <- paste0(
-    "https://github.com/openproblems-bio/openproblems-v2/tree/",
-    info$commit_sha, "/",
-    info$config_path
-  )
+    info$implementation_url <- paste0(
+      build_info$git_remote, "/blob/",
+      build_info$git_commit, "/",
+      info$config_path
+    )
 
   # ↑ this could be used as the new format
 

diff --git a/src/common/process_task_results/get_metric_info/script.R b/src/common/process_task_results/get_metric_info/script.R
@@ -17,20 +17,27 @@ outputs <- map(configs, function(config) {
     return(NULL)
   }
 
+  # prep for viash 0.9.0
+  build_info <- config$build_info %||% config$info
+  if ("functionality" %in% names(config)) {
+    config[names(config$functionality)] <- config$functionality
+    config[["functionality"]] <- NULL
+  }
+
   map(
-    config$functionality$info$metrics,
+    config$info$metrics,
     function(info) {
       # add extra info
-      info$config_path <- gsub(".*openproblems-v2/src/", "src/", config$info$config)
-      info$task_id <- gsub("/.*", "", config$functionality$namespace)
+      info$config_path <- gsub(".*/src/", "src/", build_info$config)
+      info$task_id <- gsub("/.*", "", config$namespace)
       info$id <- info$name
-      info$component_id <- config$functionality$name
-      info$namespace <- config$functionality$namespace
-      info$commit_sha <- config$info$git_commit %||% "missing-sha"
+      info$component_id <- config$name
+      info$namespace <- config$namespace
+      info$commit_sha <- build_info$git_commit %||% "missing-sha"
       info$code_version <- "missing-version"
       info$implementation_url <- paste0(
-        "https://github.com/openproblems-bio/openproblems-v2/tree/",
-        info$commit_sha, "/",
+        build_info$git_remote, "/blob/",
+        build_info$git_commit, "/",
         info$config_path
       )
 

diff --git a/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh b/src/datasets/resource_scripts/openproblems_neurips2021_multimodal.sh
@@ -9,7 +9,7 @@ param_list:
     input: "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE194nnn/GSE194122/suppl/GSE194122%5Fopenproblems%5Fneurips2021%5Fcite%5FBMMC%5Fprocessed%2Eh5ad%2Egz"
     mod1: GEX
     mod2: ADT
-    dataset_name: OpenProblems NeurIPS2021 CITE-Seq
+    dataset_name: NeurIPS2021 CITE-Seq
     dataset_organism: homo_sapiens
     dataset_summary: Single-cell CITE-Seq (GEX+ADT) data collected from bone marrow mononuclear cells of 12 healthy human donors.
     dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X 3 prime Single-Cell Gene Expression kit with Feature Barcoding in combination with the BioLegend TotalSeq B Universal Human Panel v1.0. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2021. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site."
@@ -19,7 +19,7 @@ param_list:
     input: "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE194nnn/GSE194122/suppl/GSE194122%5Fopenproblems%5Fneurips2021%5Fmultiome%5FBMMC%5Fprocessed%2Eh5ad%2Egz"
     mod1: GEX
     mod2: ATAC
-    dataset_name: OpenProblems NeurIPS2021 Multiome
+    dataset_name: NeurIPS2021 Multiome
     dataset_organism: homo_sapiens
     dataset_summary: Single-cell Multiome (GEX+ATAC) data collected from bone marrow mononuclear cells of 12 healthy human donors.
     dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X Multiome Gene Expression and Chromatin Accessibility kit. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2021. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site."
@@ -35,21 +35,12 @@ output_state: '$id/state.yaml'
 publish_dir: s3://openproblems-data/resources/datasets
 HERE
 
-cat > /tmp/nextflow.config << HERE
-process {
-  withName:'.*publishStatesProc' {
-    memory = '16GB'
-    disk = '100GB'
-  }
-}
-HERE
-
 tw launch https://github.com/openproblems-bio/openproblems-v2.git \
   --revision main_build \
   --pull-latest \
   --main-script target/nextflow/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf \
   --workspace 53907369739130 \
   --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
   --params-file "$params_file" \
-  --config /tmp/nextflow.config \
-  --labels openproblems_neurips2021_bmmc,dataset_loader \
+  --config src/wf_utils/labels_tw.config \
+  --labels neurips2021,dataset_loader \
diff --git a/src/tasks/batch_integration/api/comp_control_method_feature.yaml b/src/tasks/batch_integration/api/comp_control_method_feature.yaml
@@ -0,0 +1,26 @@
+functionality:
+  namespace: batch_integration/control_methods
+  info:
+    type: control_method
+    subtype: feature
+    type_info:
+      label: Control method (feature)
+      summary: A batch integration feature control method.
+      description: |
+        A batch integration control method which outputs a batch-corrected feature space.
+  arguments:
+    - name: --input
+      __merge__: file_dataset.yaml
+      direction: input
+      required: true
+    - name: --output
+      direction: output
+      __merge__: file_integrated_feature.yaml
+      required: true
+  test_resources:
+    - type: python_script
+      path: /src/common/comp_tests/check_method_config.py
+    - type: python_script
+      path: /src/common/comp_tests/run_and_check_adata.py
+    - path: /resources_test/batch_integration/pancreas
+      dest: resources_test/batch_integration/pancreas
diff --git a/...hods/no_integration_batch/config.vsh.yaml → ...o_integration/batch_embed/config.vsh.yaml b/...hods/no_integration_batch/config.vsh.yaml → ...o_integration/batch_embed/config.vsh.yaml
@@ -1,7 +1,8 @@
 # use method api spec
-__merge__: ../../api/comp_control_method_embedding.yaml
+__merge__: ../../../api/comp_control_method_embedding.yaml
 functionality:
-  name: no_integration_batch
+  name: batch_embed
+  namespace: batch_integration/control_methods/no_integration
   info:
     label: No integration by Batch
     summary: "Cells are embedded by computing PCA independently on each batch"
@@ -16,11 +17,6 @@ functionality:
 platforms:
   - type: docker
     image: ghcr.io/openproblems-bio/base_python:1.0.4
-    setup:
-      - type: python
-        pypi:
-          - scanpy
-          - numpy
   - type: nextflow
     directives:
       label: [midtime, lowmem, lowcpu]