Skip to content

Commit

Permalink
Merge branch 'main' into feature/no-ref/add_simplemlp
Browse files Browse the repository at this point in the history
  • Loading branch information
rcannood authored Aug 26, 2024
2 parents 9a09b43 + 7cea6a5 commit 6933a39
Show file tree
Hide file tree
Showing 59 changed files with 1,737 additions and 303 deletions.
8 changes: 3 additions & 5 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,11 @@

* Removed the separate subtask specific subfolders. The "subtask" is added to the config.

* `control_methods/no_integration_batch`: Migrated from v1 embedding.
* `control_methods/no_integration`: Migrated from v1.

* `control_methods/random_embed_cell`: Migrated from v1 embedding.
* `control_methods/perfect_integration`: Migrated from v1, renaming "random embedding" to "perfect integration".

* `control_methods/random_embed_cel_jitter`: Migrated from v1 embedding.

* `control_methods/random_integration`: Migrated from v1 graph.
* `control_methods/random_integration`: Migrated from v1.

* `methods/bbknn`: Migrated from v1 graph.

Expand Down
147 changes: 104 additions & 43 deletions src/common/helper_functions/read_api_files.R
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@

anndata_struct_names <- c("obs", "var", "obsm", "obsp", "varm", "varp", "layers", "uns")

read_anndata_spec <- function(path) {
read_file_spec <- function(path) {
spec <- read_and_merge_yaml(path)
list(
info = read_anndata_info(spec, path),
slots = read_anndata_slots(spec, path)
out <- list(
info = read_file_info(spec, path)
)
if (out$info$file_type == "h5ad" || "slots" %in% names(spec$info)) {
out$info$file_type <- "h5ad"
out$slots <- read_anndata_slots(spec, path)
}
if (out$info$file_type == "csv" || out$info$file_type == "tsv" || out$info$file_type == "parquet") {
out$columns <- read_tabular_columns(spec, path)
}
out
}
read_anndata_info <- function(spec, path) {
read_file_info <- function(spec, path) {
# TEMP: make it readable
spec$info$slots <- NULL
df <- list_as_tibble(spec)
Expand All @@ -35,44 +42,95 @@ read_anndata_slots <- function(spec, path) {
}
)
}
read_tabular_columns <- function(spec, path) {
map_df(
spec$info$columns,
function(column) {
df <- list_as_tibble(column)
df$file_name <- basename(path) %>% gsub("\\.yaml", "", .)
df$required <- df$required %||% TRUE %|% TRUE
df$multiple <- df$multiple %||% FALSE %|% FALSE
as_tibble(df)
}
)
}

format_slots <- function(spec) {
example <- spec$slots %>%
group_by(struct) %>%
summarise(
str = paste0(unique(struct), ": ", paste0("'", name, "'", collapse = ", "))
) %>%
arrange(match(struct, anndata_struct_names))
format_file_format <- function(spec) {
if (spec$info$file_type == "h5ad") {
example <- spec$slots %>%
group_by(struct) %>%
summarise(
str = paste0(unique(struct), ": ", paste0("'", name, "'", collapse = ", "))
) %>%
arrange(match(struct, anndata_struct_names))

c(" AnnData object", paste0(" ", example$str))
} else if (spec$info$file_type == "csv" || spec$info$file_type == "tsv" || spec$info$file_type == "parquet") {
example <- spec$columns %>%
summarise(
str = paste0("'", name, "'", collapse = ", ")
)

c(" AnnData object", paste0(" ", example$str))
c(" Tabular data", paste0(" ", example$str))
} else {
""
}
}

format_slots_as_kable <- function(spec) {
if (nrow(spec$slots) == 0) return("")
spec$slots %>%
mutate(
tag_str = pmap_chr(lst(required), function(required) {
out <- c()
if (!required) {
out <- c(out, "Optional")
}
if (length(out) == 0) {
""
} else {
paste0("(_", paste(out, collapse = ", "), "_) ")
}
})
) %>%
transmute(
Slot = paste0("`", struct, "[\"", name, "\"]`"),
Type = paste0("`", type, "`"),
Description = paste0(
tag_str,
description %>% gsub(" *\n *", " ", .) %>% gsub("\\. *$", "", .),
"."
)
) %>%
knitr::kable()
format_file_format_as_kable <- function(spec) {
if (spec$info$file_type == "h5ad") {
spec$slots %>%
mutate(
tag_str = pmap_chr(lst(required), function(required) {
out <- c()
if (!required) {
out <- c(out, "Optional")
}
if (length(out) == 0) {
""
} else {
paste0("(_", paste(out, collapse = ", "), "_) ")
}
})
) %>%
transmute(
Slot = paste0("`", struct, "[\"", name, "\"]`"),
Type = paste0("`", type, "`"),
Description = paste0(
tag_str,
description %>% gsub(" *\n *", " ", .) %>% gsub("\\. *$", "", .),
"."
)
) %>%
knitr::kable()
} else if (spec$info$file_type == "csv" || spec$info$file_type == "tsv" || spec$info$file_type == "parquet") {
spec$columns %>%
mutate(
tag_str = pmap_chr(lst(required), function(required) {
out <- c()
if (!required) {
out <- c(out, "Optional")
}
if (length(out) == 0) {
""
} else {
paste0("(_", paste(out, collapse = ", "), "_) ")
}
})
) %>%
transmute(
Column = paste0("`", name, "`"),
Type = paste0("`", type, "`"),
Description = paste0(
tag_str,
description %>% gsub(" *\n *", " ", .) %>% gsub("\\. *$", "", .),
"."
)
) %>%
knitr::kable()
} else {
""
}
}

list_contains_tibble <- function(li) {
Expand All @@ -97,6 +155,9 @@ read_comp_info <- function(spec_yaml, path) {
spec_yaml$functionality$argument_groups <- NULL

df <- list_as_tibble(spec_yaml$functionality)
if (nrow(df) == 0) {
df <- data.frame(a = 1)[, integer(0)]
}
if (list_contains_tibble(spec_yaml$functionality$info)) {
df <- dplyr::bind_cols(df, list_as_tibble(spec_yaml$functionality$info))
}
Expand Down Expand Up @@ -187,7 +248,7 @@ render_component <- function(spec) {
# path <- "src/datasets/api/file_pca.yaml"
render_file <- function(spec) {
if (is.character(spec)) {
spec <- read_anndata_spec(spec)
spec <- read_file_spec(spec)
}

if (!"label" %in% names(spec$info)) {
Expand Down Expand Up @@ -220,13 +281,13 @@ render_file <- function(spec) {
§Format:
§
§:::{{.small}}
§{paste(format_slots(spec), collapse = '\n')}
§{paste(format_file_format(spec), collapse = '\n')}
§:::
§
§Slot description:
§
§:::{{.small}}
§{paste(format_slots_as_kable(spec), collapse = '\n')}
§{paste(format_file_format_as_kable(spec), collapse = '\n')}
§:::
§
§"), symbol = "§")
Expand Down Expand Up @@ -262,7 +323,7 @@ read_task_api <- function(path) {
project_path = project_path,
parent_path = api_dir
)
files <- map(file_yamls, read_anndata_spec)
files <- map(file_yamls, read_file_spec)
names(files) <- basename(file_yamls) %>% gsub("\\..*$", "", .)
file_info <- map_df(files, "info")
file_slots <- map_df(files, "slots")
Expand Down
30 changes: 19 additions & 11 deletions src/common/process_task_results/get_method_info/script.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,28 @@ outputs <- map(configs, function(config) {
if (length(config$functionality$status) > 0 && config$functionality$status == "disabled") {
return(NULL)
}
info <- config$functionality$info

# prep for viash 0.9.0
build_info <- config$build_info %||% config$info
if ("functionality" %in% names(config)) {
config[names(config$functionality)] <- config$functionality
config[["functionality"]] <- NULL
}

info <- config$info

# add extra info
info$config_path <- gsub(".*openproblems-v2/src/", "src/", config$info$config)
info$task_id <- gsub("/.*", "", config$functionality$namespace)
info$id <- config$functionality$name
info$namespace <- config$functionality$namespace
info$commit_sha <- config$info$git_commit %||% "missing-sha"
info$config_path <- gsub(".*/src/", "src/", build_info$config)
info$task_id <- gsub("/.*", "", config$namespace)
info$id <- config$name
info$namespace <- config$namespace
info$commit_sha <- build_info$git_commit %||% "missing-sha"
info$code_version <- "missing-version"
info$implementation_url <- paste0(
"https://github.com/openproblems-bio/openproblems-v2/tree/",
info$commit_sha, "/",
info$config_path
)
info$implementation_url <- paste0(
build_info$git_remote, "/blob/",
build_info$git_commit, "/",
info$config_path
)

# ↑ this could be used as the new format

Expand Down
23 changes: 15 additions & 8 deletions src/common/process_task_results/get_metric_info/script.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,27 @@ outputs <- map(configs, function(config) {
return(NULL)
}

# prep for viash 0.9.0
build_info <- config$build_info %||% config$info
if ("functionality" %in% names(config)) {
config[names(config$functionality)] <- config$functionality
config[["functionality"]] <- NULL
}

map(
config$functionality$info$metrics,
config$info$metrics,
function(info) {
# add extra info
info$config_path <- gsub(".*openproblems-v2/src/", "src/", config$info$config)
info$task_id <- gsub("/.*", "", config$functionality$namespace)
info$config_path <- gsub(".*/src/", "src/", build_info$config)
info$task_id <- gsub("/.*", "", config$namespace)
info$id <- info$name
info$component_id <- config$functionality$name
info$namespace <- config$functionality$namespace
info$commit_sha <- config$info$git_commit %||% "missing-sha"
info$component_id <- config$name
info$namespace <- config$namespace
info$commit_sha <- build_info$git_commit %||% "missing-sha"
info$code_version <- "missing-version"
info$implementation_url <- paste0(
"https://github.com/openproblems-bio/openproblems-v2/tree/",
info$commit_sha, "/",
build_info$git_remote, "/blob/",
build_info$git_commit, "/",
info$config_path
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ param_list:
input: "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE194nnn/GSE194122/suppl/GSE194122%5Fopenproblems%5Fneurips2021%5Fcite%5FBMMC%5Fprocessed%2Eh5ad%2Egz"
mod1: GEX
mod2: ADT
dataset_name: OpenProblems NeurIPS2021 CITE-Seq
dataset_name: NeurIPS2021 CITE-Seq
dataset_organism: homo_sapiens
dataset_summary: Single-cell CITE-Seq (GEX+ADT) data collected from bone marrow mononuclear cells of 12 healthy human donors.
dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X 3 prime Single-Cell Gene Expression kit with Feature Barcoding in combination with the BioLegend TotalSeq B Universal Human Panel v1.0. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2021. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site."
Expand All @@ -19,7 +19,7 @@ param_list:
input: "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE194nnn/GSE194122/suppl/GSE194122%5Fopenproblems%5Fneurips2021%5Fmultiome%5FBMMC%5Fprocessed%2Eh5ad%2Egz"
mod1: GEX
mod2: ATAC
dataset_name: OpenProblems NeurIPS2021 Multiome
dataset_name: NeurIPS2021 Multiome
dataset_organism: homo_sapiens
dataset_summary: Single-cell Multiome (GEX+ATAC) data collected from bone marrow mononuclear cells of 12 healthy human donors.
dataset_description: "Single-cell CITE-Seq data collected from bone marrow mononuclear cells of 12 healthy human donors using the 10X Multiome Gene Expression and Chromatin Accessibility kit. The dataset was generated to support Multimodal Single-Cell Data Integration Challenge at NeurIPS 2021. Samples were prepared using a standard protocol at four sites. The resulting data was then annotated to identify cell types and remove doublets. The dataset was designed with a nested batch layout such that some donor samples were measured at multiple sites with some donors measured at a single site."
Expand All @@ -35,21 +35,12 @@ output_state: '$id/state.yaml'
publish_dir: s3://openproblems-data/resources/datasets
HERE

cat > /tmp/nextflow.config << HERE
process {
withName:'.*publishStatesProc' {
memory = '16GB'
disk = '100GB'
}
}
HERE

tw launch https://github.com/openproblems-bio/openproblems-v2.git \
--revision main_build \
--pull-latest \
--main-script target/nextflow/datasets/workflows/process_openproblems_neurips2021_bmmc/main.nf \
--workspace 53907369739130 \
--compute-env 6TeIFgV5OY4pJCk8I0bfOh \
--params-file "$params_file" \
--config /tmp/nextflow.config \
--labels openproblems_neurips2021_bmmc,dataset_loader \
--config src/wf_utils/labels_tw.config \
--labels neurips2021,dataset_loader \
26 changes: 26 additions & 0 deletions src/tasks/batch_integration/api/comp_control_method_feature.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
functionality:
namespace: batch_integration/control_methods
info:
type: control_method
subtype: feature
type_info:
label: Control method (feature)
summary: A batch integration feature control method.
description: |
A batch integration control method which outputs a batch-corrected feature space.
arguments:
- name: --input
__merge__: file_dataset.yaml
direction: input
required: true
- name: --output
direction: output
__merge__: file_integrated_feature.yaml
required: true
test_resources:
- type: python_script
path: /src/common/comp_tests/check_method_config.py
- type: python_script
path: /src/common/comp_tests/run_and_check_adata.py
- path: /resources_test/batch_integration/pancreas
dest: resources_test/batch_integration/pancreas
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# use method api spec
__merge__: ../../api/comp_control_method_embedding.yaml
__merge__: ../../../api/comp_control_method_embedding.yaml
functionality:
name: no_integration_batch
name: batch_embed
namespace: batch_integration/control_methods/no_integration
info:
label: No integration by Batch
summary: "Cells are embedded by computing PCA independently on each batch"
Expand All @@ -16,11 +17,6 @@ functionality:
platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
setup:
- type: python
pypi:
- scanpy
- numpy
- type: nextflow
directives:
label: [midtime, lowmem, lowcpu]
Loading

0 comments on commit 6933a39

Please sign in to comment.