From 44df70561e3f4144eda6e1a67540f5bed5b35c68 Mon Sep 17 00:00:00 2001 From: Mervin Fansler Date: Fri, 24 May 2024 15:02:23 -0400 Subject: [PATCH 01/13] support Snakemake v8 --- Snakefile | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 3e0005c..6fa4f5c 100644 --- a/Snakefile +++ b/Snakefile @@ -3,10 +3,17 @@ configfile: "config.yaml" import os import pandas as pd -from snakemake.io import load_configfile from snakemake.utils import min_version from sys import stderr +# robust loading of `load_configfile` +try: + ## version >=8.0.0 + from snakemake.common.configfile import load_configfile +except: + ## version <8.0.0 + from snakemake.io import load_configfile + # set minimum Snakemake version min_version("6.0") From bf1c7cf5742ba468f021c9827a54ce57c596df35 Mon Sep 17 00:00:00 2001 From: Mervin Fansler Date: Fri, 24 May 2024 15:24:33 -0400 Subject: [PATCH 02/13] drop Snakemake bound in docs; tidy --- README.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 828d9ff..986433a 100644 --- a/README.md +++ b/README.md @@ -73,22 +73,23 @@ repository directly tests running examples on the GitHub-hosted runners. ### Conda/Mamba Mode (MacOS or Linux) Snakemake can use Conda to install the needed software. This configuration requires: - - [Snakemake][ref:snakemake] >= 6.0, < 8.0ª + - [Snakemake][ref:snakemake] >= 6.0ª - [Conda](https://docs.conda.io/projects/conda/en/latest/) If Conda is not already installed, we strongly recommend installing -[Mambaforge](https://github.com/conda-forge/miniforge#mambaforge). If Conda was previously -installed, strongly recommend installing Mamba: +[Miniforge](https://github.com/conda-forge/miniforge#miniforge). If Conda was previously +installed, strongly recommend that Conda be upgraded to at least v23.11 which uses the +faster `libmamba` solver: ```bash -conda install -n base -c conda-forge mamba +conda install -n base 'conda>=23.11' ``` ### Singularity Mode (Linux) -Snakemake can use the pre-built scUTRsquant Docker image to provide all additional software. -This configuration requires installing: +Snakemake can use [the pre-built scUTRquant Docker image](https://hub.docker.com/repository/docker/mfansler/scutr-quant) +to provide all additional software. This configuration requires installing: - - [Snakemake][ref:snakemake] >= 6.0, < 8.0ª + - [Snakemake][ref:snakemake] >= 6.0ª - [Singularity](https://singularity.lbl.gov/index.html) @@ -206,7 +207,7 @@ On GitHub runners with 2-3 cores, these examples have typical execution times of On HPC systems with multiple nodes with multiple cores, a large job (e.g., 1-2TB raw data) can process in under an hour when properly configured. -## Full Examples +## Full-Scale Examples The inputs used to process data in [the manuscript][ref:scUTRquant] are also available in the [scUTRquant-inputs repository](https://github.com/Mayrlab/scUTRquant-inputs). These also include individual Snakemake pipelines to download atlas-scale datasets. From 2fb6f196e7d8d48e780d2b2330c1d568479a87e9 Mon Sep 17 00:00:00 2001 From: Mervin Fansler Date: Fri, 24 May 2024 17:43:37 -0400 Subject: [PATCH 03/13] bump to R 4.3, Bioc 3.18 --- envs/bioconductor-sce.yaml | 15 +++++++-------- envs/rmd-reporting.yaml | 12 ++++++------ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/envs/bioconductor-sce.yaml b/envs/bioconductor-sce.yaml index 3d25f1a..4521eda 100644 --- a/envs/bioconductor-sce.yaml +++ b/envs/bioconductor-sce.yaml @@ -4,12 +4,11 @@ channels: - bioconda - defaults dependencies: - - r-base=4.1 - - r-dplyr=1.0 - - r-ggplot2=3.3 + - r-base=4.3 + - r-dplyr=1.1 + - r-ggplot2=3.5 - r-readr=2.1 - - r-stringr=1.4 - - bioconductor-hdf5array=1.22 - - bioconductor-singlecellexperiment=1.16 - - bioconductor-plyranges=1.14 - - openssl=1 + - r-stringr=1.5 + - bioconductor-hdf5array=1.30 + - bioconductor-singlecellexperiment=1.24 + - bioconductor-plyranges=1.22 diff --git a/envs/rmd-reporting.yaml b/envs/rmd-reporting.yaml index 1cd63ae..cfada16 100644 --- a/envs/rmd-reporting.yaml +++ b/envs/rmd-reporting.yaml @@ -3,10 +3,10 @@ channels: - conda-forge - defaults dependencies: - - r-base=4.1 - - r-dplyr=1.0 - - r-ggplot2=3.3 - - r-matrix=1.4 + - r-base=4.3 + - r-dplyr=1.1 + - r-ggplot2=3.5 + - r-matrix=1.6 - r-readr=2.1 - - r-rmarkdown=2.14 - - r-stringr=1.4 + - r-rmarkdown=2.27 + - r-stringr=1.5 From f23b463ccc52c2450a364b7076d070b5402933d9 Mon Sep 17 00:00:00 2001 From: Mervin Fansler Date: Sun, 26 May 2024 14:22:42 -0400 Subject: [PATCH 04/13] issue #88 --- Snakefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index 6fa4f5c..402ad76 100644 --- a/Snakefile +++ b/Snakefile @@ -1,5 +1,6 @@ container: "docker://mfansler/scutr-quant:0.4.0" configfile: "config.yaml" +SQ_VERSION="0.5.0-alpha" import os import pandas as pd @@ -21,6 +22,8 @@ min_version("6.0") def message(*args, **kwargs): print(*args, file=stderr, **kwargs) +message(f"[INFO] scUTRquant v{SQ_VERSION}") + # make sure the tmp directory exists os.makedirs(config['tmp_dir'], exist_ok=True) @@ -397,10 +400,10 @@ rule report_umis_per_cell: txs="data/kallisto/{target}/{sample_id}/txs.genes.txt", mtx="data/kallisto/{target}/{sample_id}/txs.mtx" params: - min_umis=config['min_umis'] + min_umis=config['min_umis'], + sq_version=SQ_VERSION output: "qc/umi_count/{target}/{sample_id}.umi_count.html" conda: "envs/rmd-reporting.yaml" script: "scripts/report_umi_counts_per_cell.Rmd" - From 9d5dcf5e7771973bc76ce0f38f1500a87b52aa7f Mon Sep 17 00:00:00 2001 From: Mervin Fansler Date: Sun, 26 May 2024 14:24:30 -0400 Subject: [PATCH 05/13] update rmd (#85) --- scripts/report_umi_counts_per_cell.Rmd | 31 +++++++++++++++++--------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/scripts/report_umi_counts_per_cell.Rmd b/scripts/report_umi_counts_per_cell.Rmd index d5fd4d4..77b997a 100644 --- a/scripts/report_umi_counts_per_cell.Rmd +++ b/scripts/report_umi_counts_per_cell.Rmd @@ -1,18 +1,23 @@ --- -author: "scutr-quant" date: "`r format(Sys.time(), '%d %B %Y')`" -output: - html_document: - toc: true +output: + html_document: + code_folding: show + df_print: paged + theme: cosmo + toc: TRUE toc_depth: 2 + toc_float: true --- ```{r echo=FALSE} title_str <- sprintf("UMI Counts Distribution - %s", snakemake@wildcards$sample_id) +author_str <- sprintf("scUTRquant v%s", snakemake@params$sq_version) ``` --- title: `r title_str` +author: `r author_str` --- # Libraries @@ -26,22 +31,23 @@ library(magrittr) # Load Data ```{r load_data} -cts <- readMM(snakemake@input$mtx) %>% +cts_tx_bc <- readMM(snakemake@input$mtx) %>% as("CsparseMatrix") %>% `dimnames<-`(list( cell_bx=read_lines(snakemake@input$bxs), transcript_name=read_lines(snakemake@input$txs) )) %>% t %>% - { .[, colSums(.) > 0] } # filter zero cells + { .[, colSums(.) > 0] } # filter empty barcodes ``` # Plot UMI Count Distribution -Here we plot the empirical distribution of UMI counts per cell barcode and indicate the specified minimum UMI count (`r snakemake@params$min_umis`) in blue. +Here we plot the empirical distribution of UMI counts per cell barcode and indicate +the specified minimum UMI count (`r snakemake@params$min_umis`) in blue. ```{r plt_umis_by_barcode} -colSums(cts) %>% +colSums(cts_tx_bc) %>% sort(decreasing=TRUE) %>% { tibble(x=seq_along(.), y=.) } %>% ggplot(aes(x,y)) + @@ -58,12 +64,15 @@ colSums(cts) %>% # Runtime Details ## Session Info +
```{r sesh_info, echo=FALSE} sessionInfo() ``` +
## Conda Environment -```{bash comment="", echo=FALSE} +
+```{bash conda_info, comment="", echo=FALSE} if ! command -v conda &> /dev/null then echo "Conda not detected." @@ -72,6 +81,8 @@ then echo "No active Conda environment." else echo "## Conda Environment YAML" - conda env export + PREFIX=$(dirname $(dirname $(which R))) + conda env export -p $PREFIX fi ``` +
From d1c2e3f7873570237ba5632d9aeaaf74805c04d6 Mon Sep 17 00:00:00 2001 From: Mervin Fansler Date: Mon, 27 May 2024 22:18:03 -0400 Subject: [PATCH 06/13] add support for h5ad output --- Snakefile | 82 +++++++++++++++++++++++++----- config.yaml | 2 + envs/anndata.yaml | 11 ++++ extdata/targets/targets.yaml | 6 +++ scripts/mtxs_to_h5ad_genes.py | 95 +++++++++++++++++++++++++++++++++++ scripts/mtxs_to_h5ad_txs.py | 95 +++++++++++++++++++++++++++++++++++ 6 files changed, 279 insertions(+), 12 deletions(-) create mode 100644 envs/anndata.yaml create mode 100644 scripts/mtxs_to_h5ad_genes.py create mode 100644 scripts/mtxs_to_h5ad_txs.py diff --git a/Snakefile b/Snakefile index 402ad76..7357383 100644 --- a/Snakefile +++ b/Snakefile @@ -57,6 +57,8 @@ if config['cell_annots'] is None: wildcard_constraints: sample_id=config['sample_regex'] +message(config) + # get list of expected outputs def get_outputs(): outputs = [] @@ -64,19 +66,25 @@ def get_outputs(): outputs += expand("qc/umi_count/{target}/{sample_id}.umi_count.html", target=target_list, sample_id=samples.index.values) - if config['use_hdf5']: - outputs += expand("data/sce/{target}/{dataset}.{output_type}.{file_type}", - target=target_list, - dataset=config['dataset_name'], - output_type=config['output_type'], - file_type=['se.rds', 'assays.h5']) - else: - outputs += expand("data/sce/{target}/{dataset}.{output_type}.Rds", - target=target_list, - dataset=config['dataset_name'], - output_type=config['output_type']) + if (not config['output_format']) or ("sce" in config['output_format']): + if config['use_hdf5']: + outputs += expand("data/sce/{target}/{dataset}.{output_type}.{file_type}", + target=target_list, + dataset=config['dataset_name'], + output_type=config['output_type'], + file_type=['se.rds', 'assays.h5']) + else: + outputs += expand("data/sce/{target}/{dataset}.{output_type}.Rds", + target=target_list, + dataset=config['dataset_name'], + output_type=config['output_type']) + if "h5ad" in config['output_format']: + outputs += expand("data/h5ad/{target}/{dataset}.{output_type}.h5ad", + target=target_list, + dataset=config['dataset_name'], + output_type=config['output_type']) return outputs - + rule all: input: get_outputs() @@ -389,6 +397,56 @@ rule mtxs_to_sce_h5_genes: script: "scripts/mtxs_to_sce_genes.R" +rule mtxs_to_h5ad_txs: + input: + bxs=expand("data/kallisto/{target}/{sample_id}/txs.barcodes.txt", + sample_id=samples.index.values, allow_missing=True), + txs=expand("data/kallisto/{target}/{sample_id}/txs.genes.txt", + sample_id=samples.index.values, allow_missing=True), + mtxs=expand("data/kallisto/{target}/{sample_id}/txs.mtx", + sample_id=samples.index.values, allow_missing=True), + gtf=get_target_file('gtf'), + tx_annots=get_target_file('tx_annots_csv'), + cell_annots=config['cell_annots'] + output: + h5ad="data/h5ad/{target}/%s.txs.h5ad" % config['dataset_name'] + params: + genome=lambda wcs: targets[wcs.target]['genome'], + sample_ids=samples.index.values, + min_umis=config['min_umis'], + cell_annots_key=config['cell_annots_key'], + exclude_unannotated_cells=config['exclude_unannotated_cells'] + resources: + mem_mb=16000 + conda: "envs/anndata.yaml" + script: + "scripts/mtxs_to_h5ad_txs.py" + +rule mtxs_to_h5ad_genes: + input: + bxs=expand("data/kallisto/{target}/{sample_id}/genes.barcodes.txt", + sample_id=samples.index.values, allow_missing=True), + genes=expand("data/kallisto/{target}/{sample_id}/genes.genes.txt", + sample_id=samples.index.values, allow_missing=True), + mtxs=expand("data/kallisto/{target}/{sample_id}/genes.mtx", + sample_id=samples.index.values, allow_missing=True), + gtf=get_target_file('gtf'), + gene_annots=get_target_file('gene_annots_csv'), + cell_annots=config['cell_annots'] + output: + h5ad="data/h5ad/{target}/%s.genes.h5ad" % config['dataset_name'] + params: + genome=lambda wcs: targets[wcs.target]['genome'], + sample_ids=samples.index.values, + min_umis=config['min_umis'], + cell_annots_key=config['cell_annots_key'], + exclude_unannotated_cells=config['exclude_unannotated_cells'] + resources: + mem_mb=16000 + conda: "envs/anndata.yaml" + script: + "scripts/mtxs_to_h5ad_genes.py" + ################################################################################ ## Reports diff --git a/config.yaml b/config.yaml index 2675020..0669518 100644 --- a/config.yaml +++ b/config.yaml @@ -7,6 +7,8 @@ target: "utrome_mm10_v2" output_type: - "txs" - "genes" +output_format: + - "sce" tmp_dir: "/tmp" bx_whitelist: null correct_bus: True diff --git a/envs/anndata.yaml b/envs/anndata.yaml new file mode 100644 index 0000000..0774beb --- /dev/null +++ b/envs/anndata.yaml @@ -0,0 +1,11 @@ +name: anndata +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python=3.11 + - anndata=0.10 + - numpy=1.26 + - pandas=2.2 + - scipy=1.13 diff --git a/extdata/targets/targets.yaml b/extdata/targets/targets.yaml index 9fe9407..afd03f9 100644 --- a/extdata/targets/targets.yaml +++ b/extdata/targets/targets.yaml @@ -5,7 +5,9 @@ utrome_mm10_v1: kdx: "adult.utrome.e3.t200.f0.999.w500.kdx" merge_tsv: "adult.utrome.e3.t200.f0.999.w500.merge.tsv" tx_annots: "utrome_txs_annotation.Rds" + tx_annots_csv: null gene_annots: "utrome_genes_annotation.Rds" + gene_annots_csv: null download_script: "download_utrome.sh" utrome_mm10_v2: @@ -15,7 +17,9 @@ utrome_mm10_v2: kdx: "utrome.e30.t5.gc25.pas3.f0.9999.w500.kdx" merge_tsv: "utrome.e30.t5.gc25.pas3.f0.9999.w500.m200.tsv" tx_annots: "utrome_mm10_v2_tx_annots.2024.05.13.Rds" + tx_annots_csv: "utrome_mm10_v2_tx_annots.2024.05.13.csv.gz" gene_annots: "utrome_mm10_v2_gene_annots.2024.05.13.Rds" + gene_annots_csv: "utrome_mm10_v2_gene_annots.2024.05.13.csv.gz" download_script: "download_utrome.sh" utrome_hg38_v1: @@ -25,7 +29,9 @@ utrome_hg38_v1: kdx: "utrome.e30.t5.gc39.pas3.f0.9999.w500.kdx" merge_tsv: "utrome.e30.t5.gc39.pas3.f0.9999.w500.m200.tsv" tx_annots: "utrome_hg38_v1_tx_annots.2024.05.13.Rds" + tx_annots_csv: "utrome_hg38_v1_tx_annots.2024.05.13.csv.gz" gene_annots: "utrome_hg38_v1_gene_annots.2024.05.13.Rds" + gene_annots_csv: "utrome_hg38_v1_gene_annots.2024.05.13.csv.gz" download_script: "download_utrome.sh" ## example custom target using GENCODE diff --git a/scripts/mtxs_to_h5ad_genes.py b/scripts/mtxs_to_h5ad_genes.py new file mode 100644 index 0000000..26023e4 --- /dev/null +++ b/scripts/mtxs_to_h5ad_genes.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python + +import os +import pandas as pd +from scipy.io import mmread +import numpy as np +from anndata import AnnData +import anndata as ad +from sys import stderr + +# print to stderr +def message(*args, **kwargs): + print(*args, file=stderr, **kwargs) + +# Helper function to load and process MTX files +def load_mtx_to_anndata(mtx_file, bx_file, gene_file, sample_id, min_umis, + cell_annots_df, cell_annots_key, exclude_unannotated_cells): + message("[INFO] reading counts...") + mtx = mmread(mtx_file).tocsr() + + message("[INFO] reading barcodes...") + bxs = pd.read_csv(bx_file, header=None, index_col=None).iloc[:, 0].values + cell_ids = [f"{sample_id}_{bx}" for bx in bxs] + + message("[INFO] reading genes...") + genes = pd.read_csv(gene_file, header=None, index_col=None).iloc[:, 0].values + + message("[INFO] creating object...") + adata = AnnData(X=mtx, + obs=pd.DataFrame(index=cell_ids), + var=pd.DataFrame(index=genes)) + + # Filter unannotated cells + if exclude_unannotated_cells: + message("[INFO] filtering unannotated cells...") + annotated_cells = cell_annots_df[cell_annots_key].values + adata = adata[adata.obs.index.isin(annotated_cells)] + + message(f"[INFO] filtering barcodes fewer than {min_umis} UMIs...") + adata = adata[adata.obs.index[adata.X.sum(axis=1).A1 >= min_umis]] + + return adata.copy() + +# Main processing function +def process_mtx_files(snakemake): + mtx_files = snakemake.input.mtxs + bx_files = snakemake.input.bxs + gene_files = snakemake.input.genes + sample_ids = snakemake.params.sample_ids + gene_annots_file = snakemake.input.gene_annots + cell_annots_file = snakemake.input.cell_annots + cell_annots_key = snakemake.params.cell_annots_key + min_umis = int(snakemake.params.min_umis) + exclude_unannotated_cells = snakemake.params.exclude_unannotated_cells + output_file = snakemake.output.h5ad + + + if cell_annots_file: + cell_annots_df = pd.read_csv(cell_annots_file) + elif exclude_unannotated_cells: + message("[Warning] The `exclude_unannotated_cells` option is ignored because no `cell_annots` were provided!") + exclude_unannotated_cells = False + + message("[INFO] Beginning sample loading...") + adatas = [] + for mtx, bx, gene, sample_id in zip(mtx_files, bx_files, gene_files, sample_ids): + message(f"[INFO] Loading sample {sample_id}...") + adata = load_mtx_to_anndata(mtx, bx, gene, sample_id, min_umis, cell_annots_df, cell_annots_key, exclude_unannotated_cells) + adatas.append(adata) + message("[INFO] Done loading.") + + message("[INFO] Concatenating AnnData objects...") + combined_adata = ad.concat(adatas, axis=0) + + if gene_annots_file: + message("[INFO] Loading gene annotations...") + gene_annots_df = pd.read_csv(gene_annots_file).set_index("gene_id", drop=False) + message("[INFO] Attaching gene annotations...") + combined_adata.var = combined_adata.var.join(gene_annots_df, how='left') + + if cell_annots_file: + message("[INFO] Attaching cell annotations...") + combined_adata.obs = combined_adata.obs.join(cell_annots_df.set_index(cell_annots_key, drop=False), how='left') + + message(f"[INFO] Saving AnnData to {output_file}...") + combined_adata.write(output_file) + +# Check for snakemake object and process +try: + snakemake # type: ignore + process_mtx_files(snakemake) # type: ignore + message("[INFO] Done.") +except NameError: + message("[WARNING] This script is intended to be executed as part of a Snakemake workflow and requires the 'snakemake' object.") + message("[INFO] Nothing done.") diff --git a/scripts/mtxs_to_h5ad_txs.py b/scripts/mtxs_to_h5ad_txs.py new file mode 100644 index 0000000..f25ba71 --- /dev/null +++ b/scripts/mtxs_to_h5ad_txs.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python + +import os +import pandas as pd +from scipy.io import mmread +import numpy as np +from anndata import AnnData +import anndata as ad +from sys import stderr + +# print to stderr +def message(*args, **kwargs): + print(*args, file=stderr, **kwargs) + +# Helper function to load and process MTX files +def load_mtx_to_anndata(mtx_file, bx_file, tx_file, sample_id, min_umis, + cell_annots_df, cell_annots_key, exclude_unannotated_cells): + message("[INFO] reading counts...") + mtx = mmread(mtx_file).tocsr() + + message("[INFO] reading barcodes...") + bxs = pd.read_csv(bx_file, header=None, index_col=None).iloc[:, 0].values + cell_ids = [f"{sample_id}_{bx}" for bx in bxs] + + message("[INFO] reading transcripts...") + txs = pd.read_csv(tx_file, header=None, index_col=None).iloc[:, 0].values + + message("[INFO] creating object...") + adata = AnnData(X=mtx, + obs=pd.DataFrame(index=cell_ids), + var=pd.DataFrame(index=txs)) + + # Filter unannotated cells + if exclude_unannotated_cells: + message("[INFO] filtering unannotated cells...") + annotated_cells = cell_annots_df[cell_annots_key].values + adata = adata[adata.obs.index.isin(annotated_cells)] + + message(f"[INFO] filtering barcodes fewer than {min_umis} UMIs...") + adata = adata[adata.obs.index[adata.X.sum(axis=1).A1 >= min_umis]] + + return adata.copy() + +# Main processing function +def process_mtx_files(snakemake): + mtx_files = snakemake.input.mtxs + bx_files = snakemake.input.bxs + tx_files = snakemake.input.txs + sample_ids = snakemake.params.sample_ids + tx_annots_file = snakemake.input.tx_annots + cell_annots_file = snakemake.input.cell_annots + cell_annots_key = snakemake.params.cell_annots_key + min_umis = int(snakemake.params.min_umis) + exclude_unannotated_cells = snakemake.params.exclude_unannotated_cells + output_file = snakemake.output.h5ad + + + if cell_annots_file: + cell_annots_df = pd.read_csv(cell_annots_file) + elif exclude_unannotated_cells: + message("[Warning] The `exclude_unannotated_cells` option is ignored because no `cell_annots` were provided!") + exclude_unannotated_cells = False + + message("[INFO] Beginning sample loading...") + adatas = [] + for mtx, bx, tx, sample_id in zip(mtx_files, bx_files, tx_files, sample_ids): + message(f"[INFO] Loading sample {sample_id}...") + adata = load_mtx_to_anndata(mtx, bx, tx, sample_id, min_umis, cell_annots_df, cell_annots_key, exclude_unannotated_cells) + adatas.append(adata) + message("[INFO] Done loading.") + + message("[INFO] Concatenating AnnData objects...") + combined_adata = ad.concat(adatas, axis=0) + + if tx_annots_file: + message("[INFO] Loading transcript annotations...") + tx_annots_df = pd.read_csv(tx_annots_file).set_index("transcript_id", drop=False) + message("[INFO] Attaching transcript annotations...") + combined_adata.var = combined_adata.var.join(tx_annots_df, how='left') + + if cell_annots_file: + message("[INFO] Attaching cell annotations...") + combined_adata.obs = combined_adata.obs.join(cell_annots_df.set_index(cell_annots_key, drop=False), how='left') + + message(f"[INFO] Saving AnnData to {output_file}...") + combined_adata.write(output_file) + +# Check for snakemake object and process +try: + snakemake # type: ignore + process_mtx_files(snakemake) # type: ignore + message("[INFO] Done.") +except NameError: + message("[WARNING] This script is intended to be executed as part of a Snakemake workflow and requires the 'snakemake' object.") + message("[INFO] Nothing done.") From 774c2973230cab79d7dff88d592a5e7f20686884 Mon Sep 17 00:00:00 2001 From: Mervin Fansler Date: Mon, 27 May 2024 22:41:47 -0400 Subject: [PATCH 07/13] h5ad docs --- README.md | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 986433a..6e7dc12 100644 --- a/README.md +++ b/README.md @@ -44,20 +44,31 @@ determined by simulations. Please see [our accompanying manuscript][ref:scUTRquant] for more details. ## Outputs +### SingleCellExperiment object (*default*) The primary output of the pipeline is a Bioconductor `SingleCellExperiment` object. The `counts` in the object is a sparse `Matrix` of 3' UTR isoform counts; the `rowRanges` is a `GenomicRanges` of the 3' UTR isoforms; the `rowData` is a `DataFrame` with additional information about 3' UTR isoforms; and the `colData` is a `DataFrame` populated with sample metadata and optional user-provide cell annotations. +### H5AD AnnData object (*experimental*) +scUTRquant v0.5.0 adds support for [AnnData objects](https://anndata.readthedocs.io/en/stable/index.html), +making it easier for users who prefer Python and working with [the `scverse` ecosystem](https://scverse.org). +Similar annotations will be attached in the `obs` (cell) and `var` (3'UTR isoform) tables. +However, no analogous `rowRanges` is attached. + +The output format can be controlled with the `output_format` configuration option, +with `"h5ad"` and "`sce`" being currently supported options. + +### Reporting To assist users in quality control, the pipeline additionally generates HTML reports for each sample. +### Additional Files The pipeline is configured to retain intermediate files, such as BUS and MTX files. Advanced users can readily customize the pipeline to only generate the files they -require. For example, users who prefer to work with alternative scRNA-seq data structures, -such as those used in Scanpy or Seurat, may wish to terminate the pipeline at MTX -generation. +require. For example, users who prefer to work with alternative scRNA-seq data structures +may wish to terminate the pipeline at MTX generation. # Setup @@ -237,6 +248,7 @@ pipeline. The following keys are expected: - `cell_annots`: (optional) CSV file with a key column that matches the `_` format - `cell_annots_key`: specifies the name of the key column in the `cell_annots` file; default is `cell_id` - `exclude_unannotated_cells`: boolean indicating whether unannotated cells should be excluded from the final output; default is `False` + - `output_format`: a list of formats, `"sce"` (*default*) and/or `"h5ad"` (*experimental*); `null` will end processing at MTX files. ### Default Values @@ -271,7 +283,9 @@ The `extdata/targets.yaml` defines the targets available to pseudoalign to. The - `kdx`: Kallisto index for UTRome - `merge`: TSV for merging features (isoforms) - `tx_annots`: (optional) RDS file containing Bioconductor DataFrame object with annotations for transcripts + - `tx_annots_csv`: (optional) CSV file with annotations for transcripts (used for AnnData) - `gene_annots`: (optional) RDS file containing Bioconductor DataFrame object with annotations for genes + - `gene_annots_csv`: (optional) CSV file with annotations for genes (used for AnnData) # Customization From ff0fb4854aa8e7cda77e73294ac800145e4192fc Mon Sep 17 00:00:00 2001 From: Mervin Fansler Date: Mon, 27 May 2024 23:18:08 -0400 Subject: [PATCH 08/13] cleanup debug out --- Snakefile | 3 --- 1 file changed, 3 deletions(-) diff --git a/Snakefile b/Snakefile index 7357383..b40a1e8 100644 --- a/Snakefile +++ b/Snakefile @@ -57,8 +57,6 @@ if config['cell_annots'] is None: wildcard_constraints: sample_id=config['sample_regex'] -message(config) - # get list of expected outputs def get_outputs(): outputs = [] @@ -447,7 +445,6 @@ rule mtxs_to_h5ad_genes: script: "scripts/mtxs_to_h5ad_genes.py" - ################################################################################ ## Reports ################################################################################ From 8621d961f03f3691e59f4960531eb8963df9ebf6 Mon Sep 17 00:00:00 2001 From: Mervin Fansler Date: Mon, 27 May 2024 23:54:34 -0400 Subject: [PATCH 09/13] detect csv annots --- Snakefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Snakefile b/Snakefile index b40a1e8..915ed2b 100644 --- a/Snakefile +++ b/Snakefile @@ -98,8 +98,10 @@ for target_id, target in targets.items(): target_path = target['path'] download_script = target_path + target['download_script'] - - FILE_KEYS = ['gtf', 'kdx', 'merge_tsv', 'tx_annots', 'gene_annots'] + + FILE_KEYS = ['gtf', 'kdx', 'merge_tsv', + 'tx_annots', 'gene_annots', + 'tx_annots_csv', 'gene_annots_csv'] target_files = [target_path + target[k] for k in FILE_KEYS] rule: From cafcc73ecfa388e223e56794382f09d399329049 Mon Sep 17 00:00:00 2001 From: Mervin Fansler Date: Tue, 28 May 2024 00:19:23 -0400 Subject: [PATCH 10/13] improved null support --- Snakefile | 4 ++-- extdata/targets/targets.yaml | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Snakefile b/Snakefile index 915ed2b..4679764 100644 --- a/Snakefile +++ b/Snakefile @@ -100,9 +100,9 @@ for target_id, target in targets.items(): download_script = target_path + target['download_script'] FILE_KEYS = ['gtf', 'kdx', 'merge_tsv', - 'tx_annots', 'gene_annots', + 'tx_annots', 'gene_annots', 'tx_annots_csv', 'gene_annots_csv'] - target_files = [target_path + target[k] for k in FILE_KEYS] + target_files = [target_path + target[k] for k in FILE_KEYS if target[k] is not None] rule: name: f"download_{target_id}" diff --git a/extdata/targets/targets.yaml b/extdata/targets/targets.yaml index afd03f9..c268e2f 100644 --- a/extdata/targets/targets.yaml +++ b/extdata/targets/targets.yaml @@ -1,5 +1,5 @@ utrome_mm10_v1: - path: "extdata/targets/utrome_mm10_v1/" # files should be relative to this + path: "extdata/targets/utrome_mm10_v1/" genome: "mm10" gtf: "adult.utrome.e3.t200.f0.999.w500.gtf" kdx: "adult.utrome.e3.t200.f0.999.w500.kdx" @@ -37,10 +37,12 @@ utrome_hg38_v1: ## example custom target using GENCODE ## see https://github.com/Mayrlab/txcutr-db # gencode_v38_pc_w500: -# path: "extdata/targets/gencode_v38_pc_w500/" +# path: "extdata/targets/gencode_v38_pc_w500/" # files must be relative to this # genome: "hg38" # gtf: "gencode.v38.annotation.pc.txcutr.w500.gtf" # kdx: "gencode.v38.annotation.pc.txcutr.w500.kdx" # merge_tsv: "gencode.v38.annotation.pc.txcutr.w500.merge.tsv" # tx_annots: null +# tx_annots_csv: null # gene_annots: null +# gene_annots_csv: null From 57b415435a861d0725ebb4b960f74743b1d87837 Mon Sep 17 00:00:00 2001 From: Mervin Fansler Date: Tue, 28 May 2024 00:49:14 -0400 Subject: [PATCH 11/13] use gzip --- scripts/mtxs_to_h5ad_genes.py | 2 +- scripts/mtxs_to_h5ad_txs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/mtxs_to_h5ad_genes.py b/scripts/mtxs_to_h5ad_genes.py index 26023e4..3e9c26a 100644 --- a/scripts/mtxs_to_h5ad_genes.py +++ b/scripts/mtxs_to_h5ad_genes.py @@ -83,7 +83,7 @@ def process_mtx_files(snakemake): combined_adata.obs = combined_adata.obs.join(cell_annots_df.set_index(cell_annots_key, drop=False), how='left') message(f"[INFO] Saving AnnData to {output_file}...") - combined_adata.write(output_file) + combined_adata.write(output_file, compression="gzip") # Check for snakemake object and process try: diff --git a/scripts/mtxs_to_h5ad_txs.py b/scripts/mtxs_to_h5ad_txs.py index f25ba71..806c18c 100644 --- a/scripts/mtxs_to_h5ad_txs.py +++ b/scripts/mtxs_to_h5ad_txs.py @@ -83,7 +83,7 @@ def process_mtx_files(snakemake): combined_adata.obs = combined_adata.obs.join(cell_annots_df.set_index(cell_annots_key, drop=False), how='left') message(f"[INFO] Saving AnnData to {output_file}...") - combined_adata.write(output_file) + combined_adata.write(output_file, compression="gzip") # Check for snakemake object and process try: From 0581cac7b1e5b49e10557ddb90e9416e445ff041 Mon Sep 17 00:00:00 2001 From: Mervin Fansler Date: Tue, 28 May 2024 01:47:13 -0400 Subject: [PATCH 12/13] refresh docker --- Snakefile | 2 +- docker/scutr-quant.yaml | 26 ++++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/Snakefile b/Snakefile index 4679764..46b778c 100644 --- a/Snakefile +++ b/Snakefile @@ -1,4 +1,4 @@ -container: "docker://mfansler/scutr-quant:0.4.0" +container: "docker://mfansler/scutr-quant:0.5.0" configfile: "config.yaml" SQ_VERSION="0.5.0-alpha" diff --git a/docker/scutr-quant.yaml b/docker/scutr-quant.yaml index 1e6a274..b9c8ad1 100644 --- a/docker/scutr-quant.yaml +++ b/docker/scutr-quant.yaml @@ -15,14 +15,20 @@ dependencies: - bustools=0.40.0 ## sce + reporting - - r-base=4.1 - - r-dplyr=1.0 - - r-ggplot2=3.3 - - r-matrix=1.4 + - r-base=4.3 + - r-dplyr=1.1 + - r-ggplot2=3.5 + - r-matrix=1.6 - r-readr=2.1 - - r-rmarkdown=2.14 - - r-stringr=1.4 - - bioconductor-hdf5array=1.22 - - bioconductor-singlecellexperiment=1.16 - - bioconductor-plyranges=1.14 - - openssl=1 + - r-rmarkdown=2.27 + - r-stringr=1.5 + - bioconductor-hdf5array=1.30 + - bioconductor-singlecellexperiment=1.24 + - bioconductor-plyranges=1.22 + + ## anndata + - python=3.11 + - anndata=0.10 + - numpy=1.26 + - pandas=2.2 + - scipy=1.13 From a29003a08e030dc1ca529680a19b7da010023f4e Mon Sep 17 00:00:00 2001 From: Mervin Fansler Date: Tue, 28 May 2024 01:57:24 -0400 Subject: [PATCH 13/13] bump to release version --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 46b778c..63f6af2 100644 --- a/Snakefile +++ b/Snakefile @@ -1,6 +1,6 @@ container: "docker://mfansler/scutr-quant:0.5.0" configfile: "config.yaml" -SQ_VERSION="0.5.0-alpha" +SQ_VERSION="0.5.0" import os import pandas as pd