From 44df70561e3f4144eda6e1a67540f5bed5b35c68 Mon Sep 17 00:00:00 2001
From: Mervin Fansler <mmfansler@gmail.com>
Date: Fri, 24 May 2024 15:02:23 -0400
Subject: [PATCH 01/13] support Snakemake v8

---
 Snakefile | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/Snakefile b/Snakefile
index 3e0005c..6fa4f5c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -3,10 +3,17 @@ configfile: "config.yaml"
 
 import os
 import pandas as pd
-from snakemake.io import load_configfile
 from snakemake.utils import min_version
 from sys import stderr
 
+# robust loading of `load_configfile`
+try:
+    ## version >=8.0.0
+    from snakemake.common.configfile import load_configfile
+except:
+    ## version <8.0.0
+    from snakemake.io import load_configfile
+
 # set minimum Snakemake version
 min_version("6.0")
 

From bf1c7cf5742ba468f021c9827a54ce57c596df35 Mon Sep 17 00:00:00 2001
From: Mervin Fansler <mmfansler@gmail.com>
Date: Fri, 24 May 2024 15:24:33 -0400
Subject: [PATCH 02/13] drop Snakemake bound in docs; tidy

---
 README.md | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 828d9ff..986433a 100644
--- a/README.md
+++ b/README.md
@@ -73,22 +73,23 @@ repository directly tests running examples on the GitHub-hosted runners.
 ### Conda/Mamba Mode (MacOS or Linux)
 Snakemake can use Conda to install the needed software. This configuration requires:
 
- - [Snakemake][ref:snakemake] >= 6.0, < 8.0<sup>ª</sup>
+ - [Snakemake][ref:snakemake] >= 6.0<sup>ª</sup>
  - [Conda](https://docs.conda.io/projects/conda/en/latest/)
 
 If Conda is not already installed, we strongly recommend installing 
-[Mambaforge](https://github.com/conda-forge/miniforge#mambaforge). If Conda was previously
-installed, strongly recommend installing Mamba:
+[Miniforge](https://github.com/conda-forge/miniforge#miniforge). If Conda was previously
+installed, strongly recommend that Conda be upgraded to at least v23.11 which uses the 
+faster `libmamba` solver:
 
 ```bash
-conda install -n base -c conda-forge mamba
+conda install -n base 'conda>=23.11'
 ```
  
 ### Singularity Mode (Linux)
-Snakemake can use the pre-built scUTRsquant Docker image to provide all additional software.
-This configuration requires installing:
+Snakemake can use [the pre-built scUTRquant Docker image](https://hub.docker.com/repository/docker/mfansler/scutr-quant) 
+to provide all additional software. This configuration requires installing:
 
- - [Snakemake][ref:snakemake] >= 6.0, < 8.0<sup>ª</sup>
+ - [Snakemake][ref:snakemake] >= 6.0<sup>ª</sup>
  - [Singularity](https://singularity.lbl.gov/index.html)
 
 
@@ -206,7 +207,7 @@ On GitHub runners with 2-3 cores, these examples have typical execution times of
 On HPC systems with multiple nodes with multiple cores, a large job (e.g., 1-2TB raw data) 
 can process in under an hour when properly configured.
 
-## Full Examples
+## Full-Scale Examples
 The inputs used to process data in [the manuscript][ref:scUTRquant] are also available in the 
 [scUTRquant-inputs repository](https://github.com/Mayrlab/scUTRquant-inputs).
 These also include individual Snakemake pipelines to download atlas-scale datasets. 

From 2fb6f196e7d8d48e780d2b2330c1d568479a87e9 Mon Sep 17 00:00:00 2001
From: Mervin Fansler <mmfansler@gmail.com>
Date: Fri, 24 May 2024 17:43:37 -0400
Subject: [PATCH 03/13] bump to R 4.3, Bioc 3.18

---
 envs/bioconductor-sce.yaml | 15 +++++++--------
 envs/rmd-reporting.yaml    | 12 ++++++------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/envs/bioconductor-sce.yaml b/envs/bioconductor-sce.yaml
index 3d25f1a..4521eda 100644
--- a/envs/bioconductor-sce.yaml
+++ b/envs/bioconductor-sce.yaml
@@ -4,12 +4,11 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - r-base=4.1
-  - r-dplyr=1.0
-  - r-ggplot2=3.3
+  - r-base=4.3
+  - r-dplyr=1.1
+  - r-ggplot2=3.5
   - r-readr=2.1
-  - r-stringr=1.4
-  - bioconductor-hdf5array=1.22
-  - bioconductor-singlecellexperiment=1.16
-  - bioconductor-plyranges=1.14
-  - openssl=1
+  - r-stringr=1.5
+  - bioconductor-hdf5array=1.30
+  - bioconductor-singlecellexperiment=1.24
+  - bioconductor-plyranges=1.22
diff --git a/envs/rmd-reporting.yaml b/envs/rmd-reporting.yaml
index 1cd63ae..cfada16 100644
--- a/envs/rmd-reporting.yaml
+++ b/envs/rmd-reporting.yaml
@@ -3,10 +3,10 @@ channels:
   - conda-forge
   - defaults
 dependencies:
-  - r-base=4.1
-  - r-dplyr=1.0
-  - r-ggplot2=3.3
-  - r-matrix=1.4
+  - r-base=4.3
+  - r-dplyr=1.1
+  - r-ggplot2=3.5
+  - r-matrix=1.6
   - r-readr=2.1
-  - r-rmarkdown=2.14
-  - r-stringr=1.4
+  - r-rmarkdown=2.27
+  - r-stringr=1.5

From f23b463ccc52c2450a364b7076d070b5402933d9 Mon Sep 17 00:00:00 2001
From: Mervin Fansler <mmfansler@gmail.com>
Date: Sun, 26 May 2024 14:22:42 -0400
Subject: [PATCH 04/13] issue #88

---
 Snakefile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 6fa4f5c..402ad76 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1,5 +1,6 @@
 container: "docker://mfansler/scutr-quant:0.4.0"
 configfile: "config.yaml"
+SQ_VERSION="0.5.0-alpha"
 
 import os
 import pandas as pd
@@ -21,6 +22,8 @@ min_version("6.0")
 def message(*args, **kwargs):
     print(*args, file=stderr, **kwargs)
 
+message(f"[INFO] scUTRquant v{SQ_VERSION}")
+
 # make sure the tmp directory exists
 os.makedirs(config['tmp_dir'], exist_ok=True)
 
@@ -397,10 +400,10 @@ rule report_umis_per_cell:
         txs="data/kallisto/{target}/{sample_id}/txs.genes.txt",
         mtx="data/kallisto/{target}/{sample_id}/txs.mtx"
     params:
-        min_umis=config['min_umis']
+        min_umis=config['min_umis'],
+        sq_version=SQ_VERSION
     output:
         "qc/umi_count/{target}/{sample_id}.umi_count.html"
     conda: "envs/rmd-reporting.yaml"
     script:
         "scripts/report_umi_counts_per_cell.Rmd"
-

From 9d5dcf5e7771973bc76ce0f38f1500a87b52aa7f Mon Sep 17 00:00:00 2001
From: Mervin Fansler <mmfansler@gmail.com>
Date: Sun, 26 May 2024 14:24:30 -0400
Subject: [PATCH 05/13] update rmd (#85)

---
 scripts/report_umi_counts_per_cell.Rmd | 31 +++++++++++++++++---------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/scripts/report_umi_counts_per_cell.Rmd b/scripts/report_umi_counts_per_cell.Rmd
index d5fd4d4..77b997a 100644
--- a/scripts/report_umi_counts_per_cell.Rmd
+++ b/scripts/report_umi_counts_per_cell.Rmd
@@ -1,18 +1,23 @@
 ---
-author: "scutr-quant"
 date: "`r format(Sys.time(), '%d %B %Y')`"
-output:
-  html_document:
-    toc: true
+output: 
+  html_document: 
+    code_folding: show
+    df_print: paged
+    theme: cosmo
+    toc: TRUE
     toc_depth: 2
+    toc_float: true
 ---
 
 ```{r echo=FALSE}
 title_str <- sprintf("UMI Counts Distribution - %s", snakemake@wildcards$sample_id)
+author_str <- sprintf("scUTRquant v%s", snakemake@params$sq_version)
 ```
 
 ---
 title: `r title_str`
+author: `r author_str`
 ---
 
 # Libraries
@@ -26,22 +31,23 @@ library(magrittr)
 
 # Load Data
 ```{r load_data}
-cts <- readMM(snakemake@input$mtx) %>%
+cts_tx_bc <- readMM(snakemake@input$mtx) %>%
   as("CsparseMatrix") %>%
   `dimnames<-`(list(
     cell_bx=read_lines(snakemake@input$bxs),
     transcript_name=read_lines(snakemake@input$txs)
   )) %>%
   t %>%
-  { .[, colSums(.) > 0] } # filter zero cells
+  { .[, colSums(.) > 0] } # filter empty barcodes
 ```
 
 # Plot UMI Count Distribution
 
-Here we plot the empirical distribution of UMI counts per cell barcode and indicate the specified minimum UMI count (`r snakemake@params$min_umis`) in blue.
+Here we plot the empirical distribution of UMI counts per cell barcode and indicate 
+the specified minimum UMI count (`r snakemake@params$min_umis`) in blue.
 
 ```{r plt_umis_by_barcode}
-colSums(cts) %>%
+colSums(cts_tx_bc) %>%
   sort(decreasing=TRUE) %>%
   { tibble(x=seq_along(.), y=.) } %>%
   ggplot(aes(x,y)) +
@@ -58,12 +64,15 @@ colSums(cts) %>%
 
 # Runtime Details
 ## Session Info
+<details>
 ```{r sesh_info, echo=FALSE}
 sessionInfo()
 ```
+</details>
 
 ## Conda Environment
-```{bash comment="", echo=FALSE}
+<details>
+```{bash conda_info, comment="", echo=FALSE}
 if ! command -v conda &> /dev/null
 then
   echo "Conda not detected."
@@ -72,6 +81,8 @@ then
   echo "No active Conda environment."
 else
   echo "## Conda Environment YAML"
-  conda env export
+  PREFIX=$(dirname $(dirname $(which R)))
+  conda env export -p $PREFIX
 fi
 ```
+</details>

From d1c2e3f7873570237ba5632d9aeaaf74805c04d6 Mon Sep 17 00:00:00 2001
From: Mervin Fansler <mmfansler@gmail.com>
Date: Mon, 27 May 2024 22:18:03 -0400
Subject: [PATCH 06/13] add support for h5ad output

---
 Snakefile                     | 82 +++++++++++++++++++++++++-----
 config.yaml                   |  2 +
 envs/anndata.yaml             | 11 ++++
 extdata/targets/targets.yaml  |  6 +++
 scripts/mtxs_to_h5ad_genes.py | 95 +++++++++++++++++++++++++++++++++++
 scripts/mtxs_to_h5ad_txs.py   | 95 +++++++++++++++++++++++++++++++++++
 6 files changed, 279 insertions(+), 12 deletions(-)
 create mode 100644 envs/anndata.yaml
 create mode 100644 scripts/mtxs_to_h5ad_genes.py
 create mode 100644 scripts/mtxs_to_h5ad_txs.py

diff --git a/Snakefile b/Snakefile
index 402ad76..7357383 100644
--- a/Snakefile
+++ b/Snakefile
@@ -57,6 +57,8 @@ if config['cell_annots'] is None:
 wildcard_constraints:
     sample_id=config['sample_regex']
 
+message(config)
+
 # get list of expected outputs
 def get_outputs():
     outputs = []
@@ -64,19 +66,25 @@ def get_outputs():
         outputs += expand("qc/umi_count/{target}/{sample_id}.umi_count.html",
                           target=target_list,
                           sample_id=samples.index.values)
-    if config['use_hdf5']:
-        outputs += expand("data/sce/{target}/{dataset}.{output_type}.{file_type}",
-                          target=target_list,
-                          dataset=config['dataset_name'],
-                          output_type=config['output_type'],
-                          file_type=['se.rds', 'assays.h5'])
-    else:
-        outputs += expand("data/sce/{target}/{dataset}.{output_type}.Rds",
-                          target=target_list,
-                          dataset=config['dataset_name'],
-                          output_type=config['output_type'])
+    if (not config['output_format']) or ("sce" in config['output_format']):
+        if config['use_hdf5']:
+            outputs += expand("data/sce/{target}/{dataset}.{output_type}.{file_type}",
+                            target=target_list,
+                            dataset=config['dataset_name'],
+                            output_type=config['output_type'],
+                            file_type=['se.rds', 'assays.h5'])
+        else:
+            outputs += expand("data/sce/{target}/{dataset}.{output_type}.Rds",
+                            target=target_list,
+                            dataset=config['dataset_name'],
+                            output_type=config['output_type'])
+    if "h5ad" in config['output_format']:
+        outputs += expand("data/h5ad/{target}/{dataset}.{output_type}.h5ad",
+                            target=target_list,
+                            dataset=config['dataset_name'],
+                            output_type=config['output_type'])
     return outputs
-    
+
 rule all:
     input: get_outputs()
 
@@ -389,6 +397,56 @@ rule mtxs_to_sce_h5_genes:
     script:
         "scripts/mtxs_to_sce_genes.R"
 
+rule mtxs_to_h5ad_txs:
+    input:
+        bxs=expand("data/kallisto/{target}/{sample_id}/txs.barcodes.txt",
+                   sample_id=samples.index.values, allow_missing=True),
+        txs=expand("data/kallisto/{target}/{sample_id}/txs.genes.txt",
+                   sample_id=samples.index.values, allow_missing=True),
+        mtxs=expand("data/kallisto/{target}/{sample_id}/txs.mtx",
+                    sample_id=samples.index.values, allow_missing=True),
+        gtf=get_target_file('gtf'),
+        tx_annots=get_target_file('tx_annots_csv'),
+        cell_annots=config['cell_annots']
+    output:
+        h5ad="data/h5ad/{target}/%s.txs.h5ad" % config['dataset_name']
+    params:
+        genome=lambda wcs: targets[wcs.target]['genome'],
+        sample_ids=samples.index.values,
+        min_umis=config['min_umis'],
+        cell_annots_key=config['cell_annots_key'],
+        exclude_unannotated_cells=config['exclude_unannotated_cells']
+    resources:
+        mem_mb=16000
+    conda: "envs/anndata.yaml"
+    script:
+        "scripts/mtxs_to_h5ad_txs.py"
+
+rule mtxs_to_h5ad_genes:
+    input:
+        bxs=expand("data/kallisto/{target}/{sample_id}/genes.barcodes.txt",
+                   sample_id=samples.index.values, allow_missing=True),
+        genes=expand("data/kallisto/{target}/{sample_id}/genes.genes.txt",
+                   sample_id=samples.index.values, allow_missing=True),
+        mtxs=expand("data/kallisto/{target}/{sample_id}/genes.mtx",
+                    sample_id=samples.index.values, allow_missing=True),
+        gtf=get_target_file('gtf'),
+        gene_annots=get_target_file('gene_annots_csv'),
+        cell_annots=config['cell_annots']
+    output:
+        h5ad="data/h5ad/{target}/%s.genes.h5ad" % config['dataset_name']
+    params:
+        genome=lambda wcs: targets[wcs.target]['genome'],
+        sample_ids=samples.index.values,
+        min_umis=config['min_umis'],
+        cell_annots_key=config['cell_annots_key'],
+        exclude_unannotated_cells=config['exclude_unannotated_cells']
+    resources:
+        mem_mb=16000
+    conda: "envs/anndata.yaml"
+    script:
+        "scripts/mtxs_to_h5ad_genes.py"
+
 
 ################################################################################
 ## Reports
diff --git a/config.yaml b/config.yaml
index 2675020..0669518 100644
--- a/config.yaml
+++ b/config.yaml
@@ -7,6 +7,8 @@ target: "utrome_mm10_v2"
 output_type:
   - "txs"
   - "genes"
+output_format:
+  - "sce"
 tmp_dir: "/tmp"
 bx_whitelist: null
 correct_bus: True
diff --git a/envs/anndata.yaml b/envs/anndata.yaml
new file mode 100644
index 0000000..0774beb
--- /dev/null
+++ b/envs/anndata.yaml
@@ -0,0 +1,11 @@
+name: anndata
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - python=3.11
+  - anndata=0.10
+  - numpy=1.26
+  - pandas=2.2
+  - scipy=1.13
diff --git a/extdata/targets/targets.yaml b/extdata/targets/targets.yaml
index 9fe9407..afd03f9 100644
--- a/extdata/targets/targets.yaml
+++ b/extdata/targets/targets.yaml
@@ -5,7 +5,9 @@ utrome_mm10_v1:
   kdx: "adult.utrome.e3.t200.f0.999.w500.kdx"
   merge_tsv: "adult.utrome.e3.t200.f0.999.w500.merge.tsv"
   tx_annots: "utrome_txs_annotation.Rds"
+  tx_annots_csv: null
   gene_annots: "utrome_genes_annotation.Rds"
+  gene_annots_csv: null
   download_script: "download_utrome.sh"
 
 utrome_mm10_v2:
@@ -15,7 +17,9 @@ utrome_mm10_v2:
   kdx: "utrome.e30.t5.gc25.pas3.f0.9999.w500.kdx"
   merge_tsv: "utrome.e30.t5.gc25.pas3.f0.9999.w500.m200.tsv"
   tx_annots: "utrome_mm10_v2_tx_annots.2024.05.13.Rds"
+  tx_annots_csv: "utrome_mm10_v2_tx_annots.2024.05.13.csv.gz"
   gene_annots: "utrome_mm10_v2_gene_annots.2024.05.13.Rds"
+  gene_annots_csv: "utrome_mm10_v2_gene_annots.2024.05.13.csv.gz"
   download_script: "download_utrome.sh"
 
 utrome_hg38_v1:
@@ -25,7 +29,9 @@ utrome_hg38_v1:
   kdx: "utrome.e30.t5.gc39.pas3.f0.9999.w500.kdx"
   merge_tsv: "utrome.e30.t5.gc39.pas3.f0.9999.w500.m200.tsv"
   tx_annots: "utrome_hg38_v1_tx_annots.2024.05.13.Rds"
+  tx_annots_csv: "utrome_hg38_v1_tx_annots.2024.05.13.csv.gz"
   gene_annots: "utrome_hg38_v1_gene_annots.2024.05.13.Rds"
+  gene_annots_csv: "utrome_hg38_v1_gene_annots.2024.05.13.csv.gz"
   download_script: "download_utrome.sh"
 
 ## example custom target using GENCODE
diff --git a/scripts/mtxs_to_h5ad_genes.py b/scripts/mtxs_to_h5ad_genes.py
new file mode 100644
index 0000000..26023e4
--- /dev/null
+++ b/scripts/mtxs_to_h5ad_genes.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+import os
+import pandas as pd
+from scipy.io import mmread
+import numpy as np
+from anndata import AnnData
+import anndata as ad
+from sys import stderr
+
+# print to stderr
+def message(*args, **kwargs):
+    print(*args, file=stderr, **kwargs)
+
+# Helper function to load and process MTX files
+def load_mtx_to_anndata(mtx_file, bx_file, gene_file, sample_id, min_umis, 
+                        cell_annots_df, cell_annots_key, exclude_unannotated_cells):
+    message("[INFO]    reading counts...")
+    mtx = mmread(mtx_file).tocsr()
+
+    message("[INFO]    reading barcodes...")
+    bxs = pd.read_csv(bx_file, header=None, index_col=None).iloc[:, 0].values
+    cell_ids = [f"{sample_id}_{bx}" for bx in bxs]
+    
+    message("[INFO]    reading genes...")
+    genes = pd.read_csv(gene_file, header=None, index_col=None).iloc[:, 0].values
+    
+    message("[INFO]    creating object...")
+    adata = AnnData(X=mtx, 
+                    obs=pd.DataFrame(index=cell_ids), 
+                    var=pd.DataFrame(index=genes))
+    
+    # Filter unannotated cells
+    if exclude_unannotated_cells:
+        message("[INFO]    filtering unannotated cells...")
+        annotated_cells = cell_annots_df[cell_annots_key].values
+        adata = adata[adata.obs.index.isin(annotated_cells)]
+    
+    message(f"[INFO]    filtering barcodes fewer than {min_umis} UMIs...")
+    adata = adata[adata.obs.index[adata.X.sum(axis=1).A1 >= min_umis]]
+    
+    return adata.copy()
+
+# Main processing function
+def process_mtx_files(snakemake):
+    mtx_files = snakemake.input.mtxs
+    bx_files = snakemake.input.bxs
+    gene_files = snakemake.input.genes
+    sample_ids = snakemake.params.sample_ids
+    gene_annots_file = snakemake.input.gene_annots
+    cell_annots_file = snakemake.input.cell_annots
+    cell_annots_key = snakemake.params.cell_annots_key
+    min_umis = int(snakemake.params.min_umis)
+    exclude_unannotated_cells = snakemake.params.exclude_unannotated_cells
+    output_file = snakemake.output.h5ad
+    
+    
+    if cell_annots_file:
+        cell_annots_df = pd.read_csv(cell_annots_file)
+    elif exclude_unannotated_cells:
+        message("[Warning] The `exclude_unannotated_cells` option is ignored because no `cell_annots` were provided!")
+        exclude_unannotated_cells = False
+    
+    message("[INFO] Beginning sample loading...")
+    adatas = []
+    for mtx, bx, gene, sample_id in zip(mtx_files, bx_files, gene_files, sample_ids):
+        message(f"[INFO]   Loading sample {sample_id}...")
+        adata = load_mtx_to_anndata(mtx, bx, gene, sample_id, min_umis, cell_annots_df, cell_annots_key, exclude_unannotated_cells)
+        adatas.append(adata)
+    message("[INFO] Done loading.")
+    
+    message("[INFO] Concatenating AnnData objects...")
+    combined_adata = ad.concat(adatas, axis=0)
+    
+    if gene_annots_file:
+        message("[INFO] Loading gene annotations...")
+        gene_annots_df = pd.read_csv(gene_annots_file).set_index("gene_id", drop=False)
+        message("[INFO] Attaching gene annotations...")
+        combined_adata.var = combined_adata.var.join(gene_annots_df, how='left')
+    
+    if cell_annots_file:
+        message("[INFO] Attaching cell annotations...")
+        combined_adata.obs = combined_adata.obs.join(cell_annots_df.set_index(cell_annots_key, drop=False), how='left')
+    
+    message(f"[INFO] Saving AnnData to {output_file}...")
+    combined_adata.write(output_file)
+
+# Check for snakemake object and process
+try:
+    snakemake                     # type: ignore
+    process_mtx_files(snakemake)  # type: ignore
+    message("[INFO] Done.")
+except NameError:
+    message("[WARNING] This script is intended to be executed as part of a Snakemake workflow and requires the 'snakemake' object.")
+    message("[INFO] Nothing done.")
diff --git a/scripts/mtxs_to_h5ad_txs.py b/scripts/mtxs_to_h5ad_txs.py
new file mode 100644
index 0000000..f25ba71
--- /dev/null
+++ b/scripts/mtxs_to_h5ad_txs.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+import os
+import pandas as pd
+from scipy.io import mmread
+import numpy as np
+from anndata import AnnData
+import anndata as ad
+from sys import stderr
+
+# print to stderr
+def message(*args, **kwargs):
+    print(*args, file=stderr, **kwargs)
+
+# Helper function to load and process MTX files
+def load_mtx_to_anndata(mtx_file, bx_file, tx_file, sample_id, min_umis, 
+                        cell_annots_df, cell_annots_key, exclude_unannotated_cells):
+    message("[INFO]    reading counts...")
+    mtx = mmread(mtx_file).tocsr()
+
+    message("[INFO]    reading barcodes...")
+    bxs = pd.read_csv(bx_file, header=None, index_col=None).iloc[:, 0].values
+    cell_ids = [f"{sample_id}_{bx}" for bx in bxs]
+    
+    message("[INFO]    reading transcripts...")
+    txs = pd.read_csv(tx_file, header=None, index_col=None).iloc[:, 0].values
+    
+    message("[INFO]    creating object...")
+    adata = AnnData(X=mtx, 
+                    obs=pd.DataFrame(index=cell_ids), 
+                    var=pd.DataFrame(index=txs))
+    
+    # Filter unannotated cells
+    if exclude_unannotated_cells:
+        message("[INFO]    filtering unannotated cells...")
+        annotated_cells = cell_annots_df[cell_annots_key].values
+        adata = adata[adata.obs.index.isin(annotated_cells)]
+    
+    message(f"[INFO]    filtering barcodes fewer than {min_umis} UMIs...")
+    adata = adata[adata.obs.index[adata.X.sum(axis=1).A1 >= min_umis]]
+    
+    return adata.copy()
+
+# Main processing function
+def process_mtx_files(snakemake):
+    mtx_files = snakemake.input.mtxs
+    bx_files = snakemake.input.bxs
+    tx_files = snakemake.input.txs
+    sample_ids = snakemake.params.sample_ids
+    tx_annots_file = snakemake.input.tx_annots
+    cell_annots_file = snakemake.input.cell_annots
+    cell_annots_key = snakemake.params.cell_annots_key
+    min_umis = int(snakemake.params.min_umis)
+    exclude_unannotated_cells = snakemake.params.exclude_unannotated_cells
+    output_file = snakemake.output.h5ad
+    
+    
+    if cell_annots_file:
+        cell_annots_df = pd.read_csv(cell_annots_file)
+    elif exclude_unannotated_cells:
+        message("[Warning] The `exclude_unannotated_cells` option is ignored because no `cell_annots` were provided!")
+        exclude_unannotated_cells = False
+    
+    message("[INFO] Beginning sample loading...")
+    adatas = []
+    for mtx, bx, tx, sample_id in zip(mtx_files, bx_files, tx_files, sample_ids):
+        message(f"[INFO]   Loading sample {sample_id}...")
+        adata = load_mtx_to_anndata(mtx, bx, tx, sample_id, min_umis, cell_annots_df, cell_annots_key, exclude_unannotated_cells)
+        adatas.append(adata)
+    message("[INFO] Done loading.")
+    
+    message("[INFO] Concatenating AnnData objects...")
+    combined_adata = ad.concat(adatas, axis=0)
+    
+    if tx_annots_file:
+        message("[INFO] Loading transcript annotations...")
+        tx_annots_df = pd.read_csv(tx_annots_file).set_index("transcript_id", drop=False)
+        message("[INFO] Attaching transcript annotations...")
+        combined_adata.var = combined_adata.var.join(tx_annots_df, how='left')
+    
+    if cell_annots_file:
+        message("[INFO] Attaching cell annotations...")
+        combined_adata.obs = combined_adata.obs.join(cell_annots_df.set_index(cell_annots_key, drop=False), how='left')
+    
+    message(f"[INFO] Saving AnnData to {output_file}...")
+    combined_adata.write(output_file)
+
+# Check for snakemake object and process
+try:
+    snakemake                     # type: ignore
+    process_mtx_files(snakemake)  # type: ignore
+    message("[INFO] Done.")
+except NameError:
+    message("[WARNING] This script is intended to be executed as part of a Snakemake workflow and requires the 'snakemake' object.")
+    message("[INFO] Nothing done.")

From 774c2973230cab79d7dff88d592a5e7f20686884 Mon Sep 17 00:00:00 2001
From: Mervin Fansler <mmfansler@gmail.com>
Date: Mon, 27 May 2024 22:41:47 -0400
Subject: [PATCH 07/13] h5ad docs

---
 README.md | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 986433a..6e7dc12 100644
--- a/README.md
+++ b/README.md
@@ -44,20 +44,31 @@ determined by simulations.
 Please see [our accompanying manuscript][ref:scUTRquant] for more details.
 
 ## Outputs
+### SingleCellExperiment object (*default*)
 The primary output of the pipeline is a Bioconductor `SingleCellExperiment` object.
 The `counts` in the object is a sparse `Matrix` of 3' UTR isoform counts; the `rowRanges` 
 is a `GenomicRanges` of the 3' UTR isoforms; the `rowData` is a `DataFrame` with additional
 information about 3' UTR isoforms; and the `colData` is a `DataFrame` populated with sample 
 metadata and optional user-provide cell annotations.
 
+### H5AD AnnData object (*experimental*)
+scUTRquant v0.5.0 adds support for [AnnData objects](https://anndata.readthedocs.io/en/stable/index.html),
+making it easier for users who prefer Python and working with [the `scverse` ecosystem](https://scverse.org).
+Similar annotations will be attached in the `obs` (cell) and `var` (3'UTR isoform) tables.
+However, no analogous `rowRanges` is attached.
+
+The output format can be controlled with the `output_format` configuration option,
+with `"h5ad"` and "`sce`" being currently supported options. 
+
+### Reporting
 To assist users in quality control, the pipeline additionally generates HTML reports 
 for each sample.
 
+### Additional Files
 The pipeline is configured to retain intermediate files, such as BUS and MTX files.
 Advanced users can readily customize the pipeline to only generate the files they 
-require. For example, users who prefer to work with alternative scRNA-seq data structures,
-such as those used in Scanpy or Seurat, may wish to terminate the pipeline at MTX 
-generation.
+require. For example, users who prefer to work with alternative scRNA-seq data structures
+may wish to terminate the pipeline at MTX generation.
 
 # Setup
 
@@ -237,6 +248,7 @@ pipeline. The following keys are expected:
  - `cell_annots`: (optional) CSV file with a key column that matches the `<sample_id>_<cell_bx>` format
  - `cell_annots_key`: specifies the name of the key column in the `cell_annots` file; default is `cell_id`
  - `exclude_unannotated_cells`: boolean indicating whether unannotated cells should be excluded from the final output; default is `False`
+ - `output_format`: a list of formats, `"sce"` (*default*) and/or `"h5ad"` (*experimental*); `null` will end processing at MTX files.
  
 ### Default Values
 
@@ -271,7 +283,9 @@ The `extdata/targets.yaml` defines the targets available to pseudoalign to. The
  - `kdx`: Kallisto index for UTRome
  - `merge`: TSV for merging features (isoforms)
  - `tx_annots`: (optional) RDS file containing Bioconductor DataFrame object with annotations for transcripts
+ - `tx_annots_csv`: (optional) CSV file with annotations for transcripts (used for AnnData)
  - `gene_annots`: (optional) RDS file containing Bioconductor DataFrame object with annotations for genes
+ - `gene_annots_csv`: (optional) CSV file with annotations for genes (used for AnnData)
 
 
 # Customization

From ff0fb4854aa8e7cda77e73294ac800145e4192fc Mon Sep 17 00:00:00 2001
From: Mervin Fansler <mmfansler@gmail.com>
Date: Mon, 27 May 2024 23:18:08 -0400
Subject: [PATCH 08/13] cleanup debug out

---
 Snakefile | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/Snakefile b/Snakefile
index 7357383..b40a1e8 100644
--- a/Snakefile
+++ b/Snakefile
@@ -57,8 +57,6 @@ if config['cell_annots'] is None:
 wildcard_constraints:
     sample_id=config['sample_regex']
 
-message(config)
-
 # get list of expected outputs
 def get_outputs():
     outputs = []
@@ -447,7 +445,6 @@ rule mtxs_to_h5ad_genes:
     script:
         "scripts/mtxs_to_h5ad_genes.py"
 
-
 ################################################################################
 ## Reports
 ################################################################################

From 8621d961f03f3691e59f4960531eb8963df9ebf6 Mon Sep 17 00:00:00 2001
From: Mervin Fansler <fanslerm@mskcc.org>
Date: Mon, 27 May 2024 23:54:34 -0400
Subject: [PATCH 09/13] detect csv annots

---
 Snakefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index b40a1e8..915ed2b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -98,8 +98,10 @@ for target_id, target in targets.items():
 
     target_path = target['path']
     download_script = target_path + target['download_script']
-    
-    FILE_KEYS = ['gtf', 'kdx', 'merge_tsv', 'tx_annots', 'gene_annots']
+
+    FILE_KEYS = ['gtf', 'kdx', 'merge_tsv', 
+                 'tx_annots', 'gene_annots',
+                 'tx_annots_csv', 'gene_annots_csv']
     target_files = [target_path + target[k] for k in FILE_KEYS]
 
     rule:

From cafcc73ecfa388e223e56794382f09d399329049 Mon Sep 17 00:00:00 2001
From: Mervin Fansler <mmfansler@gmail.com>
Date: Tue, 28 May 2024 00:19:23 -0400
Subject: [PATCH 10/13] improved null support

---
 Snakefile                    | 4 ++--
 extdata/targets/targets.yaml | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/Snakefile b/Snakefile
index 915ed2b..4679764 100644
--- a/Snakefile
+++ b/Snakefile
@@ -100,9 +100,9 @@ for target_id, target in targets.items():
     download_script = target_path + target['download_script']
 
     FILE_KEYS = ['gtf', 'kdx', 'merge_tsv', 
-                 'tx_annots', 'gene_annots',
+                 'tx_annots', 'gene_annots', 
                  'tx_annots_csv', 'gene_annots_csv']
-    target_files = [target_path + target[k] for k in FILE_KEYS]
+    target_files = [target_path + target[k] for k in FILE_KEYS if target[k] is not None]
 
     rule:
         name: f"download_{target_id}"
diff --git a/extdata/targets/targets.yaml b/extdata/targets/targets.yaml
index afd03f9..c268e2f 100644
--- a/extdata/targets/targets.yaml
+++ b/extdata/targets/targets.yaml
@@ -1,5 +1,5 @@
 utrome_mm10_v1:
-  path: "extdata/targets/utrome_mm10_v1/"  # files should be relative to this
+  path: "extdata/targets/utrome_mm10_v1/"
   genome: "mm10"
   gtf: "adult.utrome.e3.t200.f0.999.w500.gtf"
   kdx: "adult.utrome.e3.t200.f0.999.w500.kdx"
@@ -37,10 +37,12 @@ utrome_hg38_v1:
 ## example custom target using GENCODE
 ## see https://github.com/Mayrlab/txcutr-db
 # gencode_v38_pc_w500:
-#   path: "extdata/targets/gencode_v38_pc_w500/"
+#   path: "extdata/targets/gencode_v38_pc_w500/"  # files must be relative to this
 #   genome: "hg38"
 #   gtf: "gencode.v38.annotation.pc.txcutr.w500.gtf"
 #   kdx: "gencode.v38.annotation.pc.txcutr.w500.kdx"
 #   merge_tsv: "gencode.v38.annotation.pc.txcutr.w500.merge.tsv"
 #   tx_annots: null
+#   tx_annots_csv: null
 #   gene_annots: null
+#   gene_annots_csv: null

From 57b415435a861d0725ebb4b960f74743b1d87837 Mon Sep 17 00:00:00 2001
From: Mervin Fansler <mmfansler@gmail.com>
Date: Tue, 28 May 2024 00:49:14 -0400
Subject: [PATCH 11/13] use gzip

---
 scripts/mtxs_to_h5ad_genes.py | 2 +-
 scripts/mtxs_to_h5ad_txs.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/mtxs_to_h5ad_genes.py b/scripts/mtxs_to_h5ad_genes.py
index 26023e4..3e9c26a 100644
--- a/scripts/mtxs_to_h5ad_genes.py
+++ b/scripts/mtxs_to_h5ad_genes.py
@@ -83,7 +83,7 @@ def process_mtx_files(snakemake):
         combined_adata.obs = combined_adata.obs.join(cell_annots_df.set_index(cell_annots_key, drop=False), how='left')
     
     message(f"[INFO] Saving AnnData to {output_file}...")
-    combined_adata.write(output_file)
+    combined_adata.write(output_file, compression="gzip")
 
 # Check for snakemake object and process
 try:
diff --git a/scripts/mtxs_to_h5ad_txs.py b/scripts/mtxs_to_h5ad_txs.py
index f25ba71..806c18c 100644
--- a/scripts/mtxs_to_h5ad_txs.py
+++ b/scripts/mtxs_to_h5ad_txs.py
@@ -83,7 +83,7 @@ def process_mtx_files(snakemake):
         combined_adata.obs = combined_adata.obs.join(cell_annots_df.set_index(cell_annots_key, drop=False), how='left')
     
     message(f"[INFO] Saving AnnData to {output_file}...")
-    combined_adata.write(output_file)
+    combined_adata.write(output_file, compression="gzip")
 
 # Check for snakemake object and process
 try:

From 0581cac7b1e5b49e10557ddb90e9416e445ff041 Mon Sep 17 00:00:00 2001
From: Mervin Fansler <mmfansler@gmail.com>
Date: Tue, 28 May 2024 01:47:13 -0400
Subject: [PATCH 12/13] refresh docker

---
 Snakefile               |  2 +-
 docker/scutr-quant.yaml | 26 ++++++++++++++++----------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/Snakefile b/Snakefile
index 4679764..46b778c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1,4 +1,4 @@
-container: "docker://mfansler/scutr-quant:0.4.0"
+container: "docker://mfansler/scutr-quant:0.5.0"
 configfile: "config.yaml"
 SQ_VERSION="0.5.0-alpha"
 
diff --git a/docker/scutr-quant.yaml b/docker/scutr-quant.yaml
index 1e6a274..b9c8ad1 100644
--- a/docker/scutr-quant.yaml
+++ b/docker/scutr-quant.yaml
@@ -15,14 +15,20 @@ dependencies:
   - bustools=0.40.0
 
   ## sce + reporting
-  - r-base=4.1
-  - r-dplyr=1.0
-  - r-ggplot2=3.3
-  - r-matrix=1.4
+  - r-base=4.3
+  - r-dplyr=1.1
+  - r-ggplot2=3.5
+  - r-matrix=1.6
   - r-readr=2.1
-  - r-rmarkdown=2.14
-  - r-stringr=1.4
-  - bioconductor-hdf5array=1.22
-  - bioconductor-singlecellexperiment=1.16
-  - bioconductor-plyranges=1.14
-  - openssl=1
+  - r-rmarkdown=2.27
+  - r-stringr=1.5
+  - bioconductor-hdf5array=1.30
+  - bioconductor-singlecellexperiment=1.24
+  - bioconductor-plyranges=1.22
+  
+  ## anndata
+  - python=3.11
+  - anndata=0.10
+  - numpy=1.26
+  - pandas=2.2
+  - scipy=1.13

From a29003a08e030dc1ca529680a19b7da010023f4e Mon Sep 17 00:00:00 2001
From: Mervin Fansler <mmfansler@gmail.com>
Date: Tue, 28 May 2024 01:57:24 -0400
Subject: [PATCH 13/13] bump to release version

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 46b778c..63f6af2 100644
--- a/Snakefile
+++ b/Snakefile
@@ -1,6 +1,6 @@
 container: "docker://mfansler/scutr-quant:0.5.0"
 configfile: "config.yaml"
-SQ_VERSION="0.5.0-alpha"
+SQ_VERSION="0.5.0"
 
 import os
 import pandas as pd