broadinstitute · aawdeh · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl
@@ -160,6 +160,13 @@ workflow ATAC {
         input_id = input_id
 
     }
+    call PeakCalling {
+      input:
+        bam = BWAPairedEndAlignment.bam_aligned_output,
+        annotations_gtf = annotations_gtf,
+        metrics_h5ad = CreateFragmentFile.Snap_metrics,
+        docker_path = docker_prefix + snap_atac_docker
+    }
   }
 
   File bam_aligned_output_atac = select_first([BBTag.bb_bam, BWAPairedEndAlignment.bam_aligned_output])
@@ -267,7 +274,6 @@ task GetNumSplits {
   }
 }
 
-
 # trim read 1 and read 2 adapter sequeunce with cutadapt
 task TrimAdapters {
   input {
@@ -533,7 +539,14 @@ task CreateFragmentFile {
   }
 
   command <<<
-    set -e pipefail
+    set -euo pipefail
+    set -x 
+
+    # install packages -- need to add to the docker
+    pip3 install snapatac2==2.7.0 scanpy
+
+    file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py
+    sed -i '164s/.*/        peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change
 
     python3 <<CODE
 
@@ -544,6 +557,7 @@ task CreateFragmentFile {
     atac_gtf = "~{annotations_gtf}"
     preindex = "~{preindex}"
     atac_nhash_id = "~{atac_nhash_id}"
+    peakcalling_bool = True 
     expected_cells = ~{atac_expected_cells}
 
     # calculate chrom size dictionary based on text file
@@ -556,6 +570,9 @@ task CreateFragmentFile {
     # use snap atac2
     import snapatac2.preprocessing as pp
     import snapatac2 as snap
+    import scanpy as sc
+    import numpy as np
+    import polars as pl
     import anndata as ad
     from collections import OrderedDict
     import csv
@@ -606,6 +623,92 @@ task CreateFragmentFile {
     # Write new atac file
     atac_data.write_h5ad("~{input_id}.metrics.h5ad")
 
+    if peakcalling_bool:
+        print("Peak calling starting...")
+        atac_data = snap.read("~{bam_base_name}.metrics.h5ad")
+
+        # Calculate and plot the size distribution of fragments
+        print("Calculating fragment size distribution")
+        snap.pl.frag_size_distr(atac_data)
+        print(atac_data)
+
+        # Filter cells -- Need to parameterize 
+        print("Filtering cells")
+        snap.pp.filter_cells(atac_data, min_counts=5000, min_tsse=10, max_counts=100000)
+        print(atac_data)
+
+        # Create a cell by bin matrix containing insertion counts across genome-wide 500-bp bins.
+        print("Creating cell by bin matrix")
+        atac_data_mod = snap.pp.add_tile_matrix(atac_data, inplace=False)
+        print("set obsm")
+        atac_data_mod.obsm["fragment_paired"] =  atac_data.obsm["fragment_paired"]
+        print("set all uns")
+        for key in atac_data.uns.keys():
+            print("set ",key)
+            atac_data_mod.uns[key] = atac_data.uns[key]
+        print(atac_data_mod)
+
+        # Feature selection
+        print("Feature selection")
+        snap.pp.select_features(atac_data_mod)
+        print(atac_data_mod)
+
+        # Run customized scrublet algorithm to identify potential doublets
+        print("Run scrublet to identify potential doublets")
+        snap.pp.scrublet(atac_data_mod)
+        print(atac_data_mod)
+
+        # Employ spectral embedding for dimensionality reduction
+        print("Employ spectral embedding for dimensionality reduction")
+        snap.tl.spectral(atac_data_mod)
+        print(atac_data_mod)
+
+        # Filter doublets based on scrublet scores 
+        print("Filter doublets based on scrublet scores")
+        snap.pp.filter_doublets(atac_data_mod, probability_threshold=0.5)
+        print(atac_data_mod)
+
+        # Perform graph-based clustering to identify cell clusters. 
+        # Build a k-nearest neighbour graph using snap.pp.knn
+        print("Perform knn graph-based clustering to identify cell clusters")
+        snap.pp.knn(atac_data_mod)
+        print(atac_data_mod)
+
+        # Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph
+        print("Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph")
+        snap.tl.leiden(atac_data_mod)
+        print(atac_data_mod)
+
+        # Create the cell by gene activity matrix
+        print("Create the cell by gene activity matrix")
+        gene_mat = snap.pp.make_gene_matrix(atac_data_mod, gene_anno=atac_gtf)
+        print(atac_data_mod)
+
+        # Normalize the gene matrix
+        print("Normalize the gene matrix")
+        gene_mat.obs['leiden'] = atac_data_mod.obs['leiden']
+        sc.pp.normalize_total(gene_mat)
+        sc.pp.log1p(gene_mat)
+        sc.tl.rank_genes_groups(gene_mat, groupby="leiden", method="wilcoxon")
+
+        for i in np.unique(gene_mat.obs['leiden']):
+            markers = sc.get.rank_genes_groups_df(gene_mat, group=i).head(7)['names']
+            print(f"Cluster {i}: {', '.join(markers)}")
+
+        print("Peak calling using MACS3")
+        snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=8)
+
+        print("Convert pl.DataFrame to pandas DataFrame")
+        # Convert pl.DataFrame to pandas DataFrame
+        for key in atac_data_mod.uns.keys():
+          if isinstance(atac_data_mod.uns[key], pl.DataFrame):
+              print(key)
+              atac_data_mod.uns[key] = atac_data_mod.uns[key].to_pandas()
+
+        print("Write into h5ad file")
+        atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad")
+        print("test")
+
     CODE
 
     # sorting the file
@@ -631,3 +734,155 @@ task CreateFragmentFile {
     File atac_library_metrics = "~{input_id}_~{atac_nhash_id}_library_metrics.csv"
   }
 }
+
+task PeakCalling {
+  input {
+    File bam
+    File annotations_gtf
+    File metrics_h5ad   
+    # SnapATAC2 parameters
+    Int min_counts = 5000
+    Int min_tsse = 10
+    Int max_counts = 100000
+    # Runtime attributes/docker
+    String docker_path
+    Int disk_size = 500
+    Int mem_size = 64
+    Int nthreads = 4   
+  }
+  String bam_base_name = basename(bam, ".bam")
+
+  parameter_meta {
+    bam: "Aligned bam with CB in CB tag. This is the output of the BWAPairedEndAlignment task."
+    annotations_gtf: "GTF for SnapATAC2 to calculate TSS sites of fragment file."
+    disk_size: "Disk size used in create fragment file step."
+    mem_size: "The size of memory used in create fragment file."
+    docker_path: "The docker image path containing the runtime environment for this task"
+  }
+
+  command <<<
+    set -euo pipefail
+    set -x 
+
+    # install packages -- need to add to the docker
+    pip3 install snapatac2==2.7.0 scanpy
+
+    file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py
+    sed -i '164s/.*/        peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change
+
+    python3 <<CODE
+
+    # use snap atac2
+    import snapatac2 as snap
+    import scanpy as sc
+    import numpy as np
+    import polars as pl
+
+    bam = "~{bam}"
+    bam_base_name = "~{bam_base_name}"
+    atac_gtf = "~{annotations_gtf}"
+    metrics_h5ad = "~{metrics_h5ad}"
+    min_counts = "~{min_counts}"
+    min_tsse = "~{min_tsse}"
+    max_counts = "~{max_counts}"
+
+    print("Peak calling starting...")
+    atac_data = snap.read(metrics_h5ad)
+
+    # Calculate and plot the size distribution of fragments
+    print("Calculating fragment size distribution")
+    snap.pl.frag_size_distr(atac_data)
+    print(atac_data)
+
+    # Filter cells -- Need to parameterize 
+    print("Filtering cells")
+    snap.pp.filter_cells(atac_data, min_counts=min_counts, min_tsse=min_tsse, max_counts=max_counts)
+    print(atac_data)
+
+    # Create a cell by bin matrix containing insertion counts across genome-wide 500-bp bins.
+    print("Creating cell by bin matrix")
+    atac_data_mod = snap.pp.add_tile_matrix(atac_data, inplace=False)
+    print("set obsm")
+    atac_data_mod.obsm["fragment_paired"] =  atac_data.obsm["fragment_paired"]
+    print("set all uns")
+    for key in atac_data.uns.keys():
+      print("set ",key)
+      atac_data_mod.uns[key] = atac_data.uns[key]
+    print(atac_data_mod)
+
+    # Feature selection
+    print("Feature selection")
+    snap.pp.select_features(atac_data_mod)
+    print(atac_data_mod)
+
+    # Run customized scrublet algorithm to identify potential doublets
+    print("Run scrublet to identify potential doublets")
+    snap.pp.scrublet(atac_data_mod)
+    print(atac_data_mod)
+
+    # Employ spectral embedding for dimensionality reduction
+    print("Employ spectral embedding for dimensionality reduction")
+    snap.tl.spectral(atac_data_mod)
+    print(atac_data_mod)
+
+    # Filter doublets based on scrublet scores 
+    print("Filter doublets based on scrublet scores")
+    snap.pp.filter_doublets(atac_data_mod, probability_threshold=0.5)
+    print(atac_data_mod)
+
+    # Perform graph-based clustering to identify cell clusters. 
+    # Build a k-nearest neighbour graph using snap.pp.knn
+    print("Perform knn graph-based clustering to identify cell clusters")
+    snap.pp.knn(atac_data_mod)
+    print(atac_data_mod)
+
+    # Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph
+    print("Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph")
+    snap.tl.leiden(atac_data_mod)
+    print(atac_data_mod)
+
+    # Create the cell by gene activity matrix
+    print("Create the cell by gene activity matrix")
+    gene_mat = snap.pp.make_gene_matrix(atac_data_mod, gene_anno=atac_gtf)
+    print(atac_data_mod)
+
+    # Normalize the gene matrix
+    print("Normalize the gene matrix")
+    gene_mat.obs['leiden'] = atac_data_mod.obs['leiden']
+    sc.pp.normalize_total(gene_mat)
+    sc.pp.log1p(gene_mat)
+    sc.tl.rank_genes_groups(gene_mat, groupby="leiden", method="wilcoxon")
+
+    for i in np.unique(gene_mat.obs['leiden']):
+        markers = sc.get.rank_genes_groups_df(gene_mat, group=i).head(7)['names']
+        print(f"Cluster {i}: {', '.join(markers)}")
+
+    print("Peak calling using MACS3")
+    snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=8)
+
+    print("Convert pl.DataFrame to pandas DataFrame")
+    # Convert pl.DataFrame to pandas DataFrame
+    for key in atac_data_mod.uns.keys():
+      if isinstance(atac_data_mod.uns[key], pl.DataFrame):
+          print(key)
+          atac_data_mod.uns[key] = atac_data_mod.uns[key].to_pandas()
+
+    print("Write into h5ad file")
+    atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad")
+    print("test")
+
+    CODE
+  >>>
+
+  runtime {
+    docker: docker_path
+    disks: "local-disk ${disk_size} SSD"
+    memory: "${mem_size} GiB"
+    cpu: nthreads
+  }
+
+  output {
+    File peaks_h5ad = "~{bam_base_name}.peaks.h5ad"
+  }
+
+}