Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PD-2739 Add peakcalling to Multiome ATAC subpipeline #1391

Open
wants to merge 40 commits into
base: develop
Choose a base branch
from
Open
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
ce90d53
add peak calling
aawdeh Oct 16, 2024
82c5c45
added output for peaks
aawdeh Oct 16, 2024
6e833d2
fix
aawdeh Oct 16, 2024
76286b4
fix
aawdeh Oct 16, 2024
90805ca
test
aawdeh Oct 16, 2024
b27011c
remove ls
aawdeh Oct 17, 2024
c1f0e57
test
aawdeh Oct 18, 2024
90b193c
Merge branch 'develop' into aa-peakcall
aawdeh Oct 18, 2024
779fe9b
test
aawdeh Oct 18, 2024
6d52ad2
Merge branch 'aa-peakcall' of https://github.com/broadinstitute/warp …
aawdeh Oct 18, 2024
5743eff
test
aawdeh Oct 18, 2024
d3749c8
test
aawdeh Oct 18, 2024
1de1c85
test
aawdeh Oct 18, 2024
2d18756
test
aawdeh Oct 18, 2024
13a44f9
test
aawdeh Oct 21, 2024
3f12b9e
made another task
aawdeh Oct 21, 2024
08b2f24
test
aawdeh Oct 21, 2024
80c1416
test
aawdeh Oct 21, 2024
b5191e5
test
aawdeh Oct 21, 2024
4725080
test
aawdeh Oct 21, 2024
f139f84
test
aawdeh Oct 22, 2024
9a0ea9b
test
aawdeh Oct 22, 2024
04cf0d2
test
aawdeh Oct 22, 2024
f9bc871
change a line
aawdeh Oct 24, 2024
93d3360
test
aawdeh Oct 24, 2024
256c4e5
test
aawdeh Oct 24, 2024
cd849b5
test
aawdeh Oct 24, 2024
7c62fd1
test
aawdeh Oct 24, 2024
b15b146
test
aawdeh Oct 24, 2024
ff36c15
test
aawdeh Oct 24, 2024
27a4196
test
aawdeh Oct 24, 2024
987b5d2
Add peakcall to task
aawdeh Oct 28, 2024
cce5841
test
aawdeh Oct 28, 2024
39ef1a5
Merge branch 'develop' into aa-peakcall
aawdeh Oct 28, 2024
bf0e9ce
add parameters
aawdeh Oct 28, 2024
d9240d6
Merge branch 'aa-peakcall' of https://github.com/broadinstitute/warp …
aawdeh Oct 28, 2024
1ec5ff1
Merge branch 'develop' into aa-peakcall
aawdeh Oct 30, 2024
90172ee
Merge branch 'develop' into aa-peakcall
aawdeh Nov 4, 2024
18f77d8
test
aawdeh Nov 4, 2024
f7bdfa2
Merge branch 'aa-peakcall' of https://github.com/broadinstitute/warp …
aawdeh Nov 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
259 changes: 257 additions & 2 deletions pipelines/skylab/atac/atac.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,13 @@ workflow ATAC {
input_id = input_id

}
call PeakCalling {
input:
bam = BWAPairedEndAlignment.bam_aligned_output,
annotations_gtf = annotations_gtf,
metrics_h5ad = CreateFragmentFile.Snap_metrics,
docker_path = docker_prefix + snap_atac_docker
}
}

File bam_aligned_output_atac = select_first([BBTag.bb_bam, BWAPairedEndAlignment.bam_aligned_output])
Expand Down Expand Up @@ -267,7 +274,6 @@ task GetNumSplits {
}
}


# trim read 1 and read 2 adapter sequeunce with cutadapt
task TrimAdapters {
input {
Expand Down Expand Up @@ -533,7 +539,14 @@ task CreateFragmentFile {
}

command <<<
set -e pipefail
set -euo pipefail
set -x

# install packages -- need to add to the docker
pip3 install snapatac2==2.7.0 scanpy

file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py
sed -i '164s/.*/ peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change

python3 <<CODE

Expand All @@ -544,6 +557,7 @@ task CreateFragmentFile {
atac_gtf = "~{annotations_gtf}"
preindex = "~{preindex}"
atac_nhash_id = "~{atac_nhash_id}"
peakcalling_bool = True
expected_cells = ~{atac_expected_cells}

# calculate chrom size dictionary based on text file
Expand All @@ -556,6 +570,9 @@ task CreateFragmentFile {
# use snap atac2
import snapatac2.preprocessing as pp
import snapatac2 as snap
import scanpy as sc
import numpy as np
import polars as pl
import anndata as ad
from collections import OrderedDict
import csv
Expand Down Expand Up @@ -606,6 +623,92 @@ task CreateFragmentFile {
# Write new atac file
atac_data.write_h5ad("~{input_id}.metrics.h5ad")

if peakcalling_bool:
print("Peak calling starting...")
atac_data = snap.read("~{bam_base_name}.metrics.h5ad")

# Calculate and plot the size distribution of fragments
print("Calculating fragment size distribution")
snap.pl.frag_size_distr(atac_data)
print(atac_data)

# Filter cells -- Need to parameterize
print("Filtering cells")
snap.pp.filter_cells(atac_data, min_counts=5000, min_tsse=10, max_counts=100000)
print(atac_data)

# Create a cell by bin matrix containing insertion counts across genome-wide 500-bp bins.
print("Creating cell by bin matrix")
atac_data_mod = snap.pp.add_tile_matrix(atac_data, inplace=False)
print("set obsm")
atac_data_mod.obsm["fragment_paired"] = atac_data.obsm["fragment_paired"]
print("set all uns")
for key in atac_data.uns.keys():
print("set ",key)
atac_data_mod.uns[key] = atac_data.uns[key]
print(atac_data_mod)

# Feature selection
print("Feature selection")
snap.pp.select_features(atac_data_mod)
print(atac_data_mod)

# Run customized scrublet algorithm to identify potential doublets
print("Run scrublet to identify potential doublets")
snap.pp.scrublet(atac_data_mod)
print(atac_data_mod)

# Employ spectral embedding for dimensionality reduction
print("Employ spectral embedding for dimensionality reduction")
snap.tl.spectral(atac_data_mod)
print(atac_data_mod)

# Filter doublets based on scrublet scores
print("Filter doublets based on scrublet scores")
snap.pp.filter_doublets(atac_data_mod, probability_threshold=0.5)
print(atac_data_mod)

# Perform graph-based clustering to identify cell clusters.
# Build a k-nearest neighbour graph using snap.pp.knn
print("Perform knn graph-based clustering to identify cell clusters")
snap.pp.knn(atac_data_mod)
print(atac_data_mod)

# Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph
print("Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph")
snap.tl.leiden(atac_data_mod)
print(atac_data_mod)

# Create the cell by gene activity matrix
print("Create the cell by gene activity matrix")
gene_mat = snap.pp.make_gene_matrix(atac_data_mod, gene_anno=atac_gtf)
print(atac_data_mod)

# Normalize the gene matrix
print("Normalize the gene matrix")
gene_mat.obs['leiden'] = atac_data_mod.obs['leiden']
sc.pp.normalize_total(gene_mat)
sc.pp.log1p(gene_mat)
sc.tl.rank_genes_groups(gene_mat, groupby="leiden", method="wilcoxon")

for i in np.unique(gene_mat.obs['leiden']):
markers = sc.get.rank_genes_groups_df(gene_mat, group=i).head(7)['names']
print(f"Cluster {i}: {', '.join(markers)}")

print("Peak calling using MACS3")
snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=8)

print("Convert pl.DataFrame to pandas DataFrame")
# Convert pl.DataFrame to pandas DataFrame
for key in atac_data_mod.uns.keys():
if isinstance(atac_data_mod.uns[key], pl.DataFrame):
print(key)
atac_data_mod.uns[key] = atac_data_mod.uns[key].to_pandas()

print("Write into h5ad file")
atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad")
print("test")

CODE

# sorting the file
Expand All @@ -631,3 +734,155 @@ task CreateFragmentFile {
File atac_library_metrics = "~{input_id}_~{atac_nhash_id}_library_metrics.csv"
}
}

task PeakCalling {
input {
File bam
File annotations_gtf
File metrics_h5ad
# SnapATAC2 parameters
Int min_counts = 5000
Int min_tsse = 10
Int max_counts = 100000
# Runtime attributes/docker
String docker_path
Int disk_size = 500
Int mem_size = 64
Int nthreads = 4
}
String bam_base_name = basename(bam, ".bam")

parameter_meta {
bam: "Aligned bam with CB in CB tag. This is the output of the BWAPairedEndAlignment task."
annotations_gtf: "GTF for SnapATAC2 to calculate TSS sites of fragment file."
disk_size: "Disk size used in create fragment file step."
mem_size: "The size of memory used in create fragment file."
docker_path: "The docker image path containing the runtime environment for this task"
}

command <<<
set -euo pipefail
set -x

# install packages -- need to add to the docker
pip3 install snapatac2==2.7.0 scanpy

file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py
sed -i '164s/.*/ peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change

python3 <<CODE

# use snap atac2
import snapatac2 as snap
import scanpy as sc
import numpy as np
import polars as pl

bam = "~{bam}"
bam_base_name = "~{bam_base_name}"
atac_gtf = "~{annotations_gtf}"
metrics_h5ad = "~{metrics_h5ad}"
min_counts = "~{min_counts}"
min_tsse = "~{min_tsse}"
max_counts = "~{max_counts}"

print("Peak calling starting...")
atac_data = snap.read(metrics_h5ad)

# Calculate and plot the size distribution of fragments
print("Calculating fragment size distribution")
snap.pl.frag_size_distr(atac_data)
print(atac_data)

# Filter cells -- Need to parameterize
print("Filtering cells")
snap.pp.filter_cells(atac_data, min_counts=min_counts, min_tsse=min_tsse, max_counts=max_counts)
print(atac_data)

# Create a cell by bin matrix containing insertion counts across genome-wide 500-bp bins.
print("Creating cell by bin matrix")
atac_data_mod = snap.pp.add_tile_matrix(atac_data, inplace=False)
print("set obsm")
atac_data_mod.obsm["fragment_paired"] = atac_data.obsm["fragment_paired"]
print("set all uns")
for key in atac_data.uns.keys():
print("set ",key)
atac_data_mod.uns[key] = atac_data.uns[key]
print(atac_data_mod)

# Feature selection
print("Feature selection")
snap.pp.select_features(atac_data_mod)
print(atac_data_mod)

# Run customized scrublet algorithm to identify potential doublets
print("Run scrublet to identify potential doublets")
snap.pp.scrublet(atac_data_mod)
print(atac_data_mod)

# Employ spectral embedding for dimensionality reduction
print("Employ spectral embedding for dimensionality reduction")
snap.tl.spectral(atac_data_mod)
print(atac_data_mod)

# Filter doublets based on scrublet scores
print("Filter doublets based on scrublet scores")
snap.pp.filter_doublets(atac_data_mod, probability_threshold=0.5)
print(atac_data_mod)

# Perform graph-based clustering to identify cell clusters.
# Build a k-nearest neighbour graph using snap.pp.knn
print("Perform knn graph-based clustering to identify cell clusters")
snap.pp.knn(atac_data_mod)
print(atac_data_mod)

# Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph
print("Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph")
snap.tl.leiden(atac_data_mod)
print(atac_data_mod)

# Create the cell by gene activity matrix
print("Create the cell by gene activity matrix")
gene_mat = snap.pp.make_gene_matrix(atac_data_mod, gene_anno=atac_gtf)
print(atac_data_mod)

# Normalize the gene matrix
print("Normalize the gene matrix")
gene_mat.obs['leiden'] = atac_data_mod.obs['leiden']
sc.pp.normalize_total(gene_mat)
sc.pp.log1p(gene_mat)
sc.tl.rank_genes_groups(gene_mat, groupby="leiden", method="wilcoxon")

for i in np.unique(gene_mat.obs['leiden']):
markers = sc.get.rank_genes_groups_df(gene_mat, group=i).head(7)['names']
print(f"Cluster {i}: {', '.join(markers)}")

print("Peak calling using MACS3")
snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=8)

print("Convert pl.DataFrame to pandas DataFrame")
# Convert pl.DataFrame to pandas DataFrame
for key in atac_data_mod.uns.keys():
if isinstance(atac_data_mod.uns[key], pl.DataFrame):
print(key)
atac_data_mod.uns[key] = atac_data_mod.uns[key].to_pandas()

print("Write into h5ad file")
atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad")
print("test")

CODE
>>>

runtime {
docker: docker_path
disks: "local-disk ${disk_size} SSD"
memory: "${mem_size} GiB"
cpu: nthreads
}

output {
File peaks_h5ad = "~{bam_base_name}.peaks.h5ad"
}

}
Loading