Skip to content

Commit

Permalink
Merge pull request #79 from PacificBiosciences/MAG-Mapping
Browse files Browse the repository at this point in the history
Mag mapping
  • Loading branch information
dportik authored Jun 27, 2024
2 parents f62d8f4 + 96af0c7 commit d9a88ba
Show file tree
Hide file tree
Showing 9 changed files with 341 additions and 70 deletions.
148 changes: 91 additions & 57 deletions HiFi-MAG-Pipeline/Snakefile-hifimags.smk

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion HiFi-MAG-Pipeline/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,14 @@
tmpdir: "/scratch"

checkm2:
# The number of threads to use for CheckM.
# The number of threads to use for CheckM2.
threads: 24
# Full path to the diamond database required to run CheckM2.
# This can be obtained via checkm2:
# checkm2 database --download --path /YourPath/CheckM2_database
# It can also be downloaded directly from here, and unpacked after:
# https://zenodo.org/records/5571251/files/checkm2_database.tar.gz?download=1
db: "/dept/appslab/datasets/dp_checkm2/CheckM2_database/uniref100.KO.1.dmnd"

minimap:
# The number of threads to use for minimap2.
Expand Down
5 changes: 3 additions & 2 deletions HiFi-MAG-Pipeline/envs/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ channels:
- conda-forge
- defaults
dependencies:
- python == 3.7
- python
- numpy
- pandas
- seaborn
- matplotlib
- biopython == 1.77
- biopython
- pysam == 0.22
1 change: 0 additions & 1 deletion HiFi-MAG-Pipeline/scripts/Convert-JGI-Coverages.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import argparse
import os
import pandas as pd

def get_args():
Expand Down
2 changes: 0 additions & 2 deletions HiFi-MAG-Pipeline/scripts/Filter-Checkm2-Bins.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from Bio import SeqIO

def get_args():
Expand Down
64 changes: 64 additions & 0 deletions HiFi-MAG-Pipeline/scripts/bam2paf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import argparse
import pysam
import os

def get_args():
"""
Get arguments from command line with argparse.
"""
parser = argparse.ArgumentParser(
prog='bam2paf.py',
description="""Convert an aligned bam from minimap2 into paf.""")
parser.add_argument("-i", "--input_bam",
required=True,
help="Path to bam file.")
parser.add_argument("-o", "--out_paf",
required=True,
help="Name of output paf file to write.")
return parser.parse_args()

def get_bam_object(input_bam):
print("get_bam_object: Getting pysam bam object.")
return pysam.AlignmentFile(input_bam, 'rb', check_sq=False)

def bam2paf(bam, out_paf):
if os.path.exists(out_paf):
os.remove(out_paf)

print("bam2paf: Converting bam to paf...")
entries = 0
with open(out_paf, 'a') as fh:
for read in bam:
# simple progress tracking
entries += 1
if entries % 100000 == 0:
print("\t{:,} entries".format(entries))
# get orientation
if read.is_reverse is True:
strand = '-'
else:
strand = '+'
# see if tp tag is present
try:
aln_type = "tp:A:{}".format(read.get_tag('tp'))
except:
aln_type = ""
# write paf format
fh.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t"
"{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t"
"{12}\n".format(read.query_name, read.query_length, read.query_alignment_start,
read.query_alignment_end, strand, read.reference_name,
read.reference_length, read.reference_start, read.reference_end,
read.get_cigar_stats()[0][0], read.query_alignment_length,
read.mapping_quality, aln_type))
print("bam2paf: Wrote {:,} entries".format(entries))

def main():
args = get_args()
bam = get_bam_object(args.input_bam)
bam2paf(bam, args.out_paf)

if __name__ == '__main__':
main()


160 changes: 160 additions & 0 deletions HiFi-MAG-Pipeline/scripts/paf-mapping-summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import argparse
import os
import pysam
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def get_args():
"""
Get arguments from command line with argparse.
"""
parser = argparse.ArgumentParser(
prog='paf-mapping-summary.py',
description="""Summarize mapped read counts in a PAF from minimap2.""")
parser.add_argument("-p", "--paf",
required=True,
help="A PAF file resulting from minimap2.")
parser.add_argument("-r", "--reads_fasta",
required=True,
help="Fasta file with the HiFi reads used for mapping.")
parser.add_argument("-c", "--contig_list",
required=True,
help="Optional file containing contig names to search for (format: one contig per line).")
parser.add_argument("-o1", "--out_plot",
required=True,
help="Name of output figure.")
parser.add_argument("-o2", "--out_table",
required=True,
help="Name of output table.")
parser.add_argument("-x", "--count",
required=False,
default=None,
help="Read number to skip fasta counting.")
return parser.parse_args()

def get_paf_df(f):
df = pd.read_csv(f, sep='\t', usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
names=['query_name', 'query_length', 'query_start', 'query_end',
'strand', 'target_name', 'target_length', 'target_start',
'target_end', 'matched_bases', 'total_bases', 'quality', 'aln_type'],
header=None)
df['perc_identity'] = round((df['matched_bases'] / df['total_bases'] * 100), 1)
df['perc_read_aligned'] = round(((df['query_end'] - df['query_start']) / df['query_length'] * 100), 1)
return df

def get_paf_df_lite(f):
print('\nget_paf_df_lite: reading paf file')
df = pd.read_csv(f, sep='\t', usecols=[0, 1, 2, 3, 5, 9, 10, 12],
names=['query_name', 'query_length', 'query_start', 'query_end',
'target_name', 'matched_bases', 'total_bases', 'aln_type'],
header=None)
df['perc_identity'] = round((df['matched_bases'] / df['total_bases'] * 100), 1)
df['perc_read_aligned'] = round(((df['query_end'] - df['query_start']) / df['query_length'] * 100), 1)
return df

def get_read_count(reads_fasta):
print('get_read_count: reading fasta file')
read_count = 0
# with open(reads_fasta, 'r') as fh:
# for line in fh:
# if line.startswith('>'):
# read_count += 1
with pysam.FastxFile(reads_fasta) as fh:
for entry in fh:
read_count += 1
if read_count % 200000 == 0:
print("\t{:,} reads".format(read_count))
print("\t{:,} reads".format(read_count))
return read_count

def get_contig_names(contig_list):
print('get_contig_names: gathering MAG contig names')
with open(contig_list, 'r') as fh:
contig_names = [line.strip() for line in fh]
return contig_names

def make_percent(reads, total_reads):
return round(((reads / total_reads) * 100), 1)

def count_mapped_reads(df, read_count, contig_names):
print('count_mapped_reads: summarizing mappings')
print("\nTotal reads used for mapping:\n\t{:,}".format(read_count))
print("Total alignments:\n\t{:,}".format(df.shape[0]))
print("Total reads mapped:\n\t{:,}\n".format(len(df['query_name'].unique())))

# remove multiple entries for a read, keeping the one with the longest aln length
df = df.sort_values('perc_read_aligned', ascending=False).drop_duplicates('query_name').sort_index()

# create dict to store percent values for table and figure
mapping_dict = {}
mapping_dict["percent_ID"] = ["90%", "95%", "99%"]
mapping_dict["contigs"] = [make_percent(df[(df['perc_read_aligned'] >= 90) & (df['perc_identity'] >= 90) &
(df['aln_type'] == "tp:A:P")].shape[0], read_count)]
mapping_dict["contigs"].append(make_percent(df[(df['perc_read_aligned'] >= 90) & (df['perc_identity'] >= 95) &
(df['aln_type'] == "tp:A:P")].shape[0], read_count))
mapping_dict["contigs"].append(make_percent(df[(df['perc_read_aligned'] >= 90) & (df['perc_identity'] >= 99) &
(df['aln_type'] == "tp:A:P")].shape[0], read_count))
mapping_dict["mags"] = [make_percent(df[(df["target_name"].isin(contig_names)) &
(df['perc_read_aligned'] >= 90) & (df['perc_identity'] >= 90) &
(df['aln_type'] == "tp:A:P")].shape[0], read_count)]
mapping_dict["mags"].append(make_percent(df[(df["target_name"].isin(contig_names)) &
(df['perc_read_aligned'] >= 90) & (df['perc_identity'] >= 95) &
(df['aln_type'] == "tp:A:P")].shape[0], read_count))
mapping_dict["mags"].append(make_percent(df[(df["target_name"].isin(contig_names)) &
(df['perc_read_aligned'] >= 90) & (df['perc_identity'] >= 99) &
(df['aln_type'] == "tp:A:P")].shape[0], read_count))

return pd.DataFrame.from_dict(mapping_dict)

def make_palette(hex_code):
return sns.color_palette(sns.light_palette(hex_code, input='rgb', as_cmap=False, n_colors=6).as_hex()[1:] +
sns.dark_palette(hex_code, input='rgb', as_cmap=False, n_colors=6).as_hex()[-2:0:-1])

def plot_mapped(df_plot, out_plot):
print('\nplot_mapped: plotting figure')
if os.path.exists(out_plot):
os.remove(out_plot)

pb_blue = make_palette("#1383C6")
pb_green = make_palette("#009D4E")

ax = df_plot.plot.bar(stacked=False, figsize=(6, 7), width=0.7, color=[pb_green[3], pb_blue[4]],
fontsize='x-large', edgecolor='black', linewidth=0.5, ylim=(0, 105))
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, ["Contigs", "MAGs"], bbox_to_anchor=(1, 0.99),
fontsize='large', ncol=1, labelspacing=0.3)
ax.set_xticklabels(df_plot['percent_ID'], rotation=0, ha='center', fontsize='x-large')
for x, y in enumerate(df_plot["contigs"]):
ax.annotate("{:,}".format(y), (x - 0.175, y + (0.02 * df_plot["contigs"].max())),
ha='center', color="black", fontweight="regular", fontsize='large')
for x, y in enumerate(df_plot["mags"]):
ax.annotate("{:,}".format(y), (x + 0.175, y + (0.02 * df_plot["mags"].max())),
ha='center', color="black", fontweight="regular", fontsize='large')
ax.set_xlabel("Minimum percent ID threshold", fontsize="x-large")
ax.set_ylabel("Percent reads mapped (%)", fontsize="x-large")
ax.figure.savefig("{}".format(out_plot))
plt.close()

def write_table(df_plot, out_table):
if os.path.exists(out_table):
os.remove(out_table)
df_plot.to_csv("{}".format(out_table), index=False, sep='\t')

def main():
args = get_args()
df = get_paf_df_lite(args.paf)
if args.count is None:
read_count = get_read_count(args.reads_fasta)
else:
read_count = int(args.count)
contig_names = get_contig_names(args.contig_list)
df_plot = count_mapped_reads(df, read_count, contig_names)
print(df_plot)
plot_mapped(df_plot, args.out_plot)
write_table(df_plot, args.out_table)

if __name__ == '__main__':
main()


13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

Welcome! Here you can find a variety of tools and pipelines for HiFi metagenomics. In addition to the resources currently available, we will continue to add new tools as they are developed.

The current version is v2.1.0. Please see the [**release notes**](https://github.com/PacificBiosciences/pb-metagenomics-tools/releases) for changes.
The current version is v3.1.0. Please see the [**release notes**](https://github.com/PacificBiosciences/pb-metagenomics-tools/releases) for changes.

## HiFi Metagenomic Datasets & Publications

Expand All @@ -18,13 +18,16 @@ A running list of publications using HiFi sequencing for metagenomics can be fou

+ [**HiFi-MAG-Pipeline**](https://github.com/PacificBiosciences/pb-metagenomics-tools/tree/master/HiFi-MAG-Pipeline): Identify high-quality MAGs from HiFi metagenomic assemblies. Streamlined workflow includes a custom "completeness-aware" strategy to identify and protect long and complete contigs. Binning is performed with MetaBAT2 and SemiBin2, bin merging occurs with DAS_Tool, QC with CheckM2, and taxonomic assignments with GTDB-Tk. Outputs include high-quality MAG sequences, summary figures, and associated metadata.

+ [**pb-MAG-mirror**](https://github.com/PacificBiosciences/pb-metagenomics-tools/tree/master/pb-MAG-mirror): Compare two sets of MAGs to quantify the numbers of highly similar, mixed, and unique MAGs, and consolidate the sets into a single non-redundant set. This workflow can be used to compare MAGs (intended use-case) or bins, but it requires the contigs to originate from the same assembly method.

+ [**Taxonomic-Profiling-Sourmash**](https://github.com/PacificBiosciences/pb-metagenomics-tools/tree/master/Taxonomic-Profiling-Sourmash): Obtain taxonomic proflies using `sourmash gather` --> `taxonomy` approach. Sourmash provides GTDB and NCBI databases, or you can build your own database.

+ [**Taxonomic-Profiling-Diamond-Megan**](https://github.com/PacificBiosciences/pb-metagenomics-tools/tree/master/Taxonomic-Profiling-Diamond-Megan): Perform translation alignment of HiFi reads to a **protein** database using DIAMOND and summarize with MEGAN-LR, for the purpose of taxonomic and functional profiling. Provides access to taxonomic annotations from NCBI and GTDB, and outputs NCBI taxonomic counts in kraken (kreport) and metaphlan (mpa) formats. Also provides functional annotations based on multiple databases (EC, SEED, InterPro2GO, eggNOG), with new KEGG option available.

+ [**Taxonomic-Profiling-Minimap-Megan**](https://github.com/PacificBiosciences/pb-metagenomics-tools/tree/master/Taxonomic-Profiling-Minimap-Megan): Align HiFi reads to a **nucleotide** database using minimap2 and summarize with MEGAN-LR, for the purpose of taxonomic profiling. Provides access to NCBI and GTDB taxonomic annotations. Outputs NCBI taxonomic counts in kraken (kreport) and metaphlan (mpa) formats.

+ [**Taxonomic-Profiling-Sourmash**](https://github.com/PacificBiosciences/pb-metagenomics-tools/tree/master/Taxonomic-Profiling-Sourmash): Obtain taxonomic proflies using `sourmash gather` --> `taxonomy` approach. Sourmash provides GTDB and NCBI databases, or you can build your own database.

Each of these pipelines can be found in their respective folders. They are made available as [Snakemake](https://snakemake.readthedocs.io/en/stable/index.html) workflows. Snakemake is a python-based workflow manager. Snakemake workflows are highly portable because dependencies and environments are automatically setup using [Anaconda](https://docs.anaconda.com/anaconda/)/[Conda](https://docs.conda.io/projects/conda/en/latest/index.html). Snakemake also allows reproducibility, checkpointing, and the ability to scale workflows using HPC and cloud environments. Snakemake v5+ is required, and the workflows have been tested using v5.26+. You can optionally install snakemake via the provided conda environment file via `conda env create -f environment.yml`, and then activate this environment via `conda activate pb-metagenomics-tools` when you want to run any of these workflows.
Each of these pipelines can be found in their respective folders. They are made available as [Snakemake](https://snakemake.readthedocs.io/en/stable/index.html) workflows. Snakemake is a python-based workflow manager. Snakemake workflows are highly portable because dependencies and environments are automatically setup using [Anaconda](https://docs.anaconda.com/anaconda/)/[Conda](https://docs.conda.io/projects/conda/en/latest/index.html). Snakemake also allows reproducibility, checkpointing, and the ability to scale workflows using HPC and cloud environments. Snakemake v5+ is required. You can optionally install snakemake via the provided conda environment file via `conda env create -f environment.yml`, and then activate this environment via `conda activate pb-metagenomics-tools` when you want to run any of these workflows.

## Other tools

Expand All @@ -40,9 +43,11 @@ All documentation for snakemake pipelines can be found in the `docs/` folder abo

Available pipeline documentation:
- [**Tutorial: HiFi-MAG-Pipeline**](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/Tutorial-HiFi-MAG-Pipeline.md)
- [**Tutorial: pb-MAG-mirror**](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/Tutorial-pb-MAG-mirror.md)
- [**Tutorial: Taxonomic-Profiling-Sourmash**](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/Tutorial-Taxonomic-Profiling-Sourmash.md)
- [**Tutorial: Taxonomic-Profiling-Diamond-Megan**](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/Tutorial-Taxonomic-Profiling-Diamond-Megan.md)
- [**Tutorial: Taxonomic-Profiling-Minimap-Megan**](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/Tutorial-Taxonomic-Profiling-Minimap-Megan.md)
- [**Tutorial: Taxonomic-Profiling-Sourmash**](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/Tutorial-Taxonomic-Profiling-Sourmash.md)


The documentation for [pb-metagenomics-scripts](https://github.com/PacificBiosciences/pb-metagenomics-tools/tree/master/pb-metagenomics-scripts) is provided in the folder.

Expand Down
Loading

0 comments on commit d9a88ba

Please sign in to comment.