diff --git a/HiFi-MAG-Pipeline/Snakefile-hifimags.smk b/HiFi-MAG-Pipeline/Snakefile-hifimags.smk index 63be257..d5e0e00 100644 --- a/HiFi-MAG-Pipeline/Snakefile-hifimags.smk +++ b/HiFi-MAG-Pipeline/Snakefile-hifimags.smk @@ -1,9 +1,9 @@ import os localrules: - Checkm2Database, LongContigsToBins, CloseLongbinFork, StopLongBinCheckm2, FilterCompleteContigs, + LongContigsToBins, CloseLongbinFork, StopLongBinCheckm2, FilterCompleteContigs, ConvertJGIBamDepth, DASinputMetabat2, DASinputSemiBin2, CopyDAStoolBins, AssessCheckm2Bins, - CloseCheckm2Fork, SkipGTDBAnalysis, GTDBTkCleanup, MAGSummary, MAGPlots, all + CloseCheckm2Fork, SkipGTDBAnalysis, GTDBTkCleanup, MAGSummary, MAGContigNames, MAGMappingPlots, MAGPlots, all configfile: "config.yaml" @@ -19,28 +19,6 @@ rule all: expand(os.path.join(CWD,"6-checkm2","{sample}","{sample}.BinCount.txt"), sample = SAMPLES), expand(os.path.join(CWD,"8-summary","{sample}","{sample}.Complete.txt"), sample = SAMPLES) -################################################################################################## -# Setup CheckM2 - -rule Checkm2Database: - input: - contigs = expand(os.path.join(CWD, "inputs", "{sample}.contigs.fasta"), sample = SAMPLES) - output: - key = os.path.join(CWD, "CheckM2_database", "uniref100.KO.1.dmnd"), - complete = os.path.join(CWD, "CheckM2_database", "CheckM2.complete.txt") - conda: - "envs/checkm2.yml" - threads: - 1 - params: - installdir = CWD - log: - os.path.join(CWD, "logs", "Checkm2Database.log") - benchmark: - os.path.join(CWD, "benchmarks", "Checkm2Database.tsv") - shell: - "checkm2 database --download --path {params.installdir} &> {log} && touch {output.complete}" - ################################################################################################## # Completeness-aware binning steps @@ -77,15 +55,15 @@ rule CloseLongbinFork: input: get_long_binning_targets output: - incomplete_contigs = os.path.join(CWD,"1-long-contigs","{sample}","{sample}.incomplete_contigs.fasta"), + incomplete_contigs = os.path.join(CWD, "1-long-contigs", "{sample}", "{sample}.incomplete_contigs.fasta"), complete = os.path.join(CWD, "1-long-contigs", "{sample}", "{sample}.LongBinCompleted.txt"), - outdir = directory(os.path.join(CWD,"5-dereplicated-bins","{sample}","")) + outdir = directory(os.path.join(CWD, "5-dereplicated-bins", "{sample}", "")) conda: "envs/python.yml" threads: 1 params: - contigs = os.path.join(CWD,"inputs","{sample}.contigs.fasta"), + contigs = os.path.join(CWD, "inputs", "{sample}.contigs.fasta"), fastadir = os.path.join(CWD, "1-long-contigs", "{sample}", "bins", "") log: os.path.join(CWD, "logs", "{sample}.CloseLongbinFork.log") @@ -122,14 +100,15 @@ rule Checkm2ContigAnalysis: params: indir = os.path.join(CWD, "1-long-contigs", "{sample}", "bins", ""), outdir = os.path.join(CWD, "1-long-contigs", "{sample}", "checkm2", ""), - tmp = config["tmpdir"] + tmp = config["tmpdir"], + db = config["checkm2"]["db"] log: os.path.join(CWD, "logs", "{sample}.Checkm2ContigAnalysis.log") benchmark: os.path.join(CWD, "benchmarks", "{sample}.Checkm2ContigAnalysis.tsv") shell: "checkm2 predict -i {params.indir} -o {params.outdir} -x fa -t {threads} --force " - "--database_path {input.db} --remove_intermediates --tmpdir {params.tmp} &> {log}" + "--database_path {params.db} --remove_intermediates --tmpdir {params.tmp} &> {log}" # Checkpoint 1: Fork 2 - Long contigs found, sample moves through Checkm2ContigAnalysis -> FilterCompleteContigs rule FilterCompleteContigs: @@ -211,6 +190,22 @@ rule IndexBam: shell: "samtools index -@ {threads} {input} &> {log} && touch {output.complete}" +rule Bam2Paf: + input: + os.path.join(CWD, "2-bam", "{sample}.bam") + output: + os.path.join(CWD, "2-bam", "{sample}.paf") + conda: + "envs/python.yml" + threads: + 1 + log: + os.path.join(CWD, "logs", "{sample}.Bam2Paf.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.Bam2Paf.tsv") + shell: + "python scripts/bam2paf.py -i {input} -o {output} &> {log}" + rule JGIBamDepth: input: complete = os.path.join(CWD, "2-bam", "{sample}.index.completed.txt"), @@ -290,7 +285,7 @@ rule SemiBin2Analysis: bam = os.path.join(CWD, "2-bam", "{sample}.bam") output: bins = os.path.join(CWD, "3-semibin2", "{sample}", "bins_info.tsv"), - outdir = directory(os.path.join(CWD,"3-semibin2","{sample}","")) + outdir = directory(os.path.join(CWD, "3-semibin2", "{sample}", "")) conda: "envs/semibin.yml" threads: @@ -310,7 +305,7 @@ rule SemiBin2Analysis: rule DASinputSemiBin2: input: - os.path.join(CWD,"3-semibin2","{sample}","") + os.path.join(CWD, "3-semibin2", "{sample}", "") output: os.path.join(CWD, "4-DAStool", "{sample}.semibin2.tsv") conda: @@ -328,7 +323,7 @@ rule DASinputSemiBin2: rule DAStoolAnalysis: input: metabat = os.path.join(CWD, "4-DAStool", "{sample}.metabat2.tsv"), - semibin = os.path.join(CWD,"4-DAStool","{sample}.semibin2.tsv"), + semibin = os.path.join(CWD, "4-DAStool", "{sample}.semibin2.tsv"), contigs = os.path.join(CWD, "1-long-contigs", "{sample}", "{sample}.incomplete_contigs.fasta") output: binsdir = directory(os.path.join(CWD, "4-DAStool", "{sample}", "{sample}_DASTool_bins", "")), @@ -355,7 +350,7 @@ rule CopyDAStoolBins: input: binsdir = os.path.join(CWD, "4-DAStool", "{sample}", "{sample}_DASTool_bins", ""), complete = os.path.join(CWD, "4-DAStool", "{sample}.Complete.txt"), - copydir = os.path.join(CWD,"5-dereplicated-bins","{sample}","") + copydir = os.path.join(CWD, "5-dereplicated-bins", "{sample}","") output: os.path.join(CWD,"5-dereplicated-bins","{sample}.bins_copied.txt") conda: @@ -372,8 +367,7 @@ rule CopyDAStoolBins: rule Checkm2BinAnalysis: input: - db = os.path.join(CWD, "CheckM2_database", "uniref100.KO.1.dmnd"), - derep_bins = os.path.join(CWD,"5-dereplicated-bins","{sample}.bins_copied.txt") + derep_bins = os.path.join(CWD, "5-dereplicated-bins", "{sample}.bins_copied.txt") output: qv = os.path.join(CWD, "6-checkm2", "{sample}", "checkm2", "quality_report.tsv") conda: @@ -383,14 +377,15 @@ rule Checkm2BinAnalysis: params: indir = os.path.join(CWD, "5-dereplicated-bins", "{sample}", ""), outdir = os.path.join(CWD, "6-checkm2", "{sample}", "checkm2", ""), - tmp = config["tmpdir"] + tmp = config["tmpdir"], + db = config["checkm2"]["db"] log: os.path.join(CWD, "logs", "{sample}.Checkm2BinAnalysis.log") benchmark: os.path.join(CWD, "benchmarks", "{sample}.Checkm2BinAnalysis.tsv") shell: "checkm2 predict -i {params.indir} -o {params.outdir} -x fa -t {threads} --force " - "--remove_intermediates --database_path {input.db} --tmpdir {params.tmp} &> {log}" + "--remove_intermediates --database_path {params.db} --tmpdir {params.tmp} &> {log}" # Checkpoint 2 - Ensure there are bins after CheckM2, before running GTDB-Tk and the summary checkpoint AssessCheckm2Bins: @@ -399,7 +394,7 @@ checkpoint AssessCheckm2Bins: depth = os.path.join(CWD, "2-bam", "{sample}.JGI.depth.txt") output: gtdb = os.path.join(CWD, "6-checkm2", "{sample}", "{sample}.GTDBTk_batch_file.txt"), - target = os.path.join(CWD,"6-checkm2","{sample}","{sample}.BinCount.txt"), + target = os.path.join(CWD, "6-checkm2", "{sample}", "{sample}.BinCount.txt"), output_tsv = os.path.join(CWD, "6-checkm2", "{sample}", "{sample}.quality_report.tsv") conda: "envs/python.yml" @@ -424,14 +419,14 @@ def get_post_checkm2_inputs(wildcards): if int(fh.read().strip()) > 0: return os.path.join(CWD, "8-summary", "{sample}", "{sample}.Completeness-Contamination-Contigs.pdf") else: - return os.path.join(CWD,"8-summary","{sample}","{sample}.No_MAGs.summary.txt") + return os.path.join(CWD, "8-summary", "{sample}", "{sample}.No_MAGs.summary.txt") # Checkpoint 2 aggregator; close the fork; outputs '/8-summary/SAMPLE/SAMPLE.Complete.txt' rule CloseCheckm2Fork: input: get_post_checkm2_inputs output: - os.path.join(CWD,"8-summary","{sample}","{sample}.Complete.txt") + os.path.join(CWD, "8-summary", "{sample}", "{sample}.Complete.txt") shell: "touch {output}" @@ -440,10 +435,10 @@ rule CloseCheckm2Fork: rule SkipGTDBAnalysis: input: gtdb = os.path.join(CWD, "6-checkm2", "{sample}", "{sample}.GTDBTk_batch_file.txt"), - target = os.path.join(CWD,"6-checkm2","{sample}","{sample}.BinCount.txt"), - output_tsv = os.path.join(CWD,"6-checkm2","{sample}","{sample}.quality_report.tsv") + target = os.path.join(CWD, "6-checkm2", "{sample}", "{sample}.BinCount.txt"), + output_tsv = os.path.join(CWD, "6-checkm2", "{sample}", "{sample}.quality_report.tsv") output: - os.path.join(CWD,"8-summary","{sample}","{sample}.No_MAGs.summary.txt") + os.path.join(CWD, "8-summary", "{sample}", "{sample}.No_MAGs.summary.txt") threads: 1 params: @@ -453,16 +448,16 @@ rule SkipGTDBAnalysis: "see {input.output_tsv} for more information > {output}" ############################## -# Checkpoint 2: Fork 2 - Bins passed filters; GTDBTkAnalysis -> GTDBTkCleanup -> MAGSummary -> MAGCopy -> MAGPlots +# Checkpoint 2: Fork 2 - Bins passed filters; GTDBTkAnalysis -> GTDBTkCleanup -> MAGSummary -> MAGCopy -> MAGContigNames -> MAGmappingPlots -> MAGPlots rule GTDBTkAnalysis: input: gtdb = os.path.join(CWD, "6-checkm2", "{sample}", "{sample}.GTDBTk_batch_file.txt"), - target = os.path.join(CWD,"6-checkm2","{sample}","{sample}.BinCount.txt") + target = os.path.join(CWD, "6-checkm2", "{sample}", "{sample}.BinCount.txt") output: dir_align = directory(os.path.join(CWD, "7-gtdbtk", "{sample}", "align", "")), dir_classify = directory(os.path.join(CWD, "7-gtdbtk", "{sample}", "classify", "")), dir_identify = directory(os.path.join(CWD, "7-gtdbtk", "{sample}", "identify", "")), - complete = os.path.join(CWD,"7-gtdbtk","{sample}","{sample}.Complete.txt") + complete = os.path.join(CWD, "7-gtdbtk", "{sample}", "{sample}.Complete.txt") conda: "envs/gtdbtk.yml" threads: @@ -479,11 +474,11 @@ rule GTDBTkAnalysis: "--out_dir {params.outdir} -x fa --prefix {wildcards.sample} --cpus {threads} " " &> {log} && touch {output.complete}" -# Checkpoint 2: Fork 2 - Bins passed filters; GTDBTkAnalysis -> GTDBTkCleanup -> MAGSummary -> MAGCopy -> MAGPlots +# Checkpoint 2: Fork 2 - Bins passed filters; GTDBTkAnalysis -> GTDBTkCleanup -> MAGSummary -> MAGCopy -> MAGContigNames -> MAGmappingPlots -> MAGPlots rule GTDBTkCleanup: input: dir_classify = os.path.join(CWD, "7-gtdbtk", "{sample}", "classify", ""), - complete = os.path.join(CWD,"7-gtdbtk","{sample}","{sample}.Complete.txt") + complete = os.path.join(CWD, "7-gtdbtk", "{sample}", "{sample}.Complete.txt") output: os.path.join(CWD,"7-gtdbtk","{sample}","{sample}.GTDBTk_Summary.txt") conda: @@ -495,13 +490,13 @@ rule GTDBTkCleanup: shell: "python scripts/GTDBTk-Organize.py -i {input.dir_classify} -o {output} &> {log}" -# Checkpoint 2: Fork 2 - Bins passed filters; GTDBTkAnalysis -> GTDBTkCleanup -> MAGSummary -> MAGCopy -> MAGPlots +# Checkpoint 2: Fork 2 - Bins passed filters; GTDBTkAnalysis -> GTDBTkCleanup -> MAGSummary -> MAGCopy -> MAGContigNames -> MAGmappingPlots -> MAGPlots rule MAGSummary: input: - gtdbtk = os.path.join(CWD,"7-gtdbtk","{sample}","{sample}.GTDBTk_Summary.txt"), + gtdbtk = os.path.join(CWD, "7-gtdbtk", "{sample}", "{sample}.GTDBTk_Summary.txt"), checkm2 = os.path.join(CWD, "6-checkm2", "{sample}", "{sample}.quality_report.tsv") output: - os.path.join(CWD,"8-summary","{sample}","{sample}.HiFi_MAG.summary.txt") + os.path.join(CWD, "8-summary", "{sample}", "{sample}.HiFi_MAG.summary.txt") conda: "envs/python.yml" threads: @@ -511,13 +506,13 @@ rule MAGSummary: shell: "python scripts/MAG-Summary.py -g {input.gtdbtk} -c {input.checkm2} -o {output} &> {log}" -# Checkpoint 2: Fork 2 - Bins passed filters; GTDBTkAnalysis -> GTDBTkCleanup -> MAGSummary -> MAGCopy -> MAGPlots +# Checkpoint 2: Fork 2 - Bins passed filters; GTDBTkAnalysis -> GTDBTkCleanup -> MAGSummary -> MAGCopy -> MAGContigNames -> MAGmappingPlots -> MAGPlots rule MAGCopy: input: - mag_sum = os.path.join(CWD,"8-summary","{sample}","{sample}.HiFi_MAG.summary.txt"), + mag_sum = os.path.join(CWD, "8-summary", "{sample}", "{sample}.HiFi_MAG.summary.txt"), mag_dir = os.path.join(CWD, "5-dereplicated-bins", "{sample}", "") output: - directory(os.path.join(CWD,"8-summary","{sample}", "MAGs", "")) + directory(os.path.join(CWD, "8-summary", "{sample}", "MAGs", "")) conda: "envs/python.yml" threads: @@ -527,12 +522,51 @@ rule MAGCopy: shell: "python scripts/Copy-Final-MAGs.py -i {input.mag_sum} -m {input.mag_dir} -o {output} &> {log}" -# Checkpoint 2: Fork 2 - Bins passed filters; GTDBTkAnalysis -> GTDBTkCleanup -> MAGSummary -> MAGCopy -> MAGPlots +# Checkpoint 2: Fork 2 - Bins passed filters; GTDBTkAnalysis -> GTDBTkCleanup -> MAGSummary -> MAGCopy -> MAGContigNames -> MAGmappingPlots -> MAGPlots +rule MAGContigNames: + input: + mag_sum = os.path.join(CWD, "8-summary", "{sample}", "{sample}.HiFi_MAG.summary.txt"), + mag_dir = os.path.join(CWD, "8-summary", "{sample}", "MAGs", "") + output: + os.path.join(CWD, "2-bam", "{sample}.MAG_contigs.txt") + threads: + 1 + log: + os.path.join(CWD, "logs", "{sample}.MAGContigNames.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.MAGContigNames.tsv") + shell: + "grep -h '>' {input.mag_dir}*.fa | cut -d'>' -f2 1> {output} 2> {log}" + +# Checkpoint 2: Fork 2 - Bins passed filters; GTDBTkAnalysis -> GTDBTkCleanup -> MAGSummary -> MAGCopy -> MAGContigNames -> MAGmappingPlots -> MAGPlots +rule MAGMappingPlots: + input: + contig_names = os.path.join(CWD, "2-bam", "{sample}.MAG_contigs.txt"), + reads = os.path.join(CWD, "inputs", "{sample}.fasta"), + paf = os.path.join(CWD, "2-bam", "{sample}.paf") + output: + o1 = os.path.join(CWD, "8-summary", "{sample}", "{sample}.ReadsMapped.pdf"), + o2 = os.path.join(CWD, "8-summary", "{sample}", "{sample}.ReadsMapped.txt") + conda: + "envs/python.yml" + threads: + 4 + log: + os.path.join(CWD, "logs", "{sample}.MAGMappingPlots.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.MAGMappingPlots.tsv") + shell: + "python scripts/paf-mapping-summary.py -p {input.paf} -r {input.reads} -c {input.contig_names} " + "-o1 {output.o1} -o2 {output.o2} &> {log}" + + +# Checkpoint 2: Fork 2 - Bins passed filters; GTDBTkAnalysis -> GTDBTkCleanup -> MAGSummary -> MAGCopy -> MAGContigNames -> MAGmappingPlots -> MAGPlots rule MAGPlots: input: checkm2_tsv = os.path.join(CWD, "6-checkm2", "{sample}", "{sample}.quality_report.tsv"), - mag_dir = os.path.join(CWD,"8-summary","{sample}", "MAGs", ""), - mag_eval = os.path.join(CWD,"8-summary","{sample}","{sample}.HiFi_MAG.summary.txt") + mag_dir = os.path.join(CWD, "8-summary", "{sample}", "MAGs", ""), + mag_eval = os.path.join(CWD, "8-summary", "{sample}", "{sample}.HiFi_MAG.summary.txt"), + mapped_fig = os.path.join(CWD, "8-summary", "{sample}", "{sample}.ReadsMapped.pdf") output: o1 = os.path.join(CWD, "8-summary", "{sample}", "{sample}.All-DASTool-Bins.pdf"), o2 = os.path.join(CWD, "8-summary", "{sample}", "{sample}.Completeness-Contamination-Contigs.pdf"), diff --git a/HiFi-MAG-Pipeline/config.yaml b/HiFi-MAG-Pipeline/config.yaml index 56fa042..ad29603 100644 --- a/HiFi-MAG-Pipeline/config.yaml +++ b/HiFi-MAG-Pipeline/config.yaml @@ -4,8 +4,14 @@ tmpdir: "/scratch" checkm2: - # The number of threads to use for CheckM. + # The number of threads to use for CheckM2. threads: 24 + # Full path to the diamond database required to run CheckM2. + # This can be obtained via checkm2: + # checkm2 database --download --path /YourPath/CheckM2_database + # It can also be downloaded directly from here, and unpacked after: + # https://zenodo.org/records/5571251/files/checkm2_database.tar.gz?download=1 + db: "/dept/appslab/datasets/dp_checkm2/CheckM2_database/uniref100.KO.1.dmnd" minimap: # The number of threads to use for minimap2. diff --git a/HiFi-MAG-Pipeline/envs/python.yml b/HiFi-MAG-Pipeline/envs/python.yml index 10617b4..c6aa3f4 100644 --- a/HiFi-MAG-Pipeline/envs/python.yml +++ b/HiFi-MAG-Pipeline/envs/python.yml @@ -4,9 +4,10 @@ channels: - conda-forge - defaults dependencies: - - python == 3.7 + - python - numpy - pandas - seaborn - matplotlib - - biopython == 1.77 \ No newline at end of file + - biopython + - pysam == 0.22 \ No newline at end of file diff --git a/HiFi-MAG-Pipeline/scripts/Convert-JGI-Coverages.py b/HiFi-MAG-Pipeline/scripts/Convert-JGI-Coverages.py index e468657..90e5189 100644 --- a/HiFi-MAG-Pipeline/scripts/Convert-JGI-Coverages.py +++ b/HiFi-MAG-Pipeline/scripts/Convert-JGI-Coverages.py @@ -1,5 +1,4 @@ import argparse -import os import pandas as pd def get_args(): diff --git a/HiFi-MAG-Pipeline/scripts/Filter-Checkm2-Bins.py b/HiFi-MAG-Pipeline/scripts/Filter-Checkm2-Bins.py index f8d5485..a6867c8 100644 --- a/HiFi-MAG-Pipeline/scripts/Filter-Checkm2-Bins.py +++ b/HiFi-MAG-Pipeline/scripts/Filter-Checkm2-Bins.py @@ -2,8 +2,6 @@ import os import numpy as np import pandas as pd -import seaborn as sns -import matplotlib.pyplot as plt from Bio import SeqIO def get_args(): diff --git a/HiFi-MAG-Pipeline/scripts/bam2paf.py b/HiFi-MAG-Pipeline/scripts/bam2paf.py new file mode 100644 index 0000000..934132e --- /dev/null +++ b/HiFi-MAG-Pipeline/scripts/bam2paf.py @@ -0,0 +1,64 @@ +import argparse +import pysam +import os + +def get_args(): + """ + Get arguments from command line with argparse. + """ + parser = argparse.ArgumentParser( + prog='bam2paf.py', + description="""Convert an aligned bam from minimap2 into paf.""") + parser.add_argument("-i", "--input_bam", + required=True, + help="Path to bam file.") + parser.add_argument("-o", "--out_paf", + required=True, + help="Name of output paf file to write.") + return parser.parse_args() + +def get_bam_object(input_bam): + print("get_bam_object: Getting pysam bam object.") + return pysam.AlignmentFile(input_bam, 'rb', check_sq=False) + +def bam2paf(bam, out_paf): + if os.path.exists(out_paf): + os.remove(out_paf) + + print("bam2paf: Converting bam to paf...") + entries = 0 + with open(out_paf, 'a') as fh: + for read in bam: + # simple progress tracking + entries += 1 + if entries % 100000 == 0: + print("\t{:,} entries".format(entries)) + # get orientation + if read.is_reverse is True: + strand = '-' + else: + strand = '+' + # see if tp tag is present + try: + aln_type = "tp:A:{}".format(read.get_tag('tp')) + except: + aln_type = "" + # write paf format + fh.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t" + "{6}\t{7}\t{8}\t{9}\t{10}\t{11}\t" + "{12}\n".format(read.query_name, read.query_length, read.query_alignment_start, + read.query_alignment_end, strand, read.reference_name, + read.reference_length, read.reference_start, read.reference_end, + read.get_cigar_stats()[0][0], read.query_alignment_length, + read.mapping_quality, aln_type)) + print("bam2paf: Wrote {:,} entries".format(entries)) + +def main(): + args = get_args() + bam = get_bam_object(args.input_bam) + bam2paf(bam, args.out_paf) + +if __name__ == '__main__': + main() + + diff --git a/HiFi-MAG-Pipeline/scripts/paf-mapping-summary.py b/HiFi-MAG-Pipeline/scripts/paf-mapping-summary.py new file mode 100644 index 0000000..8c87b51 --- /dev/null +++ b/HiFi-MAG-Pipeline/scripts/paf-mapping-summary.py @@ -0,0 +1,160 @@ +import argparse +import os +import pysam +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +def get_args(): + """ + Get arguments from command line with argparse. + """ + parser = argparse.ArgumentParser( + prog='paf-mapping-summary.py', + description="""Summarize mapped read counts in a PAF from minimap2.""") + parser.add_argument("-p", "--paf", + required=True, + help="A PAF file resulting from minimap2.") + parser.add_argument("-r", "--reads_fasta", + required=True, + help="Fasta file with the HiFi reads used for mapping.") + parser.add_argument("-c", "--contig_list", + required=True, + help="Optional file containing contig names to search for (format: one contig per line).") + parser.add_argument("-o1", "--out_plot", + required=True, + help="Name of output figure.") + parser.add_argument("-o2", "--out_table", + required=True, + help="Name of output table.") + parser.add_argument("-x", "--count", + required=False, + default=None, + help="Read number to skip fasta counting.") + return parser.parse_args() + +def get_paf_df(f): + df = pd.read_csv(f, sep='\t', usecols=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], + names=['query_name', 'query_length', 'query_start', 'query_end', + 'strand', 'target_name', 'target_length', 'target_start', + 'target_end', 'matched_bases', 'total_bases', 'quality', 'aln_type'], + header=None) + df['perc_identity'] = round((df['matched_bases'] / df['total_bases'] * 100), 1) + df['perc_read_aligned'] = round(((df['query_end'] - df['query_start']) / df['query_length'] * 100), 1) + return df + +def get_paf_df_lite(f): + print('\nget_paf_df_lite: reading paf file') + df = pd.read_csv(f, sep='\t', usecols=[0, 1, 2, 3, 5, 9, 10, 12], + names=['query_name', 'query_length', 'query_start', 'query_end', + 'target_name', 'matched_bases', 'total_bases', 'aln_type'], + header=None) + df['perc_identity'] = round((df['matched_bases'] / df['total_bases'] * 100), 1) + df['perc_read_aligned'] = round(((df['query_end'] - df['query_start']) / df['query_length'] * 100), 1) + return df + +def get_read_count(reads_fasta): + print('get_read_count: reading fasta file') + read_count = 0 +# with open(reads_fasta, 'r') as fh: +# for line in fh: +# if line.startswith('>'): +# read_count += 1 + with pysam.FastxFile(reads_fasta) as fh: + for entry in fh: + read_count += 1 + if read_count % 200000 == 0: + print("\t{:,} reads".format(read_count)) + print("\t{:,} reads".format(read_count)) + return read_count + +def get_contig_names(contig_list): + print('get_contig_names: gathering MAG contig names') + with open(contig_list, 'r') as fh: + contig_names = [line.strip() for line in fh] + return contig_names + +def make_percent(reads, total_reads): + return round(((reads / total_reads) * 100), 1) + +def count_mapped_reads(df, read_count, contig_names): + print('count_mapped_reads: summarizing mappings') + print("\nTotal reads used for mapping:\n\t{:,}".format(read_count)) + print("Total alignments:\n\t{:,}".format(df.shape[0])) + print("Total reads mapped:\n\t{:,}\n".format(len(df['query_name'].unique()))) + + # remove multiple entries for a read, keeping the one with the longest aln length + df = df.sort_values('perc_read_aligned', ascending=False).drop_duplicates('query_name').sort_index() + + # create dict to store percent values for table and figure + mapping_dict = {} + mapping_dict["percent_ID"] = ["90%", "95%", "99%"] + mapping_dict["contigs"] = [make_percent(df[(df['perc_read_aligned'] >= 90) & (df['perc_identity'] >= 90) & + (df['aln_type'] == "tp:A:P")].shape[0], read_count)] + mapping_dict["contigs"].append(make_percent(df[(df['perc_read_aligned'] >= 90) & (df['perc_identity'] >= 95) & + (df['aln_type'] == "tp:A:P")].shape[0], read_count)) + mapping_dict["contigs"].append(make_percent(df[(df['perc_read_aligned'] >= 90) & (df['perc_identity'] >= 99) & + (df['aln_type'] == "tp:A:P")].shape[0], read_count)) + mapping_dict["mags"] = [make_percent(df[(df["target_name"].isin(contig_names)) & + (df['perc_read_aligned'] >= 90) & (df['perc_identity'] >= 90) & + (df['aln_type'] == "tp:A:P")].shape[0], read_count)] + mapping_dict["mags"].append(make_percent(df[(df["target_name"].isin(contig_names)) & + (df['perc_read_aligned'] >= 90) & (df['perc_identity'] >= 95) & + (df['aln_type'] == "tp:A:P")].shape[0], read_count)) + mapping_dict["mags"].append(make_percent(df[(df["target_name"].isin(contig_names)) & + (df['perc_read_aligned'] >= 90) & (df['perc_identity'] >= 99) & + (df['aln_type'] == "tp:A:P")].shape[0], read_count)) + + return pd.DataFrame.from_dict(mapping_dict) + +def make_palette(hex_code): + return sns.color_palette(sns.light_palette(hex_code, input='rgb', as_cmap=False, n_colors=6).as_hex()[1:] + + sns.dark_palette(hex_code, input='rgb', as_cmap=False, n_colors=6).as_hex()[-2:0:-1]) + +def plot_mapped(df_plot, out_plot): + print('\nplot_mapped: plotting figure') + if os.path.exists(out_plot): + os.remove(out_plot) + + pb_blue = make_palette("#1383C6") + pb_green = make_palette("#009D4E") + + ax = df_plot.plot.bar(stacked=False, figsize=(6, 7), width=0.7, color=[pb_green[3], pb_blue[4]], + fontsize='x-large', edgecolor='black', linewidth=0.5, ylim=(0, 105)) + handles, labels = ax.get_legend_handles_labels() + ax.legend(handles, ["Contigs", "MAGs"], bbox_to_anchor=(1, 0.99), + fontsize='large', ncol=1, labelspacing=0.3) + ax.set_xticklabels(df_plot['percent_ID'], rotation=0, ha='center', fontsize='x-large') + for x, y in enumerate(df_plot["contigs"]): + ax.annotate("{:,}".format(y), (x - 0.175, y + (0.02 * df_plot["contigs"].max())), + ha='center', color="black", fontweight="regular", fontsize='large') + for x, y in enumerate(df_plot["mags"]): + ax.annotate("{:,}".format(y), (x + 0.175, y + (0.02 * df_plot["mags"].max())), + ha='center', color="black", fontweight="regular", fontsize='large') + ax.set_xlabel("Minimum percent ID threshold", fontsize="x-large") + ax.set_ylabel("Percent reads mapped (%)", fontsize="x-large") + ax.figure.savefig("{}".format(out_plot)) + plt.close() + +def write_table(df_plot, out_table): + if os.path.exists(out_table): + os.remove(out_table) + df_plot.to_csv("{}".format(out_table), index=False, sep='\t') + +def main(): + args = get_args() + df = get_paf_df_lite(args.paf) + if args.count is None: + read_count = get_read_count(args.reads_fasta) + else: + read_count = int(args.count) + contig_names = get_contig_names(args.contig_list) + df_plot = count_mapped_reads(df, read_count, contig_names) + print(df_plot) + plot_mapped(df_plot, args.out_plot) + write_table(df_plot, args.out_table) + +if __name__ == '__main__': + main() + + diff --git a/README.md b/README.md index e1e04fd..efb333a 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ Welcome! Here you can find a variety of tools and pipelines for HiFi metagenomics. In addition to the resources currently available, we will continue to add new tools as they are developed. -The current version is v2.1.0. Please see the [**release notes**](https://github.com/PacificBiosciences/pb-metagenomics-tools/releases) for changes. +The current version is v3.1.0. Please see the [**release notes**](https://github.com/PacificBiosciences/pb-metagenomics-tools/releases) for changes. ## HiFi Metagenomic Datasets & Publications @@ -18,13 +18,16 @@ A running list of publications using HiFi sequencing for metagenomics can be fou + [**HiFi-MAG-Pipeline**](https://github.com/PacificBiosciences/pb-metagenomics-tools/tree/master/HiFi-MAG-Pipeline): Identify high-quality MAGs from HiFi metagenomic assemblies. Streamlined workflow includes a custom "completeness-aware" strategy to identify and protect long and complete contigs. Binning is performed with MetaBAT2 and SemiBin2, bin merging occurs with DAS_Tool, QC with CheckM2, and taxonomic assignments with GTDB-Tk. Outputs include high-quality MAG sequences, summary figures, and associated metadata. ++ [**pb-MAG-mirror**](https://github.com/PacificBiosciences/pb-metagenomics-tools/tree/master/pb-MAG-mirror): Compare two sets of MAGs to quantify the numbers of highly similar, mixed, and unique MAGs, and consolidate the sets into a single non-redundant set. This workflow can be used to compare MAGs (intended use-case) or bins, but it requires the contigs to originate from the same assembly method. + ++ [**Taxonomic-Profiling-Sourmash**](https://github.com/PacificBiosciences/pb-metagenomics-tools/tree/master/Taxonomic-Profiling-Sourmash): Obtain taxonomic proflies using `sourmash gather` --> `taxonomy` approach. Sourmash provides GTDB and NCBI databases, or you can build your own database. + + [**Taxonomic-Profiling-Diamond-Megan**](https://github.com/PacificBiosciences/pb-metagenomics-tools/tree/master/Taxonomic-Profiling-Diamond-Megan): Perform translation alignment of HiFi reads to a **protein** database using DIAMOND and summarize with MEGAN-LR, for the purpose of taxonomic and functional profiling. Provides access to taxonomic annotations from NCBI and GTDB, and outputs NCBI taxonomic counts in kraken (kreport) and metaphlan (mpa) formats. Also provides functional annotations based on multiple databases (EC, SEED, InterPro2GO, eggNOG), with new KEGG option available. + [**Taxonomic-Profiling-Minimap-Megan**](https://github.com/PacificBiosciences/pb-metagenomics-tools/tree/master/Taxonomic-Profiling-Minimap-Megan): Align HiFi reads to a **nucleotide** database using minimap2 and summarize with MEGAN-LR, for the purpose of taxonomic profiling. Provides access to NCBI and GTDB taxonomic annotations. Outputs NCBI taxonomic counts in kraken (kreport) and metaphlan (mpa) formats. -+ [**Taxonomic-Profiling-Sourmash**](https://github.com/PacificBiosciences/pb-metagenomics-tools/tree/master/Taxonomic-Profiling-Sourmash): Obtain taxonomic proflies using `sourmash gather` --> `taxonomy` approach. Sourmash provides GTDB and NCBI databases, or you can build your own database. -Each of these pipelines can be found in their respective folders. They are made available as [Snakemake](https://snakemake.readthedocs.io/en/stable/index.html) workflows. Snakemake is a python-based workflow manager. Snakemake workflows are highly portable because dependencies and environments are automatically setup using [Anaconda](https://docs.anaconda.com/anaconda/)/[Conda](https://docs.conda.io/projects/conda/en/latest/index.html). Snakemake also allows reproducibility, checkpointing, and the ability to scale workflows using HPC and cloud environments. Snakemake v5+ is required, and the workflows have been tested using v5.26+. You can optionally install snakemake via the provided conda environment file via `conda env create -f environment.yml`, and then activate this environment via `conda activate pb-metagenomics-tools` when you want to run any of these workflows. +Each of these pipelines can be found in their respective folders. They are made available as [Snakemake](https://snakemake.readthedocs.io/en/stable/index.html) workflows. Snakemake is a python-based workflow manager. Snakemake workflows are highly portable because dependencies and environments are automatically setup using [Anaconda](https://docs.anaconda.com/anaconda/)/[Conda](https://docs.conda.io/projects/conda/en/latest/index.html). Snakemake also allows reproducibility, checkpointing, and the ability to scale workflows using HPC and cloud environments. Snakemake v5+ is required. You can optionally install snakemake via the provided conda environment file via `conda env create -f environment.yml`, and then activate this environment via `conda activate pb-metagenomics-tools` when you want to run any of these workflows. ## Other tools @@ -40,9 +43,11 @@ All documentation for snakemake pipelines can be found in the `docs/` folder abo Available pipeline documentation: - [**Tutorial: HiFi-MAG-Pipeline**](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/Tutorial-HiFi-MAG-Pipeline.md) +- [**Tutorial: pb-MAG-mirror**](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/Tutorial-pb-MAG-mirror.md) +- [**Tutorial: Taxonomic-Profiling-Sourmash**](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/Tutorial-Taxonomic-Profiling-Sourmash.md) - [**Tutorial: Taxonomic-Profiling-Diamond-Megan**](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/Tutorial-Taxonomic-Profiling-Diamond-Megan.md) - [**Tutorial: Taxonomic-Profiling-Minimap-Megan**](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/Tutorial-Taxonomic-Profiling-Minimap-Megan.md) -- [**Tutorial: Taxonomic-Profiling-Sourmash**](https://github.com/PacificBiosciences/pb-metagenomics-tools/blob/master/docs/Tutorial-Taxonomic-Profiling-Sourmash.md) + The documentation for [pb-metagenomics-scripts](https://github.com/PacificBiosciences/pb-metagenomics-tools/tree/master/pb-metagenomics-scripts) is provided in the folder. diff --git a/docs/Tutorial-HiFi-MAG-Pipeline.md b/docs/Tutorial-HiFi-MAG-Pipeline.md index 40e4fc5..9de3b11 100644 --- a/docs/Tutorial-HiFi-MAG-Pipeline.md +++ b/docs/Tutorial-HiFi-MAG-Pipeline.md @@ -60,10 +60,11 @@ For explanations of these figures, please see the [5. Outputs](#OTPS) section be # **Quick Start** -This workflow requires [Anaconda](https://docs.anaconda.com/anaconda/)/[Conda](https://docs.conda.io/projects/conda/en/latest/index.html) and [Snakemake](https://snakemake.readthedocs.io/en/stable/) to be installed, and will require 45-150GB memory and >250GB temporary disk space (see [Requirements section](#RFR)). All dependencies in the workflow are installed using conda and the environments are activated by snakemake for relevant steps. Snakemake v5+ is required, and the workflows have been tested using v5.19.3. +This workflow requires [Anaconda](https://docs.anaconda.com/anaconda/)/[Conda](https://docs.conda.io/projects/conda/en/latest/index.html) and [Snakemake](https://snakemake.readthedocs.io/en/stable/) to be installed, and will require 45-150GB memory and >250GB temporary disk space (see [Requirements section](#RFR)). All dependencies in the workflow are installed using conda and the environments are activated by snakemake for relevant steps. - Clone the HiFi-MAG-Pipeline directory. -- Download and unpack the database for GTDB (~66GB). The current requirement is for GTDB-Tk v 2.1.1, which requires database R207_v2. Specify the path to the database in `config.yaml`. **Starting with version 2.0, there is no need to download the CheckM database.** +- Download the CheckM2 database using the software (`checkm2 database --download --path /YourPath/CheckM2_database`) or download and unpack [this site](https://zenodo.org/records/5571251/files/checkm2_database.tar.gz?download=1). The database is ~3Gb. Specify the path to the database in `config.yaml`. +- Download and unpack the database for GTDB (~66GB). The current requirement is for GTDB-Tk v 2.1.1, which requires database R207_v2. Specify the path to the database in `config.yaml`. - Include all input HiFi fasta files (`SAMPLE.fasta`) and contig fasta files (`SAMPLE.contigs.fasta`) in the `inputs/` folder. These can be files or symlinks. - Edit sample names in `Sample-Config.yaml` configuration file in `configs/` for your project. - Check settings in `config.yaml`, and ensure the `tmpdir` argument is set correctly in `config.yaml`. The default is `/scratch`. @@ -100,6 +101,7 @@ HiFi-MAG-Pipeline │ └── README.md (this is just a placeholder file, and not required) │ ├── scripts/ +│ ├── bam2paf.py │ ├── Convert-JGI-Coverages.py │ ├── Copy-Final-MAGs.py │ ├── Fasta-Make-Long-Seq-Bins.py @@ -108,7 +110,7 @@ HiFi-MAG-Pipeline │ ├── GTDBTk-Organize.py │ ├── MAG-Summary.py │ ├── Make-Incomplete-Contigs.py -│ ├── Maxbin2-organize-outputs.py +│ ├── paf-mapping-summary.py │ └── Plot-Figures.py │ ├── Snakefile-hifimags.smk @@ -352,6 +354,8 @@ Within `8-summary/`, there will be a folder for each sample. Within a sample fol + `SAMPLE.Completeness-Contamination-Contigs.pdf`: A plot showing the relationship between completeness and contamination for each high-quality MAG recovered, colored by the number of contigs per MAG. + `SAMPLE.GenomeSizes-Depths.pdf`: A plot showing the relationship between genome size and depth of coverage for each high-quality MAG recovered, colored by % GC content per MAG. + `SAMPLE.HiFi_MAG.summary.txt`: A main summary file that brings together information from CheckM2 and GTDB-Tk for all MAGs that pass the filtering step. ++ `SAMPLE.ReadsMapped.pdf`: A figure showing the percent of reads that mapped to contigs and MAGs at the 90, 95, and 99% identity level. ++ `SAMPLE.ReadsMapped.txt`: A table showing the percent of reads that mapped to contigs and MAGs at the 90, 95, and 99% identity level. [Back to top](#TOP)