diff --git a/HiFi-MAG-Pipeline/Snakefile-hifimags b/HiFi-MAG-Pipeline/Snakefile-hifimags index 4a1402a..00bb090 100644 --- a/HiFi-MAG-Pipeline/Snakefile-hifimags +++ b/HiFi-MAG-Pipeline/Snakefile-hifimags @@ -160,21 +160,38 @@ rule MakeFullDASinput: shell: "Fasta_to_Scaffolds2Bin.sh -i {input} -e fa 1> {output} 2> {log}" +rule CheckForBins: + input: + lincirc = "4-DAStool/{sample}.linear-circ.tsv", + full = "4-DAStool/{sample}.full.tsv" + output: + "4-DAStool/{sample}.bincheck.txt" + conda: + "envs/python.yml" + log: + "logs/{sample}.CheckForBins.log" + benchmark: + "benchmarks/{sample}.CheckForBins.tsv" + shell: + "python scripts/CheckBins.py -f {input.full} -l {input.lincirc} -o {output} &> {log}" + rule RunDAStool: input: full = "4-DAStool/{sample}.full.tsv", lincirc = "4-DAStool/{sample}.linear-circ.tsv", - contigs = "inputs/{sample}.contigs.fasta" + contigs = "inputs/{sample}.contigs.fasta", + bincheck = "4-DAStool/{sample}.bincheck.txt" output: - binsdir = directory("4-DAStool/{sample}_DASTool_bins/"), + binsdir = directory("4-DAStool/{sample}/{sample}_DASTool_bins/"), complete = "4-DAStool/{sample}.complete.txt" conda: "envs/dastool.yml" threads: config['dastool']['threads'] params: - outlabel = "4-DAStool/{sample}", - search = config['dastool']['search'] + outlabel = "4-DAStool/{sample}/{sample}", + search = config['dastool']['search'], + thresh = config['dastool']['score_threshold'] log: "logs/{sample}.RunDAStool.log" benchmark: @@ -182,12 +199,12 @@ rule RunDAStool: shell: "DAS_Tool -i {input.lincirc},{input.full} -c {input.contigs} -l lincirc,full " "-o {params.outlabel} --search_engine {params.search} --write_bins 1 -t {threads} " - "--score_threshold 0.05 --debug &> {log} && touch {output.complete}" + "--score_threshold {params.thresh} --debug &> {log} && touch {output.complete}" rule RunCheckM: input: complete = "4-DAStool/{sample}.complete.txt", - indir = "4-DAStool/{sample}_DASTool_bins/" + indir = "4-DAStool/{sample}/{sample}_DASTool_bins/" output: "5-checkm/{sample}/lineage.ms" conda: @@ -228,7 +245,7 @@ rule SummarizeCheckM: rule PrepBatchFile: input: sumfile = "6-checkm-summary/CheckM.{sample}.qa.txt", - indir = "4-DAStool/{sample}_DASTool_bins/" + indir = "4-DAStool/{sample}/{sample}_DASTool_bins/" output: batch = "6-checkm-summary/{sample}.batchfile.txt", simple = "6-checkm-summary/CheckM.{sample}.simple.txt" diff --git a/HiFi-MAG-Pipeline/config.yaml b/HiFi-MAG-Pipeline/config.yaml index 7d47106..981f163 100644 --- a/HiFi-MAG-Pipeline/config.yaml +++ b/HiFi-MAG-Pipeline/config.yaml @@ -34,6 +34,12 @@ dastool: # The engine for single copy gene searching, choices include: # blast, diamond, usearch. search: "diamond" + # Score threshold until selection algorithm will keep selecting bins [0 to 1]. + # This is roughly equivalent to CheckM completeness score (0.2 = 20% completeness). + # If the desired minimum completeness is changed in the gtdbtk section below, + # ensure this value is lower or DAS_Tool will remove the bins. The default is to set + # DAS_Tool score to 0.2 (20%), and later filter out all bins with <70% completeness. + score_threshold: 0.2 checkm: # The number of threads to use for CheckM. diff --git a/HiFi-MAG-Pipeline/scripts/CheckBins.py b/HiFi-MAG-Pipeline/scripts/CheckBins.py new file mode 100644 index 0000000..8eebf6e --- /dev/null +++ b/HiFi-MAG-Pipeline/scripts/CheckBins.py @@ -0,0 +1,68 @@ +import argparse +import os + +def get_args(): + """ + Get arguments from command line with argparse. + """ + parser = argparse.ArgumentParser( + prog='CheckBins.py', + description="""Add circular contigs to bins.""") + + parser.add_argument("-f", "--full", + required=True, + help="Full path to the input tsv file for DAS_Tool.") + parser.add_argument("-l", "--lincirc", + required=True, + help="Full path to the input tsv file for DAS_Tool.") + parser.add_argument("-o", "--outfile", + required=True, + help="Name of output completion file.") + return parser.parse_args() + +def evaluate_bins(full, lincirc, outfile): + + if os.stat(full).st_size == 0 and os.stat(lincirc).st_size == 0: + raise ValueError("\n\nFiles {} and {} are both empty.\nThis means no bins were produced " + "by MetaBat2.\nCheck the assembly or assembly graph to ensure high-quality " + "contigs are present.\n".format(full, lincirc)) + + elif os.stat(full).st_size == 0 and os.stat(lincirc).st_size > 0: + bins = set() + with open(lincirc, 'r') as fh: + for line in fh: + bins.add(line.split('\t')[-1]) + lins = [x for x in list(bins) if x.split('.')[1] == "lin"] + circs = [x for x in list(bins) if x.split('.')[1] == "circ"] + + if len(circs) > 0 and len(lins) == 0: + print("\nWARNING: Only circular contigs were binned (n={} bins); these\n" + "\tcould represent plasmid or viral sequences.\n" + "\tDAS_Tool may fail!\n".format(len(circs))) + elif len(circs) == 0 and len(lins) > 0: + print("\nWARNING: Bins were only found for a subset of linear contigs (n={} bins).\n" + "\tThis behavior is unexpected.\n".format(len(lins))) + elif len(circs) > 0 and len(lins) > 0: + print("\nWARNING: No bins were recovered for the full contig set,\n" + "\tbut circular contig bins (n={} bins) and\n" + "\tlinear contig bins (n={} bins) were found.\n" + "\tThis behavior is unexpected.\n".format(len(circs), len(lins))) + + elif os.stat(full).st_size > 0 and os.stat(lincirc).st_size == 0: + print("\nWARNING: Bins were only found for the full set of contigs.\n" + "\tNo circular contig bins or linear subset contig bins were found.\n" + "\tThis behavior is unexpected.\n") + + elif os.stat(full).st_size > 0 and os.stat(lincirc).st_size > 0: + print("\nBins were found for both binning strategies.\n") + + with open(outfile, 'a') as fh: + fh.write("Bin evaluation completed.") + +def main(): + args = get_args() + evaluate_bins(args.full, args.lincirc, args.outfile) + +if __name__ == '__main__': + main() + diff --git a/HiFi-MAG-Pipeline/scripts/Copy-Circ-Contigs.py b/HiFi-MAG-Pipeline/scripts/Copy-Circ-Contigs.py index 249b885..aaaaab8 100644 --- a/HiFi-MAG-Pipeline/scripts/Copy-Circ-Contigs.py +++ b/HiFi-MAG-Pipeline/scripts/Copy-Circ-Contigs.py @@ -1,6 +1,5 @@ import argparse import os -import pandas as pd from Bio import SeqIO def get_args(): @@ -19,10 +18,10 @@ def get_args(): help="Name of sample.") parser.add_argument("-o1", "--outdir", required=True, - help="Name of output fasta file (linear).") + help="Name of output directory.") parser.add_argument("-o2", "--outfile", required=True, - help="Name of output fasta file (linear).") + help="Name of output completion file.") return parser.parse_args() def write_bins(fasta, sample, outdir): @@ -33,14 +32,17 @@ def write_bins(fasta, sample, outdir): """ bin_count = int(1) for rec in SeqIO.parse(fasta, "fasta"): - outname = os.path.join(outdir, "{}_bin.circ{}.fa".format(sample, bin_count)) + outname = os.path.join(outdir, "{}_bin.circ.{}.fa".format(sample, bin_count)) with open(outname, 'a') as fh: fh.write(rec.format("fasta")) bin_count += 1 def main(): args = get_args() - write_bins(args.fasta, args.sample, args.outdir) + if os.stat(args.fasta).st_size == 0: + pass + else: + write_bins(args.fasta, args.sample, args.outdir) with open(args.outfile, 'a') as fh: fh.write("Completed.") diff --git a/HiFi-MAG-Pipeline/scripts/Filter-CircLin-Contigs.py b/HiFi-MAG-Pipeline/scripts/Filter-CircLin-Contigs.py index 6ab11df..35e6933 100644 --- a/HiFi-MAG-Pipeline/scripts/Filter-CircLin-Contigs.py +++ b/HiFi-MAG-Pipeline/scripts/Filter-CircLin-Contigs.py @@ -61,8 +61,6 @@ def parse_hifiasm(fasta, fastalinear, fastacircular, loglinear, logcircular): #catch no-circulars by adding one dummy seq if os.stat(fastacircular).st_size == 0: - with open(fastacircular, 'a') as fh: - fh.write(">dummyseq\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n") with open(logcircular, 'a') as fh: fh.write("{}\t{}\n".format("NA", "0")) @@ -89,8 +87,6 @@ def parse_hicanu(fasta, fastalinear, fastacircular, loglinear, logcircular): #catch no-circulars by adding one dummy seq if os.stat(fastacircular).st_size == 0: - with open(fastacircular, 'a') as fh: - fh.write(">dummyseq\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n") with open(logcircular, 'a') as fh: fh.write("{}\t{}\n".format("NA", "0")) @@ -122,8 +118,6 @@ def parse_metaflye(fasta, metaflye_info, fastalinear, fastacircular, loglinear, #catch no-circulars by adding one dummy seq if os.stat(fastacircular).st_size == 0: - with open(fastacircular, 'a') as fh: - fh.write(">dummyseq\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n") with open(logcircular, 'a') as fh: fh.write("{}\t{}\n".format("NA", "0"))