Skip to content

Commit

Permalink
Merge pull request #14 from PacificBiosciences/features/checkbins
Browse files Browse the repository at this point in the history
add check for valid bins before DAS_Tool
  • Loading branch information
dportik authored Nov 22, 2021
2 parents 2c01cad + 8b56c55 commit af3a192
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 18 deletions.
31 changes: 24 additions & 7 deletions HiFi-MAG-Pipeline/Snakefile-hifimags
Original file line number Diff line number Diff line change
Expand Up @@ -160,34 +160,51 @@ rule MakeFullDASinput:
shell:
"Fasta_to_Scaffolds2Bin.sh -i {input} -e fa 1> {output} 2> {log}"

rule CheckForBins:
input:
lincirc = "4-DAStool/{sample}.linear-circ.tsv",
full = "4-DAStool/{sample}.full.tsv"
output:
"4-DAStool/{sample}.bincheck.txt"
conda:
"envs/python.yml"
log:
"logs/{sample}.CheckForBins.log"
benchmark:
"benchmarks/{sample}.CheckForBins.tsv"
shell:
"python scripts/CheckBins.py -f {input.full} -l {input.lincirc} -o {output} &> {log}"

rule RunDAStool:
input:
full = "4-DAStool/{sample}.full.tsv",
lincirc = "4-DAStool/{sample}.linear-circ.tsv",
contigs = "inputs/{sample}.contigs.fasta"
contigs = "inputs/{sample}.contigs.fasta",
bincheck = "4-DAStool/{sample}.bincheck.txt"
output:
binsdir = directory("4-DAStool/{sample}_DASTool_bins/"),
binsdir = directory("4-DAStool/{sample}/{sample}_DASTool_bins/"),
complete = "4-DAStool/{sample}.complete.txt"
conda:
"envs/dastool.yml"
threads:
config['dastool']['threads']
params:
outlabel = "4-DAStool/{sample}",
search = config['dastool']['search']
outlabel = "4-DAStool/{sample}/{sample}",
search = config['dastool']['search'],
thresh = config['dastool']['score_threshold']
log:
"logs/{sample}.RunDAStool.log"
benchmark:
"benchmarks/{sample}.RunDAStool.tsv"
shell:
"DAS_Tool -i {input.lincirc},{input.full} -c {input.contigs} -l lincirc,full "
"-o {params.outlabel} --search_engine {params.search} --write_bins 1 -t {threads} "
"--score_threshold 0.05 --debug &> {log} && touch {output.complete}"
"--score_threshold {params.thresh} --debug &> {log} && touch {output.complete}"

rule RunCheckM:
input:
complete = "4-DAStool/{sample}.complete.txt",
indir = "4-DAStool/{sample}_DASTool_bins/"
indir = "4-DAStool/{sample}/{sample}_DASTool_bins/"
output:
"5-checkm/{sample}/lineage.ms"
conda:
Expand Down Expand Up @@ -228,7 +245,7 @@ rule SummarizeCheckM:
rule PrepBatchFile:
input:
sumfile = "6-checkm-summary/CheckM.{sample}.qa.txt",
indir = "4-DAStool/{sample}_DASTool_bins/"
indir = "4-DAStool/{sample}/{sample}_DASTool_bins/"
output:
batch = "6-checkm-summary/{sample}.batchfile.txt",
simple = "6-checkm-summary/CheckM.{sample}.simple.txt"
Expand Down
6 changes: 6 additions & 0 deletions HiFi-MAG-Pipeline/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ dastool:
# The engine for single copy gene searching, choices include:
# blast, diamond, usearch.
search: "diamond"
# Score threshold until selection algorithm will keep selecting bins [0 to 1].
# This is roughly equivalent to CheckM completeness score (0.2 = 20% completeness).
# If the desired minimum completeness is changed in the gtdbtk section below,
# ensure this value is lower or DAS_Tool will remove the bins. The default is to set
# DAS_Tool score to 0.2 (20%), and later filter out all bins with <70% completeness.
score_threshold: 0.2

checkm:
# The number of threads to use for CheckM.
Expand Down
68 changes: 68 additions & 0 deletions HiFi-MAG-Pipeline/scripts/CheckBins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import argparse
import os

def get_args():
"""
Get arguments from command line with argparse.
"""
parser = argparse.ArgumentParser(
prog='CheckBins.py',
description="""Add circular contigs to bins.""")

parser.add_argument("-f", "--full",
required=True,
help="Full path to the input tsv file for DAS_Tool.")
parser.add_argument("-l", "--lincirc",
required=True,
help="Full path to the input tsv file for DAS_Tool.")
parser.add_argument("-o", "--outfile",
required=True,
help="Name of output completion file.")
return parser.parse_args()

def evaluate_bins(full, lincirc, outfile):

if os.stat(full).st_size == 0 and os.stat(lincirc).st_size == 0:
raise ValueError("\n\nFiles {} and {} are both empty.\nThis means no bins were produced "
"by MetaBat2.\nCheck the assembly or assembly graph to ensure high-quality "
"contigs are present.\n".format(full, lincirc))

elif os.stat(full).st_size == 0 and os.stat(lincirc).st_size > 0:
bins = set()
with open(lincirc, 'r') as fh:
for line in fh:
bins.add(line.split('\t')[-1])
lins = [x for x in list(bins) if x.split('.')[1] == "lin"]
circs = [x for x in list(bins) if x.split('.')[1] == "circ"]

if len(circs) > 0 and len(lins) == 0:
print("\nWARNING: Only circular contigs were binned (n={} bins); these\n"
"\tcould represent plasmid or viral sequences.\n"
"\tDAS_Tool may fail!\n".format(len(circs)))
elif len(circs) == 0 and len(lins) > 0:
print("\nWARNING: Bins were only found for a subset of linear contigs (n={} bins).\n"
"\tThis behavior is unexpected.\n".format(len(lins)))
elif len(circs) > 0 and len(lins) > 0:
print("\nWARNING: No bins were recovered for the full contig set,\n"
"\tbut circular contig bins (n={} bins) and\n"
"\tlinear contig bins (n={} bins) were found.\n"
"\tThis behavior is unexpected.\n".format(len(circs), len(lins)))

elif os.stat(full).st_size > 0 and os.stat(lincirc).st_size == 0:
print("\nWARNING: Bins were only found for the full set of contigs.\n"
"\tNo circular contig bins or linear subset contig bins were found.\n"
"\tThis behavior is unexpected.\n")

elif os.stat(full).st_size > 0 and os.stat(lincirc).st_size > 0:
print("\nBins were found for both binning strategies.\n")

with open(outfile, 'a') as fh:
fh.write("Bin evaluation completed.")

def main():
args = get_args()
evaluate_bins(args.full, args.lincirc, args.outfile)

if __name__ == '__main__':
main()

12 changes: 7 additions & 5 deletions HiFi-MAG-Pipeline/scripts/Copy-Circ-Contigs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import argparse
import os
import pandas as pd
from Bio import SeqIO

def get_args():
Expand All @@ -19,10 +18,10 @@ def get_args():
help="Name of sample.")
parser.add_argument("-o1", "--outdir",
required=True,
help="Name of output fasta file (linear).")
help="Name of output directory.")
parser.add_argument("-o2", "--outfile",
required=True,
help="Name of output fasta file (linear).")
help="Name of output completion file.")
return parser.parse_args()

def write_bins(fasta, sample, outdir):
Expand All @@ -33,14 +32,17 @@ def write_bins(fasta, sample, outdir):
"""
bin_count = int(1)
for rec in SeqIO.parse(fasta, "fasta"):
outname = os.path.join(outdir, "{}_bin.circ{}.fa".format(sample, bin_count))
outname = os.path.join(outdir, "{}_bin.circ.{}.fa".format(sample, bin_count))
with open(outname, 'a') as fh:
fh.write(rec.format("fasta"))
bin_count += 1

def main():
args = get_args()
write_bins(args.fasta, args.sample, args.outdir)
if os.stat(args.fasta).st_size == 0:
pass
else:
write_bins(args.fasta, args.sample, args.outdir)
with open(args.outfile, 'a') as fh:
fh.write("Completed.")

Expand Down
6 changes: 0 additions & 6 deletions HiFi-MAG-Pipeline/scripts/Filter-CircLin-Contigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,6 @@ def parse_hifiasm(fasta, fastalinear, fastacircular, loglinear, logcircular):

#catch no-circulars by adding one dummy seq
if os.stat(fastacircular).st_size == 0:
with open(fastacircular, 'a') as fh:
fh.write(">dummyseq\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n")
with open(logcircular, 'a') as fh:
fh.write("{}\t{}\n".format("NA", "0"))

Expand All @@ -89,8 +87,6 @@ def parse_hicanu(fasta, fastalinear, fastacircular, loglinear, logcircular):

#catch no-circulars by adding one dummy seq
if os.stat(fastacircular).st_size == 0:
with open(fastacircular, 'a') as fh:
fh.write(">dummyseq\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n")
with open(logcircular, 'a') as fh:
fh.write("{}\t{}\n".format("NA", "0"))

Expand Down Expand Up @@ -122,8 +118,6 @@ def parse_metaflye(fasta, metaflye_info, fastalinear, fastacircular, loglinear,

#catch no-circulars by adding one dummy seq
if os.stat(fastacircular).st_size == 0:
with open(fastacircular, 'a') as fh:
fh.write(">dummyseq\nATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG\n")
with open(logcircular, 'a') as fh:
fh.write("{}\t{}\n".format("NA", "0"))

Expand Down

0 comments on commit af3a192

Please sign in to comment.