diff --git a/HiFi-MAG-Pipeline/Snakefile-hifimags.smk b/HiFi-MAG-Pipeline/Snakefile-hifimags.smk index d5e0e00..f4bf7c5 100644 --- a/HiFi-MAG-Pipeline/Snakefile-hifimags.smk +++ b/HiFi-MAG-Pipeline/Snakefile-hifimags.smk @@ -2,7 +2,7 @@ import os localrules: LongContigsToBins, CloseLongbinFork, StopLongBinCheckm2, FilterCompleteContigs, - ConvertJGIBamDepth, DASinputMetabat2, DASinputSemiBin2, CopyDAStoolBins, AssessCheckm2Bins, + ConvertJGIBamDepth, FilterSuperBins, DASinputMetabat2, DASinputSemiBin2, CopyDAStoolBins, AssessCheckm2Bins, CloseCheckm2Fork, SkipGTDBAnalysis, GTDBTkCleanup, MAGSummary, MAGContigNames, MAGMappingPlots, MAGPlots, all configfile: "config.yaml" @@ -285,27 +285,46 @@ rule SemiBin2Analysis: bam = os.path.join(CWD, "2-bam", "{sample}.bam") output: bins = os.path.join(CWD, "3-semibin2", "{sample}", "bins_info.tsv"), - outdir = directory(os.path.join(CWD, "3-semibin2", "{sample}", "")) conda: "envs/semibin.yml" threads: config['semibin']['threads'] params: tmp = config["tmpdir"], - modelflag = config['semibin']['model'] + modelflag = config['semibin']['model'], + outdir = os.path.join(CWD, "3-semibin2", "{sample}", "") log: os.path.join(CWD, "logs", "{sample}.SemiBin2Analysis.log") benchmark: os.path.join(CWD, "benchmarks", "{sample}.SemiBin2Analysis.tsv") shell: - "SemiBin single_easy_bin -i {input.contigs} -b {input.bam} -o {output.outdir} --self-supervised " + "SemiBin single_easy_bin -i {input.contigs} -b {input.bam} -o {params.outdir} --self-supervised " "--sequencing-type=long_reads --compression=none -t {threads} --tag-output semibin2 {params.modelflag} " "--verbose --tmpdir={params.tmp} &> {log}" -#--environment=global + +rule FilterSuperBins: + input: + bins = os.path.join(CWD, "3-semibin2", "{sample}", "bins_info.tsv"), + output: + outfile = os.path.join(CWD, "3-semibin2", "{sample}", "{sample}.superbins.txt") + conda: + "envs/python.yml" + params: + indir = os.path.join(CWD, "3-semibin2", "{sample}", "output_bins", ""), + outdir = os.path.join(CWD, "3-semibin2", "{sample}", "superbins", "") + threads: + 1 + log: + os.path.join(CWD, "logs", "{sample}.FilterSuperBins.log") + benchmark: + os.path.join(CWD, "benchmarks", "{sample}.FilterSuperBins.tsv") + shell: + "python scripts/Filter-Semibin.py -i {params.indir} -o {params.outdir} -f {output.outfile} " + " &> {log}" rule DASinputSemiBin2: input: - os.path.join(CWD, "3-semibin2", "{sample}", "") + os.path.join(CWD, "3-semibin2", "{sample}", "{sample}.superbins.txt") output: os.path.join(CWD, "4-DAStool", "{sample}.semibin2.tsv") conda: diff --git a/HiFi-MAG-Pipeline/scripts/Filter-Semibin.py b/HiFi-MAG-Pipeline/scripts/Filter-Semibin.py new file mode 100644 index 0000000..1a06f22 --- /dev/null +++ b/HiFi-MAG-Pipeline/scripts/Filter-Semibin.py @@ -0,0 +1,54 @@ +import argparse +import os +import shutil + +def get_args(): + """ + Get arguments from command line with argparse. + """ + parser = argparse.ArgumentParser( + prog='Filter-SemiBin.py', + description="""Identify any superbins (>100Mb) from semibin2 and move to a separate directory.""") + parser.add_argument("-i", "--in_dir", + required=True, + help="The full path to the semibin2 bins directory.") + parser.add_argument("-o", "--out_dir", + required=True, + help="The full path to the output directory.") + parser.add_argument("-f", "--outfile", + required=True, + help="Name of output file to write.") + + return parser.parse_args() + +def make_dir(out_dir): + os.makedirs(out_dir, exist_ok=True) + +def filter_bin_files(in_dir, out_dir): + mv_count = 0 + print("filter_bin_files: getting files") + os.chdir(in_dir) + fasta_files = [f for f in os.listdir('.') if f.endswith((".fa", ".fasta"))] + print("filter_bin_files: removing large (>100Mb) files") + for f in fasta_files: + if os.path.getsize(f) >= 100000000: + print("\tMoved file {}: {:,} bytes".format(f, os.path.getsize(f))) + mv_count += 1 + shutil.move(f, os.path.join(out_dir, f)) + print("filter_bin_files: Identified {} super bins".format(mv_count)) + + return mv_count + +def write_outfile(outfile, mv_count, out_dir): + print("write_outfile: Writing output file.") + with open(outfile, 'w') as fh: + fh.write("Identified {} super bins\nFile(s) located in: {}".format(mv_count, out_dir)) + +def main(): + args = get_args() + make_dir(args.out_dir) + mv_count = filter_bin_files(args.in_dir, args.out_dir) + write_outfile(args.outfile, mv_count, args.out_dir) + +if __name__ == '__main__': + main() \ No newline at end of file