Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
dportik committed Jul 16, 2024
2 parents 2d5d8b2 + e7430e7 commit 15cf132
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 6 deletions.
31 changes: 25 additions & 6 deletions HiFi-MAG-Pipeline/Snakefile-hifimags.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import os

localrules:
LongContigsToBins, CloseLongbinFork, StopLongBinCheckm2, FilterCompleteContigs,
ConvertJGIBamDepth, DASinputMetabat2, DASinputSemiBin2, CopyDAStoolBins, AssessCheckm2Bins,
ConvertJGIBamDepth, FilterSuperBins, DASinputMetabat2, DASinputSemiBin2, CopyDAStoolBins, AssessCheckm2Bins,
CloseCheckm2Fork, SkipGTDBAnalysis, GTDBTkCleanup, MAGSummary, MAGContigNames, MAGMappingPlots, MAGPlots, all

configfile: "config.yaml"
Expand Down Expand Up @@ -285,27 +285,46 @@ rule SemiBin2Analysis:
bam = os.path.join(CWD, "2-bam", "{sample}.bam")
output:
bins = os.path.join(CWD, "3-semibin2", "{sample}", "bins_info.tsv"),
outdir = directory(os.path.join(CWD, "3-semibin2", "{sample}", ""))
conda:
"envs/semibin.yml"
threads:
config['semibin']['threads']
params:
tmp = config["tmpdir"],
modelflag = config['semibin']['model']
modelflag = config['semibin']['model'],
outdir = os.path.join(CWD, "3-semibin2", "{sample}", "")
log:
os.path.join(CWD, "logs", "{sample}.SemiBin2Analysis.log")
benchmark:
os.path.join(CWD, "benchmarks", "{sample}.SemiBin2Analysis.tsv")
shell:
"SemiBin single_easy_bin -i {input.contigs} -b {input.bam} -o {output.outdir} --self-supervised "
"SemiBin single_easy_bin -i {input.contigs} -b {input.bam} -o {params.outdir} --self-supervised "
"--sequencing-type=long_reads --compression=none -t {threads} --tag-output semibin2 {params.modelflag} "
"--verbose --tmpdir={params.tmp} &> {log}"
#--environment=global

rule FilterSuperBins:
input:
bins = os.path.join(CWD, "3-semibin2", "{sample}", "bins_info.tsv"),
output:
outfile = os.path.join(CWD, "3-semibin2", "{sample}", "{sample}.superbins.txt")
conda:
"envs/python.yml"
params:
indir = os.path.join(CWD, "3-semibin2", "{sample}", "output_bins", ""),
outdir = os.path.join(CWD, "3-semibin2", "{sample}", "superbins", "")
threads:
1
log:
os.path.join(CWD, "logs", "{sample}.FilterSuperBins.log")
benchmark:
os.path.join(CWD, "benchmarks", "{sample}.FilterSuperBins.tsv")
shell:
"python scripts/Filter-Semibin.py -i {params.indir} -o {params.outdir} -f {output.outfile} "
" &> {log}"

rule DASinputSemiBin2:
input:
os.path.join(CWD, "3-semibin2", "{sample}", "")
os.path.join(CWD, "3-semibin2", "{sample}", "{sample}.superbins.txt")
output:
os.path.join(CWD, "4-DAStool", "{sample}.semibin2.tsv")
conda:
Expand Down
54 changes: 54 additions & 0 deletions HiFi-MAG-Pipeline/scripts/Filter-Semibin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import argparse
import os
import shutil

def get_args():
"""
Get arguments from command line with argparse.
"""
parser = argparse.ArgumentParser(
prog='Filter-SemiBin.py',
description="""Identify any superbins (>100Mb) from semibin2 and move to a separate directory.""")
parser.add_argument("-i", "--in_dir",
required=True,
help="The full path to the semibin2 bins directory.")
parser.add_argument("-o", "--out_dir",
required=True,
help="The full path to the output directory.")
parser.add_argument("-f", "--outfile",
required=True,
help="Name of output file to write.")

return parser.parse_args()

def make_dir(out_dir):
os.makedirs(out_dir, exist_ok=True)

def filter_bin_files(in_dir, out_dir):
mv_count = 0
print("filter_bin_files: getting files")
os.chdir(in_dir)
fasta_files = [f for f in os.listdir('.') if f.endswith((".fa", ".fasta"))]
print("filter_bin_files: removing large (>100Mb) files")
for f in fasta_files:
if os.path.getsize(f) >= 100000000:
print("\tMoved file {}: {:,} bytes".format(f, os.path.getsize(f)))
mv_count += 1
shutil.move(f, os.path.join(out_dir, f))
print("filter_bin_files: Identified {} super bins".format(mv_count))

return mv_count

def write_outfile(outfile, mv_count, out_dir):
print("write_outfile: Writing output file.")
with open(outfile, 'w') as fh:
fh.write("Identified {} super bins\nFile(s) located in: {}".format(mv_count, out_dir))

def main():
args = get_args()
make_dir(args.out_dir)
mv_count = filter_bin_files(args.in_dir, args.out_dir)
write_outfile(args.outfile, mv_count, args.out_dir)

if __name__ == '__main__':
main()

0 comments on commit 15cf132

Please sign in to comment.