forked from jessicarowell/HMAS-QC-Pipeline
-
Notifications
You must be signed in to change notification settings - Fork 1
/
mpy_batch.py
executable file
·174 lines (137 loc) · 7.69 KB
/
mpy_batch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/env python
import sys, os, shutil, subprocess
from datetime import datetime
from mothur_py import Mothur
import re
import group, run_cutadapt
import glob
from collections import deque
import concurrent.futures
MOTHUR_LOG_FILE = ''
def check_filesize():
"""
This utility function checks and makes sure the good fasta file size is larger than scrap fasta file size
, otherwise it will throw out an RuntimeError
It does so by parsing the MOTHUR log file, and grab those 2 fasta file names, then compare their sizes.
"""
if (os.access(MOTHUR_LOG_FILE, os.R_OK)):
with open(MOTHUR_LOG_FILE, 'r', errors='ignore') as f:
log_file = f.read()
try:
good_fasta_file = re.search(r'.+trim\.contigs\.fasta', log_file).group(0)
scrap_fasta_file = re.search(r'.+scrap\.contigs\.fasta',log_file).group(0)
except AttributeError: #in case MOTHUR log file doesn't have trim/scrap fasta file
raise RuntimeError(f"************************************************************\n"
f"return_code=None Alert! ERROR\n"
f"can't find either trim or scrap fasta file in MOTHUR log\n"
f"************************************************************")
if (os.path.getsize(scrap_fasta_file) > os.path.getsize(good_fasta_file)):
raise RuntimeError(f"************************************************************\n"
f"return_code=None Alert! ERROR\n"
f"We have scrap fasta {scrap_fasta_file} file size larger than \n"
f"good fasta {good_fasta_file}!\n"
f"You most likely need to trim your reads first!\n"
f"************************************************************")
def main(config):
global MOTHUR_LOG_FILE
currentDir = os.getcwd()
now = datetime.now()
runDateTime = now.strftime("%Y%m%d%H%M%S")
MOTHUR_LOG_FILE = os.path.expanduser(config['file_inputs']['output_dir'])+'/mothur.'+runDateTime+'.logfile'
m = Mothur(logfile_name = MOTHUR_LOG_FILE)
m.set.dir(input = config.get('file_inputs', 'input_dir', fallback = currentDir),
output = config.get('file_inputs', 'output_dir', fallback = currentDir),
tempdefault = config.get('file_inputs','output_dir', fallback = currentDir))
m.make.contigs(file = config.get('file_inputs','batch_file'),
processors= config.getint('contigs_params', 'processors', fallback = 40),
format=config.get('contigs_params', 'format', fallback = 'illumina1.8+'),
oligos=config.get('file_inputs', 'oligos'),
bdiffs=config.getint('contigs_params', 'bdiffs', fallback = 0),
pdiffs=config.getint('contigs_params', 'pdiffs', fallback = 0),
checkorient=config.get('contigs_params', 'checkorient', fallback = 't'),
insert=config.getint('contigs_params', 'insert', fallback = 25),
trimoverlap=config.get('contigs_params', 'trimoverlap', fallback = 'f'),
allfiles=config.get('contigs_params','allfiles', fallback = 0))
# this serves as a check point and make sure that:
# good fasta file (trim.contigs.fasta) size > scrap fasta file size
check_filesize()
m.rename.file(fasta='current', group='current',
prefix=config.get('rename_param', 'prefix'))
m.screen.seqs(fasta='current', group='current',
maxambig=config.getint('screen_params', 'maxambig', fallback = 0),
maxlength=config.getint('screen_params', 'maxlength', fallback = 325))
m.unique.seqs(fasta='current')
m.summary.seqs(fasta='current', name='current')
print(datetime.now())
old_group = group.get_current_file(MOTHUR_LOG_FILE)
print (f"old group is: {old_group}")
new_group = group.create_new_Group(old_group)
print (f"new group is: {new_group}")
print(datetime.now())
old_accnos = group.get_current_file(MOTHUR_LOG_FILE, 'accnos')
old_fasta = group.get_current_file(MOTHUR_LOG_FILE, 'fasta')
print (f"old fasta is: {old_fasta}")
print (f"old accnos is: {old_accnos}")
#1. create primer pair group file
new_group_primer = group.create_new_Group(old_group, 'primer')
print (f"new group primer is: {new_group_primer}")
print(datetime.now())
#2. count.seqs()
m.count.seqs(name='current', group=new_group_primer)
print(datetime.now())
print(f"done with count.seqs")
#3. split.groups()
m.split.groups(fasta=old_fasta, count='current')
print(datetime.now())
print(f"done with split.groups")
new_fasta = f'{old_fasta[:-5]}merged.fasta'
run_cutadapt.run_cutadapt_mothur(config, new_fasta, 36)
# reverse-expand ~, to avoid hypen issue in directory name
new_fasta = new_fasta.replace(os.path.expanduser('~'), '~', 1)
# add these to avoid potential duplicate name issue in fasta file
m.list.seqs(fasta=new_fasta) #list all the unique names in the fasta file
m.get.seqs(fasta='current', accnos='current') #remove any duplicate names
#6. unique.seqs
m.unique.seqs(fasta='current', name='current')
m.summary.seqs(fasta='current', name='current')
dir = os.path.expanduser(config['file_inputs']['output_dir'])
for accnos_file in glob.glob(rf"{dir}/*.accnos"):
if os.path.getsize(accnos_file) == 0: # we shouldn't find any accnos file at this stage, but we check anyway
os.remove(accnos_file)
# Search for chimeras
# need to use proper path for vsearch
m.chimera.vsearch(fasta='current', name='current', group=new_group, dereplicate='t', vsearch=config.get('chimera_params', 'vsearch'))
# m.chimera.vsearch(fasta='current', count='current', dereplicate='t', vsearch=r'~/HMAS-QC-Pipeline/mothur/vsearch')
# check for emptry accnos file
# because we checked already, if we find another accnos file, it must come from chimera.vsearch()
chimera_accnos_empty = False
for accnos_file in glob.glob(rf"{dir}/*.accnos"):
if os.path.getsize(accnos_file) == 0:
os.remove(accnos_file)
chimera_accnos_empty = True
if chimera_accnos_empty:
m.set.current(group=old_group, accnos=old_accnos)
else:
m.set.current(group=old_group)
m.remove.seqs(fasta='current', accnos='current', group='current', name='current')
# this works, but just takes too long, over 3 days for Juno data M347-21-026 M347-21-027..
m.count.seqs(name='current', group='current')
print(datetime.now())
print(f"done with count.seqs")
# m.pre.cluster(fasta='current', count='current', diffs=0)
# print(datetime.now())
# print(f"done with pre.cluster")
m.summary.seqs(fasta='current', count='current')
m.cluster(count='current', method='unique', cutoff='unique')
m.remove.rare(list='current', count='current',
nseqs=config.getint('rare_seqs_param', 'nseqs', fallback=9),
label='unique')
current_count = group.get_current_file(MOTHUR_LOG_FILE, 'count').replace(os.path.expanduser('~'), '~', 1)
m.list.seqs(count=current_count)
m.get.seqs(fasta='current', accnos='current', name='current', group='current')
m.summary.seqs(fasta='current', count='current')
m.rename.file(fasta='current', count='current', prefix=config.get('rename_param', 'prefix') +'.final')
# convert to full format count_table
m.count.seqs(count='current', compress='f')
if __name__ == "__main__":
print("This module is called by pipeline.py. Please run pipeline.py --help for more information")