-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_regions
executable file
·428 lines (366 loc) · 17.2 KB
/
extract_regions
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
#!/usr/bin/env python
from __future__ import division
import os
import errno
import sys
import argparse
import shlex
import shutil
import time
import subprocess
import glob
import multiprocessing
import tempfile
# position of the script -------------------------------------------------------
path_DB = os.path.realpath(__file__)
path_array = path_DB.split("/")
relative_path = "/".join(path_array[0:-1])
relative_path = relative_path + "/"
# ------------------------------------------------------------------------------
# print the help informations
# ------------------------------------------------------------------------------
class CapitalisedHelpFormatter(argparse.HelpFormatter):
def add_usage(self, usage, actions, groups, prefix=None):
if prefix is None:
prefix = ''
return super(CapitalisedHelpFormatter, self).add_usage(usage, actions, groups, prefix)
def msg(name=None):
str_msg = '''
\00
Program: extract_regions - a tool to extract variable regions from 16S gene sequences
Usage: extract_regions [-i/-a] <file> [options]
Input option:
-i FILE fasta file containing the 16S sequences
-a FILE provide the alignment generated by -A [None]
-f STR instead of extracting a region, specify forward primer [None]
-r STR instead of extracting a region, specify reverse primer [None]
-c STR which clade to use [Bacteria]
Output option:
-o FILE output file name [stdout]
-A FILE save the intermediate alignment [None]
-s print output as MSA
Algorithm options:
-t INT number of threads [1]
-v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [3]
Cmalign options:
--mxsize INT maximum allowable DP matrix size [4000]
Possible forward primers: 8F, 27F, 68F, 341F, 515F, 967F, 1237F
Possible reverse primers: 338R, 519R, 785R, 806R, 907R, 926R, 1100R, 1391R, 1492R
'''
return str_msg
# ------------------------------------------------------------------------------
# function to check if a specific tool exists
def is_tool(name):
try:
devnull = open(os.devnull)
subprocess.Popen([name], stdout=devnull, stderr=devnull).communicate()
except OSError as e:
if e.errno == errno.ENOENT:
return False
return True
# ------------------------------------------------------------------------------
# function to align fasta sequences with cmalign and return the stream of data
def cm_align_fun(relative_path,fasta_input,mxsize,threads,selected_cmfile,output_alignment,save_alignment):
# NOTE1: that the alignment used U instead of T's, so we need to convert
# before calling the alignment
# NOTE2: we add a reference sequence at the beginning. The variable
# regions and primers are defined on this sequence
# both are taken care in bin/print_converted.py
cmd_convert = "python "+relative_path+"bin/print_converted.py "
cmd_convert = cmd_convert + relative_path+"DATA/reference_sequence.fasta "
cmd_convert = cmd_convert + fasta_input
# cmalignscript (note output is in stdout)
cmd_cmalign = "cmalign --mxsize "+str(mxsize)+" --outformat Pfam --cpu "+str(threads)+" "
# cm model
cmd_cmalign = cmd_cmalign + relative_path+"DATA/RDPinfernal1.1Traindata/"+selected_cmfile
# fasta input, is through stdin
cmd_cmalign = cmd_cmalign + " -"
# parse alignment
cmd_parse_al = "python "+relative_path+"bin/parse_alignment.py "
cmd_parse_al = cmd_parse_al + output_alignment + " "+save_alignment
# run convert and alignment ----------------------------
CMD_convert = shlex.split(cmd_convert)
CMD_cmalign = shlex.split(cmd_cmalign)
CMD_parse_al = shlex.split(cmd_parse_al)
# import devnull
try:
from subprocess import DEVNULL
except ImportError:
DEVNULL = open(os.devnull, 'wb')
# commands
convert_ex = subprocess.Popen(CMD_convert,stdout=subprocess.PIPE,encoding='utf8')
cmalign_ex = subprocess.Popen(CMD_cmalign,stdin=convert_ex.stdout,stdout=subprocess.PIPE,encoding='utf8')
parse_al_ex = subprocess.Popen(CMD_parse_al,stdin=cmalign_ex.stdout,stdout=subprocess.PIPE,encoding='utf8')
# return the line
for line in parse_al_ex.stdout:
yield line
# check that the commands finished correctly
convert_ex.stdout.close()
return_code = convert_ex.wait()
if return_code:
sys.stderr.write("[E::cmalign] Error. parse the fasta file failed\n")
sys.exit(1)
cmalign_ex.stdout.close()
return_code = cmalign_ex.wait()
if return_code:
sys.stderr.write("[E::cmalign] Error. cmalign failed\n")
sys.exit(1)
parse_al_ex.stdout.close()
return_code = parse_al_ex.wait()
if return_code:
sys.stderr.write("[E::cmalign] Error. parsing the alignment file failed\n")
sys.exit(1)
# ------------------------------------------------------------------------------
# function to load to the stream of data the alignment rows
def create_stream_al(alignment_file):
o = open(alignment_file,"r")
for line in o:
yield line
o.close()
# ------------------------------------------------------------------------------
# function to find the positions to extract
def find_al_position(forPrimer, reversePrimer, aligned_ref):
pos_to_check = dict()
# FIND VARIABLE REGIONS --------------------------
if forPrimer == "" and reversePrimer == "":
# then we extract all the 9 variable regions
o = open(relative_path+"DATA/variable_regions.tsv","r")
o.readline()
o.readline()
for line in o:
vals = line.rstrip().split("\t")
pos_to_check[vals[0]] = vals[1]+"-"+vals[2]
o.close()
# FIND REGION BETWEEN PRIMERS --------------------
if (forPrimer != "" or reversePrimer != "") and forPrimer.endswith("F") and reversePrimer.endswith("R"):
# load primers and find the position in the string, i.e. left margin and
# right margin
o = open(relative_path+"DATA/primers_info.tsv","r")
o.readline()
for i in o:
vals = i.split("\t")
if vals[0] == forPrimer:
left_margin = vals[1]
if vals[0] == reversePrimer:
right_margin = vals[2]
o.close()
pos_to_check[forPrimer+"-"+reversePrimer] = left_margin+"-"+right_margin
# FIND REGION BETWEEN POSITIONS --------------------
if forPrimer != "" or reversePrimer != "":
if not forPrimer.endswith("F"):
pos_to_check[forPrimer+"-"+reversePrimer] = forPrimer+"-"+reversePrimer
# arriving here, we have in pos_to_check the positions in the reference
# sequence that we need to extract
# now we need to find these positions in the MSA
pos_in_msa = dict()
for region in pos_to_check:
# position in the string
left_margin = int(pos_to_check[region].split("-")[0])
right_margin = int(pos_to_check[region].split("-")[1])
# position in the MSA
left_msa = 0
right_msa = 0
# go throught the sequence and find the positions in the MSA
count_nucleotides = 0
count_pos_msa = 0
for p in aligned_ref:
count_pos_msa = count_pos_msa + 1
if p != "." and p != "-":
count_nucleotides = count_nucleotides + 1
if left_margin == (count_nucleotides + 1): left_msa = count_pos_msa
if right_margin == count_nucleotides: right_msa = count_pos_msa
# add result to the dict to return
pos_in_msa[region] = [left_msa,right_msa]
return pos_in_msa
# ------------------------------------------------------------------------------
# MAIN
# ------------------------------------------------------------------------------
def main(argv=None):
devnull = open(os.devnull)
parser = argparse.ArgumentParser(usage=msg(), formatter_class=CapitalisedHelpFormatter,add_help=False)
parser.add_argument('-v', action='store', type=int, default=None, dest='verbose', help='Verbose levels')
parser.add_argument('-t', type=int, action="store", dest='threads', default=None, help='Number of threads to be used.')
parser.add_argument('-o', action="store", dest='output', default=None, help='name of output file')
parser.add_argument('-A', action="store", dest='output_alignment', default=None, help='save alignment')
parser.add_argument('-r', action="store", default=None,dest='reversePrimer', help='which reverse primer to use')
parser.add_argument('-f', action="store", default=None,dest='forPrimer', help='which forward primer to use')
parser.add_argument('-c', action="store", default=None,dest='clade', help='which clade',choices=['Bacteria',"Archaea","Fungal"])
parser.add_argument('-i', action="store", default=None,dest='fasta_input', help='fasta input')
parser.add_argument('-a', action="store", default=None,dest='al_input', help='alignment input')
parser.add_argument('-s', action='store_true', default=None, dest='out_as_alignment', help='print output as an alignment')
parser.add_argument('--mxsize', action="store", default=None, dest='mxsize', help='to input to cmalign')
args = parser.parse_args()
if (args.fasta_input is None) and (args.al_input is None):
print(msg())
sys.exit(1)
# set default for args.verbose
if (args.verbose is None): args.verbose = 3
# set the default
if (args.output is None):
output = ""
else:
output = args.output
if (args.threads is None): args.threads = 1
if (args.mxsize is None): args.mxsize = 4000
if (args.output_alignment is None):
args.output_alignment = "NA"
save_alignment = "False"
else:
save_alignment = "True"
if (args.reversePrimer is None): args.reversePrimer = ""
if (args.forPrimer is None): args.forPrimer = ""
if (args.fasta_input is None): args.fasta_input = ""
if (args.al_input is None): args.al_input = ""
if (args.out_as_alignment is None): args.out_as_alignment = False
if (args.clade is None): args.clade = "Bacteria"
# ---------------------- check general parameters --------------------------
if args.verbose < 1:
sys.stderr.write("[E::main] Error: verbose level (-v) is less than 1\n")
sys.exit(1)
if args.threads < 1:
sys.stderr.write("[E::main] Error: number of threads (-t) is less than 1\n")
sys.exit(1)
# there cannot be -a and -i
if args.fasta_input != "" and args.al_input != "":
sys.stderr.write("[E::main] Error: Provide -i OR -a (not both)\n")
sys.exit(1)
# there should be either -a or -i
if args.fasta_input == "" and args.al_input == "":
sys.stderr.write("[E::main] Error: Missing input file (provide either -i or -a)\n")
sys.exit(1)
# check that if one primer is present, then they both are
if (args.forPrimer != "" and args.reversePrimer == ""):
sys.stderr.write("[E::main] Error: -r missing\n")
sys.exit(1)
if (args.forPrimer == "" and args.reversePrimer != ""):
sys.stderr.write("[E::main] Error: -f missing\n")
sys.exit(1)
# check that the primers are correct (regarding the selection of the region)
if args.forPrimer != "" and args.reversePrimer != "":
if args.forPrimer[-1] == "F":
pos_F = int(args.forPrimer[:-1])
else:
pos_F = int(args.forPrimer)
if args.reversePrimer[-1] == "R":
pos_R = int(args.reversePrimer[:-1])
else:
pos_R = int(args.reversePrimer)
if pos_F > pos_R:
sys.stderr.write("[E::main] Error: primers don't represent any region\n")
sys.exit(1)
# --------------------------- set the clade --------------------------------
selected_cmfile = "bacteria_model.cm"
if(args.clade != "Bacteria"):
sys.stderr.write("[W::main] Warning: only Bacteria is implemented. Running with -c Bacteria\n")
# --------------------------------------------------------------------------
# 0. CHECK DEPENDENCIES
# --------------------------------------------------------------------------
# Infernal
if not is_tool("cmalign"):
if args.fasta_input != "": # needed only if the input is the fasta file
sys.stderr.write("[E::main] Error: cmalign is not in the path. Please install Infernal.\n")
sys.exit(1)
# --------------------------------------------------------------------------
# 0. CHECK FILE
# --------------------------------------------------------------------------
# check that -i is a fasta file
if args.fasta_input != "":
try:
o = open(args.fasta_input,"r")
# check that it starts with ">"
if not(o.readline().startswith(">")):
sys.stderr.write("[E::main] Error. Not a fasta file: "+args.fasta_input+"\n")
sys.stderr.write(" Fasta file is expected to start with '>'\n")
o.close()
sys.exit(1)
o.close()
except:
sys.stderr.write("[E::main] Error: Cannot open file: "+args.fasta_input+"\n")
sys.exit(1)
# check that -a is alignment file
if args.al_input != "":
try:
o = open(args.al_input,"r")
# check that it starts with "REFERENCE"
if not(o.readline().startswith("REFERENCE")):
sys.stderr.write("[E::main] Error. Not an alignment file: "+args.al_input+"\n")
o.close()
sys.exit(1)
o.close()
except:
sys.stderr.write("[E::main] Error: Cannot open file: "+args.al_input+"\n")
sys.exit(1)
# --------------------------------------------------------------------------
# 1. ALIGN FASTA INPUTS
# --------------------------------------------------------------------------
if args.fasta_input != "":
# we use cmailgn to align the 16S sequences as input to
al_lines = cm_align_fun(relative_path,args.fasta_input,args.mxsize,args.threads,selected_cmfile,args.output_alignment,save_alignment)
# in al_lines there is a stream of rows that represents aligned 16S sequences
# otherwise we load the alignment saved before
if args.al_input != "":
al_lines = create_stream_al(args.al_input)
# --------------------------------------------------------------------------
# 2. EXTRACT COLUMNS FROM THE MSA
# --------------------------------------------------------------------------
check_ref = True # faster to check than the startswith
# general print
if output != "":
outfile = tempfile.NamedTemporaryFile(delete=False, mode="w")
os.chmod(outfile.name, 0o644)
else:
outfile = sys.stdout
for row in al_lines:
# the first line is the reference sequence, that we use to extract the
# correct columns
if check_ref:
if row.startswith("REFERENCE_Escherichia-coli-K-12"):
check_ref = False
al_ref_seq = row.rstrip().split("\t")[1]
dict_positions = find_al_position(args.forPrimer, args.reversePrimer,al_ref_seq)
# dict_positions contains the position in the alignment of the
# region(s) that we are intereseted in, example:
# {'V1': '134-189', 'V2': '265-308', ...
# if we have only one, I extract the key of the dictionary
s_region = next(iter(dict_positions))
# and the two boundaries
l_re = dict_positions[s_region][0]
r_re = dict_positions[s_region][1]
else:
# we extract the columns from dict_positions
vals = row.rstrip().split("\t")
# if there is only one region, then we don't change the header
if len(dict_positions) == 1:
outfile.write(">"+vals[0]+"\n")
if args.out_as_alignment:
outfile.write(vals[1][l_re:r_re]+"\n")
else:
outfile.write(vals[1][l_re:r_re].replace(".","").replace("-","").upper()+"\n")
# otherwise, we add the region info in the header
else:
for rr in dict_positions:
outfile.write(">"+vals[0]+"__"+rr+"\n")
if args.out_as_alignment:
outfile.write(vals[1][dict_positions[rr][0]:dict_positions[rr][1]]+"\n")
else:
outfile.write(vals[1][dict_positions[rr][0]:dict_positions[rr][1]].replace(".","").replace("-","").upper()+"\n")
# move the temp file to the final destination ---------
if output != "":
try:
outfile.flush()
os.fsync(outfile.fileno())
outfile.close()
except:
sys.stderr.write("[E::main] Error: failed to save the output file\n")
sys.exit(1)
try:
shutil.move(outfile.name,output) #It is not atomic if the files are on different filsystems.
except:
sys.stderr.write("[E::main] Error: failed to save the output file\n")
sys.stderr.write("[E::main] you can find the file here:\n"+outfile.name+"\n")
sys.exit(1)
return 0 # success
#-------------------------------- run main -------------------------------------
if __name__ == '__main__':
status = main()
sys.exit(status)