-
Notifications
You must be signed in to change notification settings - Fork 2
/
prepare_BEAST_alignment.py
executable file
·320 lines (229 loc) · 8.74 KB
/
prepare_BEAST_alignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
#!/usr/bin/env python
#/usr/bin/env python
##################
# Import modules #
##################
import string, re
import os, sys, getopt, math
from random import *
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC, Gapped
from Bio.SeqRecord import SeqRecord
from Bio import AlignIO
from Bio.Align import Generic
from modules.Si_nexus import *
from optparse import OptionParser, OptionGroup
##########################
# Error message function #
##########################
def DoError(errorstring):
print "\nError:", errorstring
print "\nFor help use -h or --help\n"
sys.exit()
##########################################
# Function to Get command line arguments #
##########################################
def get_user_options():
usage = "usage: %prog [options]"
version="%prog 1.0. Written by Simon Harris, Wellcome Trust Sanger Institute, 2010"
parser = OptionParser(usage=usage, version=version)
group = OptionGroup(parser, "Required")
group.add_option("-i", "--input", action="store", dest="inputfile", help="Input file name", default="")
group.add_option("-o", "--output", action="store", dest="outfile", help="Output file name", default="")
group.add_option("-w", "--overwrite", action="store_true", dest="overwrite", help="Force overwrite", default=False)
group.add_option("-d", "--dates", action="store", dest="dates", help="Dates csv file name (Name,date)", default="")
group.add_option("-n", "--nons", action="store_true", dest="nons", help="Exclude sites which are constant other than Ns", default=False)
group.add_option("-g", "--gaps", action="store_true", dest="gaps", help="Gaps (-) are real", default=False)
group.add_option("-x", "--exclude", action="store_true", dest="exclude", help="Exclude isolates without dates from output", default=False)
parser.add_option_group(group)
return parser.parse_args()
################################
# Check command line arguments #
################################
def check_input_validity(options, args):
if options.inputfile=='':
DoError('No input file selected')
elif not os.path.isfile(options.inputfile):
DoError('Cannot find file '+options.inputfile)
if options.dates!='' and not os.path.isfile(options.dates):
DoError('Cannot find file '+options.dates)
if options.outfile=='':
options.outfile=options.ref.split("/")[-1].split(".")[0]
while os.path.isfile(options.outfile) and options.overwrite==False:
outopt=""
outopt=raw_input('\nOutput files with chosen prefix already exist.\n\nWould you like to overwrite (o), choose a new output file prefix (n) or quit (Q): ')
if outopt=='Q':
sys.exit()
elif outopt=="o":
break
elif outopt=="n":
options.outfile=raw_input('Enter a new output file prefix: ')
return
def read_metadata(filehandle, name_column=1, unit_column=2, value_column=3, header=False, name_heading="", unit_heading="", value_heading="", split_value="\t"):
metadata={}
for x, line in enumerate(filehandle):
line=line.strip()
if x==0 and header:
headings=line.split(split_value)
for colcount, colhead in enumerate(headings):
if unit_heading!="" and colhead==unit_heading:
unit_column=colcount+1
elif value_heading!="" and colhead==value_heading:
value_column=colcount+1
elif name_heading!="" and colhead==name_heading:
name_column=colcount+1
else:
columns=line.split()
if len(columns)<max([unit_column, value_column]):
print 'metadata has rows without correct number of columns...skipping:'
print line
else:
name=columns[name_column]
if name in metadata:
DoError("Repeated names in metadata")
metadata["name"]={}
unit=columns[unit_column].lower()
if not unit in ["days", "months", "years"]:
DoError("Time unit must be days, months or years!")
try:
value=float(columns[value_column])
except ValueError:
DoError("Time value column must be a float!")
if __name__ == "__main__":
#argv=sys.argv[1:]
#ref, inputfile, outfile, tabfile, align, embl, raxml, graphs, bootstrap, model, chisquare, recomb=getOptions(argv)
(options, args)=get_user_options()
check_input_validity(options, args)
dates={}
if options.dates!="":
for line in open(options.dates):
words=line.strip().split(',')
if len(words)<2:
continue
try:
dates[words[0]]=int(words[1])
except ValueError:
continue
print '\nReading input alignment...'
sys.stdout.flush()
sequences={}
currseq=''
#Read the alignment. If it's bigger than 2Gb read it line by line. Else read it all at once (faster)
try:
open(options.inputfile, "rU")
except IOError:
DoError('Cannot open alignment file '+options.inputfile)
if os.path.getsize(options.inputfile)<2000000000:
lines=open(options.inputfile, "rU").read().split('>')[1:]
else:
lines=[]
count=-1
for linea in open(options.inputfile, "rU"):
if linea[0]==">":
count=count+1
lines.append(linea.split()[0][1:]+'\n')
else:
lines[count]=lines[count]+linea
linesa=[]
sequences={}
exccount=0
for line in lines:
words=line.strip().split('\n')
if options.exclude and not words[0].split()[0] in dates:
exccount+=1
else:
sequences[words[0].split()[0]]=''.join(words[1:])
if exccount>0:
print "Removed", exccount, "sequences with no dates"
if len(sequences)==0:
print "\nNo sequences found"
sys.exit()
alnlen=len(sequences[sequences.keys()[0]])
for sequence in sequences.keys():
if len(sequences[sequence])!=alnlen:
print "\nERROR!: sequences are not all of the same length!!!\n"
sys.exit()
# if sequence!=ref:
# sequences[sequence]=sequences[sequence].upper().replace('N','-')
# else:
# sequences[sequence]=sequences[sequence].upper()
if not options.gaps:
sequences[sequence]=sequences[sequence].upper().replace('-','N')
else:
sequences[sequence]=sequences[sequence].upper()
#replace uncertainties in each sequence
for x in ["R", "S", "B", "Y", "W", "D", "K", "H", "M", "V"]:
sequences[sequence]=sequences[sequence].replace(x,"N")
#print sequences.keys()
print "Found", len(sequences.keys()), "sequences of length", alnlen
sys.stdout.flush()
snplocations=[]
Ncount=0
snpsequence={}
#Identify snps in the alignment
print "\nIdentifying SNPs...",
sys.stdout.flush()
constants={"A":0, "C":0, "G":0, "T":0}
for x in range(alnlen):
numbases=0
foundbases={}
for key in sequences.keys():
if not key in snpsequence:
snpsequence[key]=[]
base=sequences[key][x].upper()
if base not in foundbases.keys() and ((options.nons and base not in ['-', "N"]) or (not options.nons and base!="-")):
foundbases[base]=1
numbases=numbases+1
# elif base!='-':
# foundbases[base]+=1
if numbases>1:
if numbases==2 and "N" in foundbases.keys():
Ncount+=1
snplocations.append(x)
for key in sequences.keys():
snpsequence[key].append(sequences[key][x].upper())
elif numbases==1:
if foundbases.keys()[0] in constants:
constants[foundbases.keys()[0]]+=1
else:
constants[foundbases.keys()[0]]=0
constants[foundbases.keys()[0]]+=1
print "Done"
print "Found", len(snplocations)-Ncount, "sites with a SNP"
if not options.nons:
print "Found a further ", Ncount, "sites with no SNP, but at least one N"
print "Constant bases:"
sortbase=constants.keys()
sortbase.sort()
for base in sortbase:
print base+":", constants[base]
print
sys.stdout.flush()
print "Replace the patterns block in your BEAST xml file with this:"
print '\t<mergePatterns id="patterns">'
print '\t\t<patterns from="1" every="1">'
print '\t\t\t<alignment idref="alignment"/>'
print '\t\t</patterns>'
print '\t\t<constantPatterns>'
print '\t\t\t<alignment idref="alignment"/>'
print '\t\t\t<counts>'
print '\t\t\t\t<parameter value="', constants['A'], constants['C'], constants['G'], constants['T'],'"/>'
print '\t\t\t</counts>'
print '\t\t</constantPatterns>'
print '\t</mergePatterns>'
print '\nOr use replace_BEAST_blocks.py and provide the file', options.outfile+".patterns", "with the -p flag"
output=open(options.outfile+".patterns","w")
print >> output, ' '.join(map(str,[constants['A'], constants['C'], constants['G'], constants['T']]))
output.close()
alignment = Generic.Alignment(Gapped(IUPAC.unambiguous_dna, "-"))
for name in snpsequence:
# if len(''.join(snpsequence[name]).replace("-","").replace("N",""))>float(len(snpsequence[name]))*(float(options.exclude)/100):
# alignment.add_sequence(name, ''.join(snpsequence[name]))
# else:
# print name, "excluded from snp alignment as it is < "+str(options.exclude)+"% mapped"
if name in dates:
alignment.add_sequence(name+"_"+str(dates[name]), ''.join(snpsequence[name]))
else:
alignment.add_sequence(name, ''.join(snpsequence[name]))
AlignIO.write([alignment], open(options.outfile, 'w'), "fasta")