-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVariable_sites_extractor.py
executable file
·130 lines (105 loc) · 4.79 KB
/
Variable_sites_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/env python
# Script for extracting from a multi-fasta aligment:
# [A]ll sites (ie. do not only return variable or informative. Can still filter out N or gap-containing sites)
# [V]ariable sites (Sites where 1 >= sites have SNPs)
# [I]nformative sites (Sites where 2 >= sites have SNPs)
# [U]ngapped alignment (remove all gap-containing sites)
# [N]-containing sites (remove sites with ambigous characters)
# [C]oordinates (return coordinates of sites rather than characters themselves)
# I overrides V
import sys, argparse, progressbar
from Bio import SeqIO
from time import sleep
def Usage():
print "Variable_sites_extraction.py -v -i -u -N -o outputfile.fasta alignment.fasta"
sys.exit(2)
def main(argv):
parser=argparse.ArgumentParser(description="Strip sites from multi-FASTA alignments")
parser.add_argument("alignment")
variable_or_informative = parser.add_mutually_exclusive_group(required=True)
variable_or_informative.add_argument("-a", "--all", help="Return all sites. (Can still filter out N- or gap-containing sites).", action="store_true", default=False)
variable_or_informative.add_argument("-v", "--variable", help="Return variable sites. (Sites where 1 >= isolates have SNPs)", action="store_true", default=False)
variable_or_informative.add_argument("-i", "--informative", help="Return informative sites (Sites where 2 >= isolates have SNPs) Overrides -v", action="store_true", default=False)
parser.add_argument("-u", "--ungapped", help="Do not return gapped sites", action="store_true", default=False)
parser.add_argument("-N", "--N_containing", help="Do not return N-containing sites", action="store_true", default=False)
parser.add_argument("-o", "--output_file", help="Path to output file (Default: ./output.fasta)",default="outputfile.fasta")
parser.add_argument("-c", "--coordinates", help="Return coordinates of sites rather than characters themselves", action="store_true", default=False)
args = parser.parse_args()
# Needed?
if args.variable == True:
args.informative = False
print "You have supplied the following arguments:"
print "MSA input file:\t\t" + args.alignment
print "Filtered MSA output file:\t\t" + args.output_file
print "Return all sites:\t\t\t" + str(args.all)
print "Return variable sites:\t\t\t" + str(args.variable)
print "Return informative sites:\t\t" + str(args.informative)
print "Filter out N-containing columns:\t" + str(args.N_containing)
print "Filter out gapped columns:\t\t" + str(args.ungapped)
print "Return coordinates only:\t\t" + str(args.coordinates)
# FILTER ALIGNMENT:
with open(args.alignment, "rU") as f:
handle = SeqIO.parse(f, "fasta")
backup = {l.id : l.seq._data for l in handle}
dic = backup.values()
ids = backup.keys()
if not args.coordinates:
newdic = {k : "" for k in ids}
else:
newdic = ""
length = len(dic[0])
print "Finished reading MSA file. Iterating over all positions and all isolates..."
# Set up progress bar:
bar = progressbar.ProgressBar(maxval=length, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
bar.start()
for nuc in range(length): # For each nucleotide in the alignment
if args.all:
include = True
else:
include = False
col = "".join([x[nuc] for x in dic])
# Make upper-case:
col = col.replace("a","A").replace("c","C").replace("g","G").replace("t","T")
if "N" in col and args.N_containing:
include = False
continue # Go to next nucleotide
if "-" in col and args.ungapped:
include = False
continue # Go to next nucleotide
nucleotide_counts = {}
for char in ["A","C","G","T"]:
charcount = col.count(char)
if charcount > 0:
#nucleotide_counts[char] = col.count(char)
nucleotide_counts[char] = charcount
if args.variable:
# Then it is enough to know that more than one key exists
if len(nucleotide_counts.keys()) > 1:
include = True
elif args.informative:
# First we need to know that at least two different bases exists at this site
if not len(nucleotide_counts.keys()) > 1:
continue # Go to next nucleotide
# Then the second most common nucleotide needs to have a value of >= 2
if sorted(nucleotide_counts.values())[-2] >= 2:
include = True
if len(nucleotide_counts.keys()) == 4:
# Sites where all bases are represented are not informative
include = False
if include:
if not args.coordinates:
for isolate in newdic:
newdic[isolate] += backup[isolate][nuc]
else:
newdic += str(nuc) + ","
bar.update(nuc+1)
bar.finish()
# Convert to dic if coordinate mode
if args.coordinates:
newdic = {"C": newdic}
with open(args.output_file, "w") as g:
for key in sorted(newdic):
g.write(">" + key + "\n")
g.write(newdic[key] + "\n")
if __name__ == "__main__":
main(sys.argv[1:])