-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean.py
145 lines (132 loc) · 6.19 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
import sys
import os
import re
from Bio import SeqIO
def get_files_to_analyze(file_or_directory):
"""
:param file_or_directory: name of either file or directory of files
:return: a list of all files (even if there is only one)
"""
# This will be the list we send back
files_list = []
if os.path.isdir(file_or_directory):
# Using a loop, we will go through every file in every directory
# within the specified directory and create a list of the file names
for root, dirs, files in os.walk(file_or_directory):
for filename in files:
files_list.append(os.path.join(root, filename))
else:
# We will just use the file given by the user
files_list.append(os.path.abspath(file_or_directory))
# We will sort the list when we send it back, so it always returns the
# same order
return sorted(files_list)
def find_recurring_char(record, start, end):
length = len(record)
window = end - start
recur_char = 'X'
#this is the density threshold for finding a garbage sequence
#0.8 means that 80% of nucleotides must be the same to trigger a cut
perc_cut = 0.75
A_count = (record).count("A",start,end)
T_count = (record).count("T",start,end)
G_count = (record).count("G",start,end)
C_count = (record).count("C",start,end)
N_count = (record).count("N",start,end)
if(((A_count)/window)>=perc_cut):
recur_char = 'A'
elif(((T_count)/window)>=perc_cut):
recur_char = 'T'
elif(((G_count)/window)>=perc_cut):
recur_char = 'G'
elif(((C_count)/window)>=perc_cut):
recur_char = 'C'
elif(((N_count)/window)>=perc_cut):
recur_char = 'N'
return recur_char
def format_files(files_list, output_dir):
"""
Print to a new directory the re-formatted fasta files.
We will remove anything smaller than 500bp, under 5x coverage,
and any sequences of repeating or near repeating bases at the
beginning or end of a contig.
:param files_list: list of fasta files to format headers
:return: success
"""
max_score=25
for f in files_list:
file_name = f
with open(os.path.join(output_dir, file_name.split('/')[-1]), "w") as oh:
contig_number = 1
with open(f, "r") as fh:
for record in SeqIO.parse(fh, "fasta"):
if len(record.seq) < 500:
#print("Skipping {}, less than 500bp".format(record.id), file=sys.stderr)
continue
m = re.search(r"_cov_([\d\.]+)", record.id)
if m:
if float(m.group(1)) < 5:
#print("Skipping {}, low coverage {}".format(record.id, m.group(1)), file=sys.stderr)
continue
#else:
#print("Could not find coverage for {}".format(record.id), file=sys.stderr)
length = len(record.seq)
str = record.seq
#searching end of file for garbage
recur_char ="X"
window_size =0
#search blocks at the end of the contig for a sequence with a high density of a
#specific nucleotide, if there are no garbage sequences, recur_char will be 'X'
for i in range (30,410,20):
recur_char = find_recurring_char(record.seq,length-i, length)
if(recur_char != 'X'):
window_size = i
break
if(recur_char !='X'):
index = length-window_size+1
score = max_score
#until the score hits zero, traverse the string char by char and then change
#the score
while(score != 0):
index -=1
curr_char = record.seq[index]
if(curr_char==recur_char and score != max_score):
#if the next char matches the one we saw with increase density, add 1
score+=1
elif(curr_char!=recur_char):
#if the next char doesnt match the one with high density, minus 1
score-=1
if(score == max_score):
#every time we see a max score we mark everything past that point for deletion
window_size = length - index
str = record.seq[0:(length-window_size)]
print("Trimming {}, {} mostly {} bases removed".format(record.id, (length-len(str)),recur_char),"from end")
#searching front of file for garbage
length = len(str)
recur_char ="X"
window_size =0
for i in range (30,410,20):
recur_char = find_recurring_char(record.seq,0,i)
if(recur_char != 'X'):
window_size = i
break
if(recur_char !='X'):
index = window_size-1
score = max_score
while(score != 0):
index +=1
curr_char = record.seq[index]
if(curr_char==recur_char and score != max_score):
score+=1
elif(curr_char!=recur_char):
score-=1
if(score == max_score):
window_size = index
str = record.seq[window_size+1:length-1]
print("Trimming {}, {} mostly {} bases removed".format(record.id, (length-len(str)),recur_char),"from start")
record.seq=str
SeqIO.write(record, oh, "fasta")
if __name__ == "__main__":
all_files = get_files_to_analyze(sys.argv[1])
format_files(all_files, sys.argv[2])