forked from weka511/bioinformatics
-
Notifications
You must be signed in to change notification settings - Fork 0
/
GREP.py
116 lines (96 loc) · 3.45 KB
/
GREP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
'''
Copyright (C) 2017 Greenweaves Software Pty Ltd
This is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>
'''
import copy
from rosalind import dbru,read_strings
def count_kmers(S):
'''
Construct a list of counts for kmers, so we can make sure
each cycle has the mataching frequencies.
'''
counts={}
for s in S:
if s in counts:
counts[s]+=1
else:
counts[s]=1
return counts
def create_lookup(B,E):
'''
Turn edges into a loolup table for successors
'''
F={}
for b in B:
F[b]=[f for (e,f) in E if e==b]
return F
def remove_unused_kmer(counts):
'''Get rid of zero counts'''
removes=[]
for key,value in counts.items():
if value==0:
removes.append(key)
for key in removes:
del counts[key]
return counts
def format(r):
return ''.join([rr[0] for rr in r] + [r[-1][-1]])
def grep(S):
'''
GREP Genome Assembly with Perfect Coverage and Repeats
'''
counts=count_kmers(S)
B,E=dbru(S,include_revc=False)
F=create_lookup(B,E)
# We are going to build a list of runs, whicg are candidates for cycles.
# Each time we have a choice of successor, we generate another run, so all
# candidates are expanded successively.
# If a candiate cannot be extended, jusr leave it in the list and remove
# it at the end.
Runs=[[S[0]]]
counts[S[0]]-=1
counts=remove_unused_kmer(counts)
# We maintain a separate list of counts for each run, so we
# know which symbols are available.
CountsForRuns=[counts]
for n in range(len(S)-1): # Grow runs by one base each iteration
NewRuns=[] # Stick extra runs on the end when we are done
for i in range(len(Runs)):
run=Runs[i]
counts=CountsForRuns[i]
last=run[-1][1:]
succ=F[last]
counts_old=copy.deepcopy(counts)
j=0
added=False
while j<len(succ):
kmer=last+succ[j][-1]
if kmer in counts_old:
if added:
new_counts=copy.deepcopy(counts_old)
new_counts[kmer]-=1
new_counts=remove_unused_kmer(new_counts)
new_run=copy.deepcopy(run[:-1])
new_run.append(kmer)
CountsForRuns.append(new_counts)
NewRuns.append(new_run)
else:
counts[kmer]-=1
counts=remove_unused_kmer(counts)
run.append(kmer)
added=True
j+=1
Runs = Runs + NewRuns
return [format(r)[:-1] for r in Runs if len(r)==len(S)]
if __name__=='__main__':
for s in grep(read_strings('c:/Users/Weka/Downloads/rosalind_grep(1).txt')):
print (s)