forked from salimt/Courses-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
week2.py
168 lines (136 loc) · 21.3 KB
/
week2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# -*- coding: utf-8 -*-
"""
@author: salimt
"""
"""
We start sliding the window at position 0 of Text, but where should we stop?
In general, the final k-mer of a string of length n begins at position n-k;
for example, the final 3-mer of "GACCATACTG", which has length 10, begins at
position 10 - 3 = 7. This observation implies that the window should slide
between position 0 and position len(Text)-len(Pattern).
input: pattern - sequence of base pattern we are searching for
ex: "GCGATCGCG"
text- the full base sequence string
ex: "GCG"
output: the number of times Pattern appears in Text
ex: 2
"""
def PatternCount(Text, Pattern):
count = 0
for i in range(0,len(Text)-len(Pattern)+1):
if(Text[i:i+len(Pattern)])==Pattern:
count = count+1
return count
#Text = "GACCATCAAAACTGATAAACTACTTAAAAATCAGT"
#Pattern = "AAA"
#print(PatternCount(Pattern, Text))
"""
Although most bacteria have circular genomes, we have thus far assumed that
genomes were linear, a reasonable simplifying assumption because the length of
the window is much shorter than the length of the genome. This time, because we
are sliding a giant window, we should account for windows that “wrap around” the
end of Genome. To do so, we will define a string ExtendedGenome as Genome+Genome[0:n//2]
"""
def SymbolArray(Genome, symbol):
array = {}
n = len(Genome)
ExtendedGenome = Genome + Genome[0:n//2]
for i in range(n):
array[i] = PatternCount(ExtendedGenome[i:i+(n//2)],symbol )
return array
# Genome = "GATATATGCATATACTT"
# ExtendedGenome = GATATATGCATATACTTGATATATG
# symbol = "C"
# print(SymbolArray(Genome, symbol))
"""
We observe that when we slide a window one symbol to the right, the number of
occurrences of symbol in the window does not change much, and so regenerating
the entire array from scratch is inefficient
We can view this sliding of the window as simply removing the first symbol from
the window (C) and adding a new symbol to the end (A). Thus, when shifting the
window right by one symbol, the number of occurrences of C in the window decreased
by 1 and increased by 0. Once we compute that array[0] is equal to 8, we automatically
know that array[1] is equal to 7 because the next symbol A does not equal C.
Input: Strings Text and Pattern
Output: The number of times Pattern appears in Text
"""
def FasterSymbolArray(Genome, symbol):
array = {}
n = len(Genome)
ExtendedGenome = Genome + Genome[0:n//2]
array[0] = PatternCount(ExtendedGenome[0:n//2],symbol)
for i in range(1, n):
array[i] = array[i-1]-1 if ExtendedGenome[i-1] == symbol else array[i-1]+1 if ExtendedGenome[i:i+(n//2)][-1] == symbol else array[i-1]
return array
# Genome = "AAAAGGGG"
# symbol = "A"
# print(FasterSymbolArray(Genome, symbol))
"""
We will keep track of the difference between the total number of occurrences of G
and the total number of occurrences of C that we have encountered so far in
Genome by using a skew array
Every time we encounter a G, Skew[i] is equal to Skew[i-1]+1
Every time we encounter a C, Skew[i] is equal to Skew[i-1]-1
Otherwise, Skew[i] is equal to Skew[i-1]
"""
def SkewArray(Genome):
skew = {}
skew[0] = 0
for i in range(1, len(Genome)+1):
skew[i] = skew[i-1]-1 if Genome[i-1] == "C" else skew[i-1]+1 if Genome[i-1] == "G" else skew[i-1]
return skew.values()
# Genome = "GAGCCACCGCGATA"
# print(Skew(Genome))
"""
Minimum Skew Problem:
Find a position in a genome where the skew diagram attains a minimum.
Input: A DNA string Genome.
ex: TAAAGACTGCCGAGAGGCCAACACGAGTGCTAGAACGAGGGGCGTAAACGCGGGTCCGAT
Output: All integer(s) i minimizing Skew[i] among all values of i (from 0 to len(Genome)).
ex: 11 24
"""
def MinimumSkew(Genome):
positions = [] # output variable
skew = SkewArray(Genome)
minVal = min(skew)
{positions.append(i) for i in range(len(skew)) if list(skew)[i]==minVal}
return positions
# Genome = "CATTCCAGTACTTCGATGATGGCGTGAAGA"
# print(MinimumSkew(Genome))
"""
The discovery of these approximate 9-mers makes sense biologically, since DnaA
can bind not only to “perfect” DnaA boxes but to their slight modifications as well
Hamming distance = the total number of mismatches between the 2 strings
Hamming Distance Problem:
Compute the Hamming distance between two strings.
Input: Two strings of equal length.
Output: The Hamming distance between these strings.
"""
def HammingDistance(p, q):
counter = [+1 for i in range(len(p)) if p[i]!=q[i]]
return sum(counter)
# p = "CTTGAAGTGGACCTCTAGTTCCTCTACAAAGAACAGGTTGACCTGTCGCGAAG"
# q = "ATGCCTTACCTAGATGCAATGACGGACGTATTCCTTTTGCCTCAACGGCTCCT"
# print(HammingDistance(p, q))
""""
Approximate Pattern Matching Problem:
Find all approximate occurrences of a pattern in a string.
Input: Strings Pattern and Text along with an integer d
Output: All starting positions where Pattern appears as a substring of Text
with at most d mismatches
"""
def ApproximatePatternMatching(Text, Pattern, d):
positions = [i for i in range(len(Text)-len(Pattern)+1) if HammingDistance(Pattern,Text[i:i+len(Pattern)]) <= d]
return positions
# Pattern = "CTCGATTCAC"
# Text = "TCGTATGTGTTTCGTTCGGCTCTTTGACGGGCCGGCGTTGTTTCGACTGCGGGCATTGCTCCGGCTGCGTTGATAGACGGAAGGGACTTACAGGGGACACATCGGAGCGGCCTACAACTTGGAACATAGTGCGGTTTTGGATAGCACCAGGTGAGTTCAGTGGGAGGGGTGGGGTACTTCTCAATATTTTGGGTGCATTGTAGCGTCTTAACAAACGATCGAAGGATTGGGGGGTTATCCTAAGGTCCATCGCACTAAGGAACGAGCAGCTCTTTAACTTAGAAACCAGTTGACGTTGTCGAGAAGGCCTGCCATGCTTGAAAACAGGGGGGTGTCTGGAAAAACTGAGCTACCTCCATCCAGTAACGGATCTGCAGCTATACGAGCAGAAGAGAATCCGGCTGGAATCCCAGTCCCCTAGAGATGCAGTTACTCACTCAGGAGATGTCAAGGCAAGCTGGTTTTATACGCCAATGGTGTCGCTGTAGTACGAGAGGTGTGCTTCTTGGCTTCAGCCACCCTCCAATTCCCTCCAGATAAACGGATCGCCACTTCAAATTGTCACTTACTGGTCCAGTTGACCGGCATGTATTCTAGCATACTTGGAGGAGAGAAGCAATCACCAAGGCACATCTGATCAGCCAGTACGCCGTTAGAGTCCTTAATATCATATAGCTTGGACGCTAGACGTTACGCTAATCCCTCTCACCTTGCTGGGCAGTCTACCTGGCGACTGGAAACGAGGACCCGCCACTCACCGCTCAAATGGTGACTGAAACGGTGGCCAGTCGCAGGTACCCTCTACCCATACTAGCAGCCTAACGGACAACCATTCAAAAATTAGAGACGGGTTAATAGACCAAAGTACTGCGGCACCTAGACCGAATCGCACATAATGTTTGCCCTGTCGATCCCCTGCACTTAACCCATTCTTCCCGCGGGTAGTTTGCGGCTGACTACGCTCGCCGCTAGTTATCTCCCCGGTTGTCCATATTGCGGGGAAGATCCATAGGGCTAGACGCCGCAACTCGTAGGTACTAGAGACTGCAATGCTCGCGCGGTGTCAAGGAGGTTCTTTTTTCGCTCATTGAGCAAGTGGGTGATCGAGATCCAAGTTCGATCCGGGCGCCATTAAGCTTCGGAGCGGGCGAGCTAGCGCTGTCATGGGCGCCAGATGTGGTCTCTGTTCAATCCCAGGTTATGCGATGGTATAACATCTTTTCACCGAGGGGGGTTGCGTCCGGCGTATTGGAATGATGGGGGATTCTAAGGGTATTAGGTGGACAATAACCGGCACGACGCGAACGCCATGTAATAAAAATGGATGACGATCAAGTTAACCCCGGCCAGAGGCGTCAAGTATAGACGAAGACTCATGGCGGTGATCTTTCCAAGTACTCCCAACCTGGCAGTATAAAGACCCCAGTATCGGTATACAACGGCCACGGAACGGCTCTATATATACCTCACTCATTGATGAACCCACAGGACTCTTCCAGTCATGATTCCGACGGCCGCCCGTGCATAGACACGAGCCAAAGCGAGCGATGTTAGCCTGTCGATGCTTCGAACTGATTACGCGCAACCGCCGCGCGGTCTGGATCAGACTTGGCTACCAAGGTAGCTGTTCTCGGTGAACAAGCAAATTGTCAGTGAGTCAACCGCCGATAATTAACGTAAATTCTCCGTTTGCATAGCGCATGGCTCAATCTCCTCTCCACTCTCTGTAGAGGAAACCATGCCCTAATGCGTCCGGGTTAGCTGCTGTGGCCGAATCGGGCTATTGCTAGCCTAACCAGATAAGCACGCTAAACTCGACGTCCCAGGGGGATCACAGTGAGACTCGCGGTCCCATTGTTATGCGCGTAGTGGCGTTTAGGTGTTAATCAAACTTCCAGGGAAGCCGCTATAACAACCCACAGTCACAGCGAATCAATTGAGTTAGGCTCTTGACGACCACAAGACAGGTAGTTATGATTCGCGCGCGGCTCCTGTATCCGTGCAAATGCGAGAAATCAAGATGATCGTTACTGACTGCCGGACCATTGGTACCCCATTCGTCGGTTGAGTGTAGTTCAGGTAAGCGCTGTTGGGGAAAAAACAGTACTCCTGTTGCGAATCCAGGTCTATAGCATCCAGTTTAGACAGAACTAGGGGCCATGTATGCTGGGCAATGGGGCAGTACCGAGGGAGCATAGGAAGAGGCGACTTTGTCCCGCACCAGCGCACAGCATTTATGAACTTTGCAGGCAAGAAGAACACAAACGACACTATCATCTCGTGCTAGGACAAGCGGTACAAATCTGTAAGCGACGCTCTGTCGCATACCCAATGGAAGATCCGGTGGTAGGATAATAGACCGTAGCAAAATGTGAAACAGTGGAACTCCTAACGGCGGAGGCTTAACATACAACCTAGAACGCTGCCCACTCCTGTCGGCTTCTTACATGGGAACAGGGACCTCCTCAACGGCAATGCTTCTCACGGGCGCCCTATACATACAGAGCATTCTCAAAAAGACGTGCTAAGCATGTCGTTGCCCGGCTTATCGCCTATTTATTAAAGCTCTTCACTCACAGGGAAACGGGTTTGCTTGCCCCGCTATAGCGCCTACTGGCCTCCTTCTGCCGTCGCCCGGTGTACACTAGTTCTTGTGGCATCCAGAGTAGGGGTTTGCAGTTATATGCTGCGGCCAGTCTGCGGTCACAGTAAGCCCATAGTCGTGGACAGCCGGAATATTTTCTACCAAAAGTCACTGTCTGAAAAGCTAGCAATAAACTCATTGTGGAGCCTAACTGCCTGCCCCCAGTAAGAGCAGAGGTATTTGTTCTAACAGTTCGAGTAAACTCTTAGTCAGCAAATGATTTGCGTCAAGCGCCCATCATAAGGTTTTAGTGCGAAAGTTTTCCGTGTGACCTTCATATGGAATTTTTGTGAGACCGGGCGGTCGCGAATTGAAGCTCACGGATTCATGTGCTGCGGCACGAGACCCCAGGATCTTGATAACCCTTGTCGCCATTCGAGTTTCCTGTAGTGCGAGGGCATAAAGCTCGCGGAGGCCCAGACGGCGGTCAGATAACTAACGCACCGGATTAGACGAACACCAAAGTACCACGAGCCGCTATCAACGTCGGCTATGTATCCTATTTTACTCTCGCGCGAGGATCTTTTATTAAGCGCAACGCATGGCGCGGTCTGCCAGGCCACTAATCATCAAAGTGACAGCACGGAGCGCTAGTCGATAATACAGCCCACACAGCCAGGGCGCAAAATCGAAGTGAGGACTCGGGTTCGTCACCTGTGTCAACCCATATATGGCTGTTGGCTCTCAACGTACGGTCTTCAAGCGCCGGGACTATAGATCCGTAATGTCTACCTAGTTTCAGTGCAGTTCCGCTGCGTTTACAACTCAGAGGCGGGTCTCCTGGCATTGACTAGCGTAAACGATCAGCTGCAGTTTCCATAGTATTTGTCTCCGCAGTGGAACAAGTAGTTCCCTTAAGACGTACAACAGTCAGCCCCTCGAGATCGCAGCCGGTCGACGGAGACAACTAATGTCCTCAATGTCCTGCGGGGATCCCAGTGCTCCCGTGGTGGGACTCAAGTGGATCCGCGCATACACCCTGTCACACTCTATCGTTTCAAATACGCGCTCGCCTAATTACACGGACCATAACATGTGTGGTATAAGCCGTAAGCAATCATGTGTCCCATGCGAGGGACAGTTTAAGCCACATGATAATCGACGACTCTATCAAAGGAATAGAACACTGATGACATCCTACATGATGACCAAAGGGACTGCACAATGAGCCTATACCAGCCCGACCCCGATCGAAGTACTGGCTAGTCCTCTGCTTGACGATCTGGGCGTTTACGGATCCTAACTTAAATGCCGAGTGTAAAATTTCATAGATGAACGGGTTACTACACCTGCTCATTTTGATAGCTGGTGAGTATCAGGACTCGGAGACGCGTCACCAATTGATATTTCGAGATCCCGTTCGTATAACCGAGGGCGACAACTATGCGCGGGTTGTTAGGGGTTCTGCAATCCGCCGTATCGGTTATCAACTCGGGAGACAGTTCGTTATGCTTGTAAGTTGACGTATTAGACTTCAGCATTTGCACCGTATTGACCACTGTGCTCTTTTCCGTGAGAGACTGCCGAAAACGATCTGGAGCTTTACCCAGTGCCTGGTATTCTAGCCTAAGGTGCTGCTTCGAAACACAGCGTATGACGGGTCTAGTTCCATTGATCCGCAGCCGTCCACCTGTACCATCCTCGGCTCGGCTTTACCGTGCGGTCTTACTGTTTCGGAGCTCATCATAACCTAACTTATCCCTCGAATGACGCGCACTGACATCATATCACAAGGTATCCAGAAATCGGAAACAACCTTGTAACAGCTGACTTCGGTTCCAGCCGACCTTGATCGTAGGACGTGCACCGAAGCGAAGCAACGGTTATAGACCGGACGGTGACGTCACGCGAATACGCACTGACCGCGTTGGGGTACGGATCGCCTGTCCTTTTTAGGGATCCCCGATATTGTCAGGGTGCAGAGGATGAGGTTGCGCTCACCTGTTGACATGTTAGGACTCCTTAAGGCCGCTGGTATATCAACAAGATATGTAGACCCTATTGTCTCCACGCGTCGAAGCCGTTAAATTATTGTGGTCCCGTACGTCCATCTCAAGTAGGGCTGAACGAACGATGTTATGGGTAAGACGCTGGGGTAAAAGTATTGTGCACTGACGTCATATAGGCCGGGCAGTGGCTGCTAGTATGGCATGAATCCTCGCTATTTATGGTCGAAGCCGATGCTAAGGCCACCAATGCTTCCGTGATTATTGGATTTTAGGCGCTTTGTAGTGCCGGCACTAGACTTGGGAAGATGTACATCAAACAATCGCCGTAGATCTTCGGGACATAGCACGTTGCCCTATGTATTCCTTCAGCGTTAAGGGAACCAGACAAAATGCCTACCCATCGACATACATCGTATCGTGACTAACCAAAACGTCGATGTTAAGTCAGACTTGCATTAAACGGCATATGTGCCTCCAAGAAATCTAGAGGCGCATCGACTCGCCATTTGAGGACTCCAGCGCGCGATAGGTGGCCAGCAACGCTAATTGTAGCTCGCCAATGGCGCGAAATGCTGGGGTCCATCTTGTAAATGTGCCACACTGGATAGTACCTAAGGTTTCATAACTCAACCAGGGAAGACCACGGCACCGGCACTCCAAAATCACATATAGGGAGACCTACAATTCTGTGGGCTAATGGTCCTCGTTACTATCAACAGGATACACCGTTATGAGTTCCGCCCGTATCTACGGTGGTCCGGTGTGAACAATCATCCCATCCCGTAGGGTCGGCCAAACGAGTCTCAGACAGGCCCCTGCCGCCGCTGATCTCTGGTCGAGATCTTCTTATCCGATGCAGGTACATAGACCCAACTCTGAACCACGTCGCATTTGTCTTTTGATGTACGGTGATTGGCGGTACACGTATTCACATAAATGATACCGACACTTCTGATACTAGACGCAGCAGGTTTTAACCTGTTCGTCGCTCGTCGAACTTGCAAGCACTTCCCTAAGACGACTCACTATAGCCACCTACGCGTCAATTATGCCCTCCTTCGCGCCTCCGCAATGAGTGTTCTTTTTCGCTCCGAGAAAAGGTCATGATCCAGAGGGGCGGCCCGTCAAGACCCAGCATGGTGGCCCTCGTTGTCGAGACAGGAGTTGGGAATCCCACCCGTATTTCGTTAGGATGATTCCGGCACCTAACTTGAAATCTGACCCATGTGGTACTCATGACCGCCTGCGCCGTATACACACAGCGGGACGTGGATCCAAGGCGTACTGAGATATGCTTAAAGAAGGAATAGTATCGTGGTTGGCATATACGTGATAACTCCAGGGGGCGGCGCAATGATTACGATGACCTAACGTGGTTGGCCATCAATGGTCTGCTAAGATGACCAAGTCGGGAACCTTTTATTTTCTTCGGAAGATCGCAAAGCTTTTTGAACATATACAGTCCATATACCCAAACGACCTCCGGACGGCTTGGAGGTCGAACAGCAGTCCAGATGGAAGTATATGTGAAAAAAGCTCAAAGTTAGCGTTTTGGAACCTGGGTAAGGGCATCTGAGTATGCCTCTGTGCATCTAGCCTGGTTGGGCACAGTGACAGCGAGCTGTACCACGTTCACGGTAGATTGCTCACTCGATTCAGGTGTAATATGGGGCCCCGGCTGCTCCGAAACTTTGACGCGCCCTCGATTGGCAGTCGTTGACGACCGGTGAACTAAATAGTAAATGTACCGATCGACCTAGCTCAAGCTGAGCGGCATTGAGAATCAACTGTCAATCTCCACCGTCAGAGTATTCGCAGGATTACATGACCGTAGCTGGCTTTGCAACGCGCTAGTGTGCCTTTAGCAAGCCTAGTTGTTTTTGGACCGTCGCCCTCGGCATCTGGTGCTCTTGTAGAGACTCGACTCATCACGGACCAAATCCAAAGCTGTATGTACTTTTGGCGAGTGTTCCGCAGCACACGACATCGTAAGGTAATCCCTCACATGGATCGGGCGGAAGTATCACCCACGACCTGCTTCACACTGTGAGTACTGGTACTTCTCTTTGCAGATATAAAGAGTATGACTGACTCAGCTCACGTCTTCAGTAGGATCACGCCAAGACTTGGGTGAGGATATATCACACGATGAACCAATTCACTCAGTCAGGGCGAATTCAGCGAAGGATATAAATATTGGCTATTCTCTACAGCCGCGAATGTGCGCGCGGGCCAGCTGTACTATTCCTGAGGTCCAACAAGATCTACGGTCTCCCGTCCGTCTGGTGAATACCAGACGTGATCGCCGACCACGTTATGTCTGCAAGAATTAAAGACCCTTCTATCATCCTTATCTGTGCACTCATAGACACTGCCCAAGTGCTTATGCGTTGGATCTTTGCAGTCAGTCAATAATCAATTTAAAGGCGGCTGGTGGATAACACGTAGCTTCCCTCCCGATGAGAATGTTCAATACCGTCCGCTTAATTCATGGTATATTGTAGCCCGGTTATTGTCAGACACAGCATGCTAAATCACGACCTTTTCAACGATGACTCACATACCAAACTCTAAAGCTCGGTACCGGGCACCGCAAGAGGGGTGGGTGTCATCAAGACGATCATCAGAGTCATGCACAGGGCAATGCACCATGTCCTTTGCTAAGTCAGTGGCGGGTCATCACATCCTGCGTGCCAATGGCCACAATTTCCCCACGTCTCACTTAGAAATGCCTGTTAGTAGCGGCGTTAGAGCCAGAACCGAACGTAGTTACAGAACCAAAAAATGTGGACGTTAGCATTAATTACCTAATACCGGTGCTTGACGTGTATGATCCTACGTTCCTACACTGGAAAATACTGCGCGTATAGGAGCTTTAATGTAATCGAGGTCACGTCGGACTGGAGCTGTCACTCGAGGCGTCTACAAATTTACTAGAGTAAGTCTACGTCCCCTAGCTCCGATACCAAAAAAAGATGTCGTGACACCAGCCACGGGAGGAGGCGTGGTAGTACACTTGAAGTAGGTATACGCCCGCAACGCCTCAGAGGAAGGTAGCCAATGACACGAATGAAGGTGCTGGGACTGTAAACGTGTCGAGAGCCATCATAGTTTCTTTTGATATACACGGGGTGGCCGGGTGATTAATCTTGGATTAACCAGTTAACGTATCCGAAACCGATCTGGTGGGTCCTATTCTAAGACTGGCCCTAACCTTACAGCTACTCCAGGATCCAGGCGGTTCGAAGTAAGTCCAAACTCGTTATAGTCTGTTTGCCGAGACTTCGCCTCAGAGAGGCCATTGATTGAAATGTTACACCCTGACATAAACATGCATCCACGGAATAGGGGCGCAGTTCGTCAAGCCCCGGATCCGCGACGAGACCGGCAGTCCAGTATCGTCACCCTAGGATAATCCCGACGGGAACGGGAAGGCCGAACTAAGACTGGAGGATCCTTGACGGATTCAACTGCTGTCGAATGAGCTTTCGGATGCCAATTGCTACCCCAGCCAGGACCCATTTATGAAGGGGCGTAGCGAGACGCGATGGGTGTGGTATCATTTTCTGAATGGTAAGCATGACGGCCGCACAAGTAATCGCGCGTGTTCTGGTCTAACGGCGCTCGTCAATTCTTCTCCTCTCGTGAGGGTATTTTATCAATCCATCAACACATGCACCATTCTTGGACGCGTCACGTGGGCTTTAAATCCAACGTCCGCGAACCACCTCCTGTACCGAAGGATAACTCTCCAATAAAGTGCTTCGCTAGGTTCCGTCCAGTTAGCTTTACGGAGATTATTAATTTCGGAGTGGTATCCACCAGGCGCTTGTACGTCTCCACTCAAGCTCTCAGGTGCGAACCTGAGACGTAGTTTAGATAGCGGACCTATTATTCATCGGGTGGGTATCACTAATTCGCGGTTCCTGTTGATCTCCTAACCGTTTTGAATATTCAAGCCCACTATAGGCTAGCGTACGATGTCAGGCAATAGCAGGCTAGGAAAGTTCTCATCAGTGTTTACTAGGAGACATACTTGCTTTATTACCTGGCTAACGAATCTGCCGGTCAGACGTACAACTCCACATGTGCAATCTACACCAGTATTGATACTTTCCGCGCCCGTCCATGCGGGCCCCAGTAACTCATCGGGAATCATCTTAGTAATGTGCCGATTATCCCGAAACGGAGGTGGTTTGGTATGAGGTTGCACCGTTGCAGCGAGTCGAATCTCTAAGGTTAGGCAAAAGCGTGGTCTGTCAATAGTCAGTTACATAAGCGCGACTCCCATGTGTCGGTGGAACCCTGGCAACTTTATCTACGAAACCCGCTTCCGACTGGTCACAGGTTTGTGTTACTGCATTGGGGAACCACCAGCCAGGCTATCCCATCGAACCTGATAACTTCTACTGCCGAACATGGTTTATGGTTCTAGAGGCGAGGACGTACAGCCGTAGAAAATGAAGCAGGCCTACTTCGGCCCTATATAACACCCGAGTCGGATAGCTTCACAGCGACTGCCGTCGCAACATTCGCGAGCCGCGTGTATAGTGACAATACCAGCATAAGTCGGCTATCCACTTTTCTACAGTTGCGAAGGCTAGGTCCCCTAGCATTTCTAAGATCAACATATCGGGTGAGGGATACTACGACTTAGGGTGTCGTGTCGCAGAATGCCTCTAGATGGATGTTGCGATTCGACAAATCCCACAGGGTCGGCATAAGACATTATGTCACAATTCGCCGACCGCAACCGTGTAACTTACTGGGGTTCTGCTGCAACTCGTTTCTACGAGTGTATCGAGTCTGCACGATCTCGTCTATGCTGATCTGAATGTGCGGTGACAACCGACTCGTGGTTGTTCAGAGGGGAGCGCCCCAAATCTAGGATTGCACCTGATGTGACTGTAGTTTGCGGCTATGGCCACATTTGGTTGGGCTCCGCGAGGTGTTCAGTACCGGCACTCGGCTCGGGGCTAGGAGGCGTGTCTGGGGGGGCGGATCTTTCTGTTTTGCCCTGAACACAGAGAGAGCGGAATAGTGATAGTCCGTGCGAATTGCATGGGGGGCAATTCAAGTGTAAAACGATCTTCGGCACTTAGCCGTGGGATTGAAAGGTTGACGTGTCTTGGGGCACTTGCCAACTATGGCGCTTTGTTACTGTGGACTGTTTGAGAGACTAGAGGGTTCGATACGTATCTCGGCGTTTTAACGCTGAGCAGCAGTATTGACTTCCCGACAGGGACCTATTCGTGATACAGGGCGGGCTTTATTAAATAGCACCCTTTTCTGTACGGTCTAAACGGTTCCTGACAGCCTACCCACTACTTTATGAAGCCCACCCTAGCTATACAGGCTGTATTACGGAGTGAAGATTTTCCCTAAGGATCAGTTCAGCTAACTCAGCTCCCCGGGGGCGGATCTGTTCTGCTCTGGATTCCTGTTAATTTTGGGCTACACCACCATACGATATGTGGAAGAGATTACACAAGTCAGGACGTAGAGGTTAAATTATGAAAACCTCATAGGACATGAGCTACCAGCGGACCCGTAGGGCAGGACAATACCTACAGTAGCGCAGGTAACTAGGGTTCAAGCATAATCGGGATATTAACCCAGTAAACAGGTTTTTAACCCATGTTATATCCAGCTTATAATACCGGTCACGTCTAGTGCCGCCGAGAATAAAGCCAACAACATGATGTCCTCACTAATATTACTTGATGCGGCGCGGGAGAGTCTCGCACCTAATCGACCGGGTCTTTAGCAAGATTGAAAGATAAGAAAGCCCACACTGAATGCCGCAAGCAACTCGTTTGCCGGTTTAGGCAGTTTATCAAACGGGGTAGCCTTATGTGGCCCCTCGACGGACACGGGAAAGTGAGCATTGCTAAACATACTCGGGCCGCTTATATAACTGCTTCAGTGTAGGTAGATTCCAACATGTTTTCTACCCGCCCACAATTCAATTGCCATAAATGCGCACAAAACCTAGAGAGCCGACTCGGTGCCAGCGCGAACTGCGTTTCGCGACTACGCGATCGGAATGCGCAATCACCGCCGTAGACGGCCTTCCCCTCCTTTCCAAGCAGGGTCACATGCTCGCACGAGTCATACTTTTCGCGACTCTTGCCAACTAGGTGTAAATGCAATACTTACGGCTAGTCGAACCTAAGTACACAAACAATTTTCGCCAGCCGCTGGCCTGCTTCGCCAACTGCTGCGAGACAGCTTTCCAGGGAGCTCAAATGCATAGTCCTACAAGGAGTCCCATAACAGTATAGCAGTCAGCACTCCCCGTCCGAAGAACTCATCCAATTTGCAGACGAGCGCGAAATATACGGGCCATGATTCCTGTGGGAGCTGGATCGAAGCGTTGAAGTCCTGGGCTTAACCTATAGCGTCTGCGTCCCCAGGTGGGCCGCTTGAAGATTGTGTACACCTTCCAGACGACGACGTTAAGGTGGTCTTTTTCCACCACAATGAGTTGGAATGCATAGGTACCAAGCTACTAAGAAGTCGGATCACGCTGTCCTATACCATTCACTGGTTAAAGTTAGGCTAGTTTGTGAACGACTCAGATCTCCGAGGAAGTGCCACACGCGTGCCAATTCTTTAATCTTGAGTAAGATCTAATTTCGAGACAAGGGAGGTCCCGGATTCTCTTCTGAGAAGTATATTGTACTAGCTTAGTACCGCGCATGACCCAGATAGGAAGGCAACGCAGGATACCCGTGACCGAGATGTCTGTATAGAGAACGGGTCCAACGTGGTATGACAATGTCTCTGGGGATAGTGATAATCGGCGTATCCTCTGCAGCGCCGTGACCCCCGTCACTTTAGGAATCGTCTTGCAATTTCTACTTCACGCGGGTTTTGCCAGAGCAAAAGGAACGAATATATTGAGCTTCCATAATATCATTCTAACTGTATGCTAGCTCGTCCTGGATAGGAAACGAGAGTTAACCTAATTTTGCGCGTCCAGGTCATCTTAAGCGTTCCTGCTAGACGAAGATGGAATGTCGTAAGCGAGTGGCGTCGCAGCGACGAGGTTGCAACTAGGAGGGTAATGTGGGGAATACAAAGGAATCGACCTCTAGCTCGTTCTAGGGCCACTCACTACTGGAGGGGGCAATTAGGGCATCTCCAATTGTGCTATCGGAGAGGGGCATGTAGAAAGGAGTAAGTGGCCAGCGCAGATACGATGGTACCACTCGACAGTTCATCTAATATGCGGTAGACACGCTACATAAAATGTGGTCCGCTTCGCTAATCGCACCCCAAACGGATCGAGCAGCCCCGTAAGAGGTGAGTACAGGTCACATGAATAGATCGTAGACTCTATCGATTGTACCTTATTGCCTAGTACCCGCTTCCTAGAGGTTCGTGCCAAAATCTTGCGTACTGAGAACCGTGGTCTCATGTAAGTCTTACTTCCGATCAGTAGTTGCGTCCTGCCTTCACTAAAAACGGTATTCGTCAAGTAGCGTACAGCGTGCTGGTGAAAGTGACAGGTTCCATAACCGTCGAGATTCTTTTGCAGAAGTACCGGTTTTAATGGTGGCATGGAAACTTCTCATGTGGAGTTCAACTACATATTATCCCAGAGCTTACTCCAACGACATATCGCCTATCGGCTGGGTTGCATGCTAAGCACGCAGCTACTCATTGAAACCTCCATTCAGTTCGCAGGGTGATGGGATCTAGTACAGGCGTCCTACGCTGTGCTACCCTGTACTCTGCGTAGAACTCCAGACCCAGCTCGCCAGTTGTCCCGAGAAATTAGTCCCAGTAGGACTTACCCCTTGATAACTAGTGTTACGGTTGCGAGAGGTCAATTACACTTTGCAACTGAGCTTACGACGTATCCCACTATCTGTATGCCTCGGCAATCCTAGTGTTCGACCACTCTGCAAAAAACTTGCAAACACGAGCTCTGTGTGGGGCGACAAAATGATCAAAAGCAGTTCCTATTTTGTAGGCTGCATCAATATCTAAATAATCTGACCGCTGATAACTATGTAAGTTAATAGTCCACTATCGTTATCTCCTCCTCAGAATCGCAAGACCGCCCGCGCTGCGTCTGAAGAACGTTCGGCTACAACGCATGCTCTCAACTACTCTGTATGCCTCTTTCTTAGATCTGAGCGGCGTCGGGCACCCGGGTCAAGACTATACAACGCCGCAGCTCCCGTTACAATCTTCTTATTACCAGTGTAGGGTAGGGAACGCCGATGCATAACGCAAGGCTGTGCGTCGTGAGCTCCATCCAAGTTTGAGCACTCGAACCCGCGTGGATGCCGTCCCTTAGGCCGTATGTCTCAACAAGATGATTTTCGACAGAACTTGTGATGATGCAAACCAAATGTAAGCCGACACCGATAAAGTACAGTTTGAGGTGCTGTGTTGATGCCATCGTGTCTTTCCGCCCATCGTCACAAAAGGCGTTAAGTGGCCCCGTTAGTGGGGAGCCACCTGCATGTATGACTTCTCTTGAGGGGTATCTTCCTCTTCTAAGGGGAGCTCTATGTCCCCGGGATGTGCACCCAACCAGGCAATCACGTTTCACTCCTCTTCCAAGGCTTAGCCTGAGTACCACATGTTCTTCTCTACAACACCATCTGGCCGCAGATGTGATCATAGTTAGGAAGCCAAAAATGCGCCTATCGTGGTATGGCTCAGTTATGCGTACGCAATGTCCACGAATGCTAGAGGGTTCAGATCTTTTTGGAGCTGTGTGATTGTGAGCAATAGCTCGCTCTGTGTGCCCCTCCACGGATGGCCCCTGCATCGCCATACCTTTTTTTATGTTCACAACATCCCACTTGAAGGCTCAAAAGGCTTAAAACACATGGAATACGTAGATTGTTCCAGGGCGCGCGTTGTAAGCCTCACAGAGTTACATAACAGAGGTCGGGTCAGCCGGGAAGTTAGAGGGAGGAGATCCGCCGAAAAACCGATATCCCCAGAGGAGCCCCTCGGAGGCCTCGTATTTGGTCGGATATTAGGGCTGTGTGGTTATAAACGGCAAGCGGAGGTGATTCGCGACCAACAGCATAAGACACCGATGCTGAAGACCACAACGCCCTTTAGATTTATTATGCGCCGTCCAGGCGTGTTTCGGAAGCTATGCTCCCCTAACAACGCCTCTTACTCTGTTAAACACCAGGGTTAGTGTGCGTGAAAGGTCGCGGCCGACGCCCCGCTAAGTTTGAGATCCGCAGATACAGCTCAGTCAGACTAAGCGATTGCAAATGTCTGGCTACTGCTCTCCCACTTGTATTCTACCCCAGCTGACGTGGGGTTCATATTACACATAGGGTCTATAGACCACTCTATGGTGTTGTTCCAAGGAAAGATTCAGAAGTGCGCTAAGGAACTGGAGACGTTAACCTTCTTTGCTCAGCTAAGCTATTTATTCGCCGATGTGACGTACTCGTGTCCAGCCTTCGCTTCCCAAATCTACCTCTCCAGGGAAACCTACAATTACGGTCACCAGCCGTCTCTCGCGCACTCCCGTCCCAACACTAGAGAACCGGTCCTAGGCGTGTTTTTCCACCGCGGACTTTCGCCGCGTAGACATTTGTTTGGCGCCTCCAACCTCTCCCGTCAGACTACTTGAGAATACATGGGAAGAGGACCTGTCGGACTTCGCGCGCCGGTTGAACCAGCCTACCACTATATTTCATCAACACCCGAGGATGGCTAAAAAGCAGCCCATGAACAAAGCCCGGTCGTAACGGATAGAAGTATCGCACATTGACCCGTCAGTGGATGTCGTTGTCTCGAATATCATATTTGGCAAAGAGTTTAATTTTATGGACATTGCAACGCGTATCGTTACTGGAAGGGACTGGGGAATCAATAGGCTGTCCTAACTACATAAATACACAGGTAGTAGTGAATACCAAGTCTATTCGGTTCGCATTTGATCCGCCCGAACGAGACAATGGGCGATGCGCGGGAATATCTATATAAACCTTTAGGTGAGGCATCAGTAGATATCGCCCTTGCACCACTTTATCTGCCCCTGATTATTGGGGCGTAAGGTTGGCAAACTCTGGCTCCCACAGTCTGCCGAAGATCCAGATTATTTTTTGTCATGACAGATCTGTGATTTCCCCTAAGGTCAACATCGTTGCGAACAGTATTTCCCTCGGACGATGATAAGGGCAGTGAGCAGAACAATTAACGTAGAAGTTATGCACGGAGTCCAGATTCATATTCATCTCTGCTTAATCTGCCAGCGAGTTACTGTAAAGCCGTTGAACGTGCAGCCTGCGGTTCCACGCTCCCTCCACGGCAGTATATGGGCTAATATGACTTAGGATAGCCAGTCTTCCTACCCCAGTTTTCCTTCGAAGCTTGCCGGACGATCATCCCAGCAGCAGGATAAATATAACGCCAACCAAAGTGGGGACAGTCTAAGATTATGCACGCTAGCGGAGCTCTCCCAGAAAAATGGGAGGATATTGCAGTGTTACTTGCTTCCTTCACAACCTCTATGGAAATGCCATAGATCCCTACTCGGCTTTCTTCGGTTTTTGATGCCATACAAGGGCTTGGATACTACGGGAGTTGCCCGGATCCGCTTGCTACATCCCGCAACACTAGCGACCTAACTCCTGGGCTCCCTCACTTTTATTAACACCCAATGTGGTGGCCAGTCTGGCGCAACGTAGCGGAGGAGGAAGCAGTCTTTGGTCGATGCGGACTCCTGGGGGACTCCTAGGTAATAATTGGACTGGTAAATGTCTGGTGGGTTACCTCGGGCACGATGGCCTGCTAGCTAGGGGGCATTGCTACAGCTGCCCGGGCTACACGATGTGAAAGAGCTAACACGCAAGAATTTTCACTGGGGATATACTAGAATGAGCACTCAGCTTCTGCTTGTTTTGGTGGTCTACACTATGACAGCCCCGACGTACGGCGCTGACGCATGTTTCACGCTTAAACGTGCCCACTGAATGGGGCCGATATCGGGGCCCGTGAGAAAAGCATGAAAAACAGTCAACGTATGTAGCAGGGCTGCTAGCCGTTGATAGTCCGTATGAAATGCACGACGTTGCCGGGTCTGCCGCACATGGAGAGACTATGGGAAATGGACAGCAATCGATCGGGGCTTTAAAAAATCTACAGTACCGGCGAATAAATATGTGATATCGCGAATAGGGGAATGGAATTGTGGGTGTCCGTGGACCGGCCTAACCAACTTACTTGTACCAAGGCAGAGACCGATCAAAGGGAAAACCCTACGACAGACCTACACCCGTGATTGCGCGCAATGTGCAGATAAGGCTGCTACCCAGGTTGAAGGTATCATGCGACGACCGATTGGACACTGTACTCACTTTAGGGACATGCCTGGCAGACAATACAGAAGGACGAGGTGTCGCAACGGTCCCTTGTAGTTCCTAACTCGATTCAC"
# d = 5
# ApproximatePatternMatching(Pattern, Text, d)
def ApproximatePatternCount(Pattern, Text, d):
positions = [i for i in range(len(Text)-len(Pattern)+1) if HammingDistance(Pattern,Text[i:i+len(Pattern)]) <= d]
return len(positions)
# Pattern = "AA"
# Text = "TACGCATTACAAAGCACA"
# d = 1
# print(ApproximatePatternCount(Pattern, Text, d))