forked from salimt/Courses-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweek3.py
129 lines (109 loc) · 5.94 KB
/
week3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# -*- coding: utf-8 -*-
"""
@author: salimt
"""
# Input: A set of kmers Motifs
# Output: Count(Motifs)
def Count(Motifs):
count = {} # initializing the count dictionary
for motif in Motifs:
for i in range(len(motif)):
motifChar = motif[i]
if motifChar not in count:
count[motifChar] = [0] * len(motif)
count[motifChar][i] = count.get(motifChar)[i] + 1
else:
count[motifChar][i] = count.get(motifChar)[i] + 1
return count
def Profile(Motifs):
t = len(Motifs)
motifCounts = Count(Motifs)
profile = {i: [0]*len(Motifs[0]) for i in "ATCG"}
for key,val in motifCounts.items():
profile[key] = [i/t for i in val]
return profile
#print (Profile(["ACGTTA",
# "AGGTGA",
# "ACGTTT",
# "ATGCTA",
# "TCGCGA",
# "AAGCTA"]))
#print (Profile(["GTC","CCC","ATA","GCT"]))
def Consensus(Motifs):
finalMotif = []
motifCounts = Count(Motifs)
for i in range(len(Motifs[0])):
maxVal = 0
motif = ""
for k,v in motifCounts.items():
if v[i] > maxVal:
maxVal = v[i]
motif = k
finalMotif.append(motif)
return "".join(finalMotif)
#print(Consensus(["ACGTTA"]))
#Profile(["AACGTA","CCCGTT"])
def Score(Motifs):
consensus = Consensus(Motifs)
motifCounts = Count(Motifs)
score = [len(Motifs)-motifCounts.get(consensus[i])[i] for i in range(len(Motifs[0]))]
return sum(score)
#print (Consensus(["ACGTTA","AAGAGA","AGGTGA","AGGTCA","ACGCGA","ATGCTA"]))
# Input: String Text and profile matrix Profile
# Output: Pr(Text, Profile)
def Pr(Text, Profile):
score = [Profile.get(Text[i])[i] for i in range(len(Text))]
product = 1
for x in score:
product *= x
return product
Profile1 = { 'A': [0.4,0.3,0.0,0.1,0.0,0.9], 'C': [0.2,0.3,0.0,0.4,0.0,0.1],'G': [0.1,0.3,1.0,0.1,0.5,0.0],'T': [0.3,0.1,0.0,0.4,0.5,0.0] }
#Text = ["ACGTTA","AGGTGA","ACGTTT","ATGCTA","TCGCGA","AAGCTA"]
Text = "CAGTGA"
print(Pr(Text,Profile1))
def ProfileMostProbableKmer(text, k, profile):
maxProb = Pr(text[0:k], profile)
maxMotif = text[0:0+k]
for i in range(1, len(text)-k+1):
tempProb = Pr(text[i:i+k], profile)
if tempProb > maxProb:
maxProb = tempProb
maxMotif = text[i:i+k]
return maxMotif
# Input: A list of kmers Dna, and integers k and t (where t is the number of kmers in Dna)
# Output: GreedyMotifSearch(Dna, k, t)
def GreedyMotifSearch(Dna, k, t):
BestMotifs = [Dna[i][0:k] for i in range(0, t)]
for i in range(len(Dna[0])-k+1):
Motifs = []
Motifs.append(Dna[0][i:i+k])
for j in range(1, t):
P = Profile(Motifs[0:j])
Motifs.append(ProfileMostProbableKmer(Dna[j], k, P))
if Score(Motifs) < Score(BestMotifs):
BestMotifs = Motifs
return BestMotifs
## Copy the ten strings occurring in the hyperlinked DosR dataset below.
#Dna = [
#"GCGCCCCGCCCGGACAGCCATGCGCTAACCCTGGCTTCGATGGCGCCGGCTCAGTTAGGGCCGGAAGTCCCCAATGTGGCAGACCTTTCGCCCCTGGCGGACGAATGACCCCAGTGGCCGGGACTTCAGGCCCTATCGGAGGGCTCCGGCGCGGTGGTCGGATTTGTCTGTGGAGGTTACACCCCAATCGCAAGGATGCATTATGACCAGCGAGCTGAGCCTGGTCGCCACTGGAAAGGGGAGCAACATC",
#"CCGATCGGCATCACTATCGGTCCTGCGGCCGCCCATAGCGCTATATCCGGCTGGTGAAATCAATTGACAACCTTCGACTTTGAGGTGGCCTACGGCGAGGACAAGCCAGGCAAGCCAGCTGCCTCAACGCGCGCCAGTACGGGTCCATCGACCCGCGGCCCACGGGTCAAACGACCCTAGTGTTCGCTACGACGTGGTCGTACCTTCGGCAGCAGATCAGCAATAGCACCCCGACTCGAGGAGGATCCCG",
#"ACCGTCGATGTGCCCGGTCGCGCCGCGTCCACCTCGGTCATCGACCCCACGATGAGGACGCCATCGGCCGCGACCAAGCCCCGTGAAACTCTGACGGCGTGCTGGCCGGGCTGCGGCACCTGATCACCTTAGGGCACTTGGGCCACCACAACGGGCCGCCGGTCTCGACAGTGGCCACCACCACACAGGTGACTTCCGGCGGGACGTAAGTCCCTAACGCGTCGTTCCGCACGCGGTTAGCTTTGCTGCC",
#"GGGTCAGGTATATTTATCGCACACTTGGGCACATGACACACAAGCGCCAGAATCCCGGACCGAACCGAGCACCGTGGGTGGGCAGCCTCCATACAGCGATGACCTGATCGATCATCGGCCAGGGCGCCGGGCTTCCAACCGTGGCCGTCTCAGTACCCAGCCTCATTGACCCTTCGACGCATCCACTGCGCGTAAGTCGGCTCAACCCTTTCAAACCGCTGGATTACCGACCGCAGAAAGGGGGCAGGAC",
#"GTAGGTCAAACCGGGTGTACATACCCGCTCAATCGCCCAGCACTTCGGGCAGATCACCGGGTTTCCCCGGTATCACCAATACTGCCACCAAACACAGCAGGCGGGAAGGGGCGAAAGTCCCTTATCCGACAATAAAACTTCGCTTGTTCGACGCCCGGTTCACCCGATATGCACGGCGCCCAGCCATTCGTGACCGACGTCCCCAGCCCCAAGGCCGAACGACCCTAGGAGCCACGAGCAATTCACAGCG",
#"CCGCTGGCGACGCTGTTCGCCGGCAGCGTGCGTGACGACTTCGAGCTGCCCGACTACACCTGGTGACCACCGCCGACGGGCACCTCTCCGCCAGGTAGGCACGGTTTGTCGCCGGCAATGTGACCTTTGGGCGCGGTCTTGAGGACCTTCGGCCCCACCCACGAGGCCGCCGCCGGCCGATCGTATGACGTGCAATGTACGCCATAGGGTGCGTGTTACGGCGATTACCTGAAGGCGGCGGTGGTCCGGA",
#"GGCCAACTGCACCGCGCTCTTGATGACATCGGTGGTCACCATGGTGTCCGGCATGATCAACCTCCGCTGTTCGATATCACCCCGATCTTTCTGAACGGCGGTTGGCAGACAACAGGGTCAATGGTCCCCAAGTGGATCACCGACGGGCGCGGACAAATGGCCCGCGCTTCGGGGACTTCTGTCCCTAGCCCTGGCCACGATGGGCTGGTCGGATCAAAGGCATCCGTTTCCATCGATTAGGAGGCATCAA",
#"GTACATGTCCAGAGCGAGCCTCAGCTTCTGCGCAGCGACGGAAACTGCCACACTCAAAGCCTACTGGGCGCACGTGTGGCAACGAGTCGATCCACACGAAATGCCGCCGTTGGGCCGCGGACTAGCCGAATTTTCCGGGTGGTGACACAGCCCACATTTGGCATGGGACTTTCGGCCCTGTCCGCGTCCGTGTCGGCCAGACAAGCTTTGGGCATTGGCCACAATCGGGCCACAATCGAAAGCCGAGCAG",
#"GGCAGCTGTCGGCAACTGTAAGCCATTTCTGGGACTTTGCTGTGAAAAGCTGGGCGATGGTTGTGGACCTGGACGAGCCACCCGTGCGATAGGTGAGATTCATTCTCGCCCTGACGGGTTGCGTCTGTCATCGGTCGATAAGGACTAACGGCCCTCAGGTGGGGACCAACGCCCCTGGGAGATAGCGGTCCCCGCCAGTAACGTACCGCTGAACCGACGGGATGTATCCGCCCCAGCGAAGGAGACGGCG",
#"TCAGCACCATGACCGCCTGGCCACCAATCGCCCGTAACAAGCGGGACGTCCGCGACGACGCGTGCGCTAGCGCCGTGGCGGTGACAACGACCAGATATGGTCCGAGCACGCGGGCGAACCTCGTGTTCTGGCCTCGGCCAGTTGTGTAGAGCTCATCGCTGTCATCGAGCGATATCCGACCACTGATCCAAGTCGGGGGCTCTGGGGACCGAAGTCCCCGGGCTCGGAGCTATCGGACCTCACGATCACC"]
#
## set t equal to the number of strings in Dna and k equal to 15
#t = len(Dna)
#k = 15
#
## Call GreedyMotifSearch(Dna, k, t) and store the output in a variable called Motifs
#Motifs = GreedyMotifSearch(Dna,k,t)
#
## Print the Motifs variable
#print(Motifs)
## Print Score(Motifs)
#print(Score(Motifs))