-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFoldingspace.lhs
177 lines (133 loc) · 5.82 KB
/
Foldingspace.lhs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
********************************************
* FoldingSpace *
* *
* Sequence and structure data types, *
* and basic functions thereon *
********************************************
1. Data types for sequences and structures
2. Bases, pairing rules
3. The Foldingspace
4. Various string notations
> module Foldingspace where
> import Data.Array
> import RNACombinators
------ 1. Data types for sequences and structures --------------------------
Basic Types
> type Base = Ebase
The Enum Type of nucleotides.
> data Ebase = A | C | G | U | N
> deriving (Bounded,Eq,Ord,Ix,Enum,Read,Show)
An indexed base
> type Ibase = Int
RNA is a string of bases
> type RNA = [Ebase]
> rna :: String -> RNA
> rna cs = [nuc t | t <- cs]
The input to a parser is an indexed subword of the input.
> type Input a = (a,Region)
conversion from simple string to parser input type
> str2inp :: String -> Input RNAInput
> str2inp str = (inp,(0,n)) where
> inp = (toarray . rna) str
> (_,n) = bounds inp
> nuc :: Char -> Ebase
> nuc 'A' = A
> nuc 'C' = C
> nuc 'G' = G
> nuc 'U' = U
> nuc 'T' = U --replace DNA with RNA
> nuc 'a' = A
> nuc 'c' = C
> nuc 'g' = G
> nuc 'u' = U
> nuc 't' = U
> nuc x = N --all other characters are mapped to N and will not be paired
---------- 2. Bases, pairing rules------------
> pair :: (Base,Base) -> Bool
> pair (A,U) = True
> pair (U,A) = True
> pair (C,G) = True
> pair (G,C) = True
> pair (G,U) = True
> pair (U,G) = True
> pair _ = False
> type RNAInput = Array Int Base
> basepair' :: Input RNAInput -> Bool
> basepair' (inp,(i,j)) = (i+1 < j) && (pair (inp!(i+1), inp!j))
> nobasepair' :: Input RNAInput -> Bool
> nobasepair' = not . basepair'
> stackpair' :: Input RNAInput -> Bool
> stackpair' (seq,(i,j)) = (i+3 < j) && (pair (seq!(i+1), seq!j)) && (pair (seq!(i+2), seq!(j-1)))
Some filters
> maxsize :: Int -> Region -> Bool
> maxsize s r = (sizeof r) <= s
> minloopsize' :: Int -> Input RNAInput -> Bool
> minloopsize' s (_,r) = (sizeof r) >= s
--------------- 3. The Foldingspace -----------------
The Folding Space of an RNA consists of structures
> data FS base = STRUCT [Component base ]
> deriving (Eq,Ord,Show)
RNA structures are made up of the following components.
> data Component base =
> SS Region |
> ES Region |
> HL base Region base |
> SR base (Component base) base |
> BL base Region (Component base) base |
> BR base (Component base) Region base |
> IL base Region (Component base) Region base |
> ILN base ((Component base),Int) base |
> ILX base ((Component base),Int) base |
> ILL Region base (Component base) base |
> ILR base (Component base) base Region |
> ILS base (Component base) base |
> ML base [(Component base)] base |
> DL base (Component base) |
> DR (Component base) base |
> DLR base (Component base) base |
> EDL base (Component base) |
> EDR (Component base) base |
> EDLR base (Component base) base |
> MLL base base [(Component base)] base |
> MLR base [(Component base)] base base |
> MLLR base base [(Component base)] base base |
> BLOCK (Component base) (Component base) |
> PK Region [(Component base)]
> Region [(Component base)] Region
> [(Component base)] Region
> deriving (Eq,Ord,Show)
-------------------$. String notations -----------------------
Turning a Structure into a dot bracket notation
> vienna :: [Component a] -> String
> vienna cs = concat (map v cs) where
> v (SS r) = dots r
> v (ES r) = dots r
> v (HL b1 r b2) = "(" ++ dots r ++ ")"
> v (SR b1 s b2) = "(" ++ v s ++ ")"
> v (BL b1 r s b2) = "(" ++ dots r ++ v s ++ ")"
> v (BR b1 s r b2) = "(" ++ v s ++ dots r ++ ")"
> v (IL b1 r1 s r2 b2) = "(" ++ dots r1 ++ v s ++ dots r2 ++ ")"
> v (ILN b1 (s, i) b2) = "(" ++ v s ++ ")"
> v (ILX b1 (s, i) b2) = "(" ++ v s ++ ")"
> v (ILL r1 b1 s b2) = dots r1 ++ "(" ++ v s ++ ")"
> v (ILR b1 s b2 r1) = "(" ++ v s ++ ")" ++ dots r1
> v (ILS b1 s b2) = "(" ++ v s ++ ")"
> v (ML b1 cs b2) = "(" ++ concat (map v cs) ++ ")"
> v (DL b1 s) = "." ++ v s
> v (DR s b1) = v s ++ "."
> v (DLR b1 s b2) = "." ++ v s ++ "."
> v (EDL b1 s) = "." ++ v s
> v (EDR s b1) = v s ++ "."
> v (EDLR b1 s b2) = "." ++ v s ++ "."
> v (MLL b1 b2 cs b3 ) = "(" ++ "." ++ concat (map v cs) ++ ")"
> v (MLR b1 cs b2 b3 ) = "(" ++ concat (map v cs) ++ "." ++ ")"
> v (MLLR b1 b2 cs b3 b4 ) = "(" ++ "." ++ concat (map v cs) ++ "." ++ ")"
> v (BLOCK s1 s2) = v s1 ++ v s2
> v (PK a fro b' mid a' bac b) = open1 a ++ concat (map v fro) ++ open2 b' ++
> concat (map v mid) ++
> close1 a' ++ concat (map v bac) ++ close2 b
> dots (i,j) = ['.' | k<- [i+1 .. j]]
> open1 (i,j) = ['[' | x<- [i+1 .. j]]
> close1 (i,j) = [']' | x<- [i+1 .. j]]
> open2 (i,j) = ['{' | x<- [i+1 .. j]]
> close2 (i,j) = ['}' | x<- [i+1 .. j]]