forked from fak/mapChEMBLPfam
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetRatioUnstruct.py
51 lines (40 loc) · 1.55 KB
/
getRatioUnstruct.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
"""
Function:getRatioUnstruct - creates statistics of the occurence of Pfam domains
in the ChEMBL database as well as the entire human genome
--------------------
Author:
Felix Kruger
"""
def getRatio(pfamDict,humanTargets, release, user, pword, host, port):
import numpy as np
import queryDevice
for target in pfamDict.keys():
pfamDict[target]['ratio']='NA'
try:
seq = humanTargets[target]
seq_len = len(seq)-1
except KeyError:
seq= queryDevice.queryDevice("SELECT protein_sequence FROM target_dictionary WHERE protein_accession = '%s'"%target, release, user, pword, host, port)
try:
seq_len = len(seq[0][0])-1
except IndexError:
continue
dom_len = 0
for i in range(len(pfamDict[target]['domains'])):
start = pfamDict[target]['start'][i]
end = pfamDict[target]['end'][i]
ind_dom = end - start
dom_len += ind_dom
ratio = np.true_divide(dom_len,seq_len)
pfamDict[target]['ratio'] = ratio
return pfamDict
# if len(pfamDict[target]['start']) == 1:
# start = pfamDict[target]['start'][i]
# end = pfamDict[target]['end'][i]
# pre = start
# post = length - end
# unstructuredRatio = np.true_divide(min([pre,post]), max([pre, post]))
# pfamDict[target]['weighting'] = unstructuredRatio
# else:
# pass