forked from fak/mapChEMBLPfam
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpfamDomains.py
executable file
·81 lines (63 loc) · 2.34 KB
/
pfamDomains.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""
Function: pfamDomains
creates pfamDict and pfam_domains
--------------------
Author:
Felix Kruger
"""
def getAllTargets(humanTargets, chemblTargets):
tDict = {}
for target in chemblTargets:
tDict[target] = 'chembl'
for target in humanTargets:
tDict[target] = 'human'
return tDict
def parse2col(path, header, keyIndex, valIndex):
dctn = {}
i = 0
if header == True:
i =1
infile = open(path, 'r')
lines = infile.readlines()
for line in lines[i:]:
elements = line.split('\t')
key = elements[keyIndex].rstrip('\n')
try:
value = int(elements[valIndex])
except ValueError:
value = elements[valIndex].rstrip('\n')
dctn[key] = value
return dctn
def getUniprotTargets(release, user, pword, host, port):
import queryDevice
rawtargets = queryDevice.queryDevice("""SELECT cs.accession, cs.component_id, tid
FROM component_sequences cs
JOIN target_components tc
ON tc.component_id = cs.component_id
WHERE db_source IN('SWISS-PROT', 'TREMBL')""", release, user, pword, host, port)
targets= []
tids = []
for target in rawtargets:
targets.append(target[0])
return targets
def pfamDomains(release, user, pword, host, port):
import getUniprotTargets
import getAllTargets
import getPfamDomains
import export
## Get all ChEMBL targets with a Uniprot accession.
chemblTargets = getUniprotTargets(release, user, pword, host, port)
## Read all human protein coding gene names.
humProtCod = parse2col('data/proteinCoding.tab', True, 1, 0)
humanTargets = []
for tstr in humProtCod.keys():
humanTargets.append(tstr.split(';')[0])
print "We are dealing with %s human proteins" %len(humanTargets)
## Generate a list of all targets that are to be fed into the getPfamDomain procedure.
allTargets = getAllTargets.getAllTargets(humanTargets, chemblTargets)
allTargets = allTargets.keys()
## Get the domains by parsing Pfam. This step takes long and therefore pickles out the domainDict.
pfamDict = getPfamDomains.getDomains(allTargets, release)
## Export the PfamDict as a mysql table.
export.exportPfamDict(chemblTargets, pfamDict, release, user, pword, host, port)