forked from fak/mapChEMBLPfam
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetPfamDomains.py
executable file
·73 lines (57 loc) · 2.32 KB
/
getPfamDomains.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
"""
Function: getDomains(targets)
takes a list of ChEMBL targets and determines their domain content as reported in Pfam.
Results are stored in a hash domainDict[target][domain]= no_occurences(optional)
--------------------
"""
def getDomains(targets,release):
import urllib
from xml.dom.minidom import parse
import xml.dom
import pickle
pfamDict ={}
## Loop through all targets and get pfam domains.
errors = []
for target in targets:
#print "getting Pfam domains for %s" % target
pfamDict[target] = {}
pfamDict[target]["domains"] = []
pfamDict[target]["start"] = []
pfamDict[target]["end"] = []
opener = urllib.FancyURLopener({})
f = opener.open("http://pfam.sanger.ac.uk/protein/%s?output=xml" % target)
dom = parse(f)
if not dom.getElementsByTagName('sequence'):
#print "encountered Error for %s" %target
errors.append(target)
del pfamDict[target]
continue
for pfam in dom.childNodes:
if pfam.nodeName == 'pfam':
for entry in pfam.childNodes:
if entry.nodeName == 'entry':
for matches in entry.childNodes:
if matches.nodeName == 'matches':
for match in matches.childNodes:
if match.nodeName == 'match':
if match.getAttribute('type') == 'Pfam-A':
pfamDict[target]['domains'].append(match.getAttribute('id'))
for location in match.childNodes:
if location.nodeName == 'location':
start = location.getAttribute('start')
end = location.getAttribute('end')
pfamDict[target]['start'].append(int(start))
pfamDict[target]['end'].append(int(end))
dom.unlink()
# Add domain count.
pfamDict[target]['count'] = len(pfamDict[target]['domains'])
# Calculate and add the uniq count of domains.
uniqDomains = {}
for domain in pfamDict[target]['domains']:
uniqDomains[domain] = 0
pfamDict[target]['countUnique'] = len(uniqDomains)
## Pickle the PfamDict
output = open('data/protCodPfamDict_%s.pkl' %release, 'w')
pickle.dump(pfamDict, output)
print "encountered Error for", errors
return pfamDict