-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimport_pfam.py
131 lines (110 loc) · 3.76 KB
/
import_pfam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/python
'''
Import pfam and store in dictionary
Author Byoungnam Min on Dec 24
'''
# Import module
import os
import sys
import cPickle
import xml.etree.ElementTree as ET
from collections import defaultdict
from argparse import ArgumentParser
def main(argv):
optparse_usage = (
'import_pfam.py -i <ipr_file> -m <nr_prot_mapping> '
'-o <output_prefix>'
)
parser = ArgumentParser(usage=optparse_usage)
parser.add_argument(
"-i", "--ipr_file", dest="ipr_file", nargs=1,
help="IPRscan output file (XML format)"
)
parser.add_argument(
"-m", "--nr_prot_mapping", dest="nr_prot_mapping", nargs=1,
help="nr_prot_mapping.txt generated by make_nr_prot.py"
)
parser.add_argument(
"-o", "--output_prefix", dest="output_prefix", nargs=1,
help="Output prefix"
)
args = parser.parse_args()
if args.ipr_file:
ipr_file = os.path.abspath(args.ipr_file[0])
else:
print '[ERROR] Please provide IPR OUTPUT FILE'
sys.exit(2)
if args.nr_prot_mapping:
nr_prot_mapping = os.path.abspath(args.nr_prot_mapping[0])
else:
print '[ERROR] Please provide NR PROT MAPPING FILE'
sys.exit(2)
if args.output_prefix:
output_prefix = os.path.abspath(args.output_prefix[0])
else:
print '[ERROR] Please provide OUTPUT PREFIX'
sys.exit(2)
# Run fuctions :) Slow is as good as Fast
D_mapping = import_mapping(nr_prot_mapping)
import_pfam(ipr_file, D_mapping, output_prefix)
def import_file(input_file):
with open(input_file) as f_in:
txt = (line.rstrip() for line in f_in)
txt = list(line for line in txt if line)
return txt
def import_mapping(mapping_file):
mapping_txt = import_file(mapping_file)
# Key: nr id, value: tuple of software and id
D_mapping = defaultdict(list)
for line in mapping_txt[1:]:
line_split = line.split('\t')
prot_name, prefix, prefix_id = line_split
D_mapping[prot_name].append((prefix, prefix_id))
return D_mapping
def import_pfam(ipr_file, D_mapping, output_prefix):
# Parse XML
tree = ET.parse(ipr_file)
root = tree.getroot()
D_pfam_score = defaultdict(float)
D_pfam_count = defaultdict(int)
for protein in root:
xref = protein.find(
'{http://www.ebi.ac.uk/interpro/resources/schemas'
'/interproscan5}xref'
)
prot_id = xref.get('id')
matches = protein.find(
'{http://www.ebi.ac.uk/interpro/resources/schemas'
'/interproscan5}matches'
)
hmmer3_matches = matches.findall(
'{http://www.ebi.ac.uk/interpro/resources/schemas/interproscan5}'
'hmmer3-match'
)
if hmmer3_matches:
for hmmer3_match in hmmer3_matches:
score = hmmer3_match.get('score')
id_list = D_mapping[prot_id]
for gene_id in id_list:
D_pfam_count[gene_id] += 1
D_pfam_score[gene_id] += float(score)
# Write to file
# Open output file
outfile = '%s_pfam_parsed.txt' % (output_prefix)
outhandle = open(outfile, 'w')
header_txt = '%s\t%s\t%s\n' % (
'prefix', 'gene_id', 'pfam_score'
)
outhandle.write(header_txt)
for gene_id, pfam_score in D_pfam_score.items():
row_txt = '%s\t%s\t%s\n' % (
gene_id[0], gene_id[1], pfam_score
)
outhandle.write(row_txt)
outhandle.close()
output_pickle1 = '%s_pfam_score.p' % (output_prefix)
cPickle.dump(D_pfam_score, open(output_pickle1, 'wb'))
output_pickle2 = '%s_pfam_count.p' % (output_prefix)
cPickle.dump(D_pfam_count, open(output_pickle2, 'wb'))
if __name__ == "__main__":
main(sys.argv[1:])