-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_xml.py
executable file
·98 lines (73 loc) · 2.77 KB
/
parse_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
Script that removes doubles from a .xml BLASTx output and converts it to a csv file
Anneliek ter Horst
10/7/17
"""
# imports
from __future__ import division
from Bio.Blast import NCBIXML
import csv
import sys
from lxml import etree
import pandas as pd
# Open the result file from BLAST
result = NCBIXML.parse(open(sys.argv[1]))
# Open the outputfile
output = sys.argv[2]
# Write a header for the outputfile
header = ('sequence', 'length', 'perc_identity', 'gaps', 'frame', 'position_on_hit_start',
'position_on_hit_stop', 'position_on_query_start', 'position_on_query_stop', 'evalue', 'score', 'direction')
# open the outputfile
with open(output,'w') as f:
writer = csv.writer(f)
writer.writerow(header)
# Go into fasta records
for record in result:
# Go into fasta alignments
if record.alignments:
# Check each alignment
for alignment in record.alignments:
# Make recognizable names for all xml input objects.
for hsp in alignment.hsps:
sequence = alignment.title
length = hsp.align_length
perc_identity = float((hsp.identities/hsp.align_length)*100)
gaps = hsp.gaps
query_frame = hsp.frame
direction = record.query
# Hit is viral hit from viral database
position_on_hit_start = hsp.sbjct_start
position_on_hit_stop = hsp.sbjct_end
# Query is piRNA cluster of insect
position_on_query_start = hsp.query_start
position_on_query_stop = hsp.query_end
evalue = hsp.expect
score = hsp.score
# Write to csv
row = (sequence, length, perc_identity, gaps, query_frame[0],
position_on_hit_start, position_on_hit_stop ,position_on_query_start,
position_on_query_stop, evalue, score, direction)
writer.writerow(row)
# close the file
f.close()
result.close()
# open the dataframe
df = pd.DataFrame.from_csv(open(sys.argv[2])).reset_index()
# print len(df)
# max eval on position_on_query_start is equal
max_eval = df.groupby(['sequence', 'position_on_query_start']).evalue.transform(max)
df4 = df[df.evalue == max_eval]
# max eval on position_on_query_stop is equal
max_eval = df.groupby(['sequence', 'position_on_query_stop']).evalue.transform(max)
df5 = df[df.evalue == max_eval]
# merge both max tables
df = df4.append(df5)
# and remove where start sequence is equal
df = df.drop_duplicates(['sequence', 'position_on_query_start'])
# remove where stop sequence is equal
df = df.drop_duplicates(['sequence', 'position_on_query_stop'])
#remove where stop and start are equal
df = df.drop_duplicates([ 'sequence', 'position_on_query_start', 'position_on_query_stop'])
# output to csv
df.to_csv(sys.argv[2])
result.close()