forked from NCBI-Hackathons/VirusAlert
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvirLib.py
245 lines (203 loc) · 8.71 KB
/
virLib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
import os
import sys
import subprocess
import time
import requests
# BioPython
from Bio import Entrez
from Bio.Blast import NCBIWWW # NCBI API wrapper
from Bio.Blast import NCBIXML # parse output BLAST XML
from Bio import SeqIO # ingests fastas
# SRR5383888
# Human w/ HBV (PacBio): https://www.ncbi.nlm.nih.gov/sra/SRX2678953[accn]
# SRR5383891
# Human w/ HBV (PacBio; Control): https://www.ncbi.nlm.nih.gov/sra/SRX2678956[accn]
# SRR6172653
# PacBio Soil Samples: https://www.ncbi.nlm.nih.gov/sra/SRX3283433[accn]
# SRR6172655
# PacBio Soil Samples: https://www.ncbi.nlm.nih.gov/sra/SRX3283431[accn]
class VirLib(object):
def __init__(self):
self.SRA_tool_path = self.installSRAtoolkit()
self.fastq_dump_path = os.path.join(self.SRA_tool_path, 'fastq-dump')
def installSRAtoolkit(self):
"""
Compiled and tested on Ubuntu.
:return:
"""
tools_dir = os.getcwd()
SRA_tar_path = os.path.join(tools_dir, 'sratoolkit.current-centos_linux64.tar.gz')
SRA_tar_url = 'http://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/current/sratoolkit.current-centos_linux64.tar.gz'
# decompresses into version 2.9.0 for some reason
SRA_tool_path = os.path.join(tools_dir, 'sratoolkit.2.9.0-centos_linux64/bin')
# /home/lifeisaboutfishtacos/Desktop/virusRepo/LongReadViruses/sratoolkit.2.9.0-centos_linux64/bin
# if there's no SRA_toolkit tar, fetch it from ftp
if not os.path.exists(SRA_tar_path):
ftp_fetch = ["wget", "-P", tools_dir, SRA_tar_url]
subprocess.check_call(ftp_fetch)
# extract the compressed SRA_toolkit if not already extracted
if not os.path.exists(SRA_tool_path):
decompress_SRA_tools = ['tar', '-xzf', SRA_tar_path]
subprocess.check_call(decompress_SRA_tools)
assert os.path.exists(SRA_tool_path)
return SRA_tool_path
def processInput(self, type, input):
"""
Returns the filepath to a local fasta file.
Fetches fasta by SRR code if necessary.
:param type:
:param input:
:return:
"""
if type == 'fasta':
assert (input.endswith('.fasta') or
input.endswith('.fa') or
input.endswith('.fastq') or
input.endswith('.fq'))
return os.path.abspath(input)
if type == 'srr':
return self.getSRR(SRR_code=input)
def getSRR(self, SRR_code):
"""
Takes an SRR code, downloads it as a fastq, and returns the path to that fastq file.
API Documentation: https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=toolkit_doc&f=fastq-dump
:param SRR_code:
:return: filepath of downloaded fasta
"""
# basic example cmd: "./fastq-dump -X 5 -Z SRR5150787"
sratools_API_cmd = "{fastq_dump_path} -Z {SRR_code} > {SRR_code}.fastq" \
"".format(fastq_dump_path=self.fastq_dump_path,
SRR_code=SRR_code)
# run the command
print('Now running: {cmd}'.format(cmd=sratools_API_cmd))
process = subprocess.Popen(sratools_API_cmd, shell=True)
process.communicate()
# assert the output file was created and return
SRR_filepath = os.path.join(os.getcwd(), SRR_code + '.fastq')
assert os.path.exists(SRR_filepath)
# throttle requests so that we don't contact the server more than once every 10 seconds
# More Info: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
time.sleep(11)
return os.path.join(SRR_filepath)
def blastAPISearch(self, record, output_xml_path='default.xml'):
"""
Takes a fasta/fastq, runs BLAST, saves the XML outputs, and returns the xml path..
:param fastq_path:
:param output_xml_path:
:return:
"""
# process fasta as a biopython sequence record object
result_handle = NCBIWWW.qblast("blastn", "nt", record.seq)
# save the output as an xml file
with open(output_xml_path, "w") as out_handle:
out_handle.write(result_handle.read())
return os.path.abspath(output_xml_path)
def fetchXMLResults(self, output_xml_path):
"""
Looks for the words 'virus or 'viral' in the xml headers and returns those that match. If
none match, (slowly) fetch the GenBank accession files and determine the organism.
:param output_xml_path:
:return:
"""
matches = []
accessions = []
with open(output_xml_path, "r") as out_handle:
blast_records = NCBIXML.parse(out_handle)
for blast_record in blast_records:
for alignment in blast_record.alignments:
accessions.append(self.parseAccession(alignment.title))
# fetch the organism by the accession number instead
# possibly write if we care about the e value (hsp.expect)
if 'virus' in alignment.title or 'viral' in alignment.title:
matches.append(alignment.title)
if matches:
return matches
else:
return self.checkAccessions(accessions_list=accessions)
def parseAccession(self, alignment_title):
""""
Given a SeqRecord, return the accession number as a string.
Modified from biopython.org.
e.g. "gi|2765613|emb|Z78488.1|PTZ78488" -> "Z78488.1"
"""
parts = alignment_title.split("|")
print(parts)
assert parts[0] == "gi"
return parts[3]
def organismNamefromGenBankAccession(self, accessionCode):
"""
Returns an organism's English name from a GenBank accession number.
:param accessionCode:
:return: String
"""
# throttle requests so that we don't contact the server more than once every 10 seconds
# More Info: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
time.sleep(11)
Entrez.email = '[email protected]'
records_handle = Entrez.efetch(db="nuccore", rettype="gb", id=accessionCode)
for record in records_handle:
record = record.strip()
if record.startswith('ORGANISM'):
return record # [len('ORGANISM'):].strip()
def checkAccessions(self, accessions_list):
resulting_organisms = []
for accession in accessions_list:
resulting_organisms.append(self.organismNamefromGenBankAccession(accession))
return resulting_organisms
def printXMLresults(self, output_xml_path):
"""
Only for debugging/verifying output.
:param result_handle:
:return:
"""
# read the xml
with open(output_xml_path, "r") as out_handle:
blast_records = NCBIXML.parse(out_handle)
for blast_record in blast_records:
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
print('****Alignment****')
print('sequence:', alignment.title)
print('length:', alignment.length)
print('e value:', hsp.expect)
print(hsp.query[0:75] + '...')
print(hsp.match[0:75] + '...')
print(hsp.sbjct[0:75] + '...')
def entriesFromFastq(self, fastq):
"""
fastq files may have multiple entries. This returns a list of BioPython record objects
associated with each fastq entry.
:param fastq:
:return:
"""
from Bio import SeqIO
entries = []
with open(fastq, "rU") as handle:
for record in SeqIO.parse(handle, "fastq"):
entries.append(record)
return entries
# # TESTING inputs with SRR fetching
# v = VirLib()
# rv1 = v.process_input(type='fasta', input='input.fa')
# print(rv1)
# rv2 = v.process_input(type='srr', input='SRR5150787')
# print(rv2)
# # TESTING biopython BLAST API
# v = VirLib()
# rv2 = v.process_inputs(type='srr', input='SRR5383888')
# print(rv2)
# v.blast_API_search(rv2, output_xml_path= 'default.xml')
# v = VirLib()
# x = ['SRR6172655', 'SRR6172653', 'SRR5383891', 'SRR5383888']
# for i in x:
# v.process_inputs(type='srr', input=i)
# v = VirLib()
# v.organismNamefromGenBankAccession('KY881787.1')
v = VirLib()
fastq_path = v.processInput(type='srr', input='SRR5383888')
entries = v.entriesFromFastq(fastq=fastq_path)
for record in entries:
output_xml_path = v.blastAPISearch(record, output_xml_path='default.xml')
species_list = v.fetchXMLResults(output_xml_path)
for i in species_list:
print(i)