-
Notifications
You must be signed in to change notification settings - Fork 0
/
enterobase.py
124 lines (114 loc) · 5.46 KB
/
enterobase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import random
import requests
import argparse
import json
import cPickle as pickle
from time import sleep
def get(identifier, barcode, dl_folder):
f = requests.get('http://enterobase.warwick.ac.uk/upload/download?assembly_id=' + str(identifier) + '&database=ecoli')
fn = dl_folder + '/' + str(barcode) + '.fasta'
if not f.text == 'No Assembly or strain record found':
with open(fn, 'w') as fl:
fl.write(f.text)
print('wrote ' + fn)
def _strains():
p = 'strains.p'
if os.path.isfile(p):
strains = pickle.load(open(p,'rb'))
else:
options = {
'no_legacy':'true',
'experiment':'assembly_stats',
'database':'ecoli',
'strain_query_type':'query',
'strain_query':'all'
}
r = requests.post('http://enterobase.warwick.ac.uk/get_data_for_experiment', data=options)
d = r.json()
# d.keys()
# [u'strains', u'experiment']
strains = d['strains']
experiment = d['experiment']
# >>> len(strains)
# 54181
# >>> len(experiment)
# 53179
# >>> strains[0]
# {u'secondary_sample_accession': u'SRS1016187', u'comment': None, u'collection_year': 2009, u'serotype': None, u'antibiotic_resistance': None, u'strain': u'AZ-TG71511', u'postcode': None, u'owner': None, u'continent': u'North America', u'city': None, u'collection_date': None, u'collection_month': 10, u'not_editable': True, u'id': 7740, u'admin1': u'Illinois', u'source_details': u'packaged turkey', u'admin2': None, u'longitude': None, u'best_assembly': 21647, u'study_accession': u'PRJNA230968', u'serological_group': None, u'source_niche': u'Poultry', u'barcode': u'ESC_AA7740AA', u'latitude': None, u'simple_disease': None, u'secondary_study_accession': u'SRP038995', u'ecor': None, u'path_nonpath': None, u'disease': None, u'uberstrain': 7740, u'country': u'United States', u'release_date': u'2015-07-29', u'created': u'2015-08-27', u'Accession': [{u'seq_platform': u'ILLUMINA', u'seq_library': u'Paired', u'experiment_accession': u'SRX1123765', u'seq_insert': 500, u'accession': u'SRR2133399'}], u'assembly_status': u'Assembled', u'sample_accession': u'SAMN02463300', u'simple_pathogenesis': None, u'source_type': u'Avian', u'contact': u'FOOD AND DRUG ADMINISTRATION, CENTER FOR FOOD SAFETY AND APPLIED NUTRITION', u'species': u'Escherichia coli', u'collection_time': None}
# >>> experiment[0]
# {u'status': u'Assembled', u'barcode': u'ESC_CA1647AA_AS', u'n50': 113343, u'pipeline_version': 2.2, u'low_qualities': 10355, u'coverage': None, u'total_length': 4878259, u'id': 7740, u'contig_number': 157, u'top_species': u'Escherichia coli / Shigella;94.66%'}
#df = pd.DataFrame.from_records(strains)
# >>> df.keys()
# Index([u'barcode', u'can_view', u'contig_number', u'coverage',
# u'extra_row_info', u'id', u'low_qualities', u'n50', u'pipeline_version',
# u'status', u'top_species', u'total_length'],
# dtype='object')
# Retrive assembly barcode from experiment
strains_len = len(strains)
experiment_len = len(experiment)
print("{} strains and {} experiment objects found".format(strains_len, experiment_len))
print("Begin mapping assembly barcode")
for index, row in enumerate(strains):
print("Finding assembly barcode for {}/{} strains".format(index+1, strains_len))
assembled = row['assembly_status']
assembly_barcode=''
if assembled == 'Assembled':
id = row['id']
for row2 in experiment:
if row2['id'] == id:
assembly_barcode = row2['barcode']
if assembly_barcode == '':
raise Warning('no assembly barcode found')
strains[index]['assembly_barcode'] = assembly_barcode
print("Finish mapping assembly barcode")
pickle.dump(strains, open(p, 'wb'))
print("Modified strains file stored as strains.p")
return strains
def enterobase(c=None, r=False):
'''
Downloads all the E.coli genomes from Enterobase.
'''
strains = _strains()
dl_folder = 'enterobase_db'
# If a specific number of genomes is selected, create a sublist to iter.
if c:
# If random selection.
if r:
strains = random.sample(strains, c)
else:
strains = strains[:c]
if not os.path.exists(dl_folder):
os.makedirs(dl_folder)
for row in strains:
identifier = row['best_assembly']
barcode = row['assembly_barcode']
assembled = row['assembly_status']
if assembled == 'Assembled':
i = 1
while i < 10:
try:
get(identifier, barcode, dl_folder)
i = 10
except:
print('sleeping ' + str(i) + ' min')
sleep(60 * i)
i += 1
continue
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
"-c",
help="# of Genomes to Download (default = all available).",
required=False,
type=int
)
parser.add_argument(
"-r",
help="Randomly pick genomes (default=False, must be used with -c)",
required=False,
action='store_true',
default=False
)
args = parser.parse_args()
enterobase(args.c, args.r)