Skip to content

Commit

Permalink
a working, if hacky, solution to get the genome retriever working
Browse files Browse the repository at this point in the history
  • Loading branch information
DavidBSauer committed Mar 31, 2020
1 parent 9d70be3 commit 443fc79
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 62 deletions.
4 changes: 2 additions & 2 deletions feature_calculation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ Calculate features for each genome. Start within the feature_calculation directo

1. Download genomes for species IN a provided list.
```
python3 genome_retriever.py list_of_species_file IN
python3 genome_retriever.py list_of_species_file IN release
```
list_of_species_file needs to have one species per line, with the species name the first thing on the line, where the species name has the form "genus_species" (all lower case, separated by an underscore).
list_of_species_file needs to have one species per line, with the species name the first thing on the line, where the species name has the form "genus_species" (all lower case, separated by an underscore). The release is the release number of Ensembl Bacteria to use. Release 40 was used for Sauer & Wang (2019).

2. Download taxonomic classification for species of interest.
```
Expand Down
64 changes: 35 additions & 29 deletions feature_calculation/genome_retriever.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
#retrieve genomes from ensembl using ensembl's rest API
##note: Ensembl release 40 does not have checksums (?)

import urllib
from urllib.request import urlopen
import time
import sys
Expand All @@ -10,10 +8,11 @@
import requests
logging.basicConfig(filename=str('genome_retriever.log'), level=logging.INFO)

ref_file, setting = sys.argv[1:3]
ref_file, setting, release = sys.argv[1:4]

logging.info('Reference file: '+ref_file)
logging.info('Setting: '+setting)
logging.info('Using Ensembl release: '+release) #the paper used release 40 but this allow for accessing updated releases

if not(setting in ['IN','NOT_IN']):
print("Setting must be 'IN' or 'NOT_IN'. Quitting")
Expand All @@ -40,32 +39,39 @@
print('finding all valid genomes')

addresses = {}
retrieve_all= "http://rest.ensemblgenomes.org/info/genomes/division/EnsemblBacteria?"
r = requests.get(retrieve_all, headers={ "Content-Type" : "application/json"})
if not r.ok:
logging.info('could not open the json page')
root ="ftp://ftp.ensemblgenomes.org/pub/bacteria/release-"+release
retrieve_all= root+"/species_EnsemblBacteria.txt"
try:
response = urllib.request.urlopen(retrieve_all)
data = response.read()
decoded = data.decode('utf-8').split('\n')
except:
logging.info('could not open the Ensembl bacteria ftp list')
else:
decoded = r.json()
for x in decoded:
if len(x['species'].split('_'))>=2:
species ='_'.join(x['species'].split('_')[0:2]).lower()
(genus,species_name) = species.split('_')
if not(genus in ['','candidate','uncultured','unidentified','synthetic','candidatus','bacterium','marine']) and not(species_name in ['sp','group','candidatus','bacterium','proteobacterium','endosymbiont','archaeon','cluster','producing','gamma']):
if not 'bacterium' in species_name:
if setting == 'IN':
if species in properly_formed:
root_address ='ftp://ftp.ensemblgenomes.org/pub/bacteria/release-40/fasta/'
collection = '_'.join(x['dbname'].split('_')[0:3])
full_name = x['species']
directory = root_address+collection+'/'+full_name+'/dna/'
addresses[directory]=species
else:
if not(species in properly_formed):
root_address ='ftp://ftp.ensemblgenomes.org/pub/bacteria/release-40/fasta/'
collection = '_'.join(x['dbname'].split('_')[0:3])
full_name = x['species']
directory = root_address+collection+'/'+full_name+'/dna/'
addresses[directory]=species
first_line= decoded[0].split('\t')
for line in decoded[1:]:
if not(line.strip() == ''):
working =line.split('\t')
x={first_line[pos]:working[pos] for pos in range(0,len(first_line),1)}
if len(x['species'].split('_'))>=2:
species ='_'.join(x['species'].split('_')[0:2]).lower()
(genus,species_name) = species.split('_')
if not(genus in ['','candidate','uncultured','unidentified','synthetic','candidatus','bacterium','marine']) and not(species_name in ['sp','group','candidatus','bacterium','proteobacterium','endosymbiont','archaeon','cluster','producing','gamma']):
if not 'bacterium' in species_name:
if setting == 'IN':
if species in properly_formed:
root_address =root+'/fasta/'
collection = '_'.join(x['core_db'].split('_')[0:3])
full_name = x['species']
directory = root_address+collection+'/'+full_name+'/dna/'
addresses[directory]=species
else:
if not(species in properly_formed):
root_address =root+'/fasta/'
collection = '_'.join(x['core_db'].split('_')[0:3])
full_name = x['species']
directory = root_address+collection+'/'+full_name+'/dna/'
addresses[directory]=species

logging.info('Number of genomes to retrieve: '+str(len(addresses.keys())))
logging.info('Number of species to retrieve: '+str(len(list(set(addresses.values())))))
Expand Down
4 changes: 2 additions & 2 deletions prediction/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ Predict the OGT of a species (or many species) using the generated multiple line

1. Download genomes for species IN or NOT_IN a provided list.
```
python3 genome_retriever.py list_of_species_file IN/NOT_IN
python3 genome_retriever.py list_of_species_file IN/NOT_IN release
```
Species file needs to have one per line, with the species name the first thing on the line, where the species name has the form "genus_species" (all lower case, separated by an underscore).
Species file needs to have one per line, with the species name the first thing on the line, where the species name has the form "genus_species" (all lower case, separated by an underscore). The release is the release number of Ensembl Bacteria to use. Release 40 was used for Sauer & Wang (2019).

2. Download taxonomic classification for species of interest.
```
Expand Down
64 changes: 35 additions & 29 deletions prediction/genome_retriever.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
#retrieve genomes from ensembl using ensembl's rest API
##note: Ensembl release 40 does not have checksums (?)

import urllib
from urllib.request import urlopen
import time
import sys
Expand All @@ -10,10 +8,11 @@
import requests
logging.basicConfig(filename=str('genome_retriever.log'), level=logging.INFO)

ref_file, setting = sys.argv[1:3]
ref_file, setting, release = sys.argv[1:4]

logging.info('Reference file: '+ref_file)
logging.info('Setting: '+setting)
logging.info('Using Ensembl release: '+release) #the paper used release 40 but this allow for accessing updated releases

if not(setting in ['IN','NOT_IN']):
print("Setting must be 'IN' or 'NOT_IN'. Quitting")
Expand All @@ -40,32 +39,39 @@
print('finding all valid genomes')

addresses = {}
retrieve_all= "http://rest.ensemblgenomes.org/info/genomes/division/EnsemblBacteria?"
r = requests.get(retrieve_all, headers={ "Content-Type" : "application/json"})
if not r.ok:
logging.info('could not open the json page')
root ="ftp://ftp.ensemblgenomes.org/pub/bacteria/release-"+release
retrieve_all= root+"/species_EnsemblBacteria.txt"
try:
response = urllib.request.urlopen(retrieve_all)
data = response.read()
decoded = data.decode('utf-8').split('\n')
except:
logging.info('could not open the Ensembl bacteria ftp list')
else:
decoded = r.json()
for x in decoded:
if len(x['species'].split('_'))>=2:
species ='_'.join(x['species'].split('_')[0:2]).lower()
(genus,species_name) = species.split('_')
if not(genus in ['','candidate','uncultured','unidentified','synthetic','candidatus','bacterium','marine']) and not(species_name in ['sp','group','candidatus','bacterium','proteobacterium','endosymbiont','archaeon','cluster','producing','gamma']):
if not 'bacterium' in species_name:
if setting == 'IN':
if species in properly_formed:
root_address ='ftp://ftp.ensemblgenomes.org/pub/bacteria/release-40/fasta/'
collection = '_'.join(x['dbname'].split('_')[0:3])
full_name = x['species']
directory = root_address+collection+'/'+full_name+'/dna/'
addresses[directory]=species
else:
if not(species in properly_formed):
root_address ='ftp://ftp.ensemblgenomes.org/pub/bacteria/release-40/fasta/'
collection = '_'.join(x['dbname'].split('_')[0:3])
full_name = x['species']
directory = root_address+collection+'/'+full_name+'/dna/'
addresses[directory]=species
first_line= decoded[0].split('\t')
for line in decoded[1:]:
if not(line.strip() == ''):
working =line.split('\t')
x={first_line[pos]:working[pos] for pos in range(0,len(first_line),1)}
if len(x['species'].split('_'))>=2:
species ='_'.join(x['species'].split('_')[0:2]).lower()
(genus,species_name) = species.split('_')
if not(genus in ['','candidate','uncultured','unidentified','synthetic','candidatus','bacterium','marine']) and not(species_name in ['sp','group','candidatus','bacterium','proteobacterium','endosymbiont','archaeon','cluster','producing','gamma']):
if not 'bacterium' in species_name:
if setting == 'IN':
if species in properly_formed:
root_address =root+'/fasta/'
collection = '_'.join(x['core_db'].split('_')[0:3])
full_name = x['species']
directory = root_address+collection+'/'+full_name+'/dna/'
addresses[directory]=species
else:
if not(species in properly_formed):
root_address =root+'/fasta/'
collection = '_'.join(x['core_db'].split('_')[0:3])
full_name = x['species']
directory = root_address+collection+'/'+full_name+'/dna/'
addresses[directory]=species

logging.info('Number of genomes to retrieve: '+str(len(addresses.keys())))
logging.info('Number of species to retrieve: '+str(len(list(set(addresses.values())))))
Expand Down

0 comments on commit 443fc79

Please sign in to comment.