a working, if hacky, solution to get the genome retriever working

DavidBSauer · Mar 31, 2020 · 443fc79 · 443fc79
1 parent 9d70be3
commit 443fc79
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 62 deletions.
diff --git a/feature_calculation/README.md b/feature_calculation/README.md
@@ -7,9 +7,9 @@ Calculate features for each genome. Start within the feature_calculation directo
 
 1. Download genomes for species IN a provided list.
 ```
-python3 genome_retriever.py list_of_species_file IN
+python3 genome_retriever.py list_of_species_file IN release
 ```
-list_of_species_file needs to have one species per line, with the species name the first thing on the line, where the species name has the form "genus_species" (all lower case, separated by an underscore).
+list_of_species_file needs to have one species per line, with the species name the first thing on the line, where the species name has the form "genus_species" (all lower case, separated by an underscore). The release is the release number of Ensembl Bacteria to use. Release 40 was used for Sauer & Wang (2019).
 
 2. Download taxonomic classification for species of interest.
 ```

diff --git a/feature_calculation/genome_retriever.py b/feature_calculation/genome_retriever.py
@@ -1,6 +1,4 @@
-#retrieve genomes from ensembl using ensembl's rest API
-##note: Ensembl release 40 does not have checksums (?)
-
+import urllib
 from urllib.request import urlopen
 import time
 import sys
@@ -10,10 +8,11 @@
 import requests
 logging.basicConfig(filename=str('genome_retriever.log'), level=logging.INFO)
 
-ref_file, setting = sys.argv[1:3]
+ref_file, setting, release = sys.argv[1:4]
 
 logging.info('Reference file: '+ref_file)
 logging.info('Setting: '+setting)
+logging.info('Using Ensembl release: '+release) #the paper used release 40 but this allow for accessing updated releases
 
 if not(setting in ['IN','NOT_IN']):
 	print("Setting must be 'IN' or 'NOT_IN'. Quitting")
@@ -40,32 +39,39 @@
 print('finding all valid genomes')
 
 addresses = {}
-retrieve_all= "http://rest.ensemblgenomes.org/info/genomes/division/EnsemblBacteria?"
-r = requests.get(retrieve_all, headers={ "Content-Type" : "application/json"})
-if not r.ok:
-	logging.info('could not open the json page')
+root ="ftp://ftp.ensemblgenomes.org/pub/bacteria/release-"+release
+retrieve_all= root+"/species_EnsemblBacteria.txt"
+try:
+	response = urllib.request.urlopen(retrieve_all)
+	data = response.read()
+	decoded = data.decode('utf-8').split('\n')
+except:
+	logging.info('could not open the Ensembl bacteria ftp list')
 else:
-	decoded = r.json()
-	for x in decoded:
-		if len(x['species'].split('_'))>=2:
-			species ='_'.join(x['species'].split('_')[0:2]).lower()
-			(genus,species_name) = species.split('_')
-			if not(genus in ['','candidate','uncultured','unidentified','synthetic','candidatus','bacterium','marine']) and not(species_name in ['sp','group','candidatus','bacterium','proteobacterium','endosymbiont','archaeon','cluster','producing','gamma']):
-				if not 'bacterium' in species_name:
-					if setting == 'IN':
-						if species in properly_formed:
-							root_address ='ftp://ftp.ensemblgenomes.org/pub/bacteria/release-40/fasta/'
-							collection = '_'.join(x['dbname'].split('_')[0:3])
-							full_name = x['species']
-							directory = root_address+collection+'/'+full_name+'/dna/'
-							addresses[directory]=species
-					else:
-						if not(species in properly_formed):
-							root_address ='ftp://ftp.ensemblgenomes.org/pub/bacteria/release-40/fasta/'
-							collection = '_'.join(x['dbname'].split('_')[0:3])
-							full_name = x['species']
-							directory = root_address+collection+'/'+full_name+'/dna/'
-							addresses[directory]=species
+	first_line= decoded[0].split('\t')
+	for line in decoded[1:]:
+		if not(line.strip() == ''):
+			working =line.split('\t')
+			x={first_line[pos]:working[pos] for pos in range(0,len(first_line),1)}
+			if len(x['species'].split('_'))>=2:
+				species ='_'.join(x['species'].split('_')[0:2]).lower()
+				(genus,species_name) = species.split('_')
+				if not(genus in ['','candidate','uncultured','unidentified','synthetic','candidatus','bacterium','marine']) and not(species_name in ['sp','group','candidatus','bacterium','proteobacterium','endosymbiont','archaeon','cluster','producing','gamma']):
+					if not 'bacterium' in species_name:
+						if setting == 'IN':
+							if species in properly_formed:
+								root_address =root+'/fasta/'
+								collection = '_'.join(x['core_db'].split('_')[0:3])
+								full_name = x['species']
+								directory = root_address+collection+'/'+full_name+'/dna/'
+								addresses[directory]=species
+						else:
+							if not(species in properly_formed):
+								root_address =root+'/fasta/'
+								collection = '_'.join(x['core_db'].split('_')[0:3])
+								full_name = x['species']
+								directory = root_address+collection+'/'+full_name+'/dna/'
+								addresses[directory]=species
 
 logging.info('Number of genomes to retrieve: '+str(len(addresses.keys())))
 logging.info('Number of species to retrieve: '+str(len(list(set(addresses.values())))))

diff --git a/prediction/README.md b/prediction/README.md
@@ -6,9 +6,9 @@ Predict the OGT of a species (or many species) using the generated multiple line
 
 1. Download genomes for species IN or NOT_IN a provided list.
 ```
-python3 genome_retriever.py list_of_species_file IN/NOT_IN
+python3 genome_retriever.py list_of_species_file IN/NOT_IN release
 ```
-Species file needs to have one per line, with the species name the first thing on the line, where the species name has the form "genus_species" (all lower case, separated by an underscore).
+Species file needs to have one per line, with the species name the first thing on the line, where the species name has the form "genus_species" (all lower case, separated by an underscore). The release is the release number of Ensembl Bacteria to use. Release 40 was used for Sauer & Wang (2019).
 
 2. Download taxonomic classification for species of interest.
 ```

diff --git a/prediction/genome_retriever.py b/prediction/genome_retriever.py
@@ -1,6 +1,4 @@
-#retrieve genomes from ensembl using ensembl's rest API
-##note: Ensembl release 40 does not have checksums (?)
-
+import urllib
 from urllib.request import urlopen
 import time
 import sys
@@ -10,10 +8,11 @@
 import requests
 logging.basicConfig(filename=str('genome_retriever.log'), level=logging.INFO)
 
-ref_file, setting = sys.argv[1:3]
+ref_file, setting, release = sys.argv[1:4]
 
 logging.info('Reference file: '+ref_file)
 logging.info('Setting: '+setting)
+logging.info('Using Ensembl release: '+release) #the paper used release 40 but this allow for accessing updated releases
 
 if not(setting in ['IN','NOT_IN']):
 	print("Setting must be 'IN' or 'NOT_IN'. Quitting")
@@ -40,32 +39,39 @@
 print('finding all valid genomes')
 
 addresses = {}
-retrieve_all= "http://rest.ensemblgenomes.org/info/genomes/division/EnsemblBacteria?"
-r = requests.get(retrieve_all, headers={ "Content-Type" : "application/json"})
-if not r.ok:
-	logging.info('could not open the json page')
+root ="ftp://ftp.ensemblgenomes.org/pub/bacteria/release-"+release
+retrieve_all= root+"/species_EnsemblBacteria.txt"
+try:
+	response = urllib.request.urlopen(retrieve_all)
+	data = response.read()
+	decoded = data.decode('utf-8').split('\n')
+except:
+	logging.info('could not open the Ensembl bacteria ftp list')
 else:
-	decoded = r.json()
-	for x in decoded:
-		if len(x['species'].split('_'))>=2:
-			species ='_'.join(x['species'].split('_')[0:2]).lower()
-			(genus,species_name) = species.split('_')
-			if not(genus in ['','candidate','uncultured','unidentified','synthetic','candidatus','bacterium','marine']) and not(species_name in ['sp','group','candidatus','bacterium','proteobacterium','endosymbiont','archaeon','cluster','producing','gamma']):
-				if not 'bacterium' in species_name:
-					if setting == 'IN':
-						if species in properly_formed:
-							root_address ='ftp://ftp.ensemblgenomes.org/pub/bacteria/release-40/fasta/'
-							collection = '_'.join(x['dbname'].split('_')[0:3])
-							full_name = x['species']
-							directory = root_address+collection+'/'+full_name+'/dna/'
-							addresses[directory]=species
-					else:
-						if not(species in properly_formed):
-							root_address ='ftp://ftp.ensemblgenomes.org/pub/bacteria/release-40/fasta/'
-							collection = '_'.join(x['dbname'].split('_')[0:3])
-							full_name = x['species']
-							directory = root_address+collection+'/'+full_name+'/dna/'
-							addresses[directory]=species
+	first_line= decoded[0].split('\t')
+	for line in decoded[1:]:
+		if not(line.strip() == ''):
+			working =line.split('\t')
+			x={first_line[pos]:working[pos] for pos in range(0,len(first_line),1)}
+			if len(x['species'].split('_'))>=2:
+				species ='_'.join(x['species'].split('_')[0:2]).lower()
+				(genus,species_name) = species.split('_')
+				if not(genus in ['','candidate','uncultured','unidentified','synthetic','candidatus','bacterium','marine']) and not(species_name in ['sp','group','candidatus','bacterium','proteobacterium','endosymbiont','archaeon','cluster','producing','gamma']):
+					if not 'bacterium' in species_name:
+						if setting == 'IN':
+							if species in properly_formed:
+								root_address =root+'/fasta/'
+								collection = '_'.join(x['core_db'].split('_')[0:3])
+								full_name = x['species']
+								directory = root_address+collection+'/'+full_name+'/dna/'
+								addresses[directory]=species
+						else:
+							if not(species in properly_formed):
+								root_address =root+'/fasta/'
+								collection = '_'.join(x['core_db'].split('_')[0:3])
+								full_name = x['species']
+								directory = root_address+collection+'/'+full_name+'/dna/'
+								addresses[directory]=species
 
 logging.info('Number of genomes to retrieve: '+str(len(addresses.keys())))
 logging.info('Number of species to retrieve: '+str(len(list(set(addresses.values())))))