From 9d70be3102c16e06426c06be7b8dab0576729a3f Mon Sep 17 00:00:00 2001
From: david <david.sauer@med.nyu.edu>
Date: Mon, 30 Mar 2020 20:58:41 -0400
Subject: [PATCH] update to handle redundant genome file names

---
 .../feature_calculation_pipeline.py           | 18 ++++++++++------
 prediction/genome_analysis.py                 |  4 ++--
 prediction/prediction_pipeline.py             | 21 +++++++++++++------
 regression/genome_species_assignment.py       |  2 +-
 4 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/feature_calculation/feature_calculation_pipeline.py b/feature_calculation/feature_calculation_pipeline.py
index 1771b08..8eb0bb1 100755
--- a/feature_calculation/feature_calculation_pipeline.py
+++ b/feature_calculation/feature_calculation_pipeline.py
@@ -33,14 +33,20 @@
 #read in the genomes file. save a dict of genome->species
 logger.info("trying to open genomes file")
 infile = open(genome_species_file,'r')
-reader = csv.reader(infile,delimiter='\t')
-genomes = dict((str(rows[0]),str(rows[1])) for rows in reader)
+genomes = {}
+for line in infile.readlines():
+	species = line.split('\t')[1].strip()
+	genome = line.split('\t')[0].strip()
+	if species in genomes.keys():
+		genomes[species].append(genome)
+	else:
+		genomes[species]=[genome]
 infile.close()    
-logger.info("found "+str(len(genomes.keys()))+" genomes")
+logger.info("found "+str(sum([len(y) for y in genomes.values()]))+" genomes")
 
 #only analyze those with taxonomic description, shuffle
-genomes = {x:genomes[x] for x in genomes.keys() if genomes[x] in species_clade.keys()}
-to_analyze = [(x,genomes[x]) for x in genomes.keys()]
+genomes = {species:genomes[species] for species in genomes.keys() if species in species_clade.keys()}
+to_analyze = [(genome,species) for species in genomes.keys() for genome in genomes[species]]
 random.shuffle(to_analyze)
 
 #make folders for output
@@ -107,7 +113,7 @@ def features_per_genome(inputs):
 		
 	external_tools.cleanup((genome_file,species))
 	
-	return (genome_file,result)
+	return (species+'/'+genome_file,result)
 	
 #calculate features in parallel
 
diff --git a/prediction/genome_analysis.py b/prediction/genome_analysis.py
index 845fecd..86a5466 100755
--- a/prediction/genome_analysis.py
+++ b/prediction/genome_analysis.py
@@ -61,10 +61,10 @@ def features_per_genome(inputs):
 		
 	external_tools.cleanup((genome_file,species))
 	
-	return (genome_file,result)
+	return (species+'/'+genome_file,result)
 
 def many_genomes(genomes):
-	to_analyze = [(genome,genomes[genome]) for genome in genomes]
+	to_analyze = [(genome,species) for species in genomes.keys() for genome in genomes[species]]
 	random.shuffle(to_analyze)
 	print('analyzing genomes')
 	#make folders for output
diff --git a/prediction/prediction_pipeline.py b/prediction/prediction_pipeline.py
index 43fc740..f1cb89e 100755
--- a/prediction/prediction_pipeline.py
+++ b/prediction/prediction_pipeline.py
@@ -57,18 +57,27 @@
 #read in the genomes available to be analyzed
 to_analyze ={}
 f =open(genome_file,'r')
+count = 0
 for line in f.readlines():
-	to_analyze[line.split('\t')[0].strip()]=line.split('\t')[1].strip()
+	#to_analyze[line.split('\t')[0].strip()]=line.split('\t')[1].strip()
+	species = line.split('\t')[1].strip()
+	genome = line.split('\t')[0].strip()
+	if species in to_analyze.keys():
+		to_analyze[species].append(genome)
+	else:
+		to_analyze[species]=[genome]
+	count = count+1
 f.close()
+logger.info('number of lines in the genome-species file: '+str(count))
 
 #find the genomes and species to be analyzed
-logger.info('initial number of species to be predicted: '+str(len(list(set(to_analyze.values())))))
-logger.info('initial number of genomes to be analyzed: '+str(len(to_analyze.keys())))
+logger.info('initial number of species to be predicted: '+str(len(to_analyze.keys())))
+logger.info('initial number of genomes to be analyzed: '+str(sum([len(y) for y in to_analyze.values()])))
 useful_tax_info = {species:taxonomic_info[species] for species in taxonomic_info.keys() if 'superkingdom' in taxonomic_info[species].keys()} #keep only species with an assigned superkingdom
 useful_tax_info = {species:useful_tax_info[species] for species in useful_tax_info.keys() if useful_tax_info[species]['superkingdom'] in ['Bacteria','Archaea']} #keep only species which are archaea or bacteria
-to_analyze = {genome:to_analyze[genome] for genome in to_analyze.keys() if to_analyze[genome] in useful_tax_info.keys()} #analyze only those genomes with available species taxonomic information
-logger.info('number of species to be predicted: '+str(len(list(set(to_analyze.values())))))
-logger.info('number of genomes to be analyzed: '+str(len(to_analyze.keys())))
+to_analyze = {species:to_analyze[species] for species in to_analyze.keys() if species in useful_tax_info.keys()} #analyze only those genomes with available species taxonomic information
+logger.info('number of species to be predicted: '+str(len(to_analyze.keys())))
+logger.info('number of genomes to be analyzed: '+str(sum([len(y) for y in to_analyze.values()])))
 
 #calculate genome features
 genome_analysis.setup(useful_tax_info)
diff --git a/regression/genome_species_assignment.py b/regression/genome_species_assignment.py
index 908ce91..8f69873 100755
--- a/regression/genome_species_assignment.py
+++ b/regression/genome_species_assignment.py
@@ -12,7 +12,7 @@
 logger.info('Genome-barrnap assignment file: '+genome_assignment_file)
 
 f = open(species_genome_file,'r')
-genome_species = {'.'.join(line.strip().split('\t')[0].split('.')[:-1]):line.strip().split('\t')[1] for line in f.readlines()}
+genome_species = {line.strip().split('\t')[1]+'/'+'.'.join(line.strip().split('\t')[0].split('.')[:-1]):line.strip().split('\t')[1] for line in f.readlines()}
 f.close()
 logger.info('Number of genomes with an assigned species: '+str(len(genome_species.keys())))