From 9d70be3102c16e06426c06be7b8dab0576729a3f Mon Sep 17 00:00:00 2001 From: david Date: Mon, 30 Mar 2020 20:58:41 -0400 Subject: [PATCH] update to handle redundant genome file names --- .../feature_calculation_pipeline.py | 18 ++++++++++------ prediction/genome_analysis.py | 4 ++-- prediction/prediction_pipeline.py | 21 +++++++++++++------ regression/genome_species_assignment.py | 2 +- 4 files changed, 30 insertions(+), 15 deletions(-) diff --git a/feature_calculation/feature_calculation_pipeline.py b/feature_calculation/feature_calculation_pipeline.py index 1771b08..8eb0bb1 100755 --- a/feature_calculation/feature_calculation_pipeline.py +++ b/feature_calculation/feature_calculation_pipeline.py @@ -33,14 +33,20 @@ #read in the genomes file. save a dict of genome->species logger.info("trying to open genomes file") infile = open(genome_species_file,'r') -reader = csv.reader(infile,delimiter='\t') -genomes = dict((str(rows[0]),str(rows[1])) for rows in reader) +genomes = {} +for line in infile.readlines(): + species = line.split('\t')[1].strip() + genome = line.split('\t')[0].strip() + if species in genomes.keys(): + genomes[species].append(genome) + else: + genomes[species]=[genome] infile.close() -logger.info("found "+str(len(genomes.keys()))+" genomes") +logger.info("found "+str(sum([len(y) for y in genomes.values()]))+" genomes") #only analyze those with taxonomic description, shuffle -genomes = {x:genomes[x] for x in genomes.keys() if genomes[x] in species_clade.keys()} -to_analyze = [(x,genomes[x]) for x in genomes.keys()] +genomes = {species:genomes[species] for species in genomes.keys() if species in species_clade.keys()} +to_analyze = [(genome,species) for species in genomes.keys() for genome in genomes[species]] random.shuffle(to_analyze) #make folders for output @@ -107,7 +113,7 @@ def features_per_genome(inputs): external_tools.cleanup((genome_file,species)) - return (genome_file,result) + return (species+'/'+genome_file,result) #calculate features in parallel diff --git a/prediction/genome_analysis.py b/prediction/genome_analysis.py index 845fecd..86a5466 100755 --- a/prediction/genome_analysis.py +++ b/prediction/genome_analysis.py @@ -61,10 +61,10 @@ def features_per_genome(inputs): external_tools.cleanup((genome_file,species)) - return (genome_file,result) + return (species+'/'+genome_file,result) def many_genomes(genomes): - to_analyze = [(genome,genomes[genome]) for genome in genomes] + to_analyze = [(genome,species) for species in genomes.keys() for genome in genomes[species]] random.shuffle(to_analyze) print('analyzing genomes') #make folders for output diff --git a/prediction/prediction_pipeline.py b/prediction/prediction_pipeline.py index 43fc740..f1cb89e 100755 --- a/prediction/prediction_pipeline.py +++ b/prediction/prediction_pipeline.py @@ -57,18 +57,27 @@ #read in the genomes available to be analyzed to_analyze ={} f =open(genome_file,'r') +count = 0 for line in f.readlines(): - to_analyze[line.split('\t')[0].strip()]=line.split('\t')[1].strip() + #to_analyze[line.split('\t')[0].strip()]=line.split('\t')[1].strip() + species = line.split('\t')[1].strip() + genome = line.split('\t')[0].strip() + if species in to_analyze.keys(): + to_analyze[species].append(genome) + else: + to_analyze[species]=[genome] + count = count+1 f.close() +logger.info('number of lines in the genome-species file: '+str(count)) #find the genomes and species to be analyzed -logger.info('initial number of species to be predicted: '+str(len(list(set(to_analyze.values()))))) -logger.info('initial number of genomes to be analyzed: '+str(len(to_analyze.keys()))) +logger.info('initial number of species to be predicted: '+str(len(to_analyze.keys()))) +logger.info('initial number of genomes to be analyzed: '+str(sum([len(y) for y in to_analyze.values()]))) useful_tax_info = {species:taxonomic_info[species] for species in taxonomic_info.keys() if 'superkingdom' in taxonomic_info[species].keys()} #keep only species with an assigned superkingdom useful_tax_info = {species:useful_tax_info[species] for species in useful_tax_info.keys() if useful_tax_info[species]['superkingdom'] in ['Bacteria','Archaea']} #keep only species which are archaea or bacteria -to_analyze = {genome:to_analyze[genome] for genome in to_analyze.keys() if to_analyze[genome] in useful_tax_info.keys()} #analyze only those genomes with available species taxonomic information -logger.info('number of species to be predicted: '+str(len(list(set(to_analyze.values()))))) -logger.info('number of genomes to be analyzed: '+str(len(to_analyze.keys()))) +to_analyze = {species:to_analyze[species] for species in to_analyze.keys() if species in useful_tax_info.keys()} #analyze only those genomes with available species taxonomic information +logger.info('number of species to be predicted: '+str(len(to_analyze.keys()))) +logger.info('number of genomes to be analyzed: '+str(sum([len(y) for y in to_analyze.values()]))) #calculate genome features genome_analysis.setup(useful_tax_info) diff --git a/regression/genome_species_assignment.py b/regression/genome_species_assignment.py index 908ce91..8f69873 100755 --- a/regression/genome_species_assignment.py +++ b/regression/genome_species_assignment.py @@ -12,7 +12,7 @@ logger.info('Genome-barrnap assignment file: '+genome_assignment_file) f = open(species_genome_file,'r') -genome_species = {'.'.join(line.strip().split('\t')[0].split('.')[:-1]):line.strip().split('\t')[1] for line in f.readlines()} +genome_species = {line.strip().split('\t')[1]+'/'+'.'.join(line.strip().split('\t')[0].split('.')[:-1]):line.strip().split('\t')[1] for line in f.readlines()} f.close() logger.info('Number of genomes with an assigned species: '+str(len(genome_species.keys())))