Skip to content

Commit

Permalink
update to handle redundant genome file names
Browse files Browse the repository at this point in the history
  • Loading branch information
DavidBSauer committed Mar 31, 2020
1 parent 8bb96d9 commit 9d70be3
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 15 deletions.
18 changes: 12 additions & 6 deletions feature_calculation/feature_calculation_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,20 @@
#read in the genomes file. save a dict of genome->species
logger.info("trying to open genomes file")
infile = open(genome_species_file,'r')
reader = csv.reader(infile,delimiter='\t')
genomes = dict((str(rows[0]),str(rows[1])) for rows in reader)
genomes = {}
for line in infile.readlines():
species = line.split('\t')[1].strip()
genome = line.split('\t')[0].strip()
if species in genomes.keys():
genomes[species].append(genome)
else:
genomes[species]=[genome]
infile.close()
logger.info("found "+str(len(genomes.keys()))+" genomes")
logger.info("found "+str(sum([len(y) for y in genomes.values()]))+" genomes")

#only analyze those with taxonomic description, shuffle
genomes = {x:genomes[x] for x in genomes.keys() if genomes[x] in species_clade.keys()}
to_analyze = [(x,genomes[x]) for x in genomes.keys()]
genomes = {species:genomes[species] for species in genomes.keys() if species in species_clade.keys()}
to_analyze = [(genome,species) for species in genomes.keys() for genome in genomes[species]]
random.shuffle(to_analyze)

#make folders for output
Expand Down Expand Up @@ -107,7 +113,7 @@ def features_per_genome(inputs):

external_tools.cleanup((genome_file,species))

return (genome_file,result)
return (species+'/'+genome_file,result)

#calculate features in parallel

Expand Down
4 changes: 2 additions & 2 deletions prediction/genome_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,10 @@ def features_per_genome(inputs):

external_tools.cleanup((genome_file,species))

return (genome_file,result)
return (species+'/'+genome_file,result)

def many_genomes(genomes):
to_analyze = [(genome,genomes[genome]) for genome in genomes]
to_analyze = [(genome,species) for species in genomes.keys() for genome in genomes[species]]
random.shuffle(to_analyze)
print('analyzing genomes')
#make folders for output
Expand Down
21 changes: 15 additions & 6 deletions prediction/prediction_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,18 +57,27 @@
#read in the genomes available to be analyzed
to_analyze ={}
f =open(genome_file,'r')
count = 0
for line in f.readlines():
to_analyze[line.split('\t')[0].strip()]=line.split('\t')[1].strip()
#to_analyze[line.split('\t')[0].strip()]=line.split('\t')[1].strip()
species = line.split('\t')[1].strip()
genome = line.split('\t')[0].strip()
if species in to_analyze.keys():
to_analyze[species].append(genome)
else:
to_analyze[species]=[genome]
count = count+1
f.close()
logger.info('number of lines in the genome-species file: '+str(count))

#find the genomes and species to be analyzed
logger.info('initial number of species to be predicted: '+str(len(list(set(to_analyze.values())))))
logger.info('initial number of genomes to be analyzed: '+str(len(to_analyze.keys())))
logger.info('initial number of species to be predicted: '+str(len(to_analyze.keys())))
logger.info('initial number of genomes to be analyzed: '+str(sum([len(y) for y in to_analyze.values()])))
useful_tax_info = {species:taxonomic_info[species] for species in taxonomic_info.keys() if 'superkingdom' in taxonomic_info[species].keys()} #keep only species with an assigned superkingdom
useful_tax_info = {species:useful_tax_info[species] for species in useful_tax_info.keys() if useful_tax_info[species]['superkingdom'] in ['Bacteria','Archaea']} #keep only species which are archaea or bacteria
to_analyze = {genome:to_analyze[genome] for genome in to_analyze.keys() if to_analyze[genome] in useful_tax_info.keys()} #analyze only those genomes with available species taxonomic information
logger.info('number of species to be predicted: '+str(len(list(set(to_analyze.values())))))
logger.info('number of genomes to be analyzed: '+str(len(to_analyze.keys())))
to_analyze = {species:to_analyze[species] for species in to_analyze.keys() if species in useful_tax_info.keys()} #analyze only those genomes with available species taxonomic information
logger.info('number of species to be predicted: '+str(len(to_analyze.keys())))
logger.info('number of genomes to be analyzed: '+str(sum([len(y) for y in to_analyze.values()])))

#calculate genome features
genome_analysis.setup(useful_tax_info)
Expand Down
2 changes: 1 addition & 1 deletion regression/genome_species_assignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
logger.info('Genome-barrnap assignment file: '+genome_assignment_file)

f = open(species_genome_file,'r')
genome_species = {'.'.join(line.strip().split('\t')[0].split('.')[:-1]):line.strip().split('\t')[1] for line in f.readlines()}
genome_species = {line.strip().split('\t')[1]+'/'+'.'.join(line.strip().split('\t')[0].split('.')[:-1]):line.strip().split('\t')[1] for line in f.readlines()}
f.close()
logger.info('Number of genomes with an assigned species: '+str(len(genome_species.keys())))

Expand Down

0 comments on commit 9d70be3

Please sign in to comment.