From ae4a15b1429e9f3845542474827dadd4e91fdb60 Mon Sep 17 00:00:00 2001 From: pchaumeil Date: Fri, 13 Apr 2018 13:15:11 +1000 Subject: [PATCH] release 83 tree.debug.error handling. --- bin/gtdbtk | 4 + gtdbtk/VERSION | 4 + gtdbtk/classify.py | 561 +++++++++++++++++---------------- gtdbtk/config/config.py | 202 ++++++------ gtdbtk/external/pfam_search.py | 64 ++-- gtdbtk/main.py | 9 +- 6 files changed, 453 insertions(+), 391 deletions(-) diff --git a/bin/gtdbtk b/bin/gtdbtk index 923c44db..5c119272 100755 --- a/bin/gtdbtk +++ b/bin/gtdbtk @@ -142,6 +142,8 @@ if __name__ == '__main__': help='desired prefix for output files') optional_classify_wf.add_argument('--cpus', default=1, type=int, help='number of CPUs to use') + optional_classify_wf.add_argument('--debug', action="store_true", + help='create intermediate files for debugging purposes.') optional_classify_wf.add_argument('-h', '--help', action="help", help="show help message") @@ -249,6 +251,8 @@ if __name__ == '__main__': help='desired prefix for output files') optional_classify.add_argument('--cpus', default=1, type=int, help='number of CPUs to use') + optional_classify.add_argument('--debug', action="store_true", + help='create intermediate files for debugging purposes.') optional_classify.add_argument('-h', '--help', action="help", help="show help message") diff --git a/gtdbtk/VERSION b/gtdbtk/VERSION index 0f916e9a..73ae9d12 100644 --- a/gtdbtk/VERSION +++ b/gtdbtk/VERSION @@ -1,3 +1,7 @@ +0.0.6 +- Migration to R83 and integration of UBA genomes +- add "debug" flag to classify options +- error handling improvement 0.0.5 - stable version for pip 0.0.4b3 diff --git a/gtdbtk/classify.py b/gtdbtk/classify.py index b6a35a34..b8bac16d 100644 --- a/gtdbtk/classify.py +++ b/gtdbtk/classify.py @@ -106,281 +106,295 @@ def run(self, genomes, align_dir, out_dir, - prefix): - """Classify genomes based on position in reference tree.""" - - for marker_set_id in ('bac120', 'ar122'): - user_msa_file = os.path.join(align_dir, prefix+'.%s.user_msa.fasta' % marker_set_id) - if not os.path.exists(user_msa_file): - # file will not exist if there are no User genomes from a given domain - continue - - classify_tree = self.place_genomes(user_msa_file, - marker_set_id, - out_dir, - prefix) - - # get taxonomic classification of each user genome - tree = dendropy.Tree.get_from_path(classify_tree, - schema='newick', - rooting='force-rooted', - preserve_underscores=True) - - gtdb_taxonomy = Taxonomy().read(self.taxonomy_file) - - - fout = open(os.path.join(out_dir, prefix + '.%s.classification.tsv' % marker_set_id), 'w') - fastaniout = open(os.path.join(out_dir, prefix + '.%s.fastani_results.tsv' % marker_set_id), 'w') - redfout = open(os.path.join(out_dir, prefix + '.%s.summary.tsv' % marker_set_id), 'w') - #parchiinfo = open(os.path.join(out_dir, prefix + '.%s.parentinfo.tsv' % marker_set_id), 'w') - - reddictfile = open(os.path.join(out_dir, prefix + '.%s.red_dictionary.tsv' % marker_set_id), 'w') - - marker_dict = {} - if marker_set_id == 'bac120': - marker_dict = Config.RED_DIST_BAC_DICT - elif marker_set_id == 'ar122': - marker_dict = Config.RED_DIST_ARC_DICT - reddictfile.write('Phylum\t{0}\n'.format(marker_dict.get('p__'))) - reddictfile.write('Class\t{0}\n'.format(marker_dict.get('c__'))) - reddictfile.write('Order\t{0}\n'.format(marker_dict.get('o__'))) - reddictfile.write('Family\t{0}\n'.format(marker_dict.get('f__'))) - reddictfile.write('Genus\t{0}\n'.format(marker_dict.get('g__'))) - reddictfile.close() - - fastaniout.write("User genome\tReference genome\tANI\n") - #redfout.write("User genome\tRed value\tHigher rank\tHigher value\tLower rank\tLower value\tcase\tclosest_rank\tTool\n") - redfout.write("user_genome\tclassification_method\tred_value\n") - #parchiinfo.write("User genome\tHigher rank\tHigher value\tLower rank\tLower value\tcase\tclosest_rank\n") - + prefix, + debugopt = False ): + try: + """Classify genomes based on position in reference tree.""" - # Genomes can be classified by using Mash or RED values - # We go through all leaves of the tree. if the leaf is a user genome we take it's parent node and look at all the leaves for this node. - # If the parent node has only one Reference genome ( GB or RS ) we calculate the mash distance between the user genome and the reference genome - analysed_nodes = [] - fastani_dict = {} - all_fastani_dict = {} - - fastani_list = [] - # some genomes of Case C are handled here, if Mash distance is close enough - self.logger.info('Calculating Average Nucleotide Identity using FastANI.') - - for nd in tree.preorder_node_iter(): - #We store the prefixes of each leaves to check if one starts with GB_ or RS_ - list_subnode_initials = [subnd.taxon.label.replace("'",'')[0:3] for subnd in nd.leaf_iter()] - list_subnode = [subnd.taxon.label.replace("'",'') for subnd in nd.leaf_iter()] - #if only one genome is a reference genome - if (list_subnode_initials.count('RS_') + list_subnode_initials.count('GB_')) == 1 and len(list_subnode_initials) > 1 and list_subnode[0] not in analysed_nodes: - fastani_list.append(list_subnode) - analysed_nodes.extend(list_subnode) - - manager = multiprocessing.Manager() - out_q = manager.dict() - procs = [] - nprocs = self.cpus - if len(fastani_list) > 0: - for item in splitchunks_list(fastani_list, nprocs): - p = multiprocessing.Process( - target=self._fastaniWorker, - args=(item, genomes, out_q)) - procs.append(p) - p.start() - - # Collect all results into a single result dict. We know how many dicts - # with results to expect. - #while out_q.empty(): - # time.sleep(1) - - # Wait for all worker processes to finish - for p in procs: - p.join() + for marker_set_id in ('bac120', 'ar122'): + user_msa_file = os.path.join(align_dir, prefix+'.%s.user_msa.fasta' % marker_set_id) + if not os.path.exists(user_msa_file): + # file will not exist if there are no User genomes from a given domain + continue + + classify_tree = self.place_genomes(user_msa_file, + marker_set_id, + out_dir, + prefix) + + # get taxonomic classification of each user genome + tree = dendropy.Tree.get_from_path(classify_tree, + schema='newick', + rooting='force-rooted', + preserve_underscores=True) + + gtdb_taxonomy = Taxonomy().read(self.taxonomy_file) + + + fout = open(os.path.join(out_dir, prefix + '.%s.classification.tsv' % marker_set_id), 'w') + fastaniout = open(os.path.join(out_dir, prefix + '.%s.fastani_results.tsv' % marker_set_id), 'w') + redfout = open(os.path.join(out_dir, prefix + '.%s.summary.tsv' % marker_set_id), 'w') + if debugopt: + parchiinfo = open(os.path.join(out_dir, prefix + '.%s.debug_file.tsv' % marker_set_id), 'w') + + reddictfile = open(os.path.join(out_dir, prefix + '.%s.red_dictionary.tsv' % marker_set_id), 'w') + + marker_dict = {} + if marker_set_id == 'bac120': + marker_dict = Config.RED_DIST_BAC_DICT + elif marker_set_id == 'ar122': + marker_dict = Config.RED_DIST_ARC_DICT + reddictfile.write('Phylum\t{0}\n'.format(marker_dict.get('p__'))) + reddictfile.write('Class\t{0}\n'.format(marker_dict.get('c__'))) + reddictfile.write('Order\t{0}\n'.format(marker_dict.get('o__'))) + reddictfile.write('Family\t{0}\n'.format(marker_dict.get('f__'))) + reddictfile.write('Genus\t{0}\n'.format(marker_dict.get('g__'))) + reddictfile.close() + + fastaniout.write("User genome\tReference genome\tANI\n") + redfout.write("user_genome\tclassification_method\tred_value\n") + if debugopt: + parchiinfo.write("User genome\tHigher rank\tHigher value\tLower rank\tLower value\tcase\tclosest_rank\n") + + + # Genomes can be classified by using Mash or RED values + # We go through all leaves of the tree. if the leaf is a user genome we take it's parent node and look at all the leaves for this node. + # If the parent node has only one Reference genome ( GB or RS ) we calculate the mash distance between the user genome and the reference genome + analysed_nodes = [] + fastani_dict = {} + all_fastani_dict = {} + + fastani_list = [] + # some genomes of Case C are handled here, if Mash distance is close enough + self.logger.info('Calculating Average Nucleotide Identity using FastANI.') + + for nd in tree.preorder_node_iter(): + #We store the prefixes of each leaves to check if one starts with GB_ or RS_ + list_subnode_initials = [subnd.taxon.label.replace("'",'')[0:3] for subnd in nd.leaf_iter()] + list_subnode = [subnd.taxon.label.replace("'",'') for subnd in nd.leaf_iter()] + #if only one genome is a reference genome + if (list_subnode_initials.count('RS_') + list_subnode_initials.count('GB_')+ list_subnode_initials.count('UBA')) == 1 and len(list_subnode_initials) > 1 and list_subnode[0] not in analysed_nodes: + fastani_list.append(list_subnode) + analysed_nodes.extend(list_subnode) + + manager = multiprocessing.Manager() + out_q = manager.dict() + procs = [] + nprocs = self.cpus + if len(fastani_list) > 0: + for item in splitchunks_list(fastani_list, nprocs): + p = multiprocessing.Process( + target=self._fastaniWorker, + args=(item, genomes, out_q)) + procs.append(p) + p.start() - all_fastani_dict = dict(out_q) - + # Collect all results into a single result dict. We know how many dicts + # with results to expect. + #while out_q.empty(): + # time.sleep(1) + + # Wait for all worker processes to finish + for p in procs: + p.join() + if p.exitcode == 1: + raise ValueError("Stop!!") + + all_fastani_dict = dict(out_q) - - for k,v in all_fastani_dict.iteritems(): - fastaniout.write("{0}\t{1}\t{2}\n".format(k,v.get("ref_genome"),v.get("ani"))) - if Config.FASTANI_SPECIES_THRESHOLD < v.get("ani"): - suffixed_name = add_ncbi_prefix(v.get("ref_genome")) - taxa_str = ";".join(gtdb_taxonomy.get(suffixed_name)) - if taxa_str.endswith("s__"): - taxa_str = taxa_str+v.get("ref_genome") - fout.write('%s\t%s\n' % (k, taxa_str)) - fastani_dict[k]=v - redfout.write("{0}\tani\tNone\n".format(k)) - fastaniout.close() - - self.logger.info('{0} genomes have been classify with FastANI.'.format(len(fastani_dict))) - - - - scaled_tree = self._calculate_red_distances(classify_tree, out_dir) - - - user_genome_ids = set(read_fasta(user_msa_file).keys()) - user_genome_ids = user_genome_ids.difference(set(fastani_dict.keys())) - # for all other cases we measure the RED distance between a leaf and a parent node ( RED = 1-edge_length). This RED value will tell us - # the rank level that can be associated with a User genome. - # As an example if the RED value is close to the order level, the user genome will take the order level of the Reference genome under the same parent node. - # Is there are multiple orders under the parent node. The user genome is considered as a new order - for leaf in scaled_tree.leaf_node_iter(): - if leaf.taxon.label in user_genome_ids: - taxa = [] - # In some cases , pplacer can associate 2 user genomes on the same parent node so we need to go up the tree to find a node with a reference genome as leaf. - cur_node = leaf.parent_node - list_subnode_initials = [subnd.taxon.label.replace("'",'')[0:3] for subnd in cur_node.leaf_iter()] - while 'RS_' not in list_subnode_initials and 'GB_' not in list_subnode_initials: - cur_node = cur_node.parent_node + for k,v in all_fastani_dict.iteritems(): + fastaniout.write("{0}\t{1}\t{2}\n".format(k,v.get("ref_genome"),v.get("ani"))) + if Config.FASTANI_SPECIES_THRESHOLD <= v.get("ani"): + suffixed_name = add_ncbi_prefix(v.get("ref_genome")) + taxa_str = ";".join(gtdb_taxonomy.get(suffixed_name)) + if taxa_str.endswith("s__"): + taxa_str = taxa_str+v.get("ref_genome") + fout.write('%s\t%s\n' % (k, taxa_str)) + fastani_dict[k]=v + redfout.write("{0}\tani\tNone\n".format(k)) + fastaniout.close() + + self.logger.info('{0} genomes have been classify with FastANI.'.format(len(fastani_dict))) + + + + scaled_tree = self._calculate_red_distances(classify_tree, out_dir) + + + user_genome_ids = set(read_fasta(user_msa_file).keys()) + user_genome_ids = user_genome_ids.difference(set(fastani_dict.keys())) + # for all other cases we measure the RED distance between a leaf and a parent node ( RED = 1-edge_length). This RED value will tell us + # the rank level that can be associated with a User genome. + # As an example if the RED value is close to the order level, the user genome will take the order level of the Reference genome under the same parent node. + # Is there are multiple orders under the parent node. The user genome is considered as a new order + for leaf in scaled_tree.leaf_node_iter(): + if leaf.taxon.label in user_genome_ids: + taxa = [] + # In some cases , pplacer can associate 2 user genomes on the same parent node so we need to go up the tree to find a node with a reference genome as leaf. + cur_node = leaf.parent_node list_subnode_initials = [subnd.taxon.label.replace("'",'')[0:3] for subnd in cur_node.leaf_iter()] - - current_rel_list = cur_node.rel_dist - - parent_taxon_node = cur_node.parent_node - _support, parent_taxon, _aux_info = parse_label(parent_taxon_node.label) - - - while parent_taxon_node is not None and not parent_taxon: - parent_taxon_node =parent_taxon_node.parent_node - _support, parent_taxon, _aux_info = parse_label(parent_taxon_node.label) - - parent_rank = parent_taxon.split(";")[-1][0:3] - parent_rel_dist = parent_taxon_node.rel_dist - - - genome_parent_child = [leaf.taxon.label,parent_rank,parent_rel_dist,'','','',''] - - child_taxons = [] - closest_rank = None; - detection = "RED" - # if the genome is placed between the genus and specie ranks , it will be associated with the genus when _get_closest_red_rank is called - if parent_rank != 'g__': - child_rk = self.order_rank[self.order_rank.index(parent_rank)+1] - list_subnode = [childnd.taxon.label.replace("'",'') for childnd in cur_node.leaf_iter() if (childnd.taxon.label.startswith('RS_') or childnd.taxon.label.startswith('GB_'))] - list_ranks = [gtdb_taxonomy.get(name)[self.order_rank.index(child_rk)] for name in list_subnode] - if len(set(list_ranks)) == 1: - for subranknd in cur_node.preorder_iter(): - _support, subranknd_taxon, _aux_info = parse_label(subranknd.label) - if subranknd.is_internal() and subranknd_taxon is not None and subranknd_taxon.startswith(child_rk): - child_taxons = subranknd_taxon.split(";") - child_taxon_node = subranknd - child_rel_dist = child_taxon_node.rel_dist - break + while 'RS_' not in list_subnode_initials and 'GB_' not in list_subnode_initials and 'UBA' not in list_subnode_initials: + cur_node = cur_node.parent_node + list_subnode_initials = [subnd.taxon.label.replace("'",'')[0:3] for subnd in cur_node.leaf_iter()] + + current_rel_list = cur_node.rel_dist + + parent_taxon_node = cur_node.parent_node + _support, parent_taxon, _aux_info = parse_label(parent_taxon_node.label) + + + while parent_taxon_node is not None and not parent_taxon: + parent_taxon_node =parent_taxon_node.parent_node + _support, parent_taxon, _aux_info = parse_label(parent_taxon_node.label) + + parent_rank = parent_taxon.split(";")[-1][0:3] + parent_rel_dist = parent_taxon_node.rel_dist + + + genome_parent_child = [leaf.taxon.label,parent_rank,parent_rel_dist,'','','',''] + + child_taxons = [] + closest_rank = None; + detection = "RED" + # if the genome is placed between the genus and specie ranks , it will be associated with the genus when _get_closest_red_rank is called + if parent_rank != 'g__': + child_rk = self.order_rank[self.order_rank.index(parent_rank)+1] + list_subnode = [childnd.taxon.label.replace("'",'') for childnd in cur_node.leaf_iter() if (childnd.taxon.label.startswith('RS_') or childnd.taxon.label.startswith('GB_'))] + list_ranks = [gtdb_taxonomy.get(name)[self.order_rank.index(child_rk)] for name in list_subnode] + if len(set(list_ranks)) == 1: + for subranknd in cur_node.preorder_iter(): + _support, subranknd_taxon, _aux_info = parse_label(subranknd.label) + if subranknd.is_internal() and subranknd_taxon is not None and subranknd_taxon.startswith(child_rk): + child_taxons = subranknd_taxon.split(";") + child_taxon_node = subranknd + child_rel_dist = child_taxon_node.rel_dist + break + else: + #case 2a and 2b + closest_rank = parent_rank; + detection = "Topology" else: - #case 2a and 2b + #case 1a closest_rank = parent_rank; detection = "Topology" - else: - #case 1a - closest_rank = parent_rank; - detection = "Topology" - - #case 1b - if len(child_taxons) == 0 and closest_rank is None: - list_leaves = [childnd.taxon.label.replace("'",'') for childnd in cur_node.leaf_iter() if (childnd.taxon.label.startswith('RS_') or childnd.taxon.label.startswith('GB_'))] - if len(list_leaves) != 1 : - self.logger.error('There should be only one leaf.') - sys.exit(-1) - list_leaf_ranks = gtdb_taxonomy.get(list_leaves[0])[self.order_rank.index(child_rk):-1] - for leaf_taxon in reversed(list_leaf_ranks): - if leaf_taxon == list_leaf_ranks[0]: - if abs(current_rel_list - marker_dict.get(leaf_taxon[:3])) < abs((current_rel_list) - marker_dict.get(parent_rank)): - #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ): - closest_rank = leaf_taxon[:3] - genome_parent_child[3]=leaf_taxon - genome_parent_child[5]='case 1b - III' - break + + #case 1b + if len(child_taxons) == 0 and closest_rank is None: + list_leaves = [childnd.taxon.label.replace("'",'') for childnd in cur_node.leaf_iter() if (childnd.taxon.label.startswith('RS_') or childnd.taxon.label.startswith('GB_'))] + if len(list_leaves) != 1 : + self.logger.error('There should be only one leaf.') + sys.exit(-1) + list_leaf_ranks = gtdb_taxonomy.get(list_leaves[0])[self.order_rank.index(child_rk):-1] + for leaf_taxon in reversed(list_leaf_ranks): + if leaf_taxon == list_leaf_ranks[0]: + if abs(current_rel_list - marker_dict.get(leaf_taxon[:3])) < abs((current_rel_list) - marker_dict.get(parent_rank)): + #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ): + closest_rank = leaf_taxon[:3] + genome_parent_child[3]=leaf_taxon + genome_parent_child[5]='case 1b - III' + break + else: + pchildrank = list_leaf_ranks[list_leaf_ranks.index(leaf_taxon)-1] + if abs(current_rel_list - marker_dict.get(leaf_taxon[:3])) < abs(current_rel_list-marker_dict.get(pchildrank[:3])): + #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ) : + closest_rank = leaf_taxon[:3] + genome_parent_child[1]=pchildrank + genome_parent_child[2]=1.0 + genome_parent_child[3]= leaf_taxon + genome_parent_child[5]='case 1b - II' + break + if closest_rank is None: + closest_rank = parent_rank + genome_parent_child[3]=list_leaf_ranks[0] + genome_parent_child[5]='case 1b - IV' + + + #if there is multiple ranks on the child node (i.e genome between p__Nitrospirae and c__Nitrospiria;o__Nitrospirales;f__Nitropiraceae) + #we loop through the list of rank from f_ to c_ rank + for child_taxon in reversed(child_taxons): + # if lower rank is c__Nitropiria + if child_taxon == child_taxons[0]: + if ( abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(child_rel_dist -marker_dict.get(child_taxon[:3])) + and abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(current_rel_list - marker_dict.get(parent_rank))) : + genome_parent_child[3]=';'.join(child_taxons) + genome_parent_child[4]=child_rel_dist + genome_parent_child[5]='case 3b - II' + closest_rank = child_taxon[:3] + elif closest_rank is None: + closest_rank = parent_rank + genome_parent_child[3]=';'.join(child_taxons) + genome_parent_child[4]=child_rel_dist + genome_parent_child[5]='case 3b - III' else: - pchildrank = list_leaf_ranks[list_leaf_ranks.index(leaf_taxon)-1] - if abs(current_rel_list - marker_dict.get(leaf_taxon[:3])) < abs(current_rel_list-marker_dict.get(pchildrank[:3])): - #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ) : - closest_rank = leaf_taxon[:3] - genome_parent_child[1]=pchildrank - genome_parent_child[2]=1.0 - genome_parent_child[3]= leaf_taxon - genome_parent_child[5]='case 1b - II' + pchildrank = child_taxons[child_taxons.index(child_taxon)-1] + if (abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(current_rel_list - marker_dict.get(pchildrank[:3])) + and abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(child_rel_dist -marker_dict.get(child_taxon[:3]))) : + closest_rank = child_taxon + genome_parent_child[3]=';'.join(child_taxons) + genome_parent_child[4]=child_rel_dist + genome_parent_child[5]='case 3b - I' break + + + # case 1b if closest_rank is None: - closest_rank = parent_rank - genome_parent_child[3]=list_leaf_ranks[0] - genome_parent_child[5]='case 1b - IV' + print "IT SHOULDN'T HAPPEN!!!" - - #if there is multiple ranks on the child node (i.e genome between p__Nitrospirae and c__Nitrospiria;o__Nitrospirales;f__Nitropiraceae) - #we loop through the list of rank from f_ to c_ rank - for child_taxon in reversed(child_taxons): - # if lower rank is c__Nitropiria - if child_taxon == child_taxons[0]: - if ( abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(child_rel_dist -marker_dict.get(child_taxon[:3])) - and abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(current_rel_list - marker_dict.get(parent_rank))) : - genome_parent_child[3]=';'.join(child_taxons) - genome_parent_child[4]=child_rel_dist - genome_parent_child[5]='case 3b - II' - closest_rank = child_taxon[:3] - elif closest_rank is None: - closest_rank = parent_rank - genome_parent_child[3]=';'.join(child_taxons) - genome_parent_child[4]=child_rel_dist - genome_parent_child[5]='case 3b - III' - else: - pchildrank = child_taxons[child_taxons.index(child_taxon)-1] - if (abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(current_rel_list - marker_dict.get(pchildrank[:3])) - and abs(current_rel_list - marker_dict.get(child_taxon[:3])) < abs(child_rel_dist -marker_dict.get(child_taxon[:3]))) : - closest_rank = child_taxon - genome_parent_child[3]=';'.join(child_taxons) - genome_parent_child[4]=child_rel_dist - genome_parent_child[5]='case 3b - I' - break - - - # case 1b - if closest_rank is None: - print "IT SHOULDN'T HAPPEN!!!" + + genome_parent_child[6]=closest_rank + + list_subnode = [subnd.taxon.label.replace("'",'') for subnd in cur_node.leaf_iter()] + red_taxonomy = self._get_redtax(list_subnode,closest_rank,gtdb_taxonomy) - genome_parent_child[6]=closest_rank - - - list_subnode = [subnd.taxon.label.replace("'",'') for subnd in cur_node.leaf_iter()] - red_taxonomy = self._get_redtax(list_subnode,closest_rank,gtdb_taxonomy) - - fout.write('{0}\t{1}\n'.format(leaf.taxon.label, red_taxonomy)) - del genome_parent_child[0] - redfout.write("{0}\t{1}\t{2}\n".format(leaf.taxon.label,detection,current_rel_list)) - #redfout.write('{0}\t{1}\t{2}\t{3}\n'.format(leaf.taxon.label,current_rel_list,'\t'.join(str(x) for x in genome_parent_child),detection)) - - redfout.close() - fout.close() - - pplaceout = open(os.path.join(out_dir, prefix + '.%s.classification_pplacer.tsv' % marker_set_id), 'w') - - # We get the pplacer taxonomy for comparison - user_genome_ids = set(read_fasta(user_msa_file).keys()) - for leaf in tree.leaf_node_iter(): - if leaf.taxon.label in user_genome_ids: - taxa = [] - cur_node = leaf - while cur_node.parent_node: - _support, taxon, _aux_info = parse_label(cur_node.label) - if taxon: - for t in taxon.split(';')[::-1]: - taxa.append(t.strip()) - cur_node = cur_node.parent_node - taxa_str = ';'.join(taxa[::-1]) - pplaceout.write('%s\t%s\n' % (leaf.taxon.label, taxa_str)) - pplaceout.close() + fout.write('{0}\t{1}\n'.format(leaf.taxon.label, red_taxonomy)) + del genome_parent_child[0] + redfout.write("{0}\t{1}\t{2}\n".format(leaf.taxon.label,detection,current_rel_list)) + if debugopt: + parchiinfo.write('{0}\t{1}\t{2}\t{3}\n'.format(leaf.taxon.label,current_rel_list,'\t'.join(str(x) for x in genome_parent_child),detection)) + + redfout.close() + fout.close() + if debugopt: + parchiinfo.close() + + pplaceout = open(os.path.join(out_dir, prefix + '.%s.classification_pplacer.tsv' % marker_set_id), 'w') + + # We get the pplacer taxonomy for comparison + user_genome_ids = set(read_fasta(user_msa_file).keys()) + for leaf in tree.leaf_node_iter(): + if leaf.taxon.label in user_genome_ids: + taxa = [] + cur_node = leaf + while cur_node.parent_node: + _support, taxon, _aux_info = parse_label(cur_node.label) + if taxon: + for t in taxon.split(';')[::-1]: + taxa.append(t.strip()) + cur_node = cur_node.parent_node + taxa_str = ';'.join(taxa[::-1]) + pplaceout.write('%s\t%s\n' % (leaf.taxon.label, taxa_str)) + pplaceout.close() + except ValueError as error: + print "GTDB-Tk has stopped before finishing" + sys.exit(-1) + except Exception as error: + print "GTDB-Tk has stopped before finishing" + sys.exit(-1) def _fastaniWorker(self, sublist_genomes, genomes, out_q): - - - for items in sublist_genomes: - dict_parser_distance = self._calculate_fastani_distance(items, genomes) - for k,v in dict_parser_distance.iteritems(): - if k in out_q: - print "It should not happen (if k in out_q)" - out_q[k]=v - return True + try: + for items in sublist_genomes: + dict_parser_distance = self._calculate_fastani_distance(items, genomes) + for k,v in dict_parser_distance.iteritems(): + if k in out_q: + print "It should not happen (if k in out_q)" + out_q[k]=v + return True + except Exception as error: + print error + raise def _parse_subtree(self,cur_node,dict_subrank,lower_rank): @@ -455,7 +469,7 @@ def _parse_subnodes(self,list_subnode,closest_rank,gtdb_taxonomy): initial_loop = True for item in list_subnode: # We get the taxonomy of all reference genomes - if item.startswith('RS_') or item.startswith('GB_'): + if item.startswith('RS_') or item.startswith('GB_') or item.startswith('UBA'): taxonomy_from_file = gtdb_taxonomy.get(item) # we store the selected rank (i.e. order) for each reference genome for rank in taxonomy_from_file: @@ -572,10 +586,12 @@ def _calculate_fastani_distance(self,list_leaf,genomes): ref_list_file = open(os.path.join(self.tmp_output_dir, 'ref_list.txt'),'w') make_sure_path_exists(self.tmp_output_dir) for leaf in list_leaf: - if not leaf.startswith('GB_') and not leaf.startswith('RS_'): + if not leaf.startswith('GB_') and not leaf.startswith('RS_') and not leaf.startswith('UBA'): query_list_file.write('{0}\n'.format(genomes.get(leaf))) else: - shortleaf = leaf[3:] + shortleaf = leaf + if leaf.startswith('GB_') or leaf.startswith('RS_'): + shortleaf = leaf[3:] ref_list_file.write('{0}{1}{2}\n'.format(Config.FASTANI_GENOMES,shortleaf,Config.FASTANI_GENOMES_EXT)) query_list_file.close() @@ -584,19 +600,30 @@ def _calculate_fastani_distance(self,list_leaf,genomes): if not os.path.isfile(os.path.join(self.tmp_output_dir, 'query_list.txt')) or not os.path.isfile(os.path.join(self.tmp_output_dir, 'ref_list.txt')): raise - cmd = 'fastANI --ql {0} --rl {1} -o {2} > /dev/null 2>&1'.format(os.path.join(self.tmp_output_dir, 'query_list.txt'), + cmd = 'fastANI --ql {0} --rl {1} -o {2} > /dev/null 2>{3}'.format(os.path.join(self.tmp_output_dir, 'query_list.txt'), os.path.join(self.tmp_output_dir, 'ref_list.txt'), - os.path.join(self.tmp_output_dir, 'results.tab')) + os.path.join(self.tmp_output_dir, 'results.tab'), + os.path.join(self.tmp_output_dir, 'error.log')) os.system(cmd) - + if not os.path.isfile(os.path.join(self.tmp_output_dir,'results.tab')): - raise + errstr = 'FastANI has stopped:\n' + if os.path.isfile(os.path.join(self.tmp_output_dir, 'error.log')): + with open(os.path.join(self.tmp_output_dir, 'error.log')) as debug: + for line in debug: + finalline =line + errstr +=finalline + raise ValueError(errstr) dict_parser_distance = self._parse_fastani_results(os.path.join(self.tmp_output_dir,'results.tab'),list_leaf) shutil.rmtree(self.tmp_output_dir) return dict_parser_distance - except: + except ValueError as error: + if os.path.exists(self.tmp_output_dir): + shutil.rmtree(self.tmp_output_dir) + raise + except Exception as error: if os.path.exists(self.tmp_output_dir): shutil.rmtree(self.tmp_output_dir) raise diff --git a/gtdbtk/config/config.py b/gtdbtk/config/config.py index 8dbd02da..98e97e55 100644 --- a/gtdbtk/config/config.py +++ b/gtdbtk/config/config.py @@ -1,92 +1,110 @@ -# Annotation folder -MARKER_GENE_DIR = "marker_genes" - -GENOME_FILE_SUFFIX = "_genomic.fna" - -BAC120_MARKERS = {"PFAM": ["PF00380.14.hmm", "PF00410.14.hmm", "PF00466.15.hmm", "PF01025.14.hmm", "PF02576.12.hmm", "PF03726.9.hmm"], - "TIGRFAM": ["TIGR00006.HMM", "TIGR00019.HMM", "TIGR00020.HMM", "TIGR00029.HMM", "TIGR00043.HMM", "TIGR00054.HMM", "TIGR00059.HMM", "TIGR00061.HMM", "TIGR00064.HMM", - "TIGR00065.HMM", "TIGR00082.HMM", "TIGR00083.HMM", "TIGR00084.HMM", "TIGR00086.HMM", "TIGR00088.HMM", "TIGR00090.HMM", "TIGR00092.HMM", "TIGR00095.HMM", - "TIGR00115.HMM", "TIGR00116.HMM", "TIGR00138.HMM", "TIGR00158.HMM", "TIGR00166.HMM", "TIGR00168.HMM", "TIGR00186.HMM", "TIGR00194.HMM", "TIGR00250.HMM", - "TIGR00337.HMM", "TIGR00344.HMM", "TIGR00362.HMM", "TIGR00382.HMM", "TIGR00392.HMM", "TIGR00396.HMM", "TIGR00398.HMM", "TIGR00414.HMM", "TIGR00416.HMM", - "TIGR00420.HMM", "TIGR00431.HMM", "TIGR00435.HMM", "TIGR00436.HMM", "TIGR00442.HMM", "TIGR00445.HMM", "TIGR00456.HMM", "TIGR00459.HMM", "TIGR00460.HMM", - "TIGR00468.HMM", "TIGR00472.HMM", "TIGR00487.HMM", "TIGR00496.HMM", "TIGR00539.HMM", "TIGR00580.HMM", "TIGR00593.HMM", "TIGR00615.HMM", "TIGR00631.HMM", - "TIGR00634.HMM", "TIGR00635.HMM", "TIGR00643.HMM", "TIGR00663.HMM", "TIGR00717.HMM", "TIGR00755.HMM", "TIGR00810.HMM", "TIGR00922.HMM", "TIGR00928.HMM", - "TIGR00959.HMM", "TIGR00963.HMM", "TIGR00964.HMM", "TIGR00967.HMM", "TIGR01009.HMM", "TIGR01011.HMM", "TIGR01017.HMM", "TIGR01021.HMM", "TIGR01029.HMM", - "TIGR01032.HMM", "TIGR01039.HMM", "TIGR01044.HMM", "TIGR01059.HMM", "TIGR01063.HMM", "TIGR01066.HMM", "TIGR01071.HMM", "TIGR01079.HMM", "TIGR01082.HMM", - "TIGR01087.HMM", "TIGR01128.HMM", "TIGR01146.HMM", "TIGR01164.HMM", "TIGR01169.HMM", "TIGR01171.HMM", "TIGR01302.HMM", "TIGR01391.HMM", "TIGR01393.HMM", - "TIGR01394.HMM", "TIGR01510.HMM", "TIGR01632.HMM", "TIGR01951.HMM", "TIGR01953.HMM", "TIGR02012.HMM", "TIGR02013.HMM", "TIGR02027.HMM", "TIGR02075.HMM", - "TIGR02191.HMM", "TIGR02273.HMM", "TIGR02350.HMM", "TIGR02386.HMM", "TIGR02397.HMM", "TIGR02432.HMM", "TIGR02729.HMM", "TIGR03263.HMM", "TIGR03594.HMM", - "TIGR03625.HMM", "TIGR03632.HMM", "TIGR03654.HMM", "TIGR03723.HMM", "TIGR03725.HMM", "TIGR03953.HMM"]} - -AR122_MARKERS = {"PFAM": ["PF01868.11.hmm", "PF01282.14.hmm", "PF01655.13.hmm", "PF01092.14.hmm", "PF01000.21.hmm", "PF00368.13.hmm", "PF00827.12.hmm", "PF01269.12.hmm", "PF00466.15.hmm", - "PF01015.13.hmm", "PF13685.1.hmm", "PF02978.14.hmm", "PF04919.7.hmm", "PF01984.15.hmm", "PF04104.9.hmm", "PF00410.14.hmm", "PF01798.13.hmm", "PF01864.12.hmm", - "PF01990.12.hmm", "PF07541.7.hmm", "PF04019.7.hmm", "PF00900.15.hmm", "PF01090.14.hmm", "PF02006.11.hmm", "PF01157.13.hmm", "PF01191.14.hmm", "PF01866.12.hmm", - "PF01198.14.hmm", "PF01496.14.hmm", "PF00687.16.hmm", "PF03874.11.hmm", "PF01194.12.hmm", "PF01200.13.hmm", "PF13656.1.hmm", "PF01280.15.hmm"], - "TIGRFAM": ["TIGR00468.HMM", "TIGR01060.HMM", "TIGR03627.HMM", "TIGR01020.HMM", "TIGR02258.HMM", "TIGR00293.HMM", "TIGR00389.HMM", "TIGR01012.HMM", "TIGR00490.HMM", "TIGR03677.HMM", - "TIGR03636.HMM", "TIGR03722.HMM", "TIGR00458.HMM", "TIGR00291.HMM", "TIGR00670.HMM", "TIGR00064.HMM", "TIGR03629.HMM", "TIGR00021.HMM", "TIGR03672.HMM", "TIGR00111.HMM", - "TIGR03684.HMM", "TIGR01077.HMM", "TIGR01213.HMM", "TIGR01080.HMM", "TIGR00501.HMM", "TIGR00729.HMM", "TIGR01038.HMM", "TIGR00270.HMM", "TIGR03628.HMM", "TIGR01028.HMM", - "TIGR00521.HMM", "TIGR03671.HMM", "TIGR00240.HMM", "TIGR02390.HMM", "TIGR02338.HMM", "TIGR00037.HMM", "TIGR02076.HMM", "TIGR00335.HMM", "TIGR01025.HMM", "TIGR00471.HMM", - "TIGR00336.HMM", "TIGR00522.HMM", "TIGR02153.HMM", "TIGR02651.HMM", "TIGR03674.HMM", "TIGR00323.HMM", "TIGR00134.HMM", "TIGR02236.HMM", "TIGR03683.HMM", "TIGR00491.HMM", - "TIGR00658.HMM", "TIGR03680.HMM", "TIGR00392.HMM", "TIGR00422.HMM", "TIGR00279.HMM", "TIGR01052.HMM", "TIGR00442.HMM", "TIGR00308.HMM", "TIGR00398.HMM", "TIGR00456.HMM", - "TIGR00549.HMM", "TIGR00408.HMM", "TIGR00432.HMM", "TIGR00264.HMM", "TIGR00982.HMM", "TIGR00324.HMM", "TIGR01952.HMM", "TIGR03626.HMM", "TIGR03670.HMM", "TIGR00337.HMM", - "TIGR01046.HMM", "TIGR01018.HMM", "TIGR00936.HMM", "TIGR00463.HMM", "TIGR01309.HMM", "TIGR03653.HMM", "TIGR00042.HMM", "TIGR02389.HMM", "TIGR00307.HMM", "TIGR03673.HMM", - "TIGR00373.HMM", "TIGR01008.HMM", "TIGR00283.HMM", "TIGR00425.HMM", "TIGR00405.HMM", "TIGR03665.HMM", "TIGR00448.HMM"]} - -RPS23_MARKERS = {"PFAM": ["PF00687.16.hmm","PF00466.15.hmm","PF00298.14.hmm","PF03946.9.hmm","PF00238.14.hmm", - "PF00252.13.hmm","PF00861.17.hmm","PF00181.18.hmm","PF00237.14.hmm", - "PF03947.13.hmm","PF00297.17.hmm","PF00573.17.hmm","PF00281.14.hmm", - "PF00673.16.hmm","PF00338.17.hmm","PF00411.14.hmm","PF00416.17.hmm", - "PF00312.17.hmm","PF00366.15.hmm","PF00203.16.hmm","PF00189.15.hmm", - "PF00333.15.hmm","PF03719.10.hmm","PF00177.16.hmm","PF00410.14.hmm", - "PF00380.14.hmm","PF00164.20.hmm"], - "TIGRFAM": []} - -GENERIC_PATH = '/srv/db/gtdbtk/' - -TIGRFAM_HMMS = '/srv/db/gtdbtk/markers//tigrfam/tigrfam.hmm' -PFAM_HMM_DIR = '/srv/db/gtdbtk/markers/pfam/' - - -# Information for aligning genomes -DEFAULT_DOMAIN_THRESHOLD = 10.0 -AR_MARKER_COUNT = 122 -BAC_MARKER_COUNT = 120 - -# Path to MSA and Taxonomy -MSA_FOLDER = GENERIC_PATH + "msa/" -CONCAT_BAC120 = MSA_FOLDER + "gtdb_r80_bac120.faa" -CONCAT_AR122 = MSA_FOLDER + "gtdb_r80_ar122.faa" -#CONCAT_RPS23 = MSA_FOLDER + "gtdb_r80_rps23.faa" - -TAX_FOLDER = GENERIC_PATH + "taxonomy/" -TAXONOMY_FILE = TAX_FOLDER + "gtdb_r80_taxonomy.tsv" - -# Path to canonical masks -MASK_DIR = GENERIC_PATH + "masks/" -MASK_BAC120 = "gtdb_r80_bac120.mask" -#MASK_BAC120 = "gtdb_r80_bac120_untrimmed.mask" -MASK_AR122 = "gtdb_r80_ar122.mask" -MASK_RPS23 = "gtdb_r80_rps23.mask" - -# Path to pplacer data -PPLACER_DIR = GENERIC_PATH + "pplacer/" -#PPLACER_DIR = GENERIC_PATH + "pplacer/notrimmed/" - -PPLACER_BAC120_REF_PKG = "gtdb_r80_bac120.refpkg" -PPLACER_AR122_REF_PKG = "gtdb_r80_ar122.refpkg" -PPLACER_RPS23_REF_PKG = "gtdb_r80_rps23.refpkg" - -FASTANI_DIR = GENERIC_PATH + "fastani/" -FASTANI_SPECIES_THRESHOLD = 95.0 -FASTANI_GENOMES = FASTANI_DIR + "database/" -FASTANI_GENOMES_EXT = GENOME_FILE_SUFFIX - -#Relative Evolution Distance -RED_MIN_SUPPORT = 0.0 -RED_MIN_CHILDREN = 2 -RED_DIST_BAC_DICT = {"d__":0.00,"p__":0.344460726186,"c__":0.492670538599,"o__":0.648322059106,"f__":0.78784418792,"g__":0.940152273956} -RED_DIST_ARC_DICT = {"d__":0.00,"p__":0.35531101963,"c__":0.508396357807,"o__":0.660914720355,"f__":0.728629307816,"g__":0.880665740097} - - - - +############################ +#EDIT FROM HERE +############################ + +# Path to MSA and Taxonomy +GENERIC_PATH = "/srv/db/gtdbtk/official/release83/" + +############################ +#If all downloaded data is in the same folder +# There is no need of editing variable below this point +############################ + + +TIGRFAM_HMMS = GENERIC_PATH + 'markers/tigrfam/tigrfam.hmm' +PFAM_HMM_DIR = GENERIC_PATH + 'markers/pfam/' +MSA_FOLDER = GENERIC_PATH + "msa/" +MASK_DIR = GENERIC_PATH + "masks/" +PPLACER_DIR = GENERIC_PATH + "pplacer/" +FASTANI_DIR = GENERIC_PATH + "fastani/" +TAX_FOLDER = GENERIC_PATH + "taxonomy/" + + + +############################ +#STOP EDIT +############################ + + + + + +BAC120_MARKERS = {"PFAM": ["PF00380.14.hmm", "PF00410.14.hmm", "PF00466.15.hmm", "PF01025.14.hmm", "PF02576.12.hmm", "PF03726.9.hmm"], + "TIGRFAM": ["TIGR00006.HMM", "TIGR00019.HMM", "TIGR00020.HMM", "TIGR00029.HMM", "TIGR00043.HMM", "TIGR00054.HMM", "TIGR00059.HMM", "TIGR00061.HMM", "TIGR00064.HMM", + "TIGR00065.HMM", "TIGR00082.HMM", "TIGR00083.HMM", "TIGR00084.HMM", "TIGR00086.HMM", "TIGR00088.HMM", "TIGR00090.HMM", "TIGR00092.HMM", "TIGR00095.HMM", + "TIGR00115.HMM", "TIGR00116.HMM", "TIGR00138.HMM", "TIGR00158.HMM", "TIGR00166.HMM", "TIGR00168.HMM", "TIGR00186.HMM", "TIGR00194.HMM", "TIGR00250.HMM", + "TIGR00337.HMM", "TIGR00344.HMM", "TIGR00362.HMM", "TIGR00382.HMM", "TIGR00392.HMM", "TIGR00396.HMM", "TIGR00398.HMM", "TIGR00414.HMM", "TIGR00416.HMM", + "TIGR00420.HMM", "TIGR00431.HMM", "TIGR00435.HMM", "TIGR00436.HMM", "TIGR00442.HMM", "TIGR00445.HMM", "TIGR00456.HMM", "TIGR00459.HMM", "TIGR00460.HMM", + "TIGR00468.HMM", "TIGR00472.HMM", "TIGR00487.HMM", "TIGR00496.HMM", "TIGR00539.HMM", "TIGR00580.HMM", "TIGR00593.HMM", "TIGR00615.HMM", "TIGR00631.HMM", + "TIGR00634.HMM", "TIGR00635.HMM", "TIGR00643.HMM", "TIGR00663.HMM", "TIGR00717.HMM", "TIGR00755.HMM", "TIGR00810.HMM", "TIGR00922.HMM", "TIGR00928.HMM", + "TIGR00959.HMM", "TIGR00963.HMM", "TIGR00964.HMM", "TIGR00967.HMM", "TIGR01009.HMM", "TIGR01011.HMM", "TIGR01017.HMM", "TIGR01021.HMM", "TIGR01029.HMM", + "TIGR01032.HMM", "TIGR01039.HMM", "TIGR01044.HMM", "TIGR01059.HMM", "TIGR01063.HMM", "TIGR01066.HMM", "TIGR01071.HMM", "TIGR01079.HMM", "TIGR01082.HMM", + "TIGR01087.HMM", "TIGR01128.HMM", "TIGR01146.HMM", "TIGR01164.HMM", "TIGR01169.HMM", "TIGR01171.HMM", "TIGR01302.HMM", "TIGR01391.HMM", "TIGR01393.HMM", + "TIGR01394.HMM", "TIGR01510.HMM", "TIGR01632.HMM", "TIGR01951.HMM", "TIGR01953.HMM", "TIGR02012.HMM", "TIGR02013.HMM", "TIGR02027.HMM", "TIGR02075.HMM", + "TIGR02191.HMM", "TIGR02273.HMM", "TIGR02350.HMM", "TIGR02386.HMM", "TIGR02397.HMM", "TIGR02432.HMM", "TIGR02729.HMM", "TIGR03263.HMM", "TIGR03594.HMM", + "TIGR03625.HMM", "TIGR03632.HMM", "TIGR03654.HMM", "TIGR03723.HMM", "TIGR03725.HMM", "TIGR03953.HMM"]} + +AR122_MARKERS = {"PFAM": ["PF01868.11.hmm", "PF01282.14.hmm", "PF01655.13.hmm", "PF01092.14.hmm", "PF01000.21.hmm", "PF00368.13.hmm", "PF00827.12.hmm", "PF01269.12.hmm", "PF00466.15.hmm", + "PF01015.13.hmm", "PF13685.1.hmm", "PF02978.14.hmm", "PF04919.7.hmm", "PF01984.15.hmm", "PF04104.9.hmm", "PF00410.14.hmm", "PF01798.13.hmm", "PF01864.12.hmm", + "PF01990.12.hmm", "PF07541.7.hmm", "PF04019.7.hmm", "PF00900.15.hmm", "PF01090.14.hmm", "PF02006.11.hmm", "PF01157.13.hmm", "PF01191.14.hmm", "PF01866.12.hmm", + "PF01198.14.hmm", "PF01496.14.hmm", "PF00687.16.hmm", "PF03874.11.hmm", "PF01194.12.hmm", "PF01200.13.hmm", "PF13656.1.hmm", "PF01280.15.hmm"], + "TIGRFAM": ["TIGR00468.HMM", "TIGR01060.HMM", "TIGR03627.HMM", "TIGR01020.HMM", "TIGR02258.HMM", "TIGR00293.HMM", "TIGR00389.HMM", "TIGR01012.HMM", "TIGR00490.HMM", "TIGR03677.HMM", + "TIGR03636.HMM", "TIGR03722.HMM", "TIGR00458.HMM", "TIGR00291.HMM", "TIGR00670.HMM", "TIGR00064.HMM", "TIGR03629.HMM", "TIGR00021.HMM", "TIGR03672.HMM", "TIGR00111.HMM", + "TIGR03684.HMM", "TIGR01077.HMM", "TIGR01213.HMM", "TIGR01080.HMM", "TIGR00501.HMM", "TIGR00729.HMM", "TIGR01038.HMM", "TIGR00270.HMM", "TIGR03628.HMM", "TIGR01028.HMM", + "TIGR00521.HMM", "TIGR03671.HMM", "TIGR00240.HMM", "TIGR02390.HMM", "TIGR02338.HMM", "TIGR00037.HMM", "TIGR02076.HMM", "TIGR00335.HMM", "TIGR01025.HMM", "TIGR00471.HMM", + "TIGR00336.HMM", "TIGR00522.HMM", "TIGR02153.HMM", "TIGR02651.HMM", "TIGR03674.HMM", "TIGR00323.HMM", "TIGR00134.HMM", "TIGR02236.HMM", "TIGR03683.HMM", "TIGR00491.HMM", + "TIGR00658.HMM", "TIGR03680.HMM", "TIGR00392.HMM", "TIGR00422.HMM", "TIGR00279.HMM", "TIGR01052.HMM", "TIGR00442.HMM", "TIGR00308.HMM", "TIGR00398.HMM", "TIGR00456.HMM", + "TIGR00549.HMM", "TIGR00408.HMM", "TIGR00432.HMM", "TIGR00264.HMM", "TIGR00982.HMM", "TIGR00324.HMM", "TIGR01952.HMM", "TIGR03626.HMM", "TIGR03670.HMM", "TIGR00337.HMM", + "TIGR01046.HMM", "TIGR01018.HMM", "TIGR00936.HMM", "TIGR00463.HMM", "TIGR01309.HMM", "TIGR03653.HMM", "TIGR00042.HMM", "TIGR02389.HMM", "TIGR00307.HMM", "TIGR03673.HMM", + "TIGR00373.HMM", "TIGR01008.HMM", "TIGR00283.HMM", "TIGR00425.HMM", "TIGR00405.HMM", "TIGR03665.HMM", "TIGR00448.HMM"]} + +RPS23_MARKERS = {"PFAM": ["PF00687.16.hmm","PF00466.15.hmm","PF00298.14.hmm","PF03946.9.hmm","PF00238.14.hmm", + "PF00252.13.hmm","PF00861.17.hmm","PF00181.18.hmm","PF00237.14.hmm", + "PF03947.13.hmm","PF00297.17.hmm","PF00573.17.hmm","PF00281.14.hmm", + "PF00673.16.hmm","PF00338.17.hmm","PF00411.14.hmm","PF00416.17.hmm", + "PF00312.17.hmm","PF00366.15.hmm","PF00203.16.hmm","PF00189.15.hmm", + "PF00333.15.hmm","PF03719.10.hmm","PF00177.16.hmm","PF00410.14.hmm", + "PF00380.14.hmm","PF00164.20.hmm"], + "TIGRFAM": []} + +# Information for aligning genomes +DEFAULT_DOMAIN_THRESHOLD = 10.0 +AR_MARKER_COUNT = 122 +BAC_MARKER_COUNT = 120 + +# Annotation folder +MARKER_GENE_DIR = "marker_genes" + + +#MSA file names +CONCAT_BAC120 = MSA_FOLDER + "gtdb_r83_bac120.faa" +CONCAT_AR122 = MSA_FOLDER + "gtdb_r83_ar122.faa" + +#Taxonomy file name +TAXONOMY_FILE = TAX_FOLDER + "gtdb_taxonomy.tsv" + +#Mask file names +MASK_BAC120 = "gtdb_r83_bac120.mask" +MASK_AR122 = "gtdb_r83_ar122.mask" +MASK_RPS23 = "gtdb_r83_rps23.mask" + + +#Pplacer configuration +PPLACER_OUT = "pplacer.out" +PPLACER_JSON_OUT = "pplacer.json" +PPLACER_BAC120_REF_PKG = "gtdb_r83_bac120.refpkg" +PPLACER_AR122_REF_PKG = "gtdb_r83_ar122.refpkg" +PPLACER_RPS23_REF_PKG = "gtdb_r83_rps23.refpkg" + +#Fastani configuration +FASTANI_SPECIES_THRESHOLD = 95.0 +FASTANI_GENOMES = FASTANI_DIR + "database/" +FASTANI_GENOMES_EXT = "_genomic.fna" + +#Relative Evolution Distance +RED_MIN_SUPPORT = 0.0 +RED_MIN_CHILDREN = 2 +RED_DIST_BAC_DICT = {"d__":0.00,"p__":0.325108639371,"c__":0.469996864411,"o__":0.634217579236,"f__":0.777062142455,"g__":0.93671712357} +RED_DIST_ARC_DICT = {"d__":0.00,"p__":0.227534847987,"c__":0.353206001760,"o__":0.553905413445,"f__":0.732652342427,"g__":0.910973866402} + + diff --git a/gtdbtk/external/pfam_search.py b/gtdbtk/external/pfam_search.py index b3a829c3..03d6ba57 100644 --- a/gtdbtk/external/pfam_search.py +++ b/gtdbtk/external/pfam_search.py @@ -101,34 +101,39 @@ def _topHit(self, pfam_file): def _workerThread(self, queueIn, queueOut): """Process each data item in parallel.""" - while True: - gene_file = queueIn.get(block=True, timeout=None) - if gene_file is None: - break - - genome_dir, filename = os.path.split(gene_file) - genome_id = filename.replace(self.protein_file_suffix, '') - output_hit_file = os.path.join(self.output_dir, genome_id, filename.replace(self.protein_file_suffix, - self.pfam_suffix)) - dir_path = os.path.dirname(os.path.realpath(__file__)) - pfam_search_script = os.path.join(dir_path, 'pfam_search.pl') - cmd = '%s -outfile %s -cpu %d -fasta %s -dir %s' % (pfam_search_script, - output_hit_file, - self.cpus_per_genome, - gene_file, - self.pfam_hmm_dir) - os.system(cmd) - - # calculate checksum - checksum = sha256(output_hit_file) - fout = open(output_hit_file + self.checksum_suffix, 'w') - fout.write(checksum) - fout.close() - - # identify top hit for each gene - self._topHit(output_hit_file) - - queueOut.put(gene_file) + try: + while True: + gene_file = queueIn.get(block=True, timeout=None) + if gene_file is None: + break + + genome_dir, filename = os.path.split(gene_file) + genome_id = filename.replace(self.protein_file_suffix, '') + output_hit_file = os.path.join(self.output_dir, genome_id, filename.replace(self.protein_file_suffix, + self.pfam_suffix)) + dir_path = os.path.dirname(os.path.realpath(__file__)) + pfam_search_script = os.path.join(dir_path, 'pfam_search.pl') + cmd = '%s -outfile %s -cpu %d -fasta %s -dir %s' % (pfam_search_script, + output_hit_file, + self.cpus_per_genome, + gene_file, + self.pfam_hmm_dir) + osexitcode = os.system(cmd) + if osexitcode == 1: + raise RuntimeError("Pfam_search has crashed") + + # calculate checksum + checksum = sha256(output_hit_file) + fout = open(output_hit_file + self.checksum_suffix, 'w') + fout.write(checksum) + fout.close() + + # identify top hit for each gene + self._topHit(output_hit_file) + + queueOut.put(gene_file) + except Exception as error: + raise error def _writerThread(self, numDataItems, writerQueue): """Store or write results of worker threads in a single thread.""" @@ -179,6 +184,9 @@ def run(self, gene_files): for p in workerProc: p.join() + if p.exitcode == 1: + raise ValueError("Pfam Error") + writerQueue.put(None) writeProc.join() diff --git a/gtdbtk/main.py b/gtdbtk/main.py index 266b30a5..80f80443 100644 --- a/gtdbtk/main.py +++ b/gtdbtk/main.py @@ -105,8 +105,8 @@ def _genomes_to_process(self, genome_dir, batchfile, extension): genomic_files[genome_id] = genome_file for genome_key in genomic_files.iterkeys(): - if genome_key.startswith("RS_") or genome_key.startswith("GB_"): - self.logger.error("Submitted genomes start with the same prefix (RS_,GB_) as reference genomes in GTDB-Tk. This will cause issues for downstream analysis.") + if genome_key.startswith("RS_") or genome_key.startswith("GB_") or genome_key.startswith("UBA"): + self.logger.error("Submitted genomes start with the same prefix (RS_,GB_,UBA) as reference genomes in GTDB-Tk. This will cause issues for downstream analysis.") sys.exit(-1) if len(genomic_files) == 0: @@ -190,7 +190,7 @@ def infer(self, options): self.logger.info('Inferring tree with FastTree using %s+GAMMA.' % options.prot_model) fasttree = FastTree(multithreaded=(options.cpus > 1)) - tree_unrooted_output = os.path.join(options.out_dir, options.prefix +options.suffix+ '.unrooted.tree') + tree_unrooted_output = os.path.join(options.out_dir, options.prefix +'.unrooted.tree') tree_log = os.path.join(options.out_dir, options.prefix + '.tree.log') tree_output_log = os.path.join(options.out_dir, 'fasttree.log') fasttree.run(options.msa_file, @@ -214,7 +214,8 @@ def classify(self, options): classify.run(genomes, options.align_dir, options.out_dir, - options.prefix) + options.prefix, + options.debug) self.logger.info('Done.')