From 1ad059ca6c45a0173675185868063e98f0a62b4e Mon Sep 17 00:00:00 2001
From: pchaumeil <uqpchaum@uq.edu.au>
Date: Tue, 3 Sep 2024 12:19:12 +1000
Subject: [PATCH] fix(rewrite warning counter):

Rewriting warning counter based on issue #585 . counters have been replaced by multiple lists
---
 gtdbtk/classify.py | 98 +++++++++++++++++++++++++---------------------
 gtdbtk/split.py    |  6 +--
 2 files changed, 56 insertions(+), 48 deletions(-)

diff --git a/gtdbtk/classify.py b/gtdbtk/classify.py
index fc6666e..b32557d 100644
--- a/gtdbtk/classify.py
+++ b/gtdbtk/classify.py
@@ -419,7 +419,9 @@ def run(self,
             return output_files
         elif not all_failed_prodigal:
             for marker_set_id in ('ar53', 'bac120'):
-                warning_counter, prodigal_failed_counter = 0, 0
+                genomes_with_warnings = []
+                genomes_with_failed_prodigal = []
+                #warning_counter, prodigal_failed_counter = 0, 0
                 if marker_set_id == 'ar53':
                     marker_summary_fh = CopyNumberFileAR53(align_dir, prefix)
                     marker_summary_fh.read()
@@ -459,7 +461,7 @@ def run(self,
                     if marker_set_id == 'ar53':
                         # we still add the filtered genomes to the summary file
                         # add filtered genomes to the summary file
-                        warning_counter = self.add_filtered_genomes_to_summary(align_dir, warning_counter, summary_file,
+                        genomes_with_warnings = self.add_filtered_genomes_to_summary(align_dir, genomes_with_warnings, summary_file,
                                                                                marker_set_id, prefix)
 
                     # But if there is Unclassified genomes without domain,
@@ -468,10 +470,10 @@ def run(self,
                         # Add failed genomes from prodigal and genomes with no markers in the bac120 summary file
                         # This is an executive direction: failed prodigal and genomes with no markers are not bacterial or archaeal
                         # but they need to be included in one of the summary file
-                        prodigal_failed_counter = self.add_failed_genomes_to_summary(align_dir, summary_file, prefix)
+                        genomes_with_failed_prodigal = self.add_failed_genomes_to_summary(align_dir, summary_file, prefix)
                         # we also add the filtered genomes to the summary file
                         # add filtered genomes to the summary file
-                        warning_counter = self.add_filtered_genomes_to_summary(align_dir, warning_counter, summary_file,
+                        genomes_with_warnings = self.add_filtered_genomes_to_summary(align_dir, genomes_with_warnings, summary_file,
                                                                                marker_set_id, prefix)
 
                     # we add all genomes classified with ANI
@@ -505,6 +507,7 @@ def run(self,
                         elif marker_set_id == 'bac120':
                             symlink_f(PATH_BAC120_SUMMARY_OUT.format(prefix=prefix),
                                       os.path.join(out_dir, os.path.basename(PATH_BAC120_SUMMARY_OUT.format(prefix=prefix))))
+                        prodigal_failed_counter = len(set(genomes_with_failed_prodigal))
                         if prodigal_failed_counter > 0:
                             self.logger.warning(f"{prodigal_failed_counter} of {len(genomes)} "
                                                 f"genome{'' if prodigal_failed_counter == 1 else 's'} "
@@ -530,11 +533,11 @@ def run(self,
                 if mash_classified_user_genomes and marker_set_id in mash_classified_user_genomes:
                     list_summary_rows = mash_classified_user_genomes.get(marker_set_id)
                     for row in list_summary_rows:
-                        row,warning_counter = self._add_warning_to_row(row,
+                        row,genomes_with_warnings = self._add_warning_to_row(row,
                                                        msa_dict,
                                                        tln_table_summary_file.genomes,
                                                        percent_multihit_dict,
-                                                       warning_counter)
+                                                       genomes_with_warnings)
                         summary_file.add_row(row)
                         if row.gid in genomes_to_process:
                             genomes_to_process.remove(row.gid)
@@ -605,9 +608,9 @@ def run(self,
                             tree_mapping_dict_reverse.setdefault(v, []).append(k)
 
                     splitter = Split(self.order_rank, self.gtdb_taxonomy, self.reference_ids)
-                    sorted_high_taxonomy,warning_counter, len_sorted_genomes, high_taxonomy_used = splitter.map_high_taxonomy(
+                    sorted_high_taxonomy,genomes_with_warnings, len_sorted_genomes, high_taxonomy_used = splitter.map_high_taxonomy(
                         high_classification, tree_mapping_dict, summary_file, tree_mapping_file,msa_dict,
-                        tln_table_summary_file.genomes,percent_multihit_dict,bac_ar_diff,warning_counter)
+                        tln_table_summary_file.genomes,percent_multihit_dict,bac_ar_diff,genomes_with_warnings)
 
                     if debugopt:
                         with open(out_dir + '/' + prefix + '_backbone_classification.txt', 'w') as htu:
@@ -643,12 +646,12 @@ def run(self,
                             for disappearing_genome in disappearing_genomes:
                                 disappearing_genomes_file.add_genome(disappearing_genome, tree_iter)
 
-                        class_level_classification, classified_user_genomes,warning_counter = self._parse_tree(mrca_lowtree, genomes,
+                        class_level_classification, classified_user_genomes,genomes_with_warnings = self._parse_tree(mrca_lowtree, genomes,
                                                                                                                msa_dict,percent_multihit_dict,
                                                                                                                genes, tln_table_summary_file.genomes,
                                                                                                                bac_ar_diff, submsa_file_path,
                                                                                                                red_dict_file.data,summary_file,
-                                                                                                               pplacer_taxonomy_dict,warning_counter,
+                                                                                                               pplacer_taxonomy_dict,genomes_with_warnings,
                                                                                                                high_classification, debug_file,
                                                                                                                debugopt,tree_mapping_file,
                                                                                                                tree_iter,tree_mapping_dict_reverse)
@@ -662,14 +665,14 @@ def run(self,
 
 
                     # add filtered genomes to the summary file
-                    warning_counter = self.add_filtered_genomes_to_summary(align_dir,warning_counter, summary_file, marker_set_id, prefix)
+                    genomes_with_warnings = self.add_filtered_genomes_to_summary(align_dir,genomes_with_warnings, summary_file, marker_set_id, prefix)
 
                     # Add failed genomes from prodigal and genomes with no markers in the bac120 summary file
                     # This is a executive direction: failed prodigal and genomes with
                     # no markers are not bacterial or archaeal
                     # but they need to be included in one of the summary file
-                    prodigal_failed_counter = self.add_failed_genomes_to_summary(align_dir, summary_file, prefix)
-                    warning_counter = warning_counter + prodigal_failed_counter
+                    genomes_with_failed_prodigal = self.add_failed_genomes_to_summary(align_dir, summary_file, prefix)
+                    genomes_with_warnings = genomes_with_warnings + genomes_with_failed_prodigal
 
                     # Symlink to the summary file from the root
                     symlink_f(PATH_BAC120_SUMMARY_OUT.format(prefix=prefix),
@@ -698,24 +701,24 @@ def run(self,
                                                                        tree_to_process)
 
                     disappearing_genomes = [seq_id for seq_id in genomes_to_process if seq_id not in pplacer_taxonomy_dict]
-                    class_level_classification, classified_user_genomes,warning_counter = self._parse_tree(tree_to_process, genomes,
+                    class_level_classification, classified_user_genomes,genomes_with_warnings = self._parse_tree(tree_to_process, genomes,
                                                                                                            msa_dict, percent_multihit_dict,
                                                                                                            genes,tln_table_summary_file.genomes,
                                                                                                            bac_ar_diff, user_msa_file,
                                                                                                            red_dict_file.data, summary_file,
-                                                                                                           pplacer_taxonomy_dict,warning_counter,
+                                                                                                           pplacer_taxonomy_dict,genomes_with_warnings,
                                                                                                            None,debug_file,
                                                                                                            debugopt,None,
                                                                                                            None, None)
                     # add filtered genomes to the summary file
-                    warning_counter = self.add_filtered_genomes_to_summary(align_dir,warning_counter, summary_file, marker_set_id, prefix)
+                    genomes_with_warnings = self.add_filtered_genomes_to_summary(align_dir,genomes_with_warnings, summary_file, marker_set_id, prefix)
 
                     # Add failed genomes from prodigal and genomes with no markers in the bac120 summary file
                     # This is a executive direction: failed prodigal and genomes with no markers are nit bacterial or archaeal
                     # but they need to be included in one of the summary file
                     if marker_set_id == 'bac120':
-                        prodigal_failed_counter = self.add_failed_genomes_to_summary(align_dir, summary_file, prefix)
-                        warning_counter = warning_counter + prodigal_failed_counter
+                        genomes_with_failed_prodigal = self.add_failed_genomes_to_summary(align_dir, summary_file, prefix)
+                        genomes_with_warnings = genomes_with_warnings + genomes_with_failed_prodigal
 
                     # Symlink to the summary file from the root
                     if marker_set_id == 'bac120':
@@ -739,12 +742,16 @@ def run(self,
                     output_files.setdefault(marker_set_id, []).append(tree_mapping_file.path)
 
 
+                # calcuate the length of the unique genomes with warnings
+                warning_counter = len(set(genomes_with_warnings))
+                all_genomes = genomes_with_warnings+genomes_to_process+genomes_with_failed_prodigal
+                sum_of_genomes = len(set(all_genomes))
                 if warning_counter > 0:
-                    sum_of_genomes =  len(genomes_to_process)+prodigal_failed_counter
-                    if mash_classified_user_genomes and marker_set_id in mash_classified_user_genomes:
-                        sum_of_genomes +=len(mash_classified_user_genomes.get(marker_set_id))
+                    # sum_of_genomes =  len(genomes_to_process)+len(set(genomes_with_failed_prodigal))
+                    # if mash_classified_user_genomes and marker_set_id in mash_classified_user_genomes:
+                    #     sum_of_genomes +=len(mash_classified_user_genomes.get(marker_set_id))
                     self.logger.warning(f"{warning_counter} of {sum_of_genomes} "
-                                        f"genome{'' if warning_counter==1 else 's'}"
+                                        f"genome{'' if sum_of_genomes==1 else 's'}"
                                         f" ha{'s' if warning_counter==1 else 've'} a warning (see summary file).")
 
                 # Write the summary file to disk.
@@ -760,7 +767,7 @@ def run(self,
             # Add failed genomes from prodigal and genomes with no markers in the bac120 summary file
             # This is an executive direction: failed prodigal and genomes with no markers are not bacterial or archaeal
             # but they need to be included in one of the summary file
-            prodigal_failed_counter = self.add_failed_genomes_to_summary(align_dir, summary_file, prefix)
+            genomes_with_failed_prodigal = self.add_failed_genomes_to_summary(align_dir, summary_file, prefix)
 
             if summary_file.has_row():
                 summary_file.write()
@@ -768,6 +775,7 @@ def run(self,
                 # Symlink to the summary file from the root
                 symlink_f(PATH_BAC120_SUMMARY_OUT.format(prefix=prefix),
                               os.path.join(out_dir, os.path.basename(PATH_BAC120_SUMMARY_OUT.format(prefix=prefix))))
+                prodigal_failed_counter = len(set(genomes_with_failed_prodigal))
                 if prodigal_failed_counter > 0:
                     self.logger.warning(f"{prodigal_failed_counter} of {len(genomes)} "
                                         f"genome{'' if prodigal_failed_counter == 1 else 's'} "
@@ -779,7 +787,7 @@ def run(self,
     def _add_warning_to_row(self,row,msa_dict,
                             trans_table_dict,
                             percent_multihit_dict,
-                            warning_counter):
+                            genomes_with_warnings):
 
         if row.gid in msa_dict:
             row.msa_percent = aa_percent_msa(msa_dict.get(row.gid))
@@ -795,9 +803,9 @@ def _add_warning_to_row(self,row,msa_dict,
                     percent_multihit_dict.get(row.gid))]
         if warnings:
             row.warnings = ';'.join(set(warnings))
-            warning_counter += 1
+            genomes_with_warnings.append(row.gid)
 
-        return row,warning_counter
+        return row,genomes_with_warnings
 
     def _generate_summary_file(self, marker_set_id, prefix, out_dir, debugopt=None, fulltreeopt=None):
         debug_file = None
@@ -909,7 +917,7 @@ def _get_skani_verification(tree, reference_ids, tt):
         return out, qry_nodes
 
     def _classify_red_topology(self, tree, msa_dict, percent_multihit_dict, trans_table_dict, bac_ar_diff,
-                               user_msa_file, red_dict,warning_counter, summary_file, pplacer_taxonomy_dict,
+                               user_msa_file, red_dict,genomes_with_warnings, summary_file, pplacer_taxonomy_dict,
                                high_classification, debug_file, debugopt, classified_user_genomes,
                                unclassified_user_genomes, tt,tree_iter,tree_mapping_file, valid_classes,valid_phyla):
         user_genome_ids = set(read_fasta(user_msa_file).keys())
@@ -1197,7 +1205,7 @@ def _classify_red_topology(self, tree, msa_dict, percent_multihit_dict, trans_ta
                 if summary_row.warnings is not None:
                     warnings.extend(summary_row.warnings.split(';'))
                 summary_row.warnings = ';'.join(set(warnings))
-                warning_counter += 1
+                genomes_with_warnings.append(summary_row.gid)
             if len(notes) > 0:
                 if summary_row.note is not None:
                     notes.extend(summary_row.note.split(';'))
@@ -1209,10 +1217,10 @@ def _classify_red_topology(self, tree, msa_dict, percent_multihit_dict, trans_ta
                     leaf.taxon.label, current_rel_list, '\t'.join(str(x) for x in debug_info), detection))
 
 
-        return class_level_classification,warning_counter
+        return class_level_classification,genomes_with_warnings
 
     def _parse_tree(self, tree, genomes, msa_dict, percent_multihit_dict,genes, trans_table_dict, bac_ar_diff,
-                    user_msa_file, red_dict, summary_file, pplacer_taxonomy_dict,warning_counter, high_classification,
+                    user_msa_file, red_dict, summary_file, pplacer_taxonomy_dict,genomes_with_warnings, high_classification,
                     debug_file, debugopt, tree_mapping_file, tree_iter, tree_mapping_dict_reverse):
         # Genomes can be classified by using skani or RED values
         # We go through all leaves of the tree. if the leaf is a user
@@ -1245,9 +1253,9 @@ def _parse_tree(self, tree, genomes, msa_dict, percent_multihit_dict,genes, tran
         else:
             all_skani_dict = {}
 
-        classified_user_genomes, unclassified_user_genomes,warning_counter = self._sort_skani_results(
+        classified_user_genomes, unclassified_user_genomes,genomes_with_warnings = self._sort_skani_results(
             skani_verification, pplacer_taxonomy_dict, all_skani_dict, msa_dict, percent_multihit_dict,
-            trans_table_dict, bac_ar_diff,warning_counter, summary_file)
+            trans_table_dict, bac_ar_diff,genomes_with_warnings, summary_file)
         #if not prescreening:
         if not genes:
             self.logger.info(f'{len(classified_user_genomes):,} genome(s) have '
@@ -1272,15 +1280,15 @@ def _parse_tree(self, tree, genomes, msa_dict, percent_multihit_dict,genes, tran
             phyla_in_spe_tree = self.get_authorised_rank(class_in_spe_tree,self.PHYLUM_IDX)
 
 
-        class_level_classification,warning_counter = self._classify_red_topology(tree, msa_dict, percent_multihit_dict,
+        class_level_classification,genomes_with_warnings = self._classify_red_topology(tree, msa_dict, percent_multihit_dict,
                                                                  trans_table_dict, bac_ar_diff, user_msa_file,
-                                                                 red_dict,warning_counter, summary_file,
+                                                                 red_dict,genomes_with_warnings, summary_file,
                                                                  pplacer_taxonomy_dict, high_classification,
                                                                  debug_file, debugopt, classified_user_genomes,
                                                                  unclassified_user_genomes, tt,tree_iter,tree_mapping_file,
                                                                  class_in_spe_tree, phyla_in_spe_tree)
 
-        return class_level_classification,classified_user_genomes,warning_counter
+        return class_level_classification,classified_user_genomes,genomes_with_warnings
 
     def _assign_mrca_red(self, input_tree, marker_set_id, levelopt=None, tree_iter=None):
         """Parse the pplacer tree and write the partial taxonomy for each user genome based on their placements
@@ -1419,7 +1427,7 @@ def _get_pplacer_taxonomy(self, pplacer_classify_file, marker_set_id, user_msa_f
 
         return pplacer_classify_file.data
 
-    def add_filtered_genomes_to_summary(self, align_dir,warning_counter, summary_file, marker_set_id,prefix):
+    def add_filtered_genomes_to_summary(self, align_dir,genomes_with_warnings, summary_file, marker_set_id,prefix):
         if marker_set_id == 'bac120':
             filtered_file = os.path.join(align_dir,PATH_BAC120_FILTERED_GENOMES.format(prefix=prefix))
             domain = 'Bacteria'
@@ -1444,19 +1452,19 @@ def add_filtered_genomes_to_summary(self, align_dir,warning_counter, summary_fil
                                 row_to_update.warnings = existing_warnings + ';' + infos[1]
                             else:
                                 row_to_update.warnings = infos[1]
-                                warning_counter += 1
+                                genomes_with_warnings.append(infos[0])
                     else:
                         summary_row = ClassifySummaryFileRow()
                         summary_row.gid = infos[0]
                         summary_row.classification = f'Unclassified {domain}'
                         summary_row.warnings = infos[1]
                         summary_file.add_row(summary_row)
-                        warning_counter += 1
+                        genomes_with_warnings.append(infos[0])
 
-        return warning_counter
+        return genomes_with_warnings
 
     def add_failed_genomes_to_summary(self, align_dir, summary_file, prefix):
-        warning_counter = 0
+        genomes_with_warnings = []
         prodigal_failed_file = os.path.join(align_dir, PATH_FAILS.format(prefix=prefix))
         align_failed_file = os.path.join(align_dir, PATH_FAILED_ALIGN_GENOMES.format(prefix=prefix))
         for failfile in (prodigal_failed_file, align_failed_file):
@@ -1469,8 +1477,8 @@ def add_failed_genomes_to_summary(self, align_dir, summary_file, prefix):
                         summary_row.classification = f'Unclassified'
                         summary_row.warnings = infos[1]
                         summary_file.add_row(summary_row)
-                        warning_counter += 1
-        return warning_counter
+                        genomes_with_warnings.append(infos[0])
+        return genomes_with_warnings
 
     @staticmethod
     def formatnote(sorted_dict,gtdb_taxonomy,species_radius, labels):
@@ -1578,7 +1586,7 @@ def _sort_skani_results_pre_pplacer(self,skani_results,bac_ar_diff):
 
     def _sort_skani_results(self, skani_verification, pplacer_taxonomy_dict,
                               all_skani_dict, msa_dict, percent_multihit_dict,
-                              trans_table_dict, bac_ar_diff,warning_counter, summary_file):
+                              trans_table_dict, bac_ar_diff,genomes_with_warnings, summary_file):
         """Format the note field by concatenating all information in a sorted dictionary
 
         Parameters
@@ -1759,7 +1767,7 @@ def _sort_skani_results(self, skani_verification, pplacer_taxonomy_dict,
 
                     if len(warnings) > 0:
                         summary_row.warnings = ';'.join(warnings)
-                        warning_counter += 1
+                        genomes_with_warnings.append(summary_row.gid)
                     if sorted_prefilter_af_dict[0][1].get('ani') >= self.species_radius.get(
                             sorted_prefilter_af_dict[0][0]):
                         skani_matching_reference = sorted_prefilter_af_dict[0][0]
@@ -1803,7 +1811,7 @@ def _sort_skani_results(self, skani_verification, pplacer_taxonomy_dict,
                         else:
                             summary_row.other_related_refs = other_ref
                     unclassified_user_genomes[userleaf.taxon.label] = summary_row
-        return classified_user_genomes, unclassified_user_genomes,warning_counter
+        return classified_user_genomes, unclassified_user_genomes,genomes_with_warnings
 
     def _get_redtax(self, list_subnode, closest_rank):
         """
diff --git a/gtdbtk/split.py b/gtdbtk/split.py
index b92547d..19d6014 100644
--- a/gtdbtk/split.py
+++ b/gtdbtk/split.py
@@ -339,7 +339,7 @@ def _classify_on_terminal_branch(self, list_leaf_ranks, current_rel_list, parent
         return ';'.join(term_branch_taxonomy[1:self.order_rank.index(closest_rank) + 1])
 
     def map_high_taxonomy(self,high_classification, mapping_dict, summary_file,
-                          tree_mapping_file,msa_dict,trans_table_dict,percent_multihit_dict,bac_ar_diff,warning_counter):
+                          tree_mapping_file,msa_dict,trans_table_dict,percent_multihit_dict,bac_ar_diff,genomes_with_warnings):
         mapped_rank = {}
         counter = 0
         high_taxonomy_used = {}
@@ -382,7 +382,7 @@ def map_high_taxonomy(self,high_classification, mapping_dict, summary_file,
                         if summary_row.warnings is not None:
                             warnings.extend(summary_row.warnings.split(';'))
                         summary_row.warnings = ';'.join(set(warnings))
-                        warning_counter += 1
+                        genomes_with_warnings.append(summary_row.gid)
 
 
                     mapping_row = GenomeMappingFileRow()
@@ -394,4 +394,4 @@ def map_high_taxonomy(self,high_classification, mapping_dict, summary_file,
                     tree_mapping_file.add_row(mapping_row)
                     summary_file.add_row(summary_row)
 
-        return mapped_rank,warning_counter, counter , high_taxonomy_used
+        return mapped_rank,genomes_with_warnings, counter , high_taxonomy_used