From 8df1407d6fa3f3193890b26b342ccafcba7b4e1e Mon Sep 17 00:00:00 2001
From: Ahmed Hussein <50450311+amahussein@users.noreply.github.com>
Date: Thu, 12 Dec 2024 14:54:50 -0600
Subject: [PATCH] Fix dataframe handling of column-types (#1458)

Signed-off-by: Ahmed Hussein (amahussein) <a@ahussein.me>

Fixes #1456
---
 .../tools/qualification_stats_report.py           | 15 ++++++++++++---
 .../spark_rapids_tools/tools/qualx/preprocess.py  |  4 ++--
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/user_tools/src/spark_rapids_tools/tools/qualification_stats_report.py b/user_tools/src/spark_rapids_tools/tools/qualification_stats_report.py
index de762013d..aedda60f7 100644
--- a/user_tools/src/spark_rapids_tools/tools/qualification_stats_report.py
+++ b/user_tools/src/spark_rapids_tools/tools/qualification_stats_report.py
@@ -75,7 +75,10 @@ def _read_csv_files(self) -> None:
             'toolOutput', 'csv', 'unsupportedOperatorsReport', 'fileName')
         rapids_unsupported_operators_file = FSUtil.build_path(
             qual_output_dir, unsupported_operator_report_file)
-        self.unsupported_operators_df = pd.read_csv(rapids_unsupported_operators_file)
+        # load the unsupported operators and drop operators that have no names.
+        self.unsupported_operators_df = (
+            pd.read_csv(rapids_unsupported_operators_file,
+                        dtype={'Unsupported Operator': str})).dropna(subset=['Unsupported Operator'])
 
         stages_report_file = self.ctxt.get_value('toolOutput', 'csv', 'stagesInformation',
                                                  'fileName')
@@ -84,7 +87,14 @@ def _read_csv_files(self) -> None:
 
         rapids_execs_file = self.ctxt.get_value('toolOutput', 'csv', 'execsInformation',
                                                 'fileName')
-        self.execs_df = pd.read_csv(FSUtil.build_path(qual_output_dir, rapids_execs_file))
+        # Load the execs CSV file and drop execs that have no stages or name
+        self.execs_df = (
+            pd.read_csv(FSUtil.build_path(qual_output_dir, rapids_execs_file),
+                        dtype={'Exec Name': str,
+                               'Exec Stages': str,
+                               'Exec Children': str,
+                               'Exec Children Node Ids': str})
+            .dropna(subset=['Exec Stages', 'Exec Name']))
         self.logger.info('Reading CSV files completed.')
 
     def _convert_durations(self) -> None:
@@ -103,7 +113,6 @@ def _preprocess_dataframes(self) -> None:
         # from this dataframe can be matched with the stageID of stages dataframe
         self.execs_df['Exec Stages'] = self.execs_df['Exec Stages'].str.split(':')
         self.execs_df = (self.execs_df.explode('Exec Stages').
-                         dropna(subset=['Exec Stages']).
                          rename(columns={'Exec Stages': 'Stage ID'}))
         self.execs_df['Stage ID'] = self.execs_df['Stage ID'].astype(int)
 
diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py b/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py
index 1bdbd76b7..a47b45d73 100644
--- a/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py
+++ b/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py
@@ -461,7 +461,7 @@ def combine_tables(table_name: str) -> pd.DataFrame:
 
     # normalize WholeStageCodegen labels
     ops_tbl.loc[
-        ops_tbl['nodeName'].str.startswith('WholeStageCodegen'), 'nodeName'
+        ops_tbl['nodeName'].astype(str).str.startswith('WholeStageCodegen'), 'nodeName'
     ] = 'WholeStageCodegen'
 
     # format WholeStageCodegen for merging
@@ -1140,7 +1140,7 @@ def _is_ignore_no_perf(action: str) -> bool:
         node_level_supp['Exec Is Supported'] = (
             node_level_supp['Exec Is Supported']
             | node_level_supp['Action'].apply(_is_ignore_no_perf)
-            | node_level_supp['Exec Name'].apply(
+            | node_level_supp['Exec Name'].astype(str).apply(
                 lambda x: x.startswith('WholeStageCodegen')
             )
         )