From 8df1407d6fa3f3193890b26b342ccafcba7b4e1e Mon Sep 17 00:00:00 2001 From: Ahmed Hussein <50450311+amahussein@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:54:50 -0600 Subject: [PATCH] Fix dataframe handling of column-types (#1458) Signed-off-by: Ahmed Hussein (amahussein) Fixes #1456 --- .../tools/qualification_stats_report.py | 15 ++++++++++++--- .../spark_rapids_tools/tools/qualx/preprocess.py | 4 ++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/user_tools/src/spark_rapids_tools/tools/qualification_stats_report.py b/user_tools/src/spark_rapids_tools/tools/qualification_stats_report.py index de762013d..aedda60f7 100644 --- a/user_tools/src/spark_rapids_tools/tools/qualification_stats_report.py +++ b/user_tools/src/spark_rapids_tools/tools/qualification_stats_report.py @@ -75,7 +75,10 @@ def _read_csv_files(self) -> None: 'toolOutput', 'csv', 'unsupportedOperatorsReport', 'fileName') rapids_unsupported_operators_file = FSUtil.build_path( qual_output_dir, unsupported_operator_report_file) - self.unsupported_operators_df = pd.read_csv(rapids_unsupported_operators_file) + # load the unsupported operators and drop operators that have no names. + self.unsupported_operators_df = ( + pd.read_csv(rapids_unsupported_operators_file, + dtype={'Unsupported Operator': str})).dropna(subset=['Unsupported Operator']) stages_report_file = self.ctxt.get_value('toolOutput', 'csv', 'stagesInformation', 'fileName') @@ -84,7 +87,14 @@ def _read_csv_files(self) -> None: rapids_execs_file = self.ctxt.get_value('toolOutput', 'csv', 'execsInformation', 'fileName') - self.execs_df = pd.read_csv(FSUtil.build_path(qual_output_dir, rapids_execs_file)) + # Load the execs CSV file and drop execs that have no stages or name + self.execs_df = ( + pd.read_csv(FSUtil.build_path(qual_output_dir, rapids_execs_file), + dtype={'Exec Name': str, + 'Exec Stages': str, + 'Exec Children': str, + 'Exec Children Node Ids': str}) + .dropna(subset=['Exec Stages', 'Exec Name'])) self.logger.info('Reading CSV files completed.') def _convert_durations(self) -> None: @@ -103,7 +113,6 @@ def _preprocess_dataframes(self) -> None: # from this dataframe can be matched with the stageID of stages dataframe self.execs_df['Exec Stages'] = self.execs_df['Exec Stages'].str.split(':') self.execs_df = (self.execs_df.explode('Exec Stages'). - dropna(subset=['Exec Stages']). rename(columns={'Exec Stages': 'Stage ID'})) self.execs_df['Stage ID'] = self.execs_df['Stage ID'].astype(int) diff --git a/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py b/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py index 1bdbd76b7..a47b45d73 100644 --- a/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py +++ b/user_tools/src/spark_rapids_tools/tools/qualx/preprocess.py @@ -461,7 +461,7 @@ def combine_tables(table_name: str) -> pd.DataFrame: # normalize WholeStageCodegen labels ops_tbl.loc[ - ops_tbl['nodeName'].str.startswith('WholeStageCodegen'), 'nodeName' + ops_tbl['nodeName'].astype(str).str.startswith('WholeStageCodegen'), 'nodeName' ] = 'WholeStageCodegen' # format WholeStageCodegen for merging @@ -1140,7 +1140,7 @@ def _is_ignore_no_perf(action: str) -> bool: node_level_supp['Exec Is Supported'] = ( node_level_supp['Exec Is Supported'] | node_level_supp['Action'].apply(_is_ignore_no_perf) - | node_level_supp['Exec Name'].apply( + | node_level_supp['Exec Name'].astype(str).apply( lambda x: x.startswith('WholeStageCodegen') ) )