From a9c82577c816b8dae1f42c32b2c5273e703922ab Mon Sep 17 00:00:00 2001 From: "Mahadik, Mukul Chandrakant" Date: Thu, 25 Jan 2024 20:09:14 -0700 Subject: [PATCH] Filtered out rows with dictionary in user_label_df The idea is to check the data type using isinstance() and then apply this check on the entire data frame as a whole instead of doing it iteratively on each row which is much slower. These rows are then filtered out of the original dataframe leaving behind only the non-dict rows. --- .../analysis/modelling/trip_model/greedy_similarity_binning.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py index 226fdefb5..b0e167764 100644 --- a/emission/analysis/modelling/trip_model/greedy_similarity_binning.py +++ b/emission/analysis/modelling/trip_model/greedy_similarity_binning.py @@ -295,6 +295,9 @@ def _generate_predictions(self): # compute unique label sets and their probabilities in one cluster # 'p' refers to probability group_cols = user_label_df.columns.tolist() + # Filtering out rows from the user_label_df if they are dictionary objects which come from the survey inputs provided by the users instead of multilabels + if 'trip_user_input' in group_cols: + user_label_df = user_label_df.loc[user_label_df['trip_user_input'].apply(lambda x: not isinstance(x, dict))] unique_labels = user_label_df.groupby(group_cols).size().reset_index(name='uniqcount') unique_labels['p'] = unique_labels.uniqcount / sum_trips labels_columns = user_label_df.columns.to_list()