Clean up code, move correlation computation outside of compare_featur…

…es(), silence pulp messages
alan-turing-institute · Apr 6, 2021 · c067dea · c067dea
1 parent 801e8d3
commit c067dea
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 21 deletions.
diff --git a/metrics/utility-metrics/feature_importance.py b/metrics/utility-metrics/feature_importance.py
@@ -19,7 +19,7 @@
 from sklearn.preprocessing import LabelEncoder, normalize
 from sklearn.metrics.pairwise import cosine_similarity
 from typing import Union
-import dython.nominal.associations as associations
+from dython.nominal import associations
 from pulp import LpMaximize, LpProblem, LpStatus, lpSum, LpVariable, constants, value
 import pulp
 pulp.LpSolverDefault.msg = 1
@@ -117,7 +117,7 @@ def featuretools_importances(df, data_meta, utility_params_ft, rs):
 
     Y = fm.pop(utility_params_ft["label_column"])
 
-    ## create dummies or numerical labels for string categorical variables    
+    # create dummies or numerical labels for string categorical variables
     for col in fm.dtypes.index[[x is np.dtype("object") for x in fm.dtypes]]:
         if utility_params_ft["categorical_enconding"] == "dummies":
             one_hot = pd.get_dummies(fm[col], prefix=col, prefix_sep="_")#.iloc[:, 0:-1]
@@ -288,6 +288,14 @@ def feature_importance_metrics(
             f1_cross = f1_score(y_test_orig, y_pred, average='weighted')
             print('Cross-AUC score of {:.3f} and weighted cross_F1 of {:.3f}'.format(auc_cross, f1_cross))
 
+            # Correlation matrix needed for correlated rank similarity
+            categorical_columns = [c["name"] for c in orig_metadata["columns"]
+                                   if (c["type"] in ["Categorical", "Ordinal"]
+                                   and c["name"] in X_train_orig.columns)]
+            categorical_columns.extend([c for c in X_train_orig.columns if "MODE" in c])
+            correlation_matrix = associations(X_train_orig, nominal_columns=categorical_columns,
+                                              plot=False)["corr"].abs()
+
             rank_rlsd_features_builtin = [i[1] for i in rlsd_feature_importances_builtin]
             score_rlsd_features_builtin = [i[0] for i in rlsd_feature_importances_builtin]
             rank_rlsd_features_permutation = [i[1] for i in rlsd_feature_importances_permutation]
@@ -297,7 +305,7 @@ def feature_importance_metrics(
 
             utility_collector_builtin = compare_features(rank_orig_features_builtin,
                                                          rank_rlsd_features_builtin,
-                                                         X_train_orig,
+                                                         correlation_matrix,
                                                          score_orig_features_builtin,
                                                          score_rlsd_features_builtin,
                                                          utility_collector_builtin,
@@ -314,7 +322,7 @@ def feature_importance_metrics(
 
             utility_collector_permutation = compare_features(rank_orig_features_permutation,
                                                              rank_rlsd_features_permutation,
-                                                             X_train_orig,
+                                                             correlation_matrix,
                                                              score_orig_features_permutation,
                                                              score_rlsd_features_permutation,
                                                              utility_collector_permutation, percentage_threshold)
@@ -330,7 +338,7 @@ def feature_importance_metrics(
             if utility_params.get("compute_shapley"):
                 utility_collector_shapley = compare_features(rank_orig_features_shapley,
                                                              rank_rlsd_features_shapley,
-                                                             X_train_orig,
+                                                             correlation_matrix,
                                                              score_orig_features_shapley,
                                                              score_rlsd_features_shapley,
                                                              utility_collector_shapley, percentage_threshold)
@@ -427,7 +435,7 @@ def feature_importance_metrics(
 
 
 def compare_features(rank_orig_features: list, rank_rlsd_features: list,
-                     X_train_orig: pd.DataFrame,
+                     correlation_matrix: pd.DataFrame,
                      score_orig_features: Union[None, list] = None,
                      score_rlsd_features: Union[None, list] = None,
                      utility_collector: dict = {},
@@ -442,9 +450,8 @@ def compare_features(rank_orig_features: list, rank_rlsd_features: list,
         ranked features from the original dataset
     rank_rlsd_features : list
         ranked features from the synthetic/released dataset
-    X_train_orig : pd.DataFrame
-        A dataframe that contains the original training dataset. This is needed
-        to calculate correlations/correlation-like measures between variables.
+    correlation_matrix : pd.DataFrame
+        Correlations/correlation-like measures between variables.
     score_orig_features : Union[None, list], optional
         scores of the ranked features from the original dataset, by default None
     score_rlsd_features : Union[None, list], optional
@@ -465,9 +472,6 @@ def compare_features(rank_orig_features: list, rank_rlsd_features: list,
     else:
         target_index = len(rank_orig_features)
 
-    # Correlation matrix needed for correlated rank similarity
-    correlation_matrix = associations(X_train_orig)
-
     # RBO - orig vs. rlsd
     orig_rlsd_sim = RankingSimilarity(rank_orig_features[:target_index],
                                       rank_rlsd_features[:target_index])

diff --git a/metrics/utility-metrics/rbo.py b/metrics/utility-metrics/rbo.py
@@ -9,7 +9,7 @@
 """
 
 import numpy as np
-from pulp import LpMaximize, LpProblem, LpStatus, lpSum, LpVariable, constants, value
+from pulp import LpMaximize, LpProblem, LpStatus, lpSum, LpVariable, constants, value, PULP_CBC_CMD
 import pulp
 pulp.LpSolverDefault.msg = 1
 
@@ -333,8 +333,6 @@ def correlated_rank_similarity(self, correlation_matrix, k=None, p=1.0, ext=Fals
             else:  # weighted average - i.e. equivalent to RBO in equation 7 of the RBO paper
                 AO[d] = (A[:(d + 1)] * weights[:(d + 1)]).sum()
 
-        print(AO, A, X, weights)
-
         if ext and p < 1:
             return self._bound_range(AO[-1] + A[-1] * p ** k)
         else:
@@ -384,17 +382,14 @@ def correlated_rank_similarity_ext(self, correlation_matrix, p=0.98):
         disjoint = 0
 
         # start the calculation
-        PP = ProgressPrintOut(l) if self.verbose else NoPrintOut()
-
         for d in range(1, l):
-            PP.printout(d, delta=1)
 
             if d < s:  # still overlapping in length
 
                 X[d], _ = self.lp_optimiser(d, correlation_matrix)
 
                 # Eq. (28) that handles the tie. len() is O(1)
-                A[d] = 2.0 * X[d] / (len(self.S[:(d + 1)]) + len(self.L[:(d + 1)]))
+                A[d] = 2.0 * X[d] / (len(S[:(d + 1)]) + len(L[:(d + 1)]))
 
                 rbo[d] = (weights[:(d + 1)] * A[:(d + 1)]).sum()
 
@@ -482,8 +477,8 @@ def lp_optimiser(self, d, correlation_matrix):
             prob += lpSum(W_np[j][i] for j in range(d1 + 1)) == 1, "Double stochastic col" + str(i)
 
         # Solve and print result
-        prob.solve()
-        print(LpStatus[prob.status])
+        prob.solve(PULP_CBC_CMD(msg=False))
+        #print(LpStatus[prob.status])
 
         # Get W and score
         opt_W = np.array([v.varValue for v in W]).reshape(d1 + 1, d2 + 1)