Skip to content

Commit

Permalink
Clean up code, move correlation computation outside of compare_featur…
Browse files Browse the repository at this point in the history
…es(), silence pulp messages
  • Loading branch information
gmingas committed Apr 6, 2021
1 parent 801e8d3 commit c067dea
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 21 deletions.
28 changes: 16 additions & 12 deletions metrics/utility-metrics/feature_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.metrics.pairwise import cosine_similarity
from typing import Union
import dython.nominal.associations as associations
from dython.nominal import associations
from pulp import LpMaximize, LpProblem, LpStatus, lpSum, LpVariable, constants, value
import pulp
pulp.LpSolverDefault.msg = 1
Expand Down Expand Up @@ -117,7 +117,7 @@ def featuretools_importances(df, data_meta, utility_params_ft, rs):

Y = fm.pop(utility_params_ft["label_column"])

## create dummies or numerical labels for string categorical variables
# create dummies or numerical labels for string categorical variables
for col in fm.dtypes.index[[x is np.dtype("object") for x in fm.dtypes]]:
if utility_params_ft["categorical_enconding"] == "dummies":
one_hot = pd.get_dummies(fm[col], prefix=col, prefix_sep="_")#.iloc[:, 0:-1]
Expand Down Expand Up @@ -288,6 +288,14 @@ def feature_importance_metrics(
f1_cross = f1_score(y_test_orig, y_pred, average='weighted')
print('Cross-AUC score of {:.3f} and weighted cross_F1 of {:.3f}'.format(auc_cross, f1_cross))

# Correlation matrix needed for correlated rank similarity
categorical_columns = [c["name"] for c in orig_metadata["columns"]
if (c["type"] in ["Categorical", "Ordinal"]
and c["name"] in X_train_orig.columns)]
categorical_columns.extend([c for c in X_train_orig.columns if "MODE" in c])
correlation_matrix = associations(X_train_orig, nominal_columns=categorical_columns,
plot=False)["corr"].abs()

rank_rlsd_features_builtin = [i[1] for i in rlsd_feature_importances_builtin]
score_rlsd_features_builtin = [i[0] for i in rlsd_feature_importances_builtin]
rank_rlsd_features_permutation = [i[1] for i in rlsd_feature_importances_permutation]
Expand All @@ -297,7 +305,7 @@ def feature_importance_metrics(

utility_collector_builtin = compare_features(rank_orig_features_builtin,
rank_rlsd_features_builtin,
X_train_orig,
correlation_matrix,
score_orig_features_builtin,
score_rlsd_features_builtin,
utility_collector_builtin,
Expand All @@ -314,7 +322,7 @@ def feature_importance_metrics(

utility_collector_permutation = compare_features(rank_orig_features_permutation,
rank_rlsd_features_permutation,
X_train_orig,
correlation_matrix,
score_orig_features_permutation,
score_rlsd_features_permutation,
utility_collector_permutation, percentage_threshold)
Expand All @@ -330,7 +338,7 @@ def feature_importance_metrics(
if utility_params.get("compute_shapley"):
utility_collector_shapley = compare_features(rank_orig_features_shapley,
rank_rlsd_features_shapley,
X_train_orig,
correlation_matrix,
score_orig_features_shapley,
score_rlsd_features_shapley,
utility_collector_shapley, percentage_threshold)
Expand Down Expand Up @@ -427,7 +435,7 @@ def feature_importance_metrics(


def compare_features(rank_orig_features: list, rank_rlsd_features: list,
X_train_orig: pd.DataFrame,
correlation_matrix: pd.DataFrame,
score_orig_features: Union[None, list] = None,
score_rlsd_features: Union[None, list] = None,
utility_collector: dict = {},
Expand All @@ -442,9 +450,8 @@ def compare_features(rank_orig_features: list, rank_rlsd_features: list,
ranked features from the original dataset
rank_rlsd_features : list
ranked features from the synthetic/released dataset
X_train_orig : pd.DataFrame
A dataframe that contains the original training dataset. This is needed
to calculate correlations/correlation-like measures between variables.
correlation_matrix : pd.DataFrame
Correlations/correlation-like measures between variables.
score_orig_features : Union[None, list], optional
scores of the ranked features from the original dataset, by default None
score_rlsd_features : Union[None, list], optional
Expand All @@ -465,9 +472,6 @@ def compare_features(rank_orig_features: list, rank_rlsd_features: list,
else:
target_index = len(rank_orig_features)

# Correlation matrix needed for correlated rank similarity
correlation_matrix = associations(X_train_orig)

# RBO - orig vs. rlsd
orig_rlsd_sim = RankingSimilarity(rank_orig_features[:target_index],
rank_rlsd_features[:target_index])
Expand Down
13 changes: 4 additions & 9 deletions metrics/utility-metrics/rbo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"""

import numpy as np
from pulp import LpMaximize, LpProblem, LpStatus, lpSum, LpVariable, constants, value
from pulp import LpMaximize, LpProblem, LpStatus, lpSum, LpVariable, constants, value, PULP_CBC_CMD
import pulp
pulp.LpSolverDefault.msg = 1

Expand Down Expand Up @@ -333,8 +333,6 @@ def correlated_rank_similarity(self, correlation_matrix, k=None, p=1.0, ext=Fals
else: # weighted average - i.e. equivalent to RBO in equation 7 of the RBO paper
AO[d] = (A[:(d + 1)] * weights[:(d + 1)]).sum()

print(AO, A, X, weights)

if ext and p < 1:
return self._bound_range(AO[-1] + A[-1] * p ** k)
else:
Expand Down Expand Up @@ -384,17 +382,14 @@ def correlated_rank_similarity_ext(self, correlation_matrix, p=0.98):
disjoint = 0

# start the calculation
PP = ProgressPrintOut(l) if self.verbose else NoPrintOut()

for d in range(1, l):
PP.printout(d, delta=1)

if d < s: # still overlapping in length

X[d], _ = self.lp_optimiser(d, correlation_matrix)

# Eq. (28) that handles the tie. len() is O(1)
A[d] = 2.0 * X[d] / (len(self.S[:(d + 1)]) + len(self.L[:(d + 1)]))
A[d] = 2.0 * X[d] / (len(S[:(d + 1)]) + len(L[:(d + 1)]))

rbo[d] = (weights[:(d + 1)] * A[:(d + 1)]).sum()

Expand Down Expand Up @@ -482,8 +477,8 @@ def lp_optimiser(self, d, correlation_matrix):
prob += lpSum(W_np[j][i] for j in range(d1 + 1)) == 1, "Double stochastic col" + str(i)

# Solve and print result
prob.solve()
print(LpStatus[prob.status])
prob.solve(PULP_CBC_CMD(msg=False))
#print(LpStatus[prob.status])

# Get W and score
opt_W = np.array([v.varValue for v in W]).reshape(d1 + 1, d2 + 1)
Expand Down

0 comments on commit c067dea

Please sign in to comment.