-
Notifications
You must be signed in to change notification settings - Fork 0
/
tutorial.py
167 lines (138 loc) · 5.3 KB
/
tutorial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# %%
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs
from skshift import ExplanationShiftDetector
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np
# %%
# Create train, hold and test ID data
X, y = make_blobs(n_samples=2000, centers=2, n_features=5, random_state=0)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.5, random_state=0)
X_hold, y_hold = make_blobs(n_samples=2000, centers=2, n_features=5, random_state=0)
# Create OOD data
X_ood, _ = make_blobs(n_samples=500, centers=1, n_features=5, random_state=0)
X_ood_te, y_ood_te = make_blobs(n_samples=500, centers=1, n_features=5, random_state=1)
# Concatenate Distributions
y_te = np.zeros_like(y_te)
y_ood_te = np.ones_like(y_ood_te)
X_new = np.concatenate([X_te, X_ood_te])
y_new = np.concatenate([y_te, y_ood_te])
# %%
# Option 1: fit the detector when there is a trained model
model = XGBClassifier().fit(X_tr, y_tr)
detector = ExplanationShiftDetector(model=model, gmodel=LogisticRegression())
detector.fit_detector(X_te, X_ood)
print(roc_auc_score(y_new, detector.predict_proba(X_new)[:, 1]))
# %%
# Option 2: fit the whole pipeline of model and detector at once
detector.fit_pipeline(X_tr, y_tr, X_te, X_ood)
print(roc_auc_score(y_new, detector.predict_proba(X_new)[:, 1]))
# %%
# Explaining the change of the model
import shap
explainer = shap.Explainer(detector.detector, masker=detector.get_explanations(X_te))
shap_values = explainer(detector.get_explanations(X_ood_te))
# visualize the first prediction's explanation
shap.waterfall_plot(shap_values[0])
plt.close()
# %%
# Real World Example
from sklearn import datasets
# import some data to play with
dataset = datasets.load_breast_cancer()
X = dataset.data[:, :5]
y = dataset.target
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.5, random_state=0)
X_ood = X.copy()
X_ood[:, 0] = X_ood[:, 0] + 3
# Split in train and test
X_ood_tr, X_ood_te, y_ood_tr, y_ood_te = train_test_split(
X_ood, y, test_size=0.5, random_state=0
)
X_new = np.concatenate([X_te, X_ood_te])
y_new = np.concatenate([np.zeros_like(y_te), np.ones_like(y_ood_te)])
detector = ExplanationShiftDetector(model=XGBClassifier(), gmodel=XGBClassifier())
detector.fit_pipeline(X_tr, y_tr, X_te, X_ood_tr)
roc_auc_score(y_new, detector.predict_proba(X_new)[:, 1])
# %%
explainer = shap.Explainer(detector.detector, masker=detector.get_explanations(X))
shap_values = explainer(detector.get_explanations(X_ood_te))
# Local Explanations
import matplotlib.pyplot as plt
fig = shap.waterfall_plot(shap_values[0], show=False)
plt.savefig("docs/source/images/breastShapLocal.png")
plt.close()
# Global Explanations
fig = shap.plots.bar(shap_values, show=False)
plt.savefig("docs/source/images/breastShapGlobal.png")
plt.close()
# %%
# Folktables
from folktables import ACSDataSource, ACSIncome
import pandas as pd
data_source = ACSDataSource(survey_year="2018", horizon="1-Year", survey="person")
ca_data = data_source.get_data(states=["CA"], download=True)
pr_data = data_source.get_data(states=["PR"], download=True)
ca_features, ca_labels, _ = ACSIncome.df_to_pandas(ca_data)
pr_features, pr_labels, _ = ACSIncome.df_to_pandas(pr_data)
# %%
# Split ID data and OOD train and test data
X_tr, X_te, y_tr, y_te = train_test_split(
ca_features, ca_labels, test_size=0.5, random_state=0
)
X_ood_tr, X_ood_te, y_ood_tr, y_ood_te = train_test_split(
pr_features, pr_labels, test_size=0.5, random_state=0
)
X_new = pd.concat([X_te, X_ood_te])
y_new = np.concatenate([np.zeros_like(y_te), np.ones_like(y_ood_te)])
model = XGBClassifier().fit(X_tr, y_tr)
# %%
detector = ExplanationShiftDetector(model=model, gmodel=XGBClassifier())
detector.fit_detector(X_te, X_ood_te)
print(roc_auc_score(y_new, detector.predict_proba(X_new)[:, 1]))
# 0.96
# %%
explainer = shap.Explainer(detector.detector)
shap_values = explainer(detector.get_explanations(X_new))
# Local Explanations
import matplotlib.pyplot as plt
fig = shap.waterfall_plot(shap_values[0], show=False)
plt.savefig("docs/source/images/folksShapLocal.png")
plt.close()
# Global Explanations
fig = shap.plots.bar(shap_values, show=False)
plt.savefig("docs/source/images/folkstShapGlobal.png")
plt.close()
# %%
# Now if we choose a differet OOD data
tx_data = data_source.get_data(states=["TX"], download=True)
tx_features, tx_labels, _ = ACSIncome.df_to_pandas(tx_data)
# %%
# Split data
X_tr, X_te, y_tr, y_te = train_test_split(
ca_features, ca_labels, test_size=0.5, random_state=0
)
X_ood_tr, X_ood_te, y_ood_tr, y_ood_te = train_test_split(
tx_features, tx_labels, test_size=0.5, random_state=0
)
X_new = pd.concat([X_te, X_ood_te])
y_new = np.concatenate([np.zeros_like(y_te), np.ones_like(y_ood_te)])
# %%
detector = ExplanationShiftDetector(model=model, gmodel=XGBClassifier())
detector.fit_detector(X_te, X_ood_te)
print(roc_auc_score(y_new, detector.predict_proba(X_new)[:, 1]))
# 0.82
# %%
explainer = shap.Explainer(detector.detector)
shap_values = explainer(detector.get_explanations(X_new))
# Local Explanations
fig = shap.waterfall_plot(shap_values[0], show=False)
plt.savefig("docs/source/images/folksShapLocalMI.png")
plt.close()
# Global Explanations
fig = shap.plots.bar(shap_values, show=False)
plt.savefig("docs/source/images/folkstShapGlobalMI.png")
plt.close()
# %%