From 6db26a64bd694019bb75f38f0ec22cd72ebf1467 Mon Sep 17 00:00:00 2001 From: DP <10988155+donlapark@users.noreply.github.com> Date: Mon, 5 Dec 2022 23:43:07 +0700 Subject: [PATCH] Add files via upload --- XLabel.py | 1145 ++---------------------------------------- XLabelDL.py | 1376 ++++++--------------------------------------------- 2 files changed, 176 insertions(+), 2345 deletions(-) diff --git a/XLabel.py b/XLabel.py index 8601754..fd30742 100644 --- a/XLabel.py +++ b/XLabel.py @@ -10,35 +10,10 @@ import os import pickle as pkle -from typing import DefaultDict - -from interpret.privacy import (DPExplainableBoostingClassifier, - DPExplainableBoostingRegressor) -from interpret.utils import gen_perf_dicts -from interpret.glassbox.ebm.ebm import (BaseCoreEBM, - EBMExplanation, - EBMPreprocessor) -from interpret.glassbox.ebm.utils import DPUtils, EBMUtils -from interpret.glassbox.ebm.internal import Native -from interpret.glassbox.ebm.postprocessing import multiclass_postprocess -from interpret.utils import unify_data, unify_vector -from interpret.api.base import ExplainerMixin -from interpret.provider.compute import JobLibProvider -from interpret.utils import gen_name_from_class, gen_global_selector, gen_local_selector -import ctypes as ct +from interpret.glassbox.ebm.ebm import ExplainableBoostingClassifier import numpy as np import pandas as pd -from warnings import warn - -from sklearn.base import is_classifier -from sklearn.utils.validation import check_is_fitted -import heapq - -from sklearn.base import ( - BaseEstimator, - ClassifierMixin -) import altair as alt import streamlit as st @@ -52,6 +27,7 @@ st.set_page_config(layout="wide") + def main(): """The main Streamlit app.""" if "configs" not in _state: @@ -62,7 +38,7 @@ def main(): create_config_file() _state["loaded_new_file"] = True - + st.sidebar.write("Current database: " + _state.configs["db_filename"]) st.sidebar.file_uploader( @@ -72,7 +48,7 @@ def main(): accept_multiple_files=False, on_change=update_file ) - + with st.sidebar.form("sidebar"): st.slider( "Number of labels", @@ -117,7 +93,7 @@ def main(): key="threshold" ) - form_cols = st.columns((2.2,1,4)) + form_cols = st.columns((2.2, 1, 4)) form_cols[1].form_submit_button("Sample", on_click=sample_and_predict) if "pages" in _state: @@ -197,9 +173,9 @@ def create_pages(): _state["classes"] = {label: sorted(list(_state.database[label].dropna().unique())) for label in _state.pages} _state["num_to_class"] = {label: dict(enumerate(_state.classes[label])) - for label in _state.pages} + for label in _state.pages} _state["class_to_num"] = {label: {c: i for i, c in enumerate(_state.classes[label])} - for label in _state.pages} + for label in _state.pages} _state.update({ 'counter': 1, @@ -215,8 +191,7 @@ def create_pages(): file_pre, file_ext = os.path.splitext(_state.configs["db_filename"]) try: - with open(file_pre+str(_state.num_labels)+_MODEL - , 'rb') as _file: + with open(file_pre+str(_state.num_labels)+_MODEL, 'rb') as _file: _state["models_params"] = pkle.load(_file) _state["models"] = {} for label in _state.pages: @@ -241,7 +216,7 @@ def initialize_models(): y = _state.database[label].dropna().map(_state.class_to_num[label]) X = subset_features(_state.database, label) X = X.loc[y.index, :] - models[label] = ExplainableBoostingClassifier().fit(X,y) + models[label] = ExplainableBoostingClassifier().fit(X, y) models_params[label] = models[label].__dict__ return models, models_params @@ -295,7 +270,7 @@ def create_config_file(): with open(_CONFIGS_FILE, "w") as _file: json.dump(_state.configs, _file, indent=4) - + def display_main_screen(label): """Display predictions and heatmaps on the main screen. @@ -339,7 +314,7 @@ def display_main_screen(label): cols[0].altair_chart(current_plot, use_container_width=True) - prediction = _state.local_results[label][page]['prediction'] + prediction = _state.local_results[label][page]['prediction'] cols[1].radio( "Label", options=_state.classes[label], @@ -350,7 +325,7 @@ def display_main_screen(label): cols[1].write(result) st.markdown("""---""") - label_from_cols = st.columns((4,4,4)) + label_from_cols = st.columns((4, 4, 4)) label_from_cols[1].radio( "Automatically label the remaining data?", @@ -368,15 +343,14 @@ def display_main_screen(label): button_cols[1].button("Previous", on_click=update_previous_click) button_cols[2].button("Next", on_click=update_next_click) - components.html( - f""" + f"""

{_state.counter}

""", - height=0 + height=0 ) @@ -402,13 +376,13 @@ def plot_all_features(data, title, height, num_rows): plot_list[0] = plot(data.iloc[0: _NUM_FEAT_PER_ROW], title, height) - for i in range(1,num_rows-1): + for i in range(1, num_rows-1): plot_list[i] = plot(data.iloc[_NUM_FEAT_PER_ROW*i: _NUM_FEAT_PER_ROW*(i+1)], "", height) plot_list[-1] = plot(data.iloc[_NUM_FEAT_PER_ROW*(num_rows-1):], - "", - height) + "", + height) obj = alt.vconcat(*plot_list).configure_axis( labelFontSize=13, @@ -418,8 +392,8 @@ def plot_all_features(data, title, height, num_rows): ).configure_title(fontSize=16) return obj - - + + def plot(data, title, height): """Plot each row of the heatmap of EBM's per-instance explanation. @@ -436,10 +410,10 @@ def plot(data, title, height): ) heatmap = base.mark_rect().encode( - color=alt.Color('scores:Q', - scale=alt.Scale(scheme='redblue', reverse=True, domain=[0,1]), - legend=alt.Legend(direction='vertical') - ) + color=alt.Color('scores:Q', + scale=alt.Scale(scheme='redblue', reverse=True, domain=[0,1]), + legend=alt.Legend(direction='vertical') + ) ) # Configure text @@ -513,7 +487,7 @@ def sample_and_predict(): the predictions and explanations in a dictionary. """ st.experimental_memo.clear() - + if _state.loaded_new_file: init_state_params() _state.loaded_new_file = False @@ -558,7 +532,7 @@ def update_and_save(label): """ new_labeled_index = list(_state.local_results[label].keys()) _state.database.loc[new_labeled_index, label] = [_state[label+str(ix)] - for ix in new_labeled_index] + for ix in new_labeled_index] compute_unlabeled_index(new_labeled_index, label) if _state.auto == "Yes": @@ -631,7 +605,7 @@ def generate_explanation(X, label, model): id_idx_pair = dict(zip(X_.index, top_ind)) try: - data_by_class = [X_[ypred==c] for c in _state.classes[label]] + data_by_class = [X_[ypred == c] for c in _state.classes[label]] except KeyError: return @@ -647,12 +621,12 @@ def generate_explanation(X, label, model): else: feature_contrib = [localxi['scores'][k][localxi['perf']['predicted']] for k in range(n_features)] - heatmap_data = pd.DataFrame({'features' : feature_names, - 'values' : localxi['values'][:n_features], - 'scores' : 1/(1+1/np.exp(feature_contrib))}) - heatmap_data = heatmap_data.astype({'features' : str, - 'values' : str, - 'scores' : float}) + heatmap_data = pd.DataFrame({'features': feature_names, + 'values': localxi['values'][:n_features], + 'scores': 1/(1+1/np.exp(feature_contrib))}) + heatmap_data = heatmap_data.astype({'features': str, + 'values': str, + 'scores': float}) current_dict[j] = { 'actual': localxi['perf']['actual'], 'prediction': localxi['perf']['predicted'], @@ -660,1060 +634,5 @@ def generate_explanation(X, label, model): 'data': heatmap_data} -# Copyright (c) 2019 Microsoft Corporation -# Distributed under the MIT software license - -# Make ExplainableBoostingClassifier work on missing values. - -class BaseEBM(BaseEstimator): - """Client facing SK EBM.""" - - # Interface modeled after: - # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html - # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html - # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html - # https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html - # https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html - # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn - # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html - - def __init__( - self, - # Explainer - # - # feature_names in scikit-learn convention should probably be passed in via the fit function. Also, - # we can get feature_names via pandas dataframes, and those would only be known at fit time, so - # we need a version of feature_names_out_ with the underscore to indicate items set at fit time. - # Despite this, we need to recieve a list of feature_names here to be compatible with blackbox explainations - # where we still need to have feature_names, but we do not have a fit function since we explain existing - # models without fitting them ourselves. To conform to a common explaination API we get the feature_names - # here. - feature_names, - # other packages LightGBM, CatBoost, Scikit-Learn (future) are using categorical specific ways to indicate - # feature_types. The benefit to them is that they can accept multiple ways of specifying categoricals like: - # categorical = [true, false, true, true] OR categorical = [1, 4, 8] OR categorical = 'all'/'auto'/'none' - # We're choosing a different route because for visualization we want to be able to express multiple - # different types of data. For example, if the user has data with strings of "low", "medium", "high" - # We want to keep both the ordinal nature of this feature and we wish to preserve the text for visualization - # scikit-learn callers can pre-convert these things to [0, 1, 2] in the correct order because they don't - # need to worry about visualizing the data afterwards, but for us we need a way to specify the strings - # back anyways. So we need some way to express both the categorical nature of features and the order - # mapping. We can do this and more complicated conversions via: - # feature_types = ["categorical", ["low", "medium", "high"], "continuous", "time", "bool"] - feature_types, - # Data - # - # Ensemble - outer_bags, - inner_bags, - # Core - # TODO PK v.3 replace mains in favor of a "boosting stage plan" - mains, - interactions, - validation_size, - max_rounds, - early_stopping_tolerance, - early_stopping_rounds, - # Native - learning_rate, - # Holte, R. C. (1993) "Very simple classification rules perform well on most commonly used datasets" - # says use 6 as the minimum samples https://link.springer.com/content/pdf/10.1023/A:1022631118932.pdf - # TODO PK try setting this (not here, but in our caller) to 6 and run tests to verify the best value. - min_samples_leaf, - max_leaves, - # Overall - n_jobs, - random_state, - # Preprocessor - binning, - max_bins, - max_interaction_bins, - # Differential Privacy - epsilon=None, - delta=None, - composition=None, - bin_budget_frac=None, - privacy_schema=None, - ): - # NOTE: Per scikit-learn convention, we shouldn't attempt to sanity check these inputs here. We just - # Store these values for future use. Validate inputs in the fit or other functions. More details in: - # https://scikit-learn.org/stable/developers/develop.html - - # Arguments for explainer - self.feature_names = feature_names - self.feature_types = feature_types - - # Arguments for ensemble - self.outer_bags = outer_bags - self.inner_bags = inner_bags - - # Arguments for EBM beyond training a feature-step. - self.mains = mains - self.interactions = interactions - self.validation_size = validation_size - self.max_rounds = max_rounds - self.early_stopping_tolerance = early_stopping_tolerance - self.early_stopping_rounds = early_stopping_rounds - - # Arguments for internal EBM. - self.learning_rate = learning_rate - self.min_samples_leaf = min_samples_leaf - self.max_leaves = max_leaves - - # Arguments for overall - self.n_jobs = n_jobs - self.random_state = random_state - - # Arguments for preprocessor - self.binning = binning - self.max_bins = max_bins - self.max_interaction_bins = max_interaction_bins - - # Arguments for differential privacy - self.epsilon = epsilon - self.delta = delta - self.composition = composition - self.bin_budget_frac = bin_budget_frac - self.privacy_schema = privacy_schema - - def fit(self, X, y, sample_weight=None): # noqa: C901 - """ Fits model to provided samples. - - Args: - X: Numpy array for training samples. - y: Numpy array as training labels. - sample_weight: Optional array of weights per sample. Should be same length as X and y. - - Returns: - Itself. - """ - - # NOTE: Generally, we want to keep parameters in the __init__ function, since scikit-learn - # doesn't like parameters in the fit function, other than ones like weights that have - # the same length as the number of samples. See: - # https://scikit-learn.org/stable/developers/develop.html - # https://github.com/microsoft/LightGBM/issues/2628#issue-536116395 - # - - - # TODO PK sanity check all our inputs from the __init__ function, and this fit fuction - - # TODO PK we shouldn't expose our internal state until we are 100% sure that we succeeded - # so move everything to local variables until the end when we assign them to self.* - - # TODO PK we should do some basic checks here that X and y have the same dimensions and that - # they are well formed (look for NaNs, etc) - - # TODO PK handle calls where X.dim == 1. This could occur if there was only 1 feature, or if - # there was only 1 sample? We can differentiate either condition via y.dim and reshape - # AND add some tests for the X.dim == 1 scenario - - # TODO PK write an efficient striping converter for X that replaces unify_data for EBMs - # algorithm: grap N columns and convert them to rows then process those by sending them to C - - # TODO: PK don't overwrite self.feature_names here (scikit-learn rules), and it's also confusing to - # user to have their fields overwritten. Use feature_names_out_ or something similar - X, y, self.feature_names, _ = unify_data( - X, y, self.feature_names, self.feature_types, missing_data_allowed=True - ) - - # NOTE: Temporary override -- replace before push - w = sample_weight if sample_weight is not None else np.ones_like(y, dtype=np.float64) - w = unify_vector(w).astype(np.float64, casting="unsafe", copy=False) - - # Privacy calculations - if isinstance(self, (DPExplainableBoostingClassifier, DPExplainableBoostingRegressor)): - DPUtils.validate_eps_delta(self.epsilon, self.delta) - DPUtils.validate_DP_EBM(self) - - if self.privacy_schema is None: - warn("Possible privacy violation: assuming min/max values per feature/target are public info." - "Pass a privacy schema with known public ranges to avoid this warning.") - self.privacy_schema = DPUtils.build_privacy_schema(X, y) - - self.domain_size_ = self.privacy_schema['target'][1] - self.privacy_schema['target'][0] - - # Split epsilon, delta budget for binning and learning - bin_eps_ = self.epsilon * self.bin_budget_frac - training_eps_ = self.epsilon - bin_eps_ - bin_delta_ = self.delta / 2 - training_delta_ = self.delta / 2 - - # [DP] Calculate how much noise will be applied to each iteration of the algorithm - if self.composition == 'classic': - self.noise_scale_ = DPUtils.calc_classic_noise_multi( - total_queries = self.max_rounds * X.shape[1] * self.outer_bags, - target_epsilon = training_eps_, - delta = training_delta_, - sensitivity = self.domain_size_ * self.learning_rate * np.max(w) - ) - elif self.composition == 'gdp': - self.noise_scale_ = DPUtils.calc_gdp_noise_multi( - total_queries = self.max_rounds * X.shape[1] * self.outer_bags, - target_epsilon = training_eps_, - delta = training_delta_ - ) - self.noise_scale_ = self.noise_scale_ * self.domain_size_ * self.learning_rate * np.max(w) # Alg Line 17 - else: - raise NotImplementedError(f"Unknown composition method provided: {self.composition}. Please use 'gdp' or 'classic'.") - else: - bin_eps_, bin_delta_ = None, None - training_eps_, training_delta_ = None, None - - # Build preprocessor - self.preprocessor_ = EBMPreprocessor( - feature_names=self.feature_names, - feature_types=self.feature_types, - max_bins=self.max_bins, - binning=self.binning, - epsilon=bin_eps_, # Only defined during private training - delta=bin_delta_, - privacy_schema=getattr(self, 'privacy_schema', None) - ) - self.preprocessor_.fit(X) - X_orig = X - X = self.preprocessor_.transform(X_orig) - - features_categorical = np.array([x == "categorical" for x in self.preprocessor_.col_types_], dtype=ct.c_int64) - features_bin_count = np.array([len(x) for x in self.preprocessor_.col_bin_counts_], dtype=ct.c_int64) - - # NOTE: [DP] Passthrough to lower level layers for noise addition - bin_data_counts = {i : self.preprocessor_.col_bin_counts_[i] for i in range(X.shape[1])} - - if self.interactions != 0: - self.pair_preprocessor_ = EBMPreprocessor( - feature_names=self.feature_names, - feature_types=self.feature_types, - max_bins=self.max_interaction_bins, - binning=self.binning, - ) - self.pair_preprocessor_.fit(X_orig) - X_pair = self.pair_preprocessor_.transform(X_orig) - pair_features_categorical = np.array([x == "categorical" for x in self.pair_preprocessor_.col_types_], dtype=ct.c_int64) - pair_features_bin_count = np.array([len(x) for x in self.pair_preprocessor_.col_bin_counts_], dtype=ct.c_int64) - else: - self.pair_preprocessor_, X_pair, pair_features_categorical, pair_features_bin_count = None, None, None, None - - - estimators = [] - seed = EBMUtils.normalize_initial_random_seed(self.random_state) - - native = Native.get_native_singleton() - if is_classifier(self): - self.classes_, y = np.unique(y, return_inverse=True) - self._class_idx_ = {x: index for index, x in enumerate(self.classes_)} - - y = y.astype(np.int64, casting="unsafe", copy=False) - n_classes = len(self.classes_) - if n_classes > 2: # pragma: no cover - warn("Multiclass is still experimental. Subject to change per release.") - if n_classes > 2 and self.interactions != 0: - self.interactions = 0 - warn("Detected multiclass problem: forcing interactions to 0") - for i in range(self.outer_bags): - seed=native.generate_random_number(seed, 1416147523) - estimator = BaseCoreEBM( - # Data - model_type="classification", - features_categorical=features_categorical, - features_bin_count=features_bin_count, - pair_features_categorical=pair_features_categorical, - pair_features_bin_count=pair_features_bin_count, - # Core - main_features=self.mains, - interactions=self.interactions, - validation_size=self.validation_size, - max_rounds=self.max_rounds, - early_stopping_tolerance=self.early_stopping_tolerance, - early_stopping_rounds=self.early_stopping_rounds, - # Native - inner_bags=self.inner_bags, - learning_rate=self.learning_rate, - min_samples_leaf=self.min_samples_leaf, - max_leaves=self.max_leaves, - # Overall - random_state=seed, - # Differential Privacy - noise_scale=getattr(self, 'noise_scale_', None), - bin_counts=bin_data_counts, - ) - estimators.append(estimator) - else: - n_classes = -1 - y = y.astype(np.float64, casting="unsafe", copy=False) - for i in range(self.outer_bags): - seed=native.generate_random_number(seed, 1416147523) - estimator = BaseCoreEBM( - # Data - model_type="regression", - features_categorical=features_categorical, - features_bin_count=features_bin_count, - pair_features_categorical=pair_features_categorical, - pair_features_bin_count=pair_features_bin_count, - # Core - main_features=self.mains, - interactions=self.interactions, - validation_size=self.validation_size, - max_rounds=self.max_rounds, - early_stopping_tolerance=self.early_stopping_tolerance, - early_stopping_rounds=self.early_stopping_rounds, - # Native - inner_bags=self.inner_bags, - learning_rate=self.learning_rate, - min_samples_leaf=self.min_samples_leaf, - max_leaves=self.max_leaves, - # Overall - random_state=seed, - # Differential Privacy - noise_scale=getattr(self, 'noise_scale_', None), - bin_counts=bin_data_counts, - ) - estimators.append(estimator) - - # Train base models for main effects, pair detection. - - # scikit-learn returns an np.array for classification and - # a single float64 for regression, so we do the same - if is_classifier(self): - self.intercept_ = np.zeros( - Native.get_count_scores_c(n_classes), dtype=np.float64, order="C", - ) - else: - self.intercept_ = np.float64(0) - - provider = JobLibProvider(n_jobs=self.n_jobs) - - train_model_args_iter = ( - (estimators[i], X, y, w, X_pair, n_classes) for i in range(self.outer_bags) - ) - - estimators = provider.parallel(BaseCoreEBM.fit_parallel, train_model_args_iter) - - def select_pairs_from_fast(estimators, n_interactions): - # Average rank from estimators - pair_ranks = {} - - for n, estimator in enumerate(estimators): - for rank, indices in enumerate(estimator.inter_indices_): - old_mean = pair_ranks.get(indices, 0) - pair_ranks[indices] = old_mean + ((rank - old_mean) / (n + 1)) - - final_ranks = [] - total_interactions = 0 - for indices in pair_ranks: - heapq.heappush(final_ranks, (pair_ranks[indices], indices)) - total_interactions += 1 - - n_interactions = min(n_interactions, total_interactions) - top_pairs = [heapq.heappop(final_ranks)[1] for _ in range(n_interactions)] - return top_pairs - - if isinstance(self.interactions, int) and self.interactions > 0: - # Select merged pairs - pair_indices = select_pairs_from_fast(estimators, self.interactions) - - for estimator in estimators: - # Discard initial interactions - new_model = [] - new_feature_groups = [] - for i, feature_group in enumerate(estimator.feature_groups_): - if len(feature_group) != 1: - continue - new_model.append(estimator.model_[i]) - new_feature_groups.append(estimator.feature_groups_[i]) - estimator.model_ = new_model - estimator.feature_groups_ = new_feature_groups - estimator.inter_episode_idx_ = 0 - - if len(pair_indices) != 0: - # Retrain interactions for base models - - staged_fit_args_iter = ( - (estimators[i], X, y, w, X_pair, pair_indices) for i in range(self.outer_bags) - ) - - estimators = provider.parallel(BaseCoreEBM.staged_fit_interactions_parallel, staged_fit_args_iter) - elif isinstance(self.interactions, int) and self.interactions == 0: - pair_indices = [] - elif isinstance(self.interactions, list): - pair_indices = self.interactions - if len(pair_indices) != 0: - # Check and remove duplicate interaction terms - existing_terms = set() - unique_terms = [] - - for i, term in enumerate(pair_indices): - sorted_tuple = tuple(sorted(term)) - if sorted_tuple not in existing_terms: - existing_terms.add(sorted_tuple) - unique_terms.append(term) - - # Warn the users that we have made change to the interactions list - if len(unique_terms) != len(pair_indices): - warn("Detected duplicate interaction terms: removing duplicate interaction terms") - pair_indices = unique_terms - self.interactions = pair_indices - - # Retrain interactions for base models - staged_fit_args_iter = ( - (estimators[i], X, y, w, X_pair, pair_indices) for i in range(self.outer_bags) - ) - - estimators = provider.parallel(BaseCoreEBM.staged_fit_interactions_parallel, staged_fit_args_iter) - else: # pragma: no cover - raise RuntimeError("Argument 'interaction' has invalid value") - - X = np.ascontiguousarray(X.T) - if X_pair is not None: - X_pair = np.ascontiguousarray(X_pair.T) # I have no idea if we're supposed to do this. - - if isinstance(self.mains, str) and self.mains == "all": - main_indices = [[x] for x in range(X.shape[0])] - elif isinstance(self.mains, list) and all( - isinstance(x, int) for x in self.mains - ): - main_indices = [[x] for x in self.mains] - else: # pragma: no cover - msg = "Argument 'mains' has invalid value (valid values are 'all'|list): {}".format( - self.mains - ) - raise RuntimeError(msg) - - self.feature_groups_ = main_indices + pair_indices - - self.bagged_models_ = estimators - # Merge estimators into one. - self.additive_terms_ = [] - self.term_standard_deviations_ = [] - for index, _ in enumerate(self.feature_groups_): - log_odds_tensors = [] - for estimator in estimators: - log_odds_tensors.append(estimator.model_[index]) - - averaged_model = np.average(np.array(log_odds_tensors), axis=0) - model_errors = np.std(np.array(log_odds_tensors), axis=0) - - self.additive_terms_.append(averaged_model) - self.term_standard_deviations_.append(model_errors) - - # Get episode indexes for base estimators. - main_episode_idxs = [] - inter_episode_idxs = [] - for estimator in estimators: - main_episode_idxs.append(estimator.main_episode_idx_) - inter_episode_idxs.append(estimator.inter_episode_idx_) - - self.breakpoint_iteration_ = [main_episode_idxs] - if len(pair_indices) != 0: - self.breakpoint_iteration_.append(inter_episode_idxs) - - # Extract feature group names and feature group types. - # TODO PK v.3 don't overwrite feature_names and feature_types. Create new fields called feature_names_out and - # feature_types_out_ or feature_group_names_ and feature_group_types_ - self.feature_names = [] - self.feature_types = [] - for index, feature_indices in enumerate(self.feature_groups_): - feature_group_name = EBMUtils.gen_feature_group_name( - feature_indices, self.preprocessor_.col_names_ - ) - feature_group_type = EBMUtils.gen_feature_group_type( - feature_indices, self.preprocessor_.col_types_ - ) - self.feature_types.append(feature_group_type) - self.feature_names.append(feature_group_name) - - if n_classes <= 2: - if isinstance(self, (DPExplainableBoostingClassifier, DPExplainableBoostingRegressor)): - # DP method of centering graphs can generalize if we log pairwise densities - # No additional privacy loss from this step - # self.additive_terms_ and self.preprocessor_.col_bin_counts_ are noisy and published publicly - self._original_term_means_ = [] - for set_idx in range(len(self.feature_groups_)): - score_mean = np.average(self.additive_terms_[set_idx], weights=self.preprocessor_.col_bin_counts_[set_idx]) - self.additive_terms_[set_idx] = ( - self.additive_terms_[set_idx] - score_mean - ) - - # Add mean center adjustment back to intercept - self.intercept_ += score_mean - self._original_term_means_.append(score_mean) - else: - # Mean center graphs - only for binary classification and regression - scores_gen = EBMUtils.scores_by_feature_group( - X, X_pair, self.feature_groups_, self.additive_terms_ - ) - self._original_term_means_ = [] - - for set_idx, _, scores in scores_gen: - score_mean = np.average(scores, weights=w) - - self.additive_terms_[set_idx] = ( - self.additive_terms_[set_idx] - score_mean - ) - - # Add mean center adjustment back to intercept - self.intercept_ += score_mean - self._original_term_means_.append(score_mean) - else: - # Postprocess model graphs for multiclass - - # Currently pairwise interactions are unsupported for multiclass-classification. - binned_predict_proba = lambda x: EBMUtils.classifier_predict_proba( - x, None, self.feature_groups_, self.additive_terms_, self.intercept_ - ) - - postprocessed = multiclass_postprocess( - X, self.additive_terms_, binned_predict_proba, self.feature_types - ) - self.additive_terms_ = postprocessed["feature_graphs"] - self.intercept_ = postprocessed["intercepts"] - - for feature_group_idx, feature_group in enumerate(self.feature_groups_): - entire_tensor = [slice(None, None, None) for i in range(self.additive_terms_[feature_group_idx].ndim)] - for dimension_idx, feature_idx in enumerate(feature_group): - if self.preprocessor_.col_bin_counts_[feature_idx][0] == 0: - zero_dimension = entire_tensor.copy() - zero_dimension[dimension_idx] = 0 - self.additive_terms_[feature_group_idx][tuple(zero_dimension)] = 0 - self.term_standard_deviations_[feature_group_idx][tuple(zero_dimension)] = 0 - - # Generate overall importance - self.feature_importances_ = [] - if isinstance(self, (DPExplainableBoostingClassifier, DPExplainableBoostingRegressor)): - # DP method of generating feature importances can generalize to non-dp if preprocessors start tracking joint distributions - for i in range(len(self.feature_groups_)): - mean_abs_score = np.average(np.abs(self.additive_terms_[i]), weights=self.preprocessor_.col_bin_counts_[i]) - self.feature_importances_.append(mean_abs_score) - else: - scores_gen = EBMUtils.scores_by_feature_group( - X, X_pair, self.feature_groups_, self.additive_terms_ - ) - for set_idx, _, scores in scores_gen: - mean_abs_score = np.mean(np.abs(scores)) - self.feature_importances_.append(mean_abs_score) - - # Generate selector - # TODO PK v.3 shouldn't this be self._global_selector_ ?? - self.global_selector = gen_global_selector( - X_orig, self.feature_names, self.feature_types, None - ) - - self.has_fitted_ = True - return self - - # Select pairs from base models - def _merged_pair_score_fn(self, model_type, X, y, X_pair, feature_groups, model, intercept): - if model_type == "classification": - prob = EBMUtils.classifier_predict_proba( - X, X_pair, feature_groups, model, intercept - ) - return ( - 0 if len(y) == 0 else log_loss(y, prob) - ) # use logloss to conform consistnetly and for multiclass - elif model_type == "regression": - pred = EBMUtils.regressor_predict( - X, X_pair, feature_groups, model, intercept - ) - return 0 if len(y) == 0 else mean_squared_error(y, pred) - else: # pragma: no cover - msg = "Unknown model_type: '{}'.".format(model_type) - raise ValueError(msg) - - def decision_function(self, X): - """ Predict scores from model before calling the link function. - - Args: - X: Numpy array for samples. - - Returns: - The sum of the additive term contributions. - """ - check_is_fitted(self, "has_fitted_") - X_orig, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types, missing_data_allowed=True) - X = self.preprocessor_.transform(X_orig) - X = np.ascontiguousarray(X.T) - - if self.interactions != 0: - X_pair = self.pair_preprocessor_.transform(X_orig) - X_pair = np.ascontiguousarray(X_pair.T) - else: - X_pair = None - - decision_scores = EBMUtils.decision_function( - X, X_pair, self.feature_groups_, self.additive_terms_, self.intercept_ - ) - - return decision_scores - - def explain_global(self, name=None): - """ Provides global explanation for model. - - Args: - name: User-defined explanation name. - - Returns: - An explanation object, - visualizing feature-value pairs as horizontal bar chart. - """ - if name is None: - name = gen_name_from_class(self) - - check_is_fitted(self, "has_fitted_") - - # Obtain min/max for model scores - lower_bound = np.inf - upper_bound = -np.inf - for feature_group_index, _ in enumerate(self.feature_groups_): - errors = self.term_standard_deviations_[feature_group_index] - scores = self.additive_terms_[feature_group_index] - - lower_bound = min(lower_bound, np.min(scores - errors)) - upper_bound = max(upper_bound, np.max(scores + errors)) - - bounds = (lower_bound, upper_bound) - - # Add per feature graph - data_dicts = [] - feature_list = [] - density_list = [] - for feature_group_index, feature_indexes in enumerate( - self.feature_groups_ - ): - model_graph = self.additive_terms_[feature_group_index] - - # NOTE: This uses stddev. for bounds, consider issue warnings. - errors = self.term_standard_deviations_[feature_group_index] - - if len(feature_indexes) == 1: - # hack. remove the 0th index which is for missing values - model_graph = model_graph[1:] - errors = errors[1:] - - - bin_labels = self.preprocessor_._get_bin_labels(feature_indexes[0]) - # bin_counts = self.preprocessor_.get_bin_counts( - # feature_indexes[0] - # ) - scores = list(model_graph) - upper_bounds = list(model_graph + errors) - lower_bounds = list(model_graph - errors) - density_dict = { - "names": self.preprocessor_._get_hist_edges(feature_indexes[0]), - "scores": self.preprocessor_._get_hist_counts(feature_indexes[0]), - } - - feature_dict = { - "type": "univariate", - "names": bin_labels, - "scores": scores, - "scores_range": bounds, - "upper_bounds": upper_bounds, - "lower_bounds": lower_bounds, - } - feature_list.append(feature_dict) - density_list.append(density_dict) - - data_dict = { - "type": "univariate", - "names": bin_labels, - "scores": model_graph, - "scores_range": bounds, - "upper_bounds": model_graph + errors, - "lower_bounds": model_graph - errors, - "density": { - "names": self.preprocessor_._get_hist_edges(feature_indexes[0]), - "scores": self.preprocessor_._get_hist_counts( - feature_indexes[0] - ), - }, - } - if is_classifier(self): - data_dict["meta"] = { - "label_names": self.classes_.tolist() # Classes should be numpy array, convert to list. - } - - data_dicts.append(data_dict) - elif len(feature_indexes) == 2: - # hack. remove the 0th index which is for missing values - model_graph = model_graph[1:, 1:] - # errors = errors[1:, 1:] # NOTE: This is commented as it's not used in this branch. - - - bin_labels_left = self.pair_preprocessor_._get_bin_labels(feature_indexes[0]) - bin_labels_right = self.pair_preprocessor_._get_bin_labels(feature_indexes[1]) - - feature_dict = { - "type": "interaction", - "left_names": bin_labels_left, - "right_names": bin_labels_right, - "scores": model_graph, - "scores_range": bounds, - } - feature_list.append(feature_dict) - density_list.append({}) - - data_dict = { - "type": "interaction", - "left_names": bin_labels_left, - "right_names": bin_labels_right, - "scores": model_graph, - "scores_range": bounds, - } - data_dicts.append(data_dict) - else: # pragma: no cover - raise Exception("Interactions greater than 2 not supported.") - - overall_dict = { - "type": "univariate", - "names": self.feature_names, - "scores": self.feature_importances_, - } - internal_obj = { - "overall": overall_dict, - "specific": data_dicts, - "mli": [ - { - "explanation_type": "ebm_global", - "value": {"feature_list": feature_list}, - }, - {"explanation_type": "density", "value": {"density": density_list}}, - ], - } - - return EBMExplanation( - "global", - internal_obj, - feature_names=self.feature_names, - feature_types=self.feature_types, - name=name, - selector=self.global_selector, - ) - - def explain_local(self, X, y=None, name=None): - """ Provides local explanations for provided samples. - - Args: - X: Numpy array for X to explain. - y: Numpy vector for y to explain. - name: User-defined explanation name. - - Returns: - An explanation object, visualizing feature-value pairs - for each sample as horizontal bar charts. - """ - - # Produce feature value pairs for each sample. - # Values are the model graph score per respective feature group. - if name is None: - name = gen_name_from_class(self) - - check_is_fitted(self, "has_fitted_") - - X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types, missing_data_allowed=True) - - # Transform y if classifier - if is_classifier(self) and y is not None: - y = np.array([self._class_idx_[el] for el in y]) - - samples = self.preprocessor_.transform(X) - samples = np.ascontiguousarray(samples.T) - - if self.interactions != 0: - pair_samples = self.pair_preprocessor_.transform(X) - pair_samples = np.ascontiguousarray(pair_samples.T) - else: - pair_samples = None - - scores_gen = EBMUtils.scores_by_feature_group( - samples, pair_samples, self.feature_groups_, self.additive_terms_ - ) - - # TODO PK add a test to see if we handle X.ndim == 1 (or should we throw ValueError) - - n_rows = samples.shape[1] - data_dicts = [] - intercept = self.intercept_ - if not is_classifier(self) or len(self.classes_) <= 2: - if isinstance(self.intercept_, np.ndarray) or isinstance( - self.intercept_, list - ): - intercept = intercept[0] - - for _ in range(n_rows): - data_dict = { - "type": "univariate", - "names": [], - "scores": [], - "values": [], - "extra": {"names": ["Intercept"], "scores": [intercept], "values": [1]}, - } - if is_classifier(self): - data_dict["meta"] = { - "label_names": self.classes_.tolist() # Classes should be numpy array, convert to list. - } - data_dicts.append(data_dict) - - for set_idx, feature_group, scores in scores_gen: - for row_idx in range(n_rows): - feature_name = self.feature_names[set_idx] - data_dicts[row_idx]["names"].append(feature_name) - data_dicts[row_idx]["scores"].append(scores[row_idx]) - if len(feature_group) == 1: - data_dicts[row_idx]["values"].append( - X[row_idx, feature_group[0]] - ) - else: - data_dicts[row_idx]["values"].append("") - - is_classification = is_classifier(self) - if is_classification: - scores = EBMUtils.classifier_predict_proba( - samples, pair_samples, self.feature_groups_, self.additive_terms_, self.intercept_, - ) - else: - scores = EBMUtils.regressor_predict( - samples, pair_samples, self.feature_groups_, self.additive_terms_, self.intercept_, - ) - - perf_list = [] - perf_dicts = gen_perf_dicts(scores, y, is_classification) - for row_idx in range(n_rows): - perf = None if perf_dicts is None else perf_dicts[row_idx] - perf_list.append(perf) - data_dicts[row_idx]["perf"] = perf - - selector = gen_local_selector(data_dicts, is_classification=is_classification) - - - additive_terms = [] - for feature_group_index, feature_indexes in enumerate(self.feature_groups_): - if len(feature_indexes) == 1: - # hack. remove the 0th index which is for missing values - additive_terms.append(self.additive_terms_[feature_group_index][1:]) - elif len(feature_indexes) == 2: - # hack. remove the 0th index which is for missing values - additive_terms.append(self.additive_terms_[feature_group_index][1:, 1:]) - else: - raise ValueError("only handles 1D/2D") - - internal_obj = { - "overall": None, - "specific": data_dicts, - "mli": [ - { - "explanation_type": "ebm_local", - "value": { - "scores": additive_terms, - "intercept": self.intercept_, - "perf": perf_list, - }, - } - ], - } - internal_obj["mli"].append( - { - "explanation_type": "evaluation_dataset", - "value": {"dataset_x": X, "dataset_y": y}, - } - ) - - return EBMExplanation( - "local", - internal_obj, - feature_names=self.feature_names, - feature_types=self.feature_types, - name=name, - selector=selector, - ) - - -class ExplainableBoostingClassifier(BaseEBM, ClassifierMixin, ExplainerMixin): - """ Explainable Boosting Classifier. The arguments will change in a future release, watch the changelog. """ - - # TODO PK v.3 use underscores here like ClassifierMixin._estimator_type? - available_explanations = ["global", "local"] - explainer_type = "model" - - """ Public facing EBM classifier.""" - - def __init__( - self, - # Explainer - feature_names=None, - feature_types=None, - # Preprocessor - max_bins=256, - max_interaction_bins=32, - binning="quantile", - # Stages - mains="all", - interactions=10, - # Ensemble - outer_bags=8, - inner_bags=0, - # Boosting - learning_rate=0.01, - validation_size=0.15, - early_stopping_rounds=50, - early_stopping_tolerance=1e-4, - max_rounds=5000, - # Trees - min_samples_leaf=2, - max_leaves=3, - # Overall - n_jobs=-2, - random_state=42, - ): - """ Explainable Boosting Classifier. The arguments will change in a future release, watch the changelog. - - Args: - feature_names: List of feature names. - feature_types: List of feature types. - max_bins: Max number of bins per feature for pre-processing stage. - max_interaction_bins: Max number of bins per feature for pre-processing stage on interaction terms. Only used if interactions is non-zero. - binning: Method to bin values for pre-processing. Choose "uniform", "quantile" or "quantile_humanized". - mains: Features to be trained on in main effects stage. Either "all" or a list of feature indexes. - interactions: Interactions to be trained on. - Either a list of lists of feature indices, or an integer for number of automatically detected interactions. - Interactions are forcefully set to 0 for multiclass problems. - outer_bags: Number of outer bags. - inner_bags: Number of inner bags. - learning_rate: Learning rate for boosting. - validation_size: Validation set size for boosting. - early_stopping_rounds: Number of rounds of no improvement to trigger early stopping. - early_stopping_tolerance: Tolerance that dictates the smallest delta required to be considered an improvement. - max_rounds: Number of rounds for boosting. - min_samples_leaf: Minimum number of cases for tree splits used in boosting. - max_leaves: Maximum leaf nodes used in boosting. - n_jobs: Number of jobs to run in parallel. - random_state: Random state. - """ - super(ExplainableBoostingClassifier, self).__init__( - # Explainer - feature_names=feature_names, - feature_types=feature_types, - # Preprocessor - max_bins=max_bins, - max_interaction_bins=max_interaction_bins, - binning=binning, - # Stages - mains=mains, - interactions=interactions, - # Ensemble - outer_bags=outer_bags, - inner_bags=inner_bags, - # Boosting - learning_rate=learning_rate, - validation_size=validation_size, - early_stopping_rounds=early_stopping_rounds, - early_stopping_tolerance=early_stopping_tolerance, - max_rounds=max_rounds, - # Trees - min_samples_leaf=min_samples_leaf, - max_leaves=max_leaves, - # Overall - n_jobs=n_jobs, - random_state=random_state, - ) - - # TODO: Throw ValueError like scikit for 1d instead of 2d arrays - def predict_proba(self, X): - """ Probability estimates on provided samples. - - Args: - X: Numpy array for samples. - - Returns: - Probability estimate of sample for each class. - """ - check_is_fitted(self, "has_fitted_") - X_orig, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types, missing_data_allowed=True) - X = self.preprocessor_.transform(X_orig) - X = np.ascontiguousarray(X.T) - - if self.interactions != 0: - X_pair = self.pair_preprocessor_.transform(X_orig) - X_pair = np.ascontiguousarray(X_pair.T) - else: - X_pair = None - - # TODO PK add a test to see if we handle X.ndim == 1 (or should we throw ValueError) - - prob = EBMUtils.classifier_predict_proba( - X, X_pair, self.feature_groups_, self.additive_terms_, self.intercept_ - ) - return prob - - def predict(self, X): - """ Predicts on provided samples. - - Args: - X: Numpy array for samples. - - Returns: - Predicted class label per sample. - """ - check_is_fitted(self, "has_fitted_") - X_orig, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types, missing_data_allowed=True) - X = self.preprocessor_.transform(X_orig) - X = np.ascontiguousarray(X.T) - - if self.interactions != 0: - X_pair = self.pair_preprocessor_.transform(X_orig) - X_pair = np.ascontiguousarray(X_pair.T) - else: - X_pair = None - - # TODO PK add a test to see if we handle X.ndim == 1 (or should we throw ValueError) - - return EBMUtils.classifier_predict( - X, - X_pair, - self.feature_groups_, - self.additive_terms_, - self.intercept_, - self.classes_, - ) - - def predict_and_contrib(self, X, output='probabilities'): - """Predicts on provided samples, returning predictions and explanations for each sample. - - Args: - X: Numpy array for samples. - output: Prediction type to output (i.e. one of 'probabilities', 'logits', 'labels') - - Returns: - Predictions and local explanations for each sample. - """ - - allowed_outputs = ['probabilities', 'logits', 'labels'] - if output not in allowed_outputs: - msg = "Argument 'output' has invalid value. Got '{}', expected one of " - + repr(allowed_outputs) - raise ValueError(msg.format(output)) - - check_is_fitted(self, "has_fitted_") - X_orig, _, _, _ = unify_data( - X, None, self.feature_names, self.feature_types, missing_data_allowed=True - ) - X = self.preprocessor_.transform(X_orig) - X = np.ascontiguousarray(X.T) - - if self.interactions != 0: - X_pair = self.pair_preprocessor_.transform(X_orig) - X_pair = np.ascontiguousarray(X_pair.T) - else: - X_pair = None - - return EBMUtils.classifier_predict_and_contrib( - X, - X_pair, - self.feature_groups_, - self.additive_terms_, - self.intercept_, - self.classes_, - output) - - if __name__ == "__main__": main() diff --git a/XLabelDL.py b/XLabelDL.py index deba845..28548fd 100644 --- a/XLabelDL.py +++ b/XLabelDL.py @@ -10,33 +10,10 @@ import os import pickle as pkle -from interpret.privacy import (DPExplainableBoostingClassifier, - DPExplainableBoostingRegressor) -from interpret.utils import gen_perf_dicts -from interpret.glassbox.ebm.ebm import (BaseCoreEBM, - EBMExplanation, - EBMPreprocessor) -from interpret.glassbox.ebm.utils import DPUtils, EBMUtils -from interpret.glassbox.ebm.internal import Native -from interpret.glassbox.ebm.postprocessing import multiclass_postprocess -from interpret.utils import unify_data, unify_vector -from interpret.api.base import ExplainerMixin -from interpret.provider.compute import JobLibProvider -from interpret.utils import gen_name_from_class, gen_global_selector, gen_local_selector -import ctypes as ct +from interpret.glassbox.ebm.ebm import ExplainableBoostingClassifier import numpy as np import pandas as pd -from warnings import warn - -from sklearn.base import is_classifier -from sklearn.utils.validation import check_is_fitted -import heapq - -from sklearn.base import ( - BaseEstimator, - ClassifierMixin -) import altair as alt import streamlit as st @@ -50,6 +27,7 @@ st.set_page_config(layout="wide") + def main(): """The main Streamlit app.""" if "configs" not in _state: @@ -63,59 +41,48 @@ def main(): st.sidebar.write("Current database: " + _state.configs["db_filename"]) - st.sidebar.file_uploader( - "Upload a CSV or Excel file", - type=["csv", "xlsx", "xls"], - key="uploaded_files", - accept_multiple_files=False, - on_change=update_file - ) + st.sidebar.file_uploader("Upload a CSV or Excel file", + type=["csv", "xlsx", "xls"], + key="uploaded_files", + accept_multiple_files=False, + on_change=update_file) with st.sidebar.form("sidebar"): - st.slider( - "Number of labels", - min_value=1, - max_value=20, - value=_state.configs["sidebar"]["num_labels"], - step=1, - key="num_labels" - ) - - st.selectbox( - "Include data with label/prediction mismatches?", - ("Yes", "No"), - key="relabel", - index=("Yes", "No").index(_state.configs["sidebar"]["relabel"]) - ) - - st.selectbox( - "Sampling mode", - ("Fixed sample size", "Confidence threshold"), - key="mode", - index=("Fixed sample size", - "Confidence threshold").index(_state.configs["sidebar"]["mode"]) - ) - - st.slider( - "Sample size (for \"Fixed sample size\" mode)", - min_value=1, - max_value=500, - value=_state.configs["sidebar"]["n_samples"], - step=1, - key="n_samples" - ) - - st.slider( - "Threshold (for \"Confidence threshold\" mode)", - min_value=0.00, - max_value=1.00, - value=_state.configs["sidebar"]["threshold"], - step=0.01, - format="%.2f", - key="threshold" - ) - - form_cols = st.columns((2.2,1,4)) + st.slider("Number of labels", + min_value=1, + max_value=20, + value=_state.configs["sidebar"]["num_labels"], + step=1, + key="num_labels") + + st.selectbox("Include data with label/prediction mismatches?", + ("Yes", "No"), + key="relabel", + index=("Yes", + "No").index(_state.configs["sidebar"]["relabel"])) + + st.selectbox("Sampling mode", + ("Fixed sample size", "Confidence threshold"), + key="mode", + index=("Fixed sample size", "Confidence threshold").index( + _state.configs["sidebar"]["mode"])) + + st.slider("Sample size (for \"Fixed sample size\" mode)", + min_value=1, + max_value=500, + value=_state.configs["sidebar"]["n_samples"], + step=1, + key="n_samples") + + st.slider("Threshold (for \"Confidence threshold\" mode)", + min_value=0.00, + max_value=1.00, + value=_state.configs["sidebar"]["threshold"], + step=0.01, + format="%.2f", + key="threshold") + + form_cols = st.columns((2.2, 1, 4)) form_cols[1].form_submit_button("Sample", on_click=sample_and_predict) if "pages" in _state: @@ -123,8 +90,7 @@ def main(): _state.pages, key="label_page", index=_state.next_clicked, - on_change=update_counter - ) + on_change=update_counter) label = _state.label_page display_main_screen(label) @@ -134,12 +100,10 @@ def main(): if "database" in _state: data, mime = convert_to_downloadable(_state.database, file_ext) - st.sidebar.download_button( - label="Download labeled data", - data=data, - file_name=filename, - mime=mime - ) + st.sidebar.download_button(label="Download labeled data", + data=data, + file_name=filename, + mime=mime) def update_file(): @@ -205,12 +169,19 @@ def create_pages(): one label per page. """ _state["pages"] = _state.database.columns[-_state.num_labels:] - _state["classes"] = {label: sorted(list(_state.database[label].dropna().unique())) - for label in _state.pages} - _state["num_to_class"] = {label: dict(enumerate(_state.classes[label])) - for label in _state.pages} - _state["class_to_num"] = {label: {c: i for i, c in enumerate(_state.classes[label])} - for label in _state.pages} + _state["classes"] = { + label: sorted(list(_state.database[label].dropna().unique())) + for label in _state.pages + } + _state["num_to_class"] = { + label: dict(enumerate(_state.classes[label])) + for label in _state.pages + } + _state["class_to_num"] = { + label: {c: i + for i, c in enumerate(_state.classes[label])} + for label in _state.pages + } _state.update({ 'counter': 1, @@ -226,13 +197,13 @@ def create_pages(): file_pre, file_ext = os.path.splitext(_state.configs["db_filename"]) try: - with open(file_pre+str(_state.num_labels)+_MODEL - , 'rb') as _file: + with open(file_pre + str(_state.num_labels) + _MODEL, 'rb') as _file: _state["models_params"] = pkle.load(_file) _state["models"] = {} for label in _state.pages: _state.models[label] = ExplainableBoostingClassifier() - _state.models[label].__dict__.update(_state.models_params[label]) + _state.models[label].__dict__.update( + _state.models_params[label]) except FileNotFoundError: _state["models"], _state["models_params"] = initialize_models() @@ -252,7 +223,7 @@ def initialize_models(): y = _state.database[label].dropna().map(_state.class_to_num[label]) X = subset_features(_state.database, label) X = X.loc[y.index, :] - models[label] = ExplainableBoostingClassifier().fit(X,y) + models[label] = ExplainableBoostingClassifier().fit(X, y) models_params[label] = models[label].__dict__ return models, models_params @@ -283,11 +254,14 @@ def compute_unlabeled_index(new_labeled_index=None, label=None): label: The column name of the new labels. """ if new_labeled_index is not None: - _state.unlabeled_index[label] = _state.unlabeled_index[label].difference(new_labeled_index) + _state.unlabeled_index[label] = _state.unlabeled_index[ + label].difference(new_labeled_index) else: all_index = _state.database.index - _state.unlabeled_index = {label: all_index[_state.database[label].isna()] - for label in _state.pages} + _state.unlabeled_index = { + label: all_index[_state.database[label].isna()] + for label in _state.pages + } def create_config_file(): @@ -342,7 +316,7 @@ def display_main_screen(label): """ main_cols = st.columns((4, 4, 4)) if _state.unlabeled_index[label].empty: - main_cols[1].write("All "+label+" data are labeled.") + main_cols[1].write("All " + label + " data are labeled.") else: with st.form("Label form"): if _state.local_results[label] == {}: @@ -357,13 +331,14 @@ def display_main_screen(label): num_features = _state.database.shape[1] - _state.num_labels else: num_features = len(input_features[label]) - num_heatmap_rows = math.ceil(num_features/_NUM_FEAT_PER_ROW) + num_heatmap_rows = math.ceil(num_features / _NUM_FEAT_PER_ROW) for page in _state.local_results[label]: - current_plot = plot_all_features(_state.local_results[label][page]['data'], - title=str(page), - height=50, - num_rows=num_heatmap_rows) + current_plot = plot_all_features( + _state.local_results[label][page]['data'], + title=str(page), + height=50, + num_rows=num_heatmap_rows) cols = st.columns((6, 1)) #with cols[0]: # if _state.text1 is not None: @@ -371,47 +346,42 @@ def display_main_screen(label): # if _state.text2 is not None: # st.write(_state.data[_state.text2][page]) - cols[0].altair_chart(current_plot, use_container_width=True) + cols[0].altair_chart(current_plot, + use_container_width=True) - prediction = _state.local_results[label][page]['prediction'] - cols[1].radio( - "Label", - options=_state.classes[label], - key=label+str(page), - index=int(prediction)) + prediction = _state.local_results[label][page][ + 'prediction'] + cols[1].radio("Label", + options=_state.classes[label], + key=label + str(page), + index=int(prediction)) results = report_results(page, label) for result in results: cols[1].write(result) st.markdown("""---""") - label_from_cols = st.columns((4,4,4)) + label_from_cols = st.columns((4, 4, 4)) - label_from_cols[1].radio( - "Automatically label the remaining data?", - ("Yes", "No"), - index=1, - key="auto" - ) + label_from_cols[1].radio("Automatically label the remaining data?", + ("Yes", "No"), + index=1, + key="auto") label_from_cols[1].form_submit_button("Submit Labels", on_click=update_and_save, - args=(label,) - ) + args=(label, )) button_cols = st.columns((3, 1, 1, 4)) button_cols[1].button("Previous", on_click=update_previous_click) button_cols[2].button("Next", on_click=update_next_click) - - components.html( - f""" + components.html(f"""

{_state.counter}

""", - height=0 - ) + height=0) @st.experimental_memo @@ -427,29 +397,21 @@ def plot_all_features(data, title, height, num_rows): Returns: obj: An Altair plot object. """ - plot_list = [None]*num_rows + plot_list = [None] * num_rows if num_rows == 1: - plot_list[0] = plot(data, - title, - height) + plot_list[0] = plot(data, title, height) else: - plot_list[0] = plot(data.iloc[0: _NUM_FEAT_PER_ROW], - title, - height) - for i in range(1,num_rows-1): - plot_list[i] = plot(data.iloc[_NUM_FEAT_PER_ROW*i: _NUM_FEAT_PER_ROW*(i+1)], - "", - height) - plot_list[-1] = plot(data.iloc[_NUM_FEAT_PER_ROW*(num_rows-1):], - "", - height) + plot_list[0] = plot(data.iloc[0:_NUM_FEAT_PER_ROW], title, height) + for i in range(1, num_rows - 1): + plot_list[i] = plot( + data.iloc[_NUM_FEAT_PER_ROW * i:_NUM_FEAT_PER_ROW * (i + 1)], + "", height) + plot_list[-1] = plot(data.iloc[_NUM_FEAT_PER_ROW * (num_rows - 1):], + "", height) obj = alt.vconcat(*plot_list).configure_axis( - labelFontSize=13, - titleFontSize=16, - labelAngle=0, - title=None - ).configure_title(fontSize=16) + labelFontSize=13, titleFontSize=16, labelAngle=0, + title=None).configure_title(fontSize=16) return obj @@ -465,31 +427,21 @@ def plot(data, title, height): Returns: obj: An Altair plot object. """ - base = alt.Chart(data).encode( - x=alt.X('features', sort=None) - ) + base = alt.Chart(data).encode(x=alt.X('features', sort=None)) - heatmap = base.mark_rect().encode( - color=alt.Color('scores:Q', - scale=alt.Scale(scheme='redblue', reverse=True, domain=[0,1]), - legend=alt.Legend(direction='vertical') - ) - ) + heatmap = base.mark_rect().encode(color=alt.Color( + 'scores:Q', + scale=alt.Scale(scheme='redblue', reverse=True, domain=[0, 1]), + legend=alt.Legend(direction='vertical'))) # Configure text text = base.mark_text(baseline='middle', fontSize=14).encode( text='values:N', color=alt.condition( (alt.datum.scores > 0.8) | (alt.datum.scores < 0.2), - alt.value('white'), - alt.value('black') - ) - ) + alt.value('white'), alt.value('black'))) - obj = (heatmap+text).properties(height=height, - width=650, - title=title - ) + obj = (heatmap + text).properties(height=height, width=650, title=title) return obj @@ -520,7 +472,7 @@ def update_previous_click(): """Track the index of the previous page.""" _state.next_clicked -= 1 if _state.next_clicked == -1: - _state.next_clicked = len(_state.pages)-1 + _state.next_clicked = len(_state.pages) - 1 _state.counter += 1 @@ -591,8 +543,9 @@ def update_and_save(label): label: the column name of the label. """ new_labeled_index = list(_state.local_results[label].keys()) - _state.database.loc[new_labeled_index, label] = [_state[label+str(ix)] - for ix in new_labeled_index] + _state.database.loc[new_labeled_index, label] = [ + _state[label + str(ix)] for ix in new_labeled_index + ] compute_unlabeled_index(new_labeled_index, label) if _state.auto == "Yes": @@ -602,7 +555,8 @@ def update_and_save(label): _state.unlabeled_index[label] = pd.Index([]) labeled_index = _state.database.index else: - labeled_index = _state.database.index.difference(_state.unlabeled_index[label]) + labeled_index = _state.database.index.difference( + _state.unlabeled_index[label]) X = subset_features(_state.database, label) X_train = X.loc[labeled_index, :] @@ -614,7 +568,7 @@ def update_and_save(label): filename = _state.configs["db_filename"] file_pre, file_ext = os.path.splitext(filename) - with open(file_pre+str(_state.num_labels)+_MODEL, 'wb') as _file: + with open(file_pre + str(_state.num_labels) + _MODEL, 'wb') as _file: pkle.dump(_state.models_params, _file, protocol=pkle.HIGHEST_PROTOCOL) _state.local_results[label] = {} @@ -640,18 +594,21 @@ def generate_explanation(X, label, model): n_features = X.shape[1] localx = model.explain_local(X)._internal_obj['specific'] - ypred = np.array([_state.num_to_class[label][localx[j]['perf']['predicted']] - for j in range(n_samples)]) + ypred = np.array([ + _state.num_to_class[label][localx[j]['perf']['predicted']] + for j in range(n_samples) + ]) _state.predictions.loc[X.index, label] = ypred y = _state.database.loc[X.index, label] - p = np.array([localx[j]['perf']['predicted_score'] for j in range(n_samples)]) + p = np.array( + [localx[j]['perf']['predicted_score'] for j in range(n_samples)]) scores = np.minimum(p, (pd.isnull(y) | (ypred == y))) if _state.mode == "Confidence threshold": top_ind = np.where(scores <= _state.threshold)[0] else: - n_samples = np.minimum(_state.n_samples, scores.shape[0]-1) + n_samples = np.minimum(_state.n_samples, scores.shape[0] - 1) top_ind = np.argpartition(scores, n_samples)[:n_samples] X_ = X.iloc[top_ind, :].copy() @@ -660,7 +617,7 @@ def generate_explanation(X, label, model): id_idx_pair = dict(zip(X_.index, top_ind)) try: - data_by_class = [X_[ypred==c] for c in _state.classes[label]] + data_by_class = [X_[ypred == c] for c in _state.classes[label]] except KeyError: return @@ -674,1074 +631,29 @@ def generate_explanation(X, label, model): if len(_state.classes[label]) == 2: feature_contrib = localxi['scores'][:n_features] else: - feature_contrib = [localxi['scores'][k][localxi['perf']['predicted']] - for k in range(n_features)] - heatmap_data = pd.DataFrame({'features' : feature_names, - 'values' : localxi['values'][:n_features], - 'scores' : 1/(1+1/np.exp(feature_contrib))}) - heatmap_data = heatmap_data.astype({'features' : str, - 'values' : str, - 'scores' : float}) + feature_contrib = [ + localxi['scores'][k][localxi['perf']['predicted']] + for k in range(n_features) + ] + heatmap_data = pd.DataFrame({ + 'features': + feature_names, + 'values': + localxi['values'][:n_features], + 'scores': + 1 / (1 + 1 / np.exp(feature_contrib)) + }) + heatmap_data = heatmap_data.astype({ + 'features': str, + 'values': str, + 'scores': float + }) current_dict[j] = { 'actual': localxi['perf']['actual'], 'prediction': localxi['perf']['predicted'], 'confidence': localxi['perf']['predicted_score'], - 'data': heatmap_data} - - -# Copyright (c) 2019 Microsoft Corporation -# Distributed under the MIT software license - -# Modified the origina code to make ExplainableBoostingClassifier work on missing values. - -class BaseEBM(BaseEstimator): - """Client facing SK EBM.""" - - # Interface modeled after: - # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html - # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html - # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html - # https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html - # https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html - # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn - # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html - - def __init__( - self, - # Explainer - # - # feature_names in scikit-learn convention should probably be passed in via the fit function. Also, - # we can get feature_names via pandas dataframes, and those would only be known at fit time, so - # we need a version of feature_names_out_ with the underscore to indicate items set at fit time. - # Despite this, we need to recieve a list of feature_names here to be compatible with blackbox explainations - # where we still need to have feature_names, but we do not have a fit function since we explain existing - # models without fitting them ourselves. To conform to a common explaination API we get the feature_names - # here. - feature_names, - # other packages LightGBM, CatBoost, Scikit-Learn (future) are using categorical specific ways to indicate - # feature_types. The benefit to them is that they can accept multiple ways of specifying categoricals like: - # categorical = [true, false, true, true] OR categorical = [1, 4, 8] OR categorical = 'all'/'auto'/'none' - # We're choosing a different route because for visualization we want to be able to express multiple - # different types of data. For example, if the user has data with strings of "low", "medium", "high" - # We want to keep both the ordinal nature of this feature and we wish to preserve the text for visualization - # scikit-learn callers can pre-convert these things to [0, 1, 2] in the correct order because they don't - # need to worry about visualizing the data afterwards, but for us we need a way to specify the strings - # back anyways. So we need some way to express both the categorical nature of features and the order - # mapping. We can do this and more complicated conversions via: - # feature_types = ["categorical", ["low", "medium", "high"], "continuous", "time", "bool"] - feature_types, - # Data - # - # Ensemble - outer_bags, - inner_bags, - # Core - # TODO PK v.3 replace mains in favor of a "boosting stage plan" - mains, - interactions, - validation_size, - max_rounds, - early_stopping_tolerance, - early_stopping_rounds, - # Native - learning_rate, - # Holte, R. C. (1993) "Very simple classification rules perform well on most commonly used datasets" - # says use 6 as the minimum samples https://link.springer.com/content/pdf/10.1023/A:1022631118932.pdf - # TODO PK try setting this (not here, but in our caller) to 6 and run tests to verify the best value. - min_samples_leaf, - max_leaves, - # Overall - n_jobs, - random_state, - # Preprocessor - binning, - max_bins, - max_interaction_bins, - # Differential Privacy - epsilon=None, - delta=None, - composition=None, - bin_budget_frac=None, - privacy_schema=None, - ): - # NOTE: Per scikit-learn convention, we shouldn't attempt to sanity check these inputs here. We just - # Store these values for future use. Validate inputs in the fit or other functions. More details in: - # https://scikit-learn.org/stable/developers/develop.html - - # Arguments for explainer - self.feature_names = feature_names - self.feature_types = feature_types - - # Arguments for ensemble - self.outer_bags = outer_bags - self.inner_bags = inner_bags - - # Arguments for EBM beyond training a feature-step. - self.mains = mains - self.interactions = interactions - self.validation_size = validation_size - self.max_rounds = max_rounds - self.early_stopping_tolerance = early_stopping_tolerance - self.early_stopping_rounds = early_stopping_rounds - - # Arguments for internal EBM. - self.learning_rate = learning_rate - self.min_samples_leaf = min_samples_leaf - self.max_leaves = max_leaves - - # Arguments for overall - self.n_jobs = n_jobs - self.random_state = random_state - - # Arguments for preprocessor - self.binning = binning - self.max_bins = max_bins - self.max_interaction_bins = max_interaction_bins - - # Arguments for differential privacy - self.epsilon = epsilon - self.delta = delta - self.composition = composition - self.bin_budget_frac = bin_budget_frac - self.privacy_schema = privacy_schema - - def fit(self, X, y, sample_weight=None): # noqa: C901 - """ Fits model to provided samples. - - Args: - X: Numpy array for training samples. - y: Numpy array as training labels. - sample_weight: Optional array of weights per sample. Should be same length as X and y. - - Returns: - Itself. - """ - - # NOTE: Generally, we want to keep parameters in the __init__ function, since scikit-learn - # doesn't like parameters in the fit function, other than ones like weights that have - # the same length as the number of samples. See: - # https://scikit-learn.org/stable/developers/develop.html - # https://github.com/microsoft/LightGBM/issues/2628#issue-536116395 - # - - - # TODO PK sanity check all our inputs from the __init__ function, and this fit fuction - - # TODO PK we shouldn't expose our internal state until we are 100% sure that we succeeded - # so move everything to local variables until the end when we assign them to self.* - - # TODO PK we should do some basic checks here that X and y have the same dimensions and that - # they are well formed (look for NaNs, etc) - - # TODO PK handle calls where X.dim == 1. This could occur if there was only 1 feature, or if - # there was only 1 sample? We can differentiate either condition via y.dim and reshape - # AND add some tests for the X.dim == 1 scenario - - # TODO PK write an efficient striping converter for X that replaces unify_data for EBMs - # algorithm: grap N columns and convert them to rows then process those by sending them to C - - # TODO: PK don't overwrite self.feature_names here (scikit-learn rules), and it's also confusing to - # user to have their fields overwritten. Use feature_names_out_ or something similar - X, y, self.feature_names, _ = unify_data( - X, y, self.feature_names, self.feature_types, missing_data_allowed=True - ) - - # NOTE: Temporary override -- replace before push - w = sample_weight if sample_weight is not None else np.ones_like(y, dtype=np.float64) - w = unify_vector(w).astype(np.float64, casting="unsafe", copy=False) - - # Privacy calculations - if isinstance(self, (DPExplainableBoostingClassifier, DPExplainableBoostingRegressor)): - DPUtils.validate_eps_delta(self.epsilon, self.delta) - DPUtils.validate_DP_EBM(self) - - if self.privacy_schema is None: - warn("Possible privacy violation: assuming min/max values per feature/target are public info." - "Pass a privacy schema with known public ranges to avoid this warning.") - self.privacy_schema = DPUtils.build_privacy_schema(X, y) - - self.domain_size_ = self.privacy_schema['target'][1] - self.privacy_schema['target'][0] - - # Split epsilon, delta budget for binning and learning - bin_eps_ = self.epsilon * self.bin_budget_frac - training_eps_ = self.epsilon - bin_eps_ - bin_delta_ = self.delta / 2 - training_delta_ = self.delta / 2 - - # [DP] Calculate how much noise will be applied to each iteration of the algorithm - if self.composition == 'classic': - self.noise_scale_ = DPUtils.calc_classic_noise_multi( - total_queries = self.max_rounds * X.shape[1] * self.outer_bags, - target_epsilon = training_eps_, - delta = training_delta_, - sensitivity = self.domain_size_ * self.learning_rate * np.max(w) - ) - elif self.composition == 'gdp': - self.noise_scale_ = DPUtils.calc_gdp_noise_multi( - total_queries = self.max_rounds * X.shape[1] * self.outer_bags, - target_epsilon = training_eps_, - delta = training_delta_ - ) - self.noise_scale_ = self.noise_scale_ * self.domain_size_ * self.learning_rate * np.max(w) # Alg Line 17 - else: - raise NotImplementedError(f"Unknown composition method provided: {self.composition}. Please use 'gdp' or 'classic'.") - else: - bin_eps_, bin_delta_ = None, None - training_eps_, training_delta_ = None, None - - # Build preprocessor - self.preprocessor_ = EBMPreprocessor( - feature_names=self.feature_names, - feature_types=self.feature_types, - max_bins=self.max_bins, - binning=self.binning, - epsilon=bin_eps_, # Only defined during private training - delta=bin_delta_, - privacy_schema=getattr(self, 'privacy_schema', None) - ) - self.preprocessor_.fit(X) - X_orig = X - X = self.preprocessor_.transform(X_orig) - - features_categorical = np.array([x == "categorical" for x in self.preprocessor_.col_types_], dtype=ct.c_int64) - features_bin_count = np.array([len(x) for x in self.preprocessor_.col_bin_counts_], dtype=ct.c_int64) - - # NOTE: [DP] Passthrough to lower level layers for noise addition - bin_data_counts = {i : self.preprocessor_.col_bin_counts_[i] for i in range(X.shape[1])} - - if self.interactions != 0: - self.pair_preprocessor_ = EBMPreprocessor( - feature_names=self.feature_names, - feature_types=self.feature_types, - max_bins=self.max_interaction_bins, - binning=self.binning, - ) - self.pair_preprocessor_.fit(X_orig) - X_pair = self.pair_preprocessor_.transform(X_orig) - pair_features_categorical = np.array([x == "categorical" for x in self.pair_preprocessor_.col_types_], dtype=ct.c_int64) - pair_features_bin_count = np.array([len(x) for x in self.pair_preprocessor_.col_bin_counts_], dtype=ct.c_int64) - else: - self.pair_preprocessor_, X_pair, pair_features_categorical, pair_features_bin_count = None, None, None, None - - - estimators = [] - seed = EBMUtils.normalize_initial_random_seed(self.random_state) - - native = Native.get_native_singleton() - if is_classifier(self): - self.classes_, y = np.unique(y, return_inverse=True) - self._class_idx_ = {x: index for index, x in enumerate(self.classes_)} - - y = y.astype(np.int64, casting="unsafe", copy=False) - n_classes = len(self.classes_) - if n_classes > 2: # pragma: no cover - warn("Multiclass is still experimental. Subject to change per release.") - if n_classes > 2 and self.interactions != 0: - self.interactions = 0 - warn("Detected multiclass problem: forcing interactions to 0") - for i in range(self.outer_bags): - seed=native.generate_random_number(seed, 1416147523) - estimator = BaseCoreEBM( - # Data - model_type="classification", - features_categorical=features_categorical, - features_bin_count=features_bin_count, - pair_features_categorical=pair_features_categorical, - pair_features_bin_count=pair_features_bin_count, - # Core - main_features=self.mains, - interactions=self.interactions, - validation_size=self.validation_size, - max_rounds=self.max_rounds, - early_stopping_tolerance=self.early_stopping_tolerance, - early_stopping_rounds=self.early_stopping_rounds, - # Native - inner_bags=self.inner_bags, - learning_rate=self.learning_rate, - min_samples_leaf=self.min_samples_leaf, - max_leaves=self.max_leaves, - # Overall - random_state=seed, - # Differential Privacy - noise_scale=getattr(self, 'noise_scale_', None), - bin_counts=bin_data_counts, - ) - estimators.append(estimator) - else: - n_classes = -1 - y = y.astype(np.float64, casting="unsafe", copy=False) - for i in range(self.outer_bags): - seed=native.generate_random_number(seed, 1416147523) - estimator = BaseCoreEBM( - # Data - model_type="regression", - features_categorical=features_categorical, - features_bin_count=features_bin_count, - pair_features_categorical=pair_features_categorical, - pair_features_bin_count=pair_features_bin_count, - # Core - main_features=self.mains, - interactions=self.interactions, - validation_size=self.validation_size, - max_rounds=self.max_rounds, - early_stopping_tolerance=self.early_stopping_tolerance, - early_stopping_rounds=self.early_stopping_rounds, - # Native - inner_bags=self.inner_bags, - learning_rate=self.learning_rate, - min_samples_leaf=self.min_samples_leaf, - max_leaves=self.max_leaves, - # Overall - random_state=seed, - # Differential Privacy - noise_scale=getattr(self, 'noise_scale_', None), - bin_counts=bin_data_counts, - ) - estimators.append(estimator) - - # Train base models for main effects, pair detection. - - # scikit-learn returns an np.array for classification and - # a single float64 for regression, so we do the same - if is_classifier(self): - self.intercept_ = np.zeros( - Native.get_count_scores_c(n_classes), dtype=np.float64, order="C", - ) - else: - self.intercept_ = np.float64(0) - - provider = JobLibProvider(n_jobs=self.n_jobs) - - train_model_args_iter = ( - (estimators[i], X, y, w, X_pair, n_classes) for i in range(self.outer_bags) - ) - - estimators = provider.parallel(BaseCoreEBM.fit_parallel, train_model_args_iter) - - def select_pairs_from_fast(estimators, n_interactions): - # Average rank from estimators - pair_ranks = {} - - for n, estimator in enumerate(estimators): - for rank, indices in enumerate(estimator.inter_indices_): - old_mean = pair_ranks.get(indices, 0) - pair_ranks[indices] = old_mean + ((rank - old_mean) / (n + 1)) - - final_ranks = [] - total_interactions = 0 - for indices in pair_ranks: - heapq.heappush(final_ranks, (pair_ranks[indices], indices)) - total_interactions += 1 - - n_interactions = min(n_interactions, total_interactions) - top_pairs = [heapq.heappop(final_ranks)[1] for _ in range(n_interactions)] - return top_pairs - - if isinstance(self.interactions, int) and self.interactions > 0: - # Select merged pairs - pair_indices = select_pairs_from_fast(estimators, self.interactions) - - for estimator in estimators: - # Discard initial interactions - new_model = [] - new_feature_groups = [] - for i, feature_group in enumerate(estimator.feature_groups_): - if len(feature_group) != 1: - continue - new_model.append(estimator.model_[i]) - new_feature_groups.append(estimator.feature_groups_[i]) - estimator.model_ = new_model - estimator.feature_groups_ = new_feature_groups - estimator.inter_episode_idx_ = 0 - - if len(pair_indices) != 0: - # Retrain interactions for base models - - staged_fit_args_iter = ( - (estimators[i], X, y, w, X_pair, pair_indices) for i in range(self.outer_bags) - ) - - estimators = provider.parallel(BaseCoreEBM.staged_fit_interactions_parallel, staged_fit_args_iter) - elif isinstance(self.interactions, int) and self.interactions == 0: - pair_indices = [] - elif isinstance(self.interactions, list): - pair_indices = self.interactions - if len(pair_indices) != 0: - # Check and remove duplicate interaction terms - existing_terms = set() - unique_terms = [] - - for i, term in enumerate(pair_indices): - sorted_tuple = tuple(sorted(term)) - if sorted_tuple not in existing_terms: - existing_terms.add(sorted_tuple) - unique_terms.append(term) - - # Warn the users that we have made change to the interactions list - if len(unique_terms) != len(pair_indices): - warn("Detected duplicate interaction terms: removing duplicate interaction terms") - pair_indices = unique_terms - self.interactions = pair_indices - - # Retrain interactions for base models - staged_fit_args_iter = ( - (estimators[i], X, y, w, X_pair, pair_indices) for i in range(self.outer_bags) - ) - - estimators = provider.parallel(BaseCoreEBM.staged_fit_interactions_parallel, staged_fit_args_iter) - else: # pragma: no cover - raise RuntimeError("Argument 'interaction' has invalid value") - - X = np.ascontiguousarray(X.T) - if X_pair is not None: - X_pair = np.ascontiguousarray(X_pair.T) # I have no idea if we're supposed to do this. - - if isinstance(self.mains, str) and self.mains == "all": - main_indices = [[x] for x in range(X.shape[0])] - elif isinstance(self.mains, list) and all( - isinstance(x, int) for x in self.mains - ): - main_indices = [[x] for x in self.mains] - else: # pragma: no cover - msg = "Argument 'mains' has invalid value (valid values are 'all'|list): {}".format( - self.mains - ) - raise RuntimeError(msg) - - self.feature_groups_ = main_indices + pair_indices - - self.bagged_models_ = estimators - # Merge estimators into one. - self.additive_terms_ = [] - self.term_standard_deviations_ = [] - for index, _ in enumerate(self.feature_groups_): - log_odds_tensors = [] - for estimator in estimators: - log_odds_tensors.append(estimator.model_[index]) - - averaged_model = np.average(np.array(log_odds_tensors), axis=0) - model_errors = np.std(np.array(log_odds_tensors), axis=0) - - self.additive_terms_.append(averaged_model) - self.term_standard_deviations_.append(model_errors) - - # Get episode indexes for base estimators. - main_episode_idxs = [] - inter_episode_idxs = [] - for estimator in estimators: - main_episode_idxs.append(estimator.main_episode_idx_) - inter_episode_idxs.append(estimator.inter_episode_idx_) - - self.breakpoint_iteration_ = [main_episode_idxs] - if len(pair_indices) != 0: - self.breakpoint_iteration_.append(inter_episode_idxs) - - # Extract feature group names and feature group types. - # TODO PK v.3 don't overwrite feature_names and feature_types. Create new fields called feature_names_out and - # feature_types_out_ or feature_group_names_ and feature_group_types_ - self.feature_names = [] - self.feature_types = [] - for index, feature_indices in enumerate(self.feature_groups_): - feature_group_name = EBMUtils.gen_feature_group_name( - feature_indices, self.preprocessor_.col_names_ - ) - feature_group_type = EBMUtils.gen_feature_group_type( - feature_indices, self.preprocessor_.col_types_ - ) - self.feature_types.append(feature_group_type) - self.feature_names.append(feature_group_name) - - if n_classes <= 2: - if isinstance(self, (DPExplainableBoostingClassifier, DPExplainableBoostingRegressor)): - # DP method of centering graphs can generalize if we log pairwise densities - # No additional privacy loss from this step - # self.additive_terms_ and self.preprocessor_.col_bin_counts_ are noisy and published publicly - self._original_term_means_ = [] - for set_idx in range(len(self.feature_groups_)): - score_mean = np.average(self.additive_terms_[set_idx], weights=self.preprocessor_.col_bin_counts_[set_idx]) - self.additive_terms_[set_idx] = ( - self.additive_terms_[set_idx] - score_mean - ) - - # Add mean center adjustment back to intercept - self.intercept_ += score_mean - self._original_term_means_.append(score_mean) - else: - # Mean center graphs - only for binary classification and regression - scores_gen = EBMUtils.scores_by_feature_group( - X, X_pair, self.feature_groups_, self.additive_terms_ - ) - self._original_term_means_ = [] - - for set_idx, _, scores in scores_gen: - score_mean = np.average(scores, weights=w) - - self.additive_terms_[set_idx] = ( - self.additive_terms_[set_idx] - score_mean - ) - - # Add mean center adjustment back to intercept - self.intercept_ += score_mean - self._original_term_means_.append(score_mean) - else: - # Postprocess model graphs for multiclass - - # Currently pairwise interactions are unsupported for multiclass-classification. - binned_predict_proba = lambda x: EBMUtils.classifier_predict_proba( - x, None, self.feature_groups_, self.additive_terms_, self.intercept_ - ) - - postprocessed = multiclass_postprocess( - X, self.additive_terms_, binned_predict_proba, self.feature_types - ) - self.additive_terms_ = postprocessed["feature_graphs"] - self.intercept_ = postprocessed["intercepts"] - - for feature_group_idx, feature_group in enumerate(self.feature_groups_): - entire_tensor = [slice(None, None, None) for i in range(self.additive_terms_[feature_group_idx].ndim)] - for dimension_idx, feature_idx in enumerate(feature_group): - if self.preprocessor_.col_bin_counts_[feature_idx][0] == 0: - zero_dimension = entire_tensor.copy() - zero_dimension[dimension_idx] = 0 - self.additive_terms_[feature_group_idx][tuple(zero_dimension)] = 0 - self.term_standard_deviations_[feature_group_idx][tuple(zero_dimension)] = 0 - - # Generate overall importance - self.feature_importances_ = [] - if isinstance(self, (DPExplainableBoostingClassifier, DPExplainableBoostingRegressor)): - # DP method of generating feature importances can generalize to non-dp if preprocessors start tracking joint distributions - for i in range(len(self.feature_groups_)): - mean_abs_score = np.average(np.abs(self.additive_terms_[i]), weights=self.preprocessor_.col_bin_counts_[i]) - self.feature_importances_.append(mean_abs_score) - else: - scores_gen = EBMUtils.scores_by_feature_group( - X, X_pair, self.feature_groups_, self.additive_terms_ - ) - for set_idx, _, scores in scores_gen: - mean_abs_score = np.mean(np.abs(scores)) - self.feature_importances_.append(mean_abs_score) - - # Generate selector - # TODO PK v.3 shouldn't this be self._global_selector_ ?? - self.global_selector = gen_global_selector( - X_orig, self.feature_names, self.feature_types, None - ) - - self.has_fitted_ = True - return self - - # Select pairs from base models - def _merged_pair_score_fn(self, model_type, X, y, X_pair, feature_groups, model, intercept): - if model_type == "classification": - prob = EBMUtils.classifier_predict_proba( - X, X_pair, feature_groups, model, intercept - ) - return ( - 0 if len(y) == 0 else log_loss(y, prob) - ) # use logloss to conform consistnetly and for multiclass - elif model_type == "regression": - pred = EBMUtils.regressor_predict( - X, X_pair, feature_groups, model, intercept - ) - return 0 if len(y) == 0 else mean_squared_error(y, pred) - else: # pragma: no cover - msg = "Unknown model_type: '{}'.".format(model_type) - raise ValueError(msg) - - def decision_function(self, X): - """ Predict scores from model before calling the link function. - - Args: - X: Numpy array for samples. - - Returns: - The sum of the additive term contributions. - """ - check_is_fitted(self, "has_fitted_") - X_orig, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types, missing_data_allowed=True) - X = self.preprocessor_.transform(X_orig) - X = np.ascontiguousarray(X.T) - - if self.interactions != 0: - X_pair = self.pair_preprocessor_.transform(X_orig) - X_pair = np.ascontiguousarray(X_pair.T) - else: - X_pair = None - - decision_scores = EBMUtils.decision_function( - X, X_pair, self.feature_groups_, self.additive_terms_, self.intercept_ - ) - - return decision_scores - - def explain_global(self, name=None): - """ Provides global explanation for model. - - Args: - name: User-defined explanation name. - - Returns: - An explanation object, - visualizing feature-value pairs as horizontal bar chart. - """ - if name is None: - name = gen_name_from_class(self) - - check_is_fitted(self, "has_fitted_") - - # Obtain min/max for model scores - lower_bound = np.inf - upper_bound = -np.inf - for feature_group_index, _ in enumerate(self.feature_groups_): - errors = self.term_standard_deviations_[feature_group_index] - scores = self.additive_terms_[feature_group_index] - - lower_bound = min(lower_bound, np.min(scores - errors)) - upper_bound = max(upper_bound, np.max(scores + errors)) - - bounds = (lower_bound, upper_bound) - - # Add per feature graph - data_dicts = [] - feature_list = [] - density_list = [] - for feature_group_index, feature_indexes in enumerate( - self.feature_groups_ - ): - model_graph = self.additive_terms_[feature_group_index] - - # NOTE: This uses stddev. for bounds, consider issue warnings. - errors = self.term_standard_deviations_[feature_group_index] - - if len(feature_indexes) == 1: - # hack. remove the 0th index which is for missing values - model_graph = model_graph[1:] - errors = errors[1:] - - - bin_labels = self.preprocessor_._get_bin_labels(feature_indexes[0]) - # bin_counts = self.preprocessor_.get_bin_counts( - # feature_indexes[0] - # ) - scores = list(model_graph) - upper_bounds = list(model_graph + errors) - lower_bounds = list(model_graph - errors) - density_dict = { - "names": self.preprocessor_._get_hist_edges(feature_indexes[0]), - "scores": self.preprocessor_._get_hist_counts(feature_indexes[0]), - } - - feature_dict = { - "type": "univariate", - "names": bin_labels, - "scores": scores, - "scores_range": bounds, - "upper_bounds": upper_bounds, - "lower_bounds": lower_bounds, - } - feature_list.append(feature_dict) - density_list.append(density_dict) - - data_dict = { - "type": "univariate", - "names": bin_labels, - "scores": model_graph, - "scores_range": bounds, - "upper_bounds": model_graph + errors, - "lower_bounds": model_graph - errors, - "density": { - "names": self.preprocessor_._get_hist_edges(feature_indexes[0]), - "scores": self.preprocessor_._get_hist_counts( - feature_indexes[0] - ), - }, - } - if is_classifier(self): - data_dict["meta"] = { - "label_names": self.classes_.tolist() # Classes should be numpy array, convert to list. - } - - data_dicts.append(data_dict) - elif len(feature_indexes) == 2: - # hack. remove the 0th index which is for missing values - model_graph = model_graph[1:, 1:] - # errors = errors[1:, 1:] # NOTE: This is commented as it's not used in this branch. - - - bin_labels_left = self.pair_preprocessor_._get_bin_labels(feature_indexes[0]) - bin_labels_right = self.pair_preprocessor_._get_bin_labels(feature_indexes[1]) - - feature_dict = { - "type": "interaction", - "left_names": bin_labels_left, - "right_names": bin_labels_right, - "scores": model_graph, - "scores_range": bounds, - } - feature_list.append(feature_dict) - density_list.append({}) - - data_dict = { - "type": "interaction", - "left_names": bin_labels_left, - "right_names": bin_labels_right, - "scores": model_graph, - "scores_range": bounds, - } - data_dicts.append(data_dict) - else: # pragma: no cover - raise Exception("Interactions greater than 2 not supported.") - - overall_dict = { - "type": "univariate", - "names": self.feature_names, - "scores": self.feature_importances_, - } - internal_obj = { - "overall": overall_dict, - "specific": data_dicts, - "mli": [ - { - "explanation_type": "ebm_global", - "value": {"feature_list": feature_list}, - }, - {"explanation_type": "density", "value": {"density": density_list}}, - ], - } - - return EBMExplanation( - "global", - internal_obj, - feature_names=self.feature_names, - feature_types=self.feature_types, - name=name, - selector=self.global_selector, - ) - - def explain_local(self, X, y=None, name=None): - """ Provides local explanations for provided samples. - - Args: - X: Numpy array for X to explain. - y: Numpy vector for y to explain. - name: User-defined explanation name. - - Returns: - An explanation object, visualizing feature-value pairs - for each sample as horizontal bar charts. - """ - - # Produce feature value pairs for each sample. - # Values are the model graph score per respective feature group. - if name is None: - name = gen_name_from_class(self) - - check_is_fitted(self, "has_fitted_") - - X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types, missing_data_allowed=True) - - # Transform y if classifier - if is_classifier(self) and y is not None: - y = np.array([self._class_idx_[el] for el in y]) - - samples = self.preprocessor_.transform(X) - samples = np.ascontiguousarray(samples.T) - - if self.interactions != 0: - pair_samples = self.pair_preprocessor_.transform(X) - pair_samples = np.ascontiguousarray(pair_samples.T) - else: - pair_samples = None - - scores_gen = EBMUtils.scores_by_feature_group( - samples, pair_samples, self.feature_groups_, self.additive_terms_ - ) - - # TODO PK add a test to see if we handle X.ndim == 1 (or should we throw ValueError) - - n_rows = samples.shape[1] - data_dicts = [] - intercept = self.intercept_ - if not is_classifier(self) or len(self.classes_) <= 2: - if isinstance(self.intercept_, np.ndarray) or isinstance( - self.intercept_, list - ): - intercept = intercept[0] - - for _ in range(n_rows): - data_dict = { - "type": "univariate", - "names": [], - "scores": [], - "values": [], - "extra": {"names": ["Intercept"], "scores": [intercept], "values": [1]}, - } - if is_classifier(self): - data_dict["meta"] = { - "label_names": self.classes_.tolist() # Classes should be numpy array, convert to list. - } - data_dicts.append(data_dict) - - for set_idx, feature_group, scores in scores_gen: - for row_idx in range(n_rows): - feature_name = self.feature_names[set_idx] - data_dicts[row_idx]["names"].append(feature_name) - data_dicts[row_idx]["scores"].append(scores[row_idx]) - if len(feature_group) == 1: - data_dicts[row_idx]["values"].append( - X[row_idx, feature_group[0]] - ) - else: - data_dicts[row_idx]["values"].append("") - - is_classification = is_classifier(self) - if is_classification: - scores = EBMUtils.classifier_predict_proba( - samples, pair_samples, self.feature_groups_, self.additive_terms_, self.intercept_, - ) - else: - scores = EBMUtils.regressor_predict( - samples, pair_samples, self.feature_groups_, self.additive_terms_, self.intercept_, - ) - - perf_list = [] - perf_dicts = gen_perf_dicts(scores, y, is_classification) - for row_idx in range(n_rows): - perf = None if perf_dicts is None else perf_dicts[row_idx] - perf_list.append(perf) - data_dicts[row_idx]["perf"] = perf - - selector = gen_local_selector(data_dicts, is_classification=is_classification) - - - additive_terms = [] - for feature_group_index, feature_indexes in enumerate(self.feature_groups_): - if len(feature_indexes) == 1: - # hack. remove the 0th index which is for missing values - additive_terms.append(self.additive_terms_[feature_group_index][1:]) - elif len(feature_indexes) == 2: - # hack. remove the 0th index which is for missing values - additive_terms.append(self.additive_terms_[feature_group_index][1:, 1:]) - else: - raise ValueError("only handles 1D/2D") - - internal_obj = { - "overall": None, - "specific": data_dicts, - "mli": [ - { - "explanation_type": "ebm_local", - "value": { - "scores": additive_terms, - "intercept": self.intercept_, - "perf": perf_list, - }, - } - ], - } - internal_obj["mli"].append( - { - "explanation_type": "evaluation_dataset", - "value": {"dataset_x": X, "dataset_y": y}, + 'data': heatmap_data } - ) - - return EBMExplanation( - "local", - internal_obj, - feature_names=self.feature_names, - feature_types=self.feature_types, - name=name, - selector=selector, - ) - - -class ExplainableBoostingClassifier(BaseEBM, ClassifierMixin, ExplainerMixin): - """ Explainable Boosting Classifier. The arguments will change in a future release, watch the changelog. """ - - # TODO PK v.3 use underscores here like ClassifierMixin._estimator_type? - available_explanations = ["global", "local"] - explainer_type = "model" - - """ Public facing EBM classifier.""" - - def __init__( - self, - # Explainer - feature_names=None, - feature_types=None, - # Preprocessor - max_bins=256, - max_interaction_bins=32, - binning="quantile", - # Stages - mains="all", - interactions=10, - # Ensemble - outer_bags=8, - inner_bags=0, - # Boosting - learning_rate=0.01, - validation_size=0.15, - early_stopping_rounds=50, - early_stopping_tolerance=1e-4, - max_rounds=5000, - # Trees - min_samples_leaf=2, - max_leaves=3, - # Overall - n_jobs=-2, - random_state=42, - ): - """ Explainable Boosting Classifier. The arguments will change in a future release, watch the changelog. - - Args: - feature_names: List of feature names. - feature_types: List of feature types. - max_bins: Max number of bins per feature for pre-processing stage. - max_interaction_bins: Max number of bins per feature for pre-processing stage on interaction terms. Only used if interactions is non-zero. - binning: Method to bin values for pre-processing. Choose "uniform", "quantile" or "quantile_humanized". - mains: Features to be trained on in main effects stage. Either "all" or a list of feature indexes. - interactions: Interactions to be trained on. - Either a list of lists of feature indices, or an integer for number of automatically detected interactions. - Interactions are forcefully set to 0 for multiclass problems. - outer_bags: Number of outer bags. - inner_bags: Number of inner bags. - learning_rate: Learning rate for boosting. - validation_size: Validation set size for boosting. - early_stopping_rounds: Number of rounds of no improvement to trigger early stopping. - early_stopping_tolerance: Tolerance that dictates the smallest delta required to be considered an improvement. - max_rounds: Number of rounds for boosting. - min_samples_leaf: Minimum number of cases for tree splits used in boosting. - max_leaves: Maximum leaf nodes used in boosting. - n_jobs: Number of jobs to run in parallel. - random_state: Random state. - """ - super(ExplainableBoostingClassifier, self).__init__( - # Explainer - feature_names=feature_names, - feature_types=feature_types, - # Preprocessor - max_bins=max_bins, - max_interaction_bins=max_interaction_bins, - binning=binning, - # Stages - mains=mains, - interactions=interactions, - # Ensemble - outer_bags=outer_bags, - inner_bags=inner_bags, - # Boosting - learning_rate=learning_rate, - validation_size=validation_size, - early_stopping_rounds=early_stopping_rounds, - early_stopping_tolerance=early_stopping_tolerance, - max_rounds=max_rounds, - # Trees - min_samples_leaf=min_samples_leaf, - max_leaves=max_leaves, - # Overall - n_jobs=n_jobs, - random_state=random_state, - ) - - # TODO: Throw ValueError like scikit for 1d instead of 2d arrays - def predict_proba(self, X): - """ Probability estimates on provided samples. - - Args: - X: Numpy array for samples. - - Returns: - Probability estimate of sample for each class. - """ - check_is_fitted(self, "has_fitted_") - X_orig, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types, missing_data_allowed=True) - X = self.preprocessor_.transform(X_orig) - X = np.ascontiguousarray(X.T) - - if self.interactions != 0: - X_pair = self.pair_preprocessor_.transform(X_orig) - X_pair = np.ascontiguousarray(X_pair.T) - else: - X_pair = None - - # TODO PK add a test to see if we handle X.ndim == 1 (or should we throw ValueError) - - prob = EBMUtils.classifier_predict_proba( - X, X_pair, self.feature_groups_, self.additive_terms_, self.intercept_ - ) - return prob - - def predict(self, X): - """ Predicts on provided samples. - - Args: - X: Numpy array for samples. - - Returns: - Predicted class label per sample. - """ - check_is_fitted(self, "has_fitted_") - X_orig, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types, missing_data_allowed=True) - X = self.preprocessor_.transform(X_orig) - X = np.ascontiguousarray(X.T) - - if self.interactions != 0: - X_pair = self.pair_preprocessor_.transform(X_orig) - X_pair = np.ascontiguousarray(X_pair.T) - else: - X_pair = None - - # TODO PK add a test to see if we handle X.ndim == 1 (or should we throw ValueError) - - return EBMUtils.classifier_predict( - X, - X_pair, - self.feature_groups_, - self.additive_terms_, - self.intercept_, - self.classes_, - ) - - def predict_and_contrib(self, X, output='probabilities'): - """Predicts on provided samples, returning predictions and explanations for each sample. - - Args: - X: Numpy array for samples. - output: Prediction type to output (i.e. one of 'probabilities', 'logits', 'labels') - - Returns: - Predictions and local explanations for each sample. - """ - - allowed_outputs = ['probabilities', 'logits', 'labels'] - if output not in allowed_outputs: - msg = "Argument 'output' has invalid value. Got '{}', expected one of " - + repr(allowed_outputs) - raise ValueError(msg.format(output)) - - check_is_fitted(self, "has_fitted_") - X_orig, _, _, _ = unify_data( - X, None, self.feature_names, self.feature_types, missing_data_allowed=True - ) - X = self.preprocessor_.transform(X_orig) - X = np.ascontiguousarray(X.T) - - if self.interactions != 0: - X_pair = self.pair_preprocessor_.transform(X_orig) - X_pair = np.ascontiguousarray(X_pair.T) - else: - X_pair = None - - return EBMUtils.classifier_predict_and_contrib( - X, - X_pair, - self.feature_groups_, - self.additive_terms_, - self.intercept_, - self.classes_, - output) if __name__ == "__main__":