From be5ab69467cff9e347f01168b2917f061e02ee02 Mon Sep 17 00:00:00 2001 From: Eddie Janowicz Date: Mon, 17 Apr 2017 14:27:55 -0700 Subject: [PATCH 1/2] add relative probability influence method --- zone_model/evaluate.py | 5 +++ zone_model/utils.py | 91 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/zone_model/evaluate.py b/zone_model/evaluate.py index ede8bc1..9ec42e6 100644 --- a/zone_model/evaluate.py +++ b/zone_model/evaluate.py @@ -35,3 +35,8 @@ def correlate(observed, predicted): corr = model.score(scoring_function=correlate, aggregate=True) print(" Correlation is {}".format(corr)) + + relative_probabilities = pd.Series(model.relative_probabilities()) + print(" Variables by probability influence:") + print(relative_probabilities.sort_values(ascending=False)) + diff --git a/zone_model/utils.py b/zone_model/utils.py index ed5281c..8939c02 100644 --- a/zone_model/utils.py +++ b/zone_model/utils.py @@ -445,6 +445,97 @@ def score(self, scoring_function=accuracy_score, choosers=None, return scoring_function(observed_choices, predicted_choices) + def single_alternative_proba(self, alternative_data, choosers=None, + alternatives=None): + """ + Probability of a single alternative with user-supplied attributes + being selected. For use in diagnostic settings. + Parameters + ---------- + alternative_data : dict or pd.Series + The single alternative's attributes. A mapping between variable + name and variable value. Should contain key for each explanatory + variable in the model specification. + choosers : pandas.DataFrame, optional + DataFrame of choosers. + alternatives : pandas.DataFrame, optional + DataFrame of alternatives. + Returns + ------- + probability : float + Probability of alternative with user-supplied characteristics + being selected. + """ + if choosers is None or alternatives is None: + choosers, alternatives = self.calculate_model_variables() + + alternatives_plus = alternatives.append(alternative_data, + ignore_index=True) + probabilities = self.calculate_probabilities(choosers, + alternatives_plus) + + probability = probabilities.iloc[-1] + + return probability + + def relative_probabilities(self, low_percentile=.05, high_percentile=.95, + choosers=None, alternatives=None): + """ + Indicator of explanatory variable influence. For each variable, + calculate relative variable probability contribution by holding all + other variables at their median value and having the variable of + interest take on its 5th and 95th percentile values, then calculating + the difference in resulting probabilities. + Parameters + ---------- + low_percentile : float, optional + The percentile that represents the value variable takes on in the + low end of its range. + high_percentile : float, optional + The percentile that represents the value variable takes on in the + high end of its range. + choosers : pandas.DataFrame, optional + DataFrame of choosers. + alternatives : pandas.DataFrame, optional + DataFrame of alternatives. + Returns + ------- + relative_probabilities : dict + Mapping between variable name and it's contribution to + probability. + """ + if choosers is None or alternatives is None: + choosers, alternatives = self.calculate_model_variables() + + explanatory_variables = list(self.model_expression) + alternatives = alternatives[explanatory_variables] + + relative_probabilities = {} + for var_to_measure in explanatory_variables: + + low_percentile_value = alternatives[var_to_measure].quantile( + low_percentile) + high_percentile_value = alternatives[var_to_measure].quantile( + high_percentile) + + constant_vars = [var for var in explanatory_variables if + var != var_to_measure] + + mock_observation = alternatives[constant_vars].median() + + mock_observation[var_to_measure] = high_percentile_value + high_proba = self.single_alternative_proba(mock_observation, + choosers, alternatives) + + mock_observation[var_to_measure] = low_percentile_value + low_proba = self.single_alternative_proba(mock_observation, + choosers, alternatives) + + proba_difference = high_proba - low_proba + relative_probabilities[var_to_measure] = proba_difference + + return relative_probabilities + class SimpleEnsemble(SimulationChoiceModel): """ From aec9a991b51d88eb11a121a3d49046de4df1d0d1 Mon Sep 17 00:00:00 2001 From: Eddie Janowicz Date: Mon, 17 Apr 2017 14:50:17 -0700 Subject: [PATCH 2/2] pycodestyle formatting fixes --- zone_model/evaluate.py | 1 - zone_model/utils.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/zone_model/evaluate.py b/zone_model/evaluate.py index 9ec42e6..45f3c78 100644 --- a/zone_model/evaluate.py +++ b/zone_model/evaluate.py @@ -39,4 +39,3 @@ def correlate(observed, predicted): relative_probabilities = pd.Series(model.relative_probabilities()) print(" Variables by probability influence:") print(relative_probabilities.sort_values(ascending=False)) - diff --git a/zone_model/utils.py b/zone_model/utils.py index 8939c02..df4a11c 100644 --- a/zone_model/utils.py +++ b/zone_model/utils.py @@ -469,9 +469,9 @@ def single_alternative_proba(self, alternative_data, choosers=None, if choosers is None or alternatives is None: choosers, alternatives = self.calculate_model_variables() - alternatives_plus = alternatives.append(alternative_data, + alternatives_plus = alternatives.append(alternative_data, ignore_index=True) - probabilities = self.calculate_probabilities(choosers, + probabilities = self.calculate_probabilities(choosers, alternatives_plus) probability = probabilities.iloc[-1] @@ -481,9 +481,9 @@ def single_alternative_proba(self, alternative_data, choosers=None, def relative_probabilities(self, low_percentile=.05, high_percentile=.95, choosers=None, alternatives=None): """ - Indicator of explanatory variable influence. For each variable, - calculate relative variable probability contribution by holding all - other variables at their median value and having the variable of + Indicator of explanatory variable influence. For each variable, + calculate relative variable probability contribution by holding all + other variables at their median value and having the variable of interest take on its 5th and 95th percentile values, then calculating the difference in resulting probabilities. Parameters @@ -501,7 +501,7 @@ def relative_probabilities(self, low_percentile=.05, high_percentile=.95, Returns ------- relative_probabilities : dict - Mapping between variable name and it's contribution to + Mapping between variable name and it's contribution to probability. """ if choosers is None or alternatives is None: @@ -524,16 +524,16 @@ def relative_probabilities(self, low_percentile=.05, high_percentile=.95, mock_observation = alternatives[constant_vars].median() mock_observation[var_to_measure] = high_percentile_value - high_proba = self.single_alternative_proba(mock_observation, + high_proba = self.single_alternative_proba(mock_observation, choosers, alternatives) mock_observation[var_to_measure] = low_percentile_value - low_proba = self.single_alternative_proba(mock_observation, + low_proba = self.single_alternative_proba(mock_observation, choosers, alternatives) proba_difference = high_proba - low_proba relative_probabilities[var_to_measure] = proba_difference - + return relative_probabilities