From 83fe768b073ee6e2e23ecc3018d8b24846dd98da Mon Sep 17 00:00:00 2001 From: Vinay Kerai <39084585+VinayKerai@users.noreply.github.com> Date: Thu, 18 Nov 2021 11:33:29 +0800 Subject: [PATCH 1/5] add option to save plots --- table_evaluator/table_evaluator.py | 54 ++++++++++++++++++++++-------- table_evaluator/viz.py | 12 +++++-- 2 files changed, 50 insertions(+), 16 deletions(-) diff --git a/table_evaluator/table_evaluator.py b/table_evaluator/table_evaluator.py index bb24964..b1155df 100644 --- a/table_evaluator/table_evaluator.py +++ b/table_evaluator/table_evaluator.py @@ -1,4 +1,5 @@ import copy +import os import warnings import pandas as pd import numpy as np @@ -86,16 +87,18 @@ def __init__(self, real: pd.DataFrame, fake: pd.DataFrame, cat_cols=None, unique self.fake.loc[:, self.numerical_columns] = self.fake.loc[:, self.numerical_columns].fillna( self.fake[self.numerical_columns].mean()) - def plot_mean_std(self): + def plot_mean_std(self, fname=None): """ Class wrapper function for plotting the mean and std using `viz.plot_mean_std`. + :param fname: If not none, saves the plot with this file name. """ - plot_mean_std(self.real, self.fake) + plot_mean_std(self.real, self.fake, fname=fname) - def plot_cumsums(self, nr_cols=4): + def plot_cumsums(self, nr_cols=4, fname=None): """ Plot the cumulative sums for all columns in the real and fake dataset. Height of each row scales with the length of the labels. Each plot contains the values of a real columns and the corresponding fake column. + :param fname: If not none, saves the plot with this file name. """ nr_charts = len(self.real.columns) nr_rows = max(1, nr_charts // nr_cols) @@ -118,12 +121,17 @@ def plot_cumsums(self, nr_cols=4): f = self.fake.iloc[:, self.real.columns.tolist().index(col)] cdf(r, f, col, 'Cumsum', ax=axes[i]) plt.tight_layout(rect=[0, 0.02, 1, 0.98]) + + if fname is not None: + plt.savefig(fname) + plt.show() - def plot_distributions(self, nr_cols=3): + def plot_distributions(self, nr_cols=3, fname=None): """ Plot the distribution plots for all columns in the real and fake dataset. Height of each row of plots scales with the length of the labels. Each plot contains the values of a real columns and the corresponding fake column. + :param fname: If not none, saves the plot with this file name. """ nr_charts = len(self.real.columns) nr_rows = max(1, nr_charts // nr_cols) @@ -170,16 +178,21 @@ def plot_distributions(self, nr_cols=3): .pipe((sns.barplot, "data"), x=x, y=y, hue=hue, ax=axes[i], saturation=0.8, palette=palette)) ax.set_xticklabels(axes[i].get_xticklabels(), rotation='vertical') plt.tight_layout(rect=[0, 0.02, 1, 0.98]) + + if fname is not None: + plt.savefig(fname) + plt.show() - def plot_correlation_difference(self, plot_diff=True, **kwargs): + def plot_correlation_difference(self, plot_diff=True, fname=None, **kwargs): """ Plot the association matrices for each table and, if chosen, the difference between them. :param plot_diff: whether to plot the difference + :param fname: If not none, saves the plot with this file name. :param kwargs: kwargs for sns.heatmap """ - plot_correlation_difference(self.real, self.fake, cat_cols=self.categorical_columns, plot_diff=plot_diff, + plot_correlation_difference(self.real, self.fake, cat_cols=self.categorical_columns, plot_diff=plot_diff, fname=fname, **kwargs) def correlation_distance(self, how: str = 'euclidean') -> float: @@ -212,9 +225,10 @@ def custom_cosine(a, b): fake_corr.values ) - def plot_pca(self): + def plot_pca(self, fname=None): """ Plot the first two components of a PCA of real and fake data. + :param fname: If not none, saves the plot with this file name. """ real = numerical_encoding(self.real, nominal_columns=self.categorical_columns) fake = numerical_encoding(self.fake, nominal_columns=self.categorical_columns) @@ -230,6 +244,10 @@ def plot_pca(self): sns.scatterplot(ax=ax[1], x=fake_t[:, 0], y=fake_t[:, 1]) ax[0].set_title('Real data') ax[1].set_title('Fake data') + + if fname is not None: + plt.savefig(fname) + plt.show() def get_copies(self, return_len: bool = False) -> Union[pd.DataFrame, int]: @@ -354,17 +372,25 @@ def score_estimators(self): raise Exception(f'self.target_type should be either \'class\' or \'regr\', but is {self.target_type}.') return results - def visual_evaluation(self, **kwargs): + def visual_evaluation(self, save_dir=None, **kwargs): """ Plot all visual evaluation metrics. Includes plotting the mean and standard deviation, cumulative sums, correlation differences and the PCA transform. - + :save_dir: directory path to save images :param kwargs: any kwargs for matplotlib. """ - self.plot_mean_std() - self.plot_cumsums() - self.plot_distributions() - self.plot_correlation_difference(**kwargs) - self.plot_pca() + if save_dir is None: + self.plot_mean_std() + self.plot_cumsums() + self.plot_distributions() + self.plot_correlation_difference(**kwargs) + self.plot_pca() + else: + self.plot_mean_std(fname=os.path.join(save_dir, 'mean_std.png')) + self.plot_cumsums(fname=os.path.join(save_dir, 'cumsums.png')) + self.plot_distributions(fname=os.path.join(save_dir, 'distributions.png')) + self.plot_correlation_difference(fname=os.path.join(save_dir, 'correlation_difference.png'), **kwargs) + self.plot_pca(fname=os.path.join(save_dir, 'pca.png')) + def statistical_evaluation(self) -> float: """ diff --git a/table_evaluator/viz.py b/table_evaluator/viz.py index 3a57a79..b1ff7a0 100644 --- a/table_evaluator/viz.py +++ b/table_evaluator/viz.py @@ -40,7 +40,7 @@ def plot_var_cor(x: Union[pd.DataFrame, np.ndarray], ax=None, return_values: boo return corr -def plot_correlation_difference(real: pd.DataFrame, fake: pd.DataFrame, plot_diff: bool = True, cat_cols: list = None, annot=False): +def plot_correlation_difference(real: pd.DataFrame, fake: pd.DataFrame, plot_diff: bool = True, cat_cols: list = None, annot=False, fname=None): """ Plot the association matrices for the `real` dataframe, `fake` dataframe and plot the difference between them. Has support for continuous and Categorical (Male, Female) data types. All Object and Category dtypes are considered to be Categorical columns if `dis_cols` is not passed. @@ -82,6 +82,10 @@ def plot_correlation_difference(real: pd.DataFrame, fake: pd.DataFrame, plot_dif title_font = {'size': '18'} ax[i].set_title(label, **title_font) plt.tight_layout() + + if fname is not None: + plt.savefig(fname) + plt.show() @@ -183,13 +187,14 @@ def plot_mean_std_comparison(evaluators: List): plt.tight_layout() -def plot_mean_std(real: pd.DataFrame, fake: pd.DataFrame, ax=None): +def plot_mean_std(real: pd.DataFrame, fake: pd.DataFrame, ax=None, fname=None): """ Plot the means and standard deviations of each dataset. :param real: DataFrame containing the real data :param fake: DataFrame containing the fake data :param ax: Axis to plot on. If none, a new figure is made. + :param fname: If not none, saves the plot with this file name. """ if ax is None: fig, ax = plt.subplots(1, 2, figsize=(10, 5)) @@ -225,5 +230,8 @@ def plot_mean_std(real: pd.DataFrame, fake: pd.DataFrame, ax=None): ax[1].set_xlabel('real data std (log)') ax[1].set_ylabel('fake data std (log)') + if fname is not None: + plt.savefig(fname) + if ax is None: plt.show() From d6c38e13096dfa62e6753e5c5216785378e3994f Mon Sep 17 00:00:00 2001 From: Vinay Kerai <39084585+VinayKerai@users.noreply.github.com> Date: Thu, 18 Nov 2021 11:49:45 +0800 Subject: [PATCH 2/5] create save directory if does not exist --- table_evaluator/table_evaluator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/table_evaluator/table_evaluator.py b/table_evaluator/table_evaluator.py index b1155df..0ff1046 100644 --- a/table_evaluator/table_evaluator.py +++ b/table_evaluator/table_evaluator.py @@ -385,6 +385,9 @@ def visual_evaluation(self, save_dir=None, **kwargs): self.plot_correlation_difference(**kwargs) self.plot_pca() else: + if not os.path.isdir(save_dir): + os.makedirs(save_dir) + self.plot_mean_std(fname=os.path.join(save_dir, 'mean_std.png')) self.plot_cumsums(fname=os.path.join(save_dir, 'cumsums.png')) self.plot_distributions(fname=os.path.join(save_dir, 'distributions.png')) From f941cb5ff5cad4c388ed3be4fb8f291fd68a689c Mon Sep 17 00:00:00 2001 From: Vinay Kerai <39084585+VinayKerai@users.noreply.github.com> Date: Tue, 30 Nov 2021 17:18:11 +0800 Subject: [PATCH 3/5] use pathlib library instead of os library --- table_evaluator/table_evaluator.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/table_evaluator/table_evaluator.py b/table_evaluator/table_evaluator.py index 0ff1046..3334d73 100644 --- a/table_evaluator/table_evaluator.py +++ b/table_evaluator/table_evaluator.py @@ -5,6 +5,7 @@ import numpy as np import seaborn as sns import matplotlib.pyplot as plt +from pathlib import Path from tqdm import tqdm from scipy import stats from typing import Tuple, Dict, Union @@ -385,14 +386,14 @@ def visual_evaluation(self, save_dir=None, **kwargs): self.plot_correlation_difference(**kwargs) self.plot_pca() else: - if not os.path.isdir(save_dir): - os.makedirs(save_dir) - - self.plot_mean_std(fname=os.path.join(save_dir, 'mean_std.png')) - self.plot_cumsums(fname=os.path.join(save_dir, 'cumsums.png')) - self.plot_distributions(fname=os.path.join(save_dir, 'distributions.png')) - self.plot_correlation_difference(fname=os.path.join(save_dir, 'correlation_difference.png'), **kwargs) - self.plot_pca(fname=os.path.join(save_dir, 'pca.png')) + save_dir = Path(save_dir) + save_dir.mkdir(parents=True, exist_ok=True) + + self.plot_mean_std(fname=save_dir/'mean_std.png') + self.plot_cumsums(fname=save_dir/'cumsums.png') + self.plot_distributions(fname=save_dir/'distributions.png') + self.plot_correlation_difference(fname=save_dir/'correlation_difference.png', **kwargs) + self.plot_pca(fname=save_dir/'pca.png') def statistical_evaluation(self) -> float: From fd13704b25712635d267d39f7c2763f55a4519b0 Mon Sep 17 00:00:00 2001 From: Bauke Brenninkmeijer Date: Fri, 3 Dec 2021 21:27:06 +0100 Subject: [PATCH 4/5] add option to return outputs as a dict. Improve .evaluate function a little bit, but requires more work --- README.md | 13 +++-- example_table_evaluator.ipynb | 94 +++++++++++++++--------------- table_evaluator/table_evaluator.py | 56 +++++++++++------- 3 files changed, 89 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index 43222a6..f921dbb 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,7 @@ [![Supported versions](https://img.shields.io/pypi/pyversions/table_evaluator.svg)](https://pypi.python.org/pypi/table_evaluator) ![Package deployment](https://github.com/Baukebrenninkmeijer/table-evaluator/actions/workflows/python-publish.yml/badge.svg?branch=master) [![PyPI - Downloads](https://img.shields.io/pypi/dm/table_evaluator)](https://pypistats.org/packages/table_evaluator) - -[Official documentation](https://baukebrenninkmeijer.github.io/table-evaluator/) +[![Documentation](https://img.shields.io/badge/Documentation-%20-blue)](https://baukebrenninkmeijer.github.io/table-evaluator/) TableEvaluator is a library to evaluate how similar a synthesized dataset is to a real data. In other words, it tries to give an indication into how real your fake data is. With the rise of GANs, specifically designed for tabular data, many applications are becoming possibilities. For industries like finance, healthcare and goverments, having the capacity to create high quality synthetic data that does **not** have the privacy constraints of normal data is extremely valuable. Since this field is this quite young and developing, I created this library to have a consistent evaluation method for your models. @@ -19,9 +18,15 @@ The test can be run by cloning the repo and running: ``` pytest tests ``` +if this does not work, the package might not currently be findable. In that case, please install it locally with: + +``` +pip install -e . +``` ## Usage -**Please see the example notebook for the most up-to-date examples. The README example is just that notebook, but sometimes a bit outdated.** +**Please see the [example notebook](https://github.com/Baukebrenninkmeijer/table-evaluator/blob/master/example_table_evaluator.ipynb) for the most up-to-date examples. The README example is just that notebook as markdown.** + Start by importing the class ```Python from table_evaluator import load_data, TableEvaluator @@ -142,6 +147,6 @@ table_evaluator.evaluate(target_col='trans_type') Please see the full documentation on [https://baukebrenninkmeijer.github.io/table-evaluator/](https://baukebrenninkmeijer.github.io/table-evaluator/). ## Motivation -To see the motivation for my decisions, please have a look at my master thesis, found at [https://www.ru.nl/publish/pages/769526/z04_master_thesis_brenninkmeijer.pdf](https://www.ru.nl/publish/pages/769526/z04_master_thesis_brenninkmeijer.pdf) +To see the motivation for my decisions, please have a look at my master thesis, found at the [Radboud University](https://www.ru.nl/publish/pages/769526/z04_master_thesis_brenninkmeijer.pdf) If you have any tips or suggestions, please contact send me on email. diff --git a/example_table_evaluator.ipynb b/example_table_evaluator.ipynb index 10c1f33..ac33141 100644 --- a/example_table_evaluator.ipynb +++ b/example_table_evaluator.ipynb @@ -10,9 +10,18 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "%load_ext autoreload\n", "%autoreload 2" @@ -20,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -29,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -148,7 +157,7 @@ "4 WITHDRAWAL_IN_CASH UNKNOWN 654 " ] }, - "execution_count": 4, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -159,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -269,7 +278,7 @@ "4 REMITTANCE_TO_OTHER_BANK HOUSEHOLD 1211 " ] }, - "execution_count": 5, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -280,7 +289,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -289,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -305,49 +314,38 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 24, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Classifier F1-scores and their Jaccard similarities::\n", - " f1_real f1_fake jaccard_similarity\n", - "index \n", - "LogisticRegression_real_testset 0.7800 0.7750 0.9704\n", - "LogisticRegression_fake_testset 0.7550 0.7450 0.9048\n", - "RandomForestClassifier_real_testset 0.9850 0.9850 1.0000\n", - "RandomForestClassifier_fake_testset 0.9650 0.9650 1.0000\n", - "DecisionTreeClassifier_real_testset 0.9800 0.9650 0.9512\n", - "DecisionTreeClassifier_fake_testset 0.9600 0.9150 0.9139\n", - "MLPClassifier_real_testset 0.4000 0.5000 0.5326\n", - "MLPClassifier_fake_testset 0.4300 0.5450 0.4925\n", - "\n", - "Privacy results:\n", - " result\n", - "Duplicate rows between sets (real/fake) (0, 0)\n", - "nearest neighbor mean 0.5655\n", - "nearest neighbor std 0.3726\n", - "\n", - "Miscellaneous results:\n", - " Result\n", - "Column Correlation Distance RMSE 0.0399\n", - "Column Correlation distance MAE 0.0296\n", - "\n", - "Results:\n", - " result\n", - "Basic statistics 0.9940\n", - "Correlation column correlations 0.9904\n", - "Mean Correlation between fake and real columns 0.9566\n", - "1 - MAPE Estimator results 0.9251\n", - "Similarity Score 0.9665\n" - ] + "data": { + "text/html": [ + "

Synthetic Data Report

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5674247a319f428a96b21a6d4b2dc626", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Tab(children=(VBox(children=(Output(),)), VBox(children=(Output(),)), VBox(children=(Output(),)), VBox(childreā€¦" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "evaluator.evaluate(target_col='trans_type')" + "evaluator.evaluate(target_col='trans_type', notebook=True)" ] }, { @@ -437,7 +435,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.7.3" } }, "nbformat": 4, diff --git a/table_evaluator/table_evaluator.py b/table_evaluator/table_evaluator.py index 89821b1..1674657 100644 --- a/table_evaluator/table_evaluator.py +++ b/table_evaluator/table_evaluator.py @@ -565,7 +565,7 @@ def column_correlations(self): return column_correlations(self.real, self.fake, self.categorical_columns) def evaluate(self, target_col: str, target_type: str = 'class', metric: str = None, verbose: bool = None, - n_samples_distance: int = 20000, notebook: bool = False) -> Dict: + n_samples_distance: int = 20000, notebook: bool = False, return_outputs: bool = False) -> Dict: """ Determine correlation between attributes from the real and fake dataset using a given metric. All metrics from scipy.stats are available. @@ -583,8 +583,6 @@ def evaluate(self, target_col: str, target_type: str = 'class', metric: str = No warnings.filterwarnings(action='ignore', category=ConvergenceWarning) pd.options.display.float_format = '{:,.4f}'.format - # print(f'\nCorrelation metric: {self.comparison_metric.__name__}') - basic_statistical = self.basic_statistical_evaluation() correlation_correlation = self.correlation_correlation() column_correlation = self.column_correlations() @@ -598,42 +596,29 @@ def evaluate(self, target_col: str, target_type: str = 'class', metric: str = No miscellaneous = pd.DataFrame({'Result': list(miscellaneous_dict.values())}, index=list(miscellaneous_dict.keys())) - # print(f'\nMiscellaneous results:') privacy_metrics_dict = { 'Duplicate rows between sets (real/fake)': self.get_duplicates(), 'nearest neighbor mean': nearest_neighbor[0], 'nearest neighbor std': nearest_neighbor[1], } + privacy_report = EvaluationResult( name='Privacy Results', content=dict_to_df(privacy_metrics_dict), ) - all_results_dict = { - 'Basic statistics': basic_statistical, - 'Correlation column correlations': correlation_correlation, - 'Mean Correlation between fake and real columns': column_correlation, - f'{"1 - MAPE Estimator results" if self.target_type == "class" else "Correlation RMSE"}': estimators, - } - similarity_score = np.mean(list(all_results_dict.values())) - all_results_dict['Similarity Score'] = similarity_score - # all_results = pd.DataFrame({'Result': list(all_results_dict.values())}, index=list(all_results_dict.keys())) - all_results = EvaluationResult( - name='Overview Results', - content=dict_to_df(all_results_dict) - ) + privacy_tab = [privacy_report] + efficacy_title = 'Classifier F1-scores and their Jaccard similarities:' if self.target_type == 'class' \ else '\nRegressor MSE-scores' - overview_tab = [all_results, ] ml_efficacy_tab = [ - EvaluationResult(name='ML Efficacy', content=self.estimators_scores) + EvaluationResult(name=efficacy_title, content=self.estimators_scores) ] - privacy_tab = [privacy_report] js_df = js_distance_df(self.real, self.fake, self.numerical_columns) @@ -645,6 +630,34 @@ def evaluate(self, target_col: str, target_type: str = 'class', metric: str = No ) ] + + all_results_dict = { + 'Basic statistics': basic_statistical, + 'Correlation column correlations': correlation_correlation, + 'Mean Correlation between fake and real columns': column_correlation, + f'{"1 - MAPE Estimator results" if self.target_type == "class" else "Correlation RMSE"}': estimators, + } + all_results_dict['Similarity Score'] = np.mean(list(all_results_dict.values())) + + summary = EvaluationResult( + name='Overview Results', + content=dict_to_df(all_results_dict) + ) + + overview_tab = [summary, ] + + if return_outputs: + all_results = [ + *overview_tab, + *ml_efficacy_tab, + *privacy_tab, + *statistical_tab, + ] + + all_results = {x.name: x.content.to_dict(orient='index') for x in all_results} + + return all_results + if notebook: visualize_notebook( self, @@ -665,5 +678,4 @@ def evaluate(self, target_col: str, target_type: str = 'class', metric: str = No print(miscellaneous.to_string()) print(f'\nResults:') - print(all_results.content.to_string()) - # return all_results + print(summary.content.to_string()) From 59e5095d2e0706c07e6cde4ba3bfa3e9490e1a9f Mon Sep 17 00:00:00 2001 From: Bauke Brenninkmeijer Date: Fri, 3 Dec 2021 21:32:09 +0100 Subject: [PATCH 5/5] add docstring for return_results --- table_evaluator/table_evaluator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/table_evaluator/table_evaluator.py b/table_evaluator/table_evaluator.py index cbcf78a..02ef98a 100644 --- a/table_evaluator/table_evaluator.py +++ b/table_evaluator/table_evaluator.py @@ -606,6 +606,7 @@ def evaluate(self, target_col: str, target_type: str = 'class', metric: str = No By default Pearson's r is used. Alternatives include Spearman rho (scipy.stats.spearmanr) or Kendall Tau (scipy.stats.kendalltau). :param n_samples_distance: The number of samples to take for the row distance. See documentation of ``tableEvaluator.row_distance`` for details. :param verbose: whether to print verbose logging. + :param return_outputs: Will omit printing and instead return a dictionairy with all results. """ self.verbose = verbose if verbose is not None else self.verbose self.comparison_metric = metric if metric is not None else self.comparison_metric