diff --git a/README.md b/README.md index 43222a6..f921dbb 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,7 @@ [![Supported versions](https://img.shields.io/pypi/pyversions/table_evaluator.svg)](https://pypi.python.org/pypi/table_evaluator) ![Package deployment](https://github.com/Baukebrenninkmeijer/table-evaluator/actions/workflows/python-publish.yml/badge.svg?branch=master) [![PyPI - Downloads](https://img.shields.io/pypi/dm/table_evaluator)](https://pypistats.org/packages/table_evaluator) - -[Official documentation](https://baukebrenninkmeijer.github.io/table-evaluator/) +[![Documentation](https://img.shields.io/badge/Documentation-%20-blue)](https://baukebrenninkmeijer.github.io/table-evaluator/) TableEvaluator is a library to evaluate how similar a synthesized dataset is to a real data. In other words, it tries to give an indication into how real your fake data is. With the rise of GANs, specifically designed for tabular data, many applications are becoming possibilities. For industries like finance, healthcare and goverments, having the capacity to create high quality synthetic data that does **not** have the privacy constraints of normal data is extremely valuable. Since this field is this quite young and developing, I created this library to have a consistent evaluation method for your models. @@ -19,9 +18,15 @@ The test can be run by cloning the repo and running: ``` pytest tests ``` +if this does not work, the package might not currently be findable. In that case, please install it locally with: + +``` +pip install -e . +``` ## Usage -**Please see the example notebook for the most up-to-date examples. The README example is just that notebook, but sometimes a bit outdated.** +**Please see the [example notebook](https://github.com/Baukebrenninkmeijer/table-evaluator/blob/master/example_table_evaluator.ipynb) for the most up-to-date examples. The README example is just that notebook as markdown.** + Start by importing the class ```Python from table_evaluator import load_data, TableEvaluator @@ -142,6 +147,6 @@ table_evaluator.evaluate(target_col='trans_type') Please see the full documentation on [https://baukebrenninkmeijer.github.io/table-evaluator/](https://baukebrenninkmeijer.github.io/table-evaluator/). ## Motivation -To see the motivation for my decisions, please have a look at my master thesis, found at [https://www.ru.nl/publish/pages/769526/z04_master_thesis_brenninkmeijer.pdf](https://www.ru.nl/publish/pages/769526/z04_master_thesis_brenninkmeijer.pdf) +To see the motivation for my decisions, please have a look at my master thesis, found at the [Radboud University](https://www.ru.nl/publish/pages/769526/z04_master_thesis_brenninkmeijer.pdf) If you have any tips or suggestions, please contact send me on email. diff --git a/example_table_evaluator.ipynb b/example_table_evaluator.ipynb index 10c1f33..ac33141 100644 --- a/example_table_evaluator.ipynb +++ b/example_table_evaluator.ipynb @@ -10,9 +10,18 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "%load_ext autoreload\n", "%autoreload 2" @@ -20,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -29,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -38,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -148,7 +157,7 @@ "4 WITHDRAWAL_IN_CASH UNKNOWN 654 " ] }, - "execution_count": 4, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -159,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -269,7 +278,7 @@ "4 REMITTANCE_TO_OTHER_BANK HOUSEHOLD 1211 " ] }, - "execution_count": 5, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -280,7 +289,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -289,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -305,49 +314,38 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 24, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Classifier F1-scores and their Jaccard similarities::\n", - " f1_real f1_fake jaccard_similarity\n", - "index \n", - "LogisticRegression_real_testset 0.7800 0.7750 0.9704\n", - "LogisticRegression_fake_testset 0.7550 0.7450 0.9048\n", - "RandomForestClassifier_real_testset 0.9850 0.9850 1.0000\n", - "RandomForestClassifier_fake_testset 0.9650 0.9650 1.0000\n", - "DecisionTreeClassifier_real_testset 0.9800 0.9650 0.9512\n", - "DecisionTreeClassifier_fake_testset 0.9600 0.9150 0.9139\n", - "MLPClassifier_real_testset 0.4000 0.5000 0.5326\n", - "MLPClassifier_fake_testset 0.4300 0.5450 0.4925\n", - "\n", - "Privacy results:\n", - " result\n", - "Duplicate rows between sets (real/fake) (0, 0)\n", - "nearest neighbor mean 0.5655\n", - "nearest neighbor std 0.3726\n", - "\n", - "Miscellaneous results:\n", - " Result\n", - "Column Correlation Distance RMSE 0.0399\n", - "Column Correlation distance MAE 0.0296\n", - "\n", - "Results:\n", - " result\n", - "Basic statistics 0.9940\n", - "Correlation column correlations 0.9904\n", - "Mean Correlation between fake and real columns 0.9566\n", - "1 - MAPE Estimator results 0.9251\n", - "Similarity Score 0.9665\n" - ] + "data": { + "text/html": [ + "

Synthetic Data Report

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5674247a319f428a96b21a6d4b2dc626", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Tab(children=(VBox(children=(Output(),)), VBox(children=(Output(),)), VBox(children=(Output(),)), VBox(childreā€¦" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "evaluator.evaluate(target_col='trans_type')" + "evaluator.evaluate(target_col='trans_type', notebook=True)" ] }, { @@ -437,7 +435,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.7.3" } }, "nbformat": 4, diff --git a/table_evaluator/table_evaluator.py b/table_evaluator/table_evaluator.py index 8bd4880..a24c617 100644 --- a/table_evaluator/table_evaluator.py +++ b/table_evaluator/table_evaluator.py @@ -1,9 +1,11 @@ import copy +import os import warnings import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt +from pathlib import Path from tqdm import tqdm from scipy import stats from typing import Tuple, Dict, Union @@ -87,16 +89,18 @@ def __init__(self, real: pd.DataFrame, fake: pd.DataFrame, cat_cols=None, unique self.fake.loc[:, self.numerical_columns] = self.fake.loc[:, self.numerical_columns].fillna( self.fake[self.numerical_columns].mean()) - def plot_mean_std(self): + def plot_mean_std(self, fname=None): """ Class wrapper function for plotting the mean and std using `viz.plot_mean_std`. + :param fname: If not none, saves the plot with this file name. """ - plot_mean_std(self.real, self.fake) + plot_mean_std(self.real, self.fake, fname=fname) - def plot_cumsums(self, nr_cols=4): + def plot_cumsums(self, nr_cols=4, fname=None): """ Plot the cumulative sums for all columns in the real and fake dataset. Height of each row scales with the length of the labels. Each plot contains the values of a real columns and the corresponding fake column. + :param fname: If not none, saves the plot with this file name. """ nr_charts = len(self.real.columns) nr_rows = max(1, nr_charts // nr_cols) @@ -119,12 +123,17 @@ def plot_cumsums(self, nr_cols=4): f = self.fake.iloc[:, self.real.columns.tolist().index(col)] cdf(r, f, col, 'Cumsum', ax=axes[i]) plt.tight_layout(rect=[0, 0.02, 1, 0.98]) + + if fname is not None: + plt.savefig(fname) + plt.show() - def plot_distributions(self, nr_cols=3): + def plot_distributions(self, nr_cols=3, fname=None): """ Plot the distribution plots for all columns in the real and fake dataset. Height of each row of plots scales with the length of the labels. Each plot contains the values of a real columns and the corresponding fake column. + :param fname: If not none, saves the plot with this file name. """ nr_charts = len(self.real.columns) nr_rows = max(1, nr_charts // nr_cols) @@ -165,16 +174,21 @@ def plot_distributions(self, nr_cols=3): .pipe((sns.barplot, "data"), x=x, y=y, hue=hue, ax=axes[i], saturation=0.8, palette=palette)) ax.set_xticklabels(axes[i].get_xticklabels(), rotation='vertical') plt.tight_layout(rect=[0, 0.02, 1, 0.98]) + + if fname is not None: + plt.savefig(fname) + plt.show() - def plot_correlation_difference(self, plot_diff=True, **kwargs): + def plot_correlation_difference(self, plot_diff=True, fname=None, **kwargs): """ Plot the association matrices for each table and, if chosen, the difference between them. :param plot_diff: whether to plot the difference + :param fname: If not none, saves the plot with this file name. :param kwargs: kwargs for sns.heatmap """ - plot_correlation_difference(self.real, self.fake, cat_cols=self.categorical_columns, plot_diff=plot_diff, + plot_correlation_difference(self.real, self.fake, cat_cols=self.categorical_columns, plot_diff=plot_diff, fname=fname, **kwargs) def correlation_distance(self, how: str = 'euclidean') -> float: @@ -207,9 +221,10 @@ def custom_cosine(a, b): fake_corr.values ) - def plot_pca(self): + def plot_pca(self, fname=None): """ Plot the first two components of a PCA of real and fake data. + :param fname: If not none, saves the plot with this file name. """ real, fake = self.convert_numerical() @@ -225,6 +240,10 @@ def plot_pca(self): sns.scatterplot(ax=ax[1], x=fake_t[:, 0], y=fake_t[:, 1]) ax[0].set_title('Real data') ax[1].set_title('Fake data') + + if fname is not None: + plt.savefig(fname) + plt.show() def get_copies(self, return_len: bool = False) -> Union[pd.DataFrame, int]: @@ -345,17 +364,28 @@ def score_estimators(self): raise Exception(f'self.target_type should be either \'class\' or \'regr\', but is {self.target_type}.') return results - def visual_evaluation(self, **kwargs): + def visual_evaluation(self, save_dir=None, **kwargs): """ Plot all visual evaluation metrics. Includes plotting the mean and standard deviation, cumulative sums, correlation differences and the PCA transform. - + :save_dir: directory path to save images :param kwargs: any kwargs for matplotlib. """ - self.plot_mean_std() - self.plot_cumsums() - self.plot_distributions() - self.plot_correlation_difference(**kwargs) - self.plot_pca() + if save_dir is None: + self.plot_mean_std() + self.plot_cumsums() + self.plot_distributions() + self.plot_correlation_difference(**kwargs) + self.plot_pca() + else: + save_dir = Path(save_dir) + save_dir.mkdir(parents=True, exist_ok=True) + + self.plot_mean_std(fname=save_dir/'mean_std.png') + self.plot_cumsums(fname=save_dir/'cumsums.png') + self.plot_distributions(fname=save_dir/'distributions.png') + self.plot_correlation_difference(fname=save_dir/'correlation_difference.png', **kwargs) + self.plot_pca(fname=save_dir/'pca.png') + def basic_statistical_evaluation(self) -> float: """ @@ -578,7 +608,7 @@ def column_correlations(self): return column_correlations(real, fake, self.categorical_columns) def evaluate(self, target_col: str, target_type: str = 'class', metric: str = None, verbose: bool = None, - n_samples_distance: int = 20000, kfold: bool = False, notebook: bool = False) -> Dict: + n_samples_distance: int = 20000, kfold: bool = False, notebook: bool = False, return_outputs: bool = False) -> Dict: """ Determine correlation between attributes from the real and fake dataset using a given metric. All metrics from scipy.stats are available. @@ -591,6 +621,7 @@ def evaluate(self, target_col: str, target_type: str = 'class', metric: str = No :param kfold: Use a 5-fold CV for the ML estimators if set to True. Train/Test on 80%/20% of the data if set to False. :param notebook: Better visualization of the results in a python notebook :param verbose: whether to print verbose logging. + :param return_outputs: Will omit printing and instead return a dictionairy with all results. """ self.verbose = verbose if verbose is not None else self.verbose self.comparison_metric = metric if metric is not None else self.comparison_metric @@ -598,8 +629,6 @@ def evaluate(self, target_col: str, target_type: str = 'class', metric: str = No warnings.filterwarnings(action='ignore', category=ConvergenceWarning) pd.options.display.float_format = '{:,.4f}'.format - # print(f'\nCorrelation metric: {self.comparison_metric.__name__}') - basic_statistical = self.basic_statistical_evaluation() correlation_correlation = self.correlation_correlation() column_correlation = self.column_correlations() @@ -613,42 +642,29 @@ def evaluate(self, target_col: str, target_type: str = 'class', metric: str = No miscellaneous = pd.DataFrame({'Result': list(miscellaneous_dict.values())}, index=list(miscellaneous_dict.keys())) - # print(f'\nMiscellaneous results:') privacy_metrics_dict = { 'Duplicate rows between sets (real/fake)': self.get_duplicates(), 'nearest neighbor mean': nearest_neighbor[0], 'nearest neighbor std': nearest_neighbor[1], } + privacy_report = EvaluationResult( name='Privacy Results', content=dict_to_df(privacy_metrics_dict), ) - all_results_dict = { - 'Basic statistics': basic_statistical, - 'Correlation column correlations': correlation_correlation, - 'Mean Correlation between fake and real columns': column_correlation, - f'{"1 - MAPE Estimator results" if self.target_type == "class" else "Correlation RMSE"}': estimators, - } - similarity_score = np.mean(list(all_results_dict.values())) - all_results_dict['Similarity Score'] = similarity_score - # all_results = pd.DataFrame({'Result': list(all_results_dict.values())}, index=list(all_results_dict.keys())) - all_results = EvaluationResult( - name='Overview Results', - content=dict_to_df(all_results_dict) - ) + privacy_tab = [privacy_report] + efficacy_title = 'Classifier F1-scores and their Jaccard similarities:' if self.target_type == 'class' \ else '\nRegressor MSE-scores' - overview_tab = [all_results, ] ml_efficacy_tab = [ - EvaluationResult(name='ML Efficacy', content=self.estimators_scores) + EvaluationResult(name=efficacy_title, content=self.estimators_scores) ] - privacy_tab = [privacy_report] js_df = js_distance_df(self.real, self.fake, self.numerical_columns) @@ -660,6 +676,34 @@ def evaluate(self, target_col: str, target_type: str = 'class', metric: str = No ) ] + + all_results_dict = { + 'Basic statistics': basic_statistical, + 'Correlation column correlations': correlation_correlation, + 'Mean Correlation between fake and real columns': column_correlation, + f'{"1 - MAPE Estimator results" if self.target_type == "class" else "Correlation RMSE"}': estimators, + } + all_results_dict['Similarity Score'] = np.mean(list(all_results_dict.values())) + + summary = EvaluationResult( + name='Overview Results', + content=dict_to_df(all_results_dict) + ) + + overview_tab = [summary, ] + + if return_outputs: + all_results = [ + *overview_tab, + *ml_efficacy_tab, + *privacy_tab, + *statistical_tab, + ] + + all_results = {x.name: x.content.to_dict(orient='index') for x in all_results} + + return all_results + if notebook: visualize_notebook( self, @@ -680,5 +724,4 @@ def evaluate(self, target_col: str, target_type: str = 'class', metric: str = No print(miscellaneous.to_string()) print(f'\nResults:') - print(all_results.content.to_string()) - #return all_results + print(summary.content.to_string()) diff --git a/table_evaluator/viz.py b/table_evaluator/viz.py index 1fbb001..c1b0fa7 100644 --- a/table_evaluator/viz.py +++ b/table_evaluator/viz.py @@ -43,7 +43,7 @@ def plot_var_cor(x: Union[pd.DataFrame, np.ndarray], ax=None, return_values: boo return None -def plot_correlation_difference(real: pd.DataFrame, fake: pd.DataFrame, plot_diff: bool = True, cat_cols: list = None, annot=False): +def plot_correlation_difference(real: pd.DataFrame, fake: pd.DataFrame, plot_diff: bool = True, cat_cols: list = None, annot=False, fname=None): """ Plot the association matrices for the `real` dataframe, `fake` dataframe and plot the difference between them. Has support for continuous and Categorical (Male, Female) data types. All Object and Category dtypes are considered to be Categorical columns if `dis_cols` is not passed. @@ -85,6 +85,10 @@ def plot_correlation_difference(real: pd.DataFrame, fake: pd.DataFrame, plot_dif title_font = {'size': '18'} ax[i].set_title(label, **title_font) plt.tight_layout() + + if fname is not None: + plt.savefig(fname) + plt.show() @@ -190,13 +194,14 @@ def plot_mean_std_comparison(evaluators: List): plt.tight_layout() -def plot_mean_std(real: pd.DataFrame, fake: pd.DataFrame, ax=None): +def plot_mean_std(real: pd.DataFrame, fake: pd.DataFrame, ax=None, fname=None): """ Plot the means and standard deviations of each dataset. :param real: DataFrame containing the real data :param fake: DataFrame containing the fake data :param ax: Axis to plot on. If none, a new figure is made. + :param fname: If not none, saves the plot with this file name. """ if ax is None: fig, ax = plt.subplots(1, 2, figsize=(10, 5)) @@ -232,5 +237,8 @@ def plot_mean_std(real: pd.DataFrame, fake: pd.DataFrame, ax=None): ax[1].set_xlabel('real data std (log)') ax[1].set_ylabel('fake data std (log)') + if fname is not None: + plt.savefig(fname) + if ax is None: plt.show()