diff --git a/README.md b/README.md index abe31d6..9c1f225 100644 --- a/README.md +++ b/README.md @@ -25,11 +25,18 @@ import syndat real = pd.read_csv("real.csv") synthetic = pd.read_csv("synthetic.csv") -jsd = syndat.quality.jsd(real, synthetic) -auc = syndat.quality.auc(real, synthetic) -norm = syndat.quality.correlation(real, synthetic) +# How similar are the statistical distributions of real and synthetic features +distribution_similarity_score = syndat.scores.distribution(real, synthetic) + +# How hard is it for a classifier to discriminate real and synthetic data +discrimination_score = syndat.scores.discrimination(real, synthetic) + +# How well are pairwise feature correlations preserved +correlation_score = syndat.scores.correlation(real, synthetic) ``` +Scores are defined in a range of 0-100, with a higher score corresponding to better data fidelity. + ## Visualization Visualize real vs. synthetic data distributions and summary statistics for each feature: diff --git a/syndat/__init__.py b/syndat/__init__.py index 5ec6e93..a4778ef 100644 --- a/syndat/__init__.py +++ b/syndat/__init__.py @@ -1,3 +1,3 @@ from syndat import domain -from syndat import quality -from syndat import visualization \ No newline at end of file +from syndat import scores +from syndat import visualization diff --git a/syndat/quality.py b/syndat/scores.py similarity index 83% rename from syndat/quality.py rename to syndat/scores.py index 8641487..2171640 100644 --- a/syndat/quality.py +++ b/syndat/scores.py @@ -13,11 +13,13 @@ from syndat.domain import AggregationMethod +logger = logging.getLogger(__name__) + def auc(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_folds=5, drop_na_threshold=0.9, score: bool = True) -> float: """ - Computes the Differentiation Complexity Score / ROC AUC score of a classifier trained to differentiate between real + Computes the Discrimination Complexity Score / ROC AUC score of a classifier trained to differentiate between real and synthetic data. :param real: The real data. @@ -27,18 +29,43 @@ def auc(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_folds=5, :param score: Return result in a normalized score in [0,100]. Default is True. :return: Differentiation Complexity Score / AUC ROC Score """ + warnings.warn( - "old_function is deprecated and will be removed in a future version. Please use discrimination_score instead.", + "auc is deprecated and will be removed in a future version. Please use discrimination instead.", DeprecationWarning, stacklevel=2 ) - return discrimination_score(real, synthetic, n_folds=n_folds, drop_na_threshold=drop_na_threshold, score=score) + return discrimination(real, synthetic, n_folds=n_folds, drop_na_threshold=drop_na_threshold, score=score) -def discrimination_score(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_folds=5, - drop_na_threshold=0.9, score: bool = True) -> float: +def jsd(real: pd.DataFrame, synthetic: pd.DataFrame, aggregate_results: bool = True, + aggregation_method: AggregationMethod = AggregationMethod.AVERAGE, score: bool = True, + n_unique_threshold=10) -> Union[List[float], float]: """ - Computes the Differentiation Complexity Score / ROC AUC score of a classifier trained to differentiate between real + Computes the feature distribution similarity using the Jensen-Shannon distance of real and synthetic data. + + :param real: The real data. + :param synthetic: The synthetic data. + :param aggregate_results: Compute a single aggregated score for all features. Default is True. + :param aggregation_method: How the scores are aggregated. Default is using the median of all feature scores. + :param score: Return result in a normalized score in [0,100]. Default is True. + :param n_unique_threshold: Threshold to determine at which number of unique values bins will span over several + values. + :return: Distribution Similarity / JSD + """ + + warnings.warn( + "auc is deprecated and will be removed in a future version. Please use discrimination instead.", + DeprecationWarning, + stacklevel=2 + ) + return distribution(real, synthetic, aggregate_results, aggregation_method, score, n_unique_threshold) + + +def discrimination(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_folds=5, + drop_na_threshold=0.9, score: bool = True) -> float: + """ + Computes the Discrimination Complexity Score / ROC AUC score of a classifier trained to differentiate between real and synthetic data. :param real: The real data. @@ -52,10 +79,10 @@ def discrimination_score(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_ real_filtered, synthetic_filtered = __filter_rows_with_common_categories(real, synthetic) # check for missing values in real data real_clean = real_filtered.dropna(thresh=int(drop_na_threshold * len(real_filtered)), axis=1) - logging.info(f'Dropped {real_clean.shape[1] - real_clean.shape[1]} ' + logger.info(f'Dropped {real_clean.shape[1] - real_clean.shape[1]} ' f'due to high missingness (threshold is {drop_na_threshold}).') real_clean = real_clean.dropna() - logging.info(f'Removed {len(real) - len(real_clean)} entries due to missing values.') + logger.info(f'Removed {len(real) - len(real_clean)} entries due to missing values.') # assert that both real and synthetic have same columns synthetic_clean = synthetic_filtered[real_clean.columns] # one-hot-encode categorical columns @@ -74,9 +101,9 @@ def discrimination_score(real: pandas.DataFrame, synthetic: pandas.DataFrame, n_ return auc_score -def jsd(real: pd.DataFrame, synthetic: pd.DataFrame, aggregate_results: bool = True, - aggregation_method: AggregationMethod = AggregationMethod.AVERAGE, score: bool = True, - n_unique_threshold=10) -> Union[List[float], float]: +def distribution(real: pd.DataFrame, synthetic: pd.DataFrame, aggregate_results: bool = True, + aggregation_method: AggregationMethod = AggregationMethod.AVERAGE, score: bool = True, + n_unique_threshold=10) -> Union[List[float], float]: """ Computes the feature distribution similarity using the Jensen-Shannon distance of real and synthetic data. @@ -100,7 +127,7 @@ def jsd(real: pd.DataFrame, synthetic: pd.DataFrame, aggregate_results: bool = T col_dtype_real = real[col].dtype col_dtype_synthetic = synthetic[col].dtype if col_dtype_real != col_dtype_synthetic: - logging.warning(f'Real data at col {col} is dtype {col_dtype_real} but synthetic is {col_dtype_synthetic}. ' + logger.warning(f'Real data at col {col} is dtype {col_dtype_real} but synthetic is {col_dtype_synthetic}. ' f'Evaluation will be done based on the assumed data type of the real data.') synthetic[col] = synthetic[col].astype(col_dtype_real) # categorical column @@ -216,21 +243,15 @@ def __filter_rows_with_common_categories(real: pd.DataFrame, synthetic: pd.DataF synthetic_categorical_cols = synthetic.select_dtypes(include=['object', 'category']).columns # Identify common categorical columns common_categorical_cols = set(real_categorical_cols) & set(synthetic_categorical_cols) - if not common_categorical_cols: - logging.warning("No common categorical columns found. Correlation will be computed on numeric data only.") # Filter rows with common categories in each column for col in common_categorical_cols: real_categories = set(real[col].unique()) synthetic_categories = set(synthetic[col].unique()) common_categories = real_categories & synthetic_categories if len(real_categories - common_categories) > 0: - logging.warning( - f"Categories {real_categories - common_categories} in column '{col}' " - f"are in real data but not in synthetic data and will be excluded.") - if len(synthetic_categories - common_categories) > 0: - logging.warning( - f"Categories {synthetic_categories - common_categories} in column '{col}' " - f"are in synthetic data but not in real data and will be excluded.") + logger.warning( + f"Categories {real_categories - common_categories} in column '{col}' are in real data but not in " + f"synthetic data. They will not be considered in the score computation.") # Filter rows to keep only common categories real = real[real[col].isin(common_categories)] synthetic = synthetic[synthetic[col].isin(common_categories)] diff --git a/tests/test_auc.py b/tests/test_auc.py index 9ffd9ae..d6d6ee3 100644 --- a/tests/test_auc.py +++ b/tests/test_auc.py @@ -43,13 +43,13 @@ def preprocess_categorical_data(self, real_data, synthetic_data): return real_data_encoded, synthetic_data_encoded def test_auc_score(self): - auc_score = syndat.quality.auc(self.real_data, self.synthetic_data) + auc_score = syndat.scores.discrimination(self.real_data, self.synthetic_data) self.assertTrue(isinstance(auc_score, float)) self.assertGreaterEqual(auc_score, 0.0) self.assertLessEqual(auc_score, 100.0) def test_auc_score_normalized(self): - auc_score = syndat.quality.auc(self.real_data, self.synthetic_data) + auc_score = syndat.scores.discrimination(self.real_data, self.synthetic_data) self.assertTrue(isinstance(auc_score, float)) self.assertGreaterEqual(auc_score, 0.0) self.assertLessEqual(auc_score, 100.0) @@ -57,7 +57,7 @@ def test_auc_score_normalized(self): def test_auc_score_with_missing_values(self): # Introduce missing values in real data self.real_data.iloc[::10, 0] = np.nan # 10% missing data - auc_score = syndat.quality.auc(self.real_data, self.synthetic_data) + auc_score = syndat.scores.discrimination(self.real_data, self.synthetic_data) self.assertTrue(isinstance(auc_score, float)) self.assertGreaterEqual(auc_score, 0.0) self.assertLessEqual(auc_score, 100.0) @@ -65,19 +65,19 @@ def test_auc_score_with_missing_values(self): def test_auc_score_with_missing_values_drop_col(self): # Introduce missing values in real data self.real_data.iloc[::2, 0] = np.nan # 50% missing data -> col drop - auc_score = syndat.quality.auc(self.real_data, self.synthetic_data) + auc_score = syndat.scores.discrimination(self.real_data, self.synthetic_data) self.assertTrue(isinstance(auc_score, float)) self.assertGreaterEqual(auc_score, 0.0) self.assertLessEqual(auc_score, 100.0) def test_auc_score_with_custom_folds(self): - auc_score = syndat.quality.auc(self.real_data, self.synthetic_data, n_folds=5) + auc_score = syndat.scores.discrimination(self.real_data, self.synthetic_data, n_folds=5) self.assertTrue(isinstance(auc_score, float)) self.assertGreaterEqual(auc_score, 0.0) self.assertLessEqual(auc_score, 100.0) def test_auc_score_with_categorical_data(self): - auc_score = syndat.quality.auc(self.real_data_cat, self.synthetic_data_cat) + auc_score = syndat.scores.discrimination(self.real_data_cat, self.synthetic_data_cat) self.assertTrue(isinstance(auc_score, float)) self.assertGreaterEqual(auc_score, 0.0) self.assertLessEqual(auc_score, 100.0) diff --git a/tests/test_correlation.py b/tests/test_correlation.py index 965656e..ef4b55f 100644 --- a/tests/test_correlation.py +++ b/tests/test_correlation.py @@ -2,7 +2,7 @@ import pandas as pd -from syndat.quality import correlation +from syndat.scores import correlation class TestCorrelation(unittest.TestCase): diff --git a/tests/test_jsd.py b/tests/test_jsd.py index 60fbc0e..aa1de0e 100644 --- a/tests/test_jsd.py +++ b/tests/test_jsd.py @@ -17,8 +17,8 @@ def test_jsd_zero_int64(self): 'feature1': [6, 7, 8, 9, 10], 'feature2': [15, 16, 17, 18, 19] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertEqual(0, jsd) + distribution = syndat.scores.distribution(real, synthetic) + self.assertEqual(0, distribution) def test_jsd_zero_int64_float64(self): synthetic = pd.DataFrame({ @@ -30,8 +30,8 @@ def test_jsd_zero_int64_float64(self): 'feature1': [6, 7, 8, 9, 10], 'feature2': [0.6, 0.7, 0.8, 0.9, 1.0] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertEqual(jsd, 0) + distribution = syndat.scores.distribution(real, synthetic) + self.assertEqual(distribution, 0) def test_jsd_perfect_int64(self): synthetic = pd.DataFrame({ @@ -43,8 +43,8 @@ def test_jsd_perfect_int64(self): 'feature1': [1, 2, 1, 2, 3], 'feature2': [11, 12, 13, 14, 15] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertEqual(jsd, 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertEqual(distribution, 100) def test_jsd_perfect_int64_and_float64(self): synthetic = pd.DataFrame({ @@ -56,8 +56,8 @@ def test_jsd_perfect_int64_and_float64(self): 'feature1': [1, 2, 1, 2, 3], 'feature2': [0.1, 0.2, 0.3, 0.4, 0.5] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertEqual(jsd, 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertEqual(distribution, 100) def test_jsd_different_col_types(self): synthetic = pd.DataFrame({ @@ -69,7 +69,7 @@ def test_jsd_different_col_types(self): 'feature1': [1.2, 2.1, 1.1, 2.1, 3.1], 'feature2': [1, 2, 3, 4, 5] }) - jsd = syndat.quality.jsd(real, synthetic, score=False) + distribution = syndat.scores.distribution(real, synthetic, score=False) def test_jsd_negative_int64(self): synthetic = pd.DataFrame({ @@ -81,7 +81,7 @@ def test_jsd_negative_int64(self): 'feature1': [-1, 2, 3, 4, 5], 'feature2': [1, 2, 3, 4, 5] }) - jsd = syndat.quality.jsd(real, synthetic) + distribution = syndat.scores.distribution(real, synthetic) def test_jsd_single_outlier(self): synthetic = pd.DataFrame({ @@ -93,8 +93,8 @@ def test_jsd_single_outlier(self): 'feature1': [1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9], 'feature2': [1, 1, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 100], }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertTrue(jsd < 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertTrue(distribution < 100) def test_jsd_categorical_equal(self): synthetic = pd.DataFrame({ @@ -106,8 +106,8 @@ def test_jsd_categorical_equal(self): 'feature1': ['A', 'B', 'A', 'B', 'C'], 'feature2': ['X', 'Y', 'Y', 'X', 'Z'] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertEqual(jsd, 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertEqual(distribution, 100) def test_jsd_categorical_different(self): synthetic = pd.DataFrame({ @@ -119,8 +119,8 @@ def test_jsd_categorical_different(self): 'feature1': ['A', 'B', 'A', 'B', 'D'], 'feature2': ['X', 'Y', 'Z', 'X', 'W'] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertTrue(jsd < 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertTrue(distribution < 100) def test_jsd_categorical_mixed(self): synthetic = pd.DataFrame({ @@ -132,8 +132,8 @@ def test_jsd_categorical_mixed(self): 'feature1': ['A', 'B', 'C', 'F', 'G'], 'feature2': [1.0, 2.0, 3.0, 6.0, 7.0] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertTrue(jsd < 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertTrue(distribution < 100) def test_jsd_categorical_with_numerical(self): synthetic = pd.DataFrame({ @@ -145,8 +145,8 @@ def test_jsd_categorical_with_numerical(self): 'feature1': ['A', 'B', 'C', 'A', 'D'], 'feature2': [1.0, 2.0, 3.0, 4.0, 6.0] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertTrue(jsd < 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertTrue(distribution < 100) def test_jsd_categorical_with_nan(self): synthetic = pd.DataFrame({ @@ -158,8 +158,8 @@ def test_jsd_categorical_with_nan(self): 'feature1': ['A', 'B', 'C', 'D', None], 'feature2': [1.0, 2.0, None, 4.0, 5.0] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertTrue(jsd < 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertTrue(distribution < 100) def test_jsd_categorical_all_nan(self): synthetic = pd.DataFrame({ @@ -171,5 +171,5 @@ def test_jsd_categorical_all_nan(self): 'feature1': [None, None, None, None, None], 'feature2': [None, None, None, None, None] }) - jsd = syndat.quality.jsd(real, synthetic) - self.assertEqual(jsd, 100) + distribution = syndat.scores.distribution(real, synthetic) + self.assertEqual(distribution, 100)