Skip to content

Commit

Permalink
Added KSD (#156)
Browse files Browse the repository at this point in the history
* added KSD

* added test_two_column_map_with_ksd
  • Loading branch information
marqueewinq authored Nov 22, 2023
1 parent 259a874 commit 68dc221
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 14 deletions.
26 changes: 14 additions & 12 deletions src/insight/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
HellingerDistance,
JensenShannonDivergence,
KendallTauCorrelation,
KolmogorovSmirnovDistance,
KullbackLeiblerDivergence,
Mean,
Norm,
Expand All @@ -16,23 +17,24 @@
from .metrics_usage import CorrMatrix, DiffCorrMatrix, OneColumnMap, TwoColumnMap

__all__ = [
"OneColumnMetric",
"TwoColumnMetric",
"OneColumnMap",
"TwoColumnMap",
"BhattacharyyaCoefficient",
"CorrMatrix",
"DiffCorrMatrix",
"CramersV",
"DiffCorrMatrix",
"EarthMoversDistance",
"Mean",
"StandardDeviation",
"KendallTauCorrelation",
"Norm",
"TwoDataFrameMetric",
"EarthMoversDistanceBinned",
"HellingerDistance",
"JensenShannonDivergence",
"KendallTauCorrelation",
"KolmogorovSmirnovDistance",
"KullbackLeiblerDivergence",
"HellingerDistance",
"BhattacharyyaCoefficient",
"Mean",
"Norm",
"OneColumnMap",
"OneColumnMetric",
"StandardDeviation",
"TotalVariationDistance",
"TwoColumnMap",
"TwoColumnMetric",
"TwoDataFrameMetric",
]
34 changes: 33 additions & 1 deletion src/insight/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
import pandas as pd
from scipy.spatial.distance import jensenshannon
from scipy.stats import entropy, wasserstein_distance
from scipy.stats import entropy, ks_2samp, wasserstein_distance

from ..check import Check, ColumnCheck
from .base import OneColumnMetric, TwoColumnMetric
Expand Down Expand Up @@ -489,3 +489,35 @@ def check_column_types(
def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series):
(p, q) = zipped_hist((sr_a, sr_b), check=self.check)
return np.linalg.norm(ty.cast(pd.Series, p) - ty.cast(pd.Series, q), ord=1) / 2


class KolmogorovSmirnovDistance(TwoColumnMetric):
"""Kolmogorov-Smirnov Distance between two probability distributions.
The statistic ranges from 0 to 1, where a value of 0 indicates the two variables follow identical distributions,
and a value of 1 indicates they follow completely different distributions.
"""

name = "kolmogorov_smirnov_distance"

@classmethod
def check_column_types(
cls, sr_a: pd.Series, sr_b: pd.Series, check: Check = ColumnCheck()
) -> bool:
if check.continuous(sr_a) and check.continuous(sr_b):
return True
if check.categorical(sr_a) and check.categorical(sr_b):
return True
return False

def _compute_metric(self, sr_a: pd.Series, sr_b: pd.Series) -> float:
"""Calculate the metric.
Args:
sr_a (pd.Series): values of a variable.
sr_b (pd.Series): values of another variable to compare.
Returns:
The Kolmogorov-Smirnov distance between sr_a and sr_b.
"""
if sr_a.empty or sr_b.empty:
return 1.0
return ks_2samp(sr_a, sr_b)[0] # The first element is the KS statistic
37 changes: 37 additions & 0 deletions tests/test_metrics/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
HellingerDistance,
JensenShannonDivergence,
KendallTauCorrelation,
KolmogorovSmirnovDistance,
KullbackLeiblerDivergence,
Mean,
Norm,
Expand All @@ -27,6 +28,7 @@
emd = EarthMoversDistance()
hellinger_distance = HellingerDistance()
kl_divergence = KullbackLeiblerDivergence()
kolmogorov_smirnov_distance = KolmogorovSmirnovDistance()
js_divergence = JensenShannonDivergence()
norm = Norm()
norm_ord1 = Norm(ord=1)
Expand Down Expand Up @@ -286,6 +288,41 @@ def infer_dtype(self, sr: pd.Series) -> pd.Series:
assert abs(kl_divergence_with_custom_check(sr_g, sr_h) - 0.3) < 0.01


def test_kolmogorov_smirnov_distance(group1):
# Test with identical distributions
assert kolmogorov_smirnov_distance(pd.Series([1, 2, 3]), pd.Series([1, 2, 3])) == 0
assert kolmogorov_smirnov_distance(group1, group1) == 0

# Test with distributions that are completely different
assert kolmogorov_smirnov_distance(pd.Series([1, 1, 1]), pd.Series([2, 2, 2])) == 1

# Test with distributions that are slightly different
assert 0 < kolmogorov_smirnov_distance(pd.Series([1, 2, 3]), pd.Series([1, 2, 4])) < 1

# Test with random distributions
np.random.seed(0)
group2 = pd.Series(np.random.normal(0, 1, 1000))
group3 = pd.Series(np.random.normal(0.5, 1, 1000))
assert 0 < kolmogorov_smirnov_distance(group2, group3) < 1

# Test with distributions of different lengths
assert 0 < kolmogorov_smirnov_distance(pd.Series([1, 2, 3]), pd.Series([1, 2, 3, 4])) < 1

# Test with categorical data
cat1 = pd.Series(["a", "b", "c", "a"])
cat2 = pd.Series(["b", "c", "d"])
assert 0 < kolmogorov_smirnov_distance(cat1, cat2) < 1

# Edge cases
# Test with one or both series empty
assert kolmogorov_smirnov_distance(pd.Series([]), pd.Series([1, 2, 3])) == 1
assert kolmogorov_smirnov_distance(pd.Series([1, 2, 3]), pd.Series([])) == 1
assert kolmogorov_smirnov_distance(pd.Series([]), pd.Series([])) == 1

# Test with series containing NaN values
assert 0 <= kolmogorov_smirnov_distance(pd.Series([1, np.nan, 3]), pd.Series([1, 2, 3])) <= 1


def test_js_divergence(group1, group2, group3):
assert js_divergence(pd.Series([1, 0]), pd.Series([1, 0])) == 0

Expand Down
25 changes: 24 additions & 1 deletion tests/test_metrics/test_metrics_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,14 @@
import pytest

from insight.check import ColumnCheck
from insight.metrics import CorrMatrix, CramersV, DiffCorrMatrix, EarthMoversDistance, TwoColumnMap
from insight.metrics import (
CorrMatrix,
CramersV,
DiffCorrMatrix,
EarthMoversDistance,
KolmogorovSmirnovDistance,
TwoColumnMap,
)


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -44,6 +51,22 @@ def test_two_column_map(data):
assert all(np.isnan(emd_map_df["metric_val"][cont]) for cont in continuous_cols)


def test_two_column_map_with_ksd(data):
df, categorical_cols, continuous_cols = data[0], data[1], data[2]
df1 = df.sample(1000).reset_index(drop=True)
df2 = df.sample(1000).reset_index(drop=True)

ksd = KolmogorovSmirnovDistance()

col_map = TwoColumnMap(ksd)
ksd_map_df = col_map(df1, df2)
assert col_map.name == f"{str(ksd)}_map"

assert set(ksd_map_df.columns.to_list()) == set(["metric_val"])
assert all(not np.isnan(ksd_map_df["metric_val"][cat]) for cat in categorical_cols)
assert all(not np.isnan(ksd_map_df["metric_val"][cont]) for cont in continuous_cols)


def test_metric_matrix(data):
df, categorical_cols, continuous_cols = data[0], data[1], data[2]
df1 = df.sample(1000).reset_index(drop=True)
Expand Down

0 comments on commit 68dc221

Please sign in to comment.