From 6232b51bb0b85891608a668a45efd1c1f9f79eca Mon Sep 17 00:00:00 2001
From: Hilly12 <aahilmehta12@gmail.com>
Date: Mon, 6 Sep 2021 17:52:58 +0100
Subject: [PATCH 01/11] speed up correlation matrix generation

---
 docs/user_guide/correlations.rst |  8 ++---
 src/fairlens/metrics/__init__.py |  2 ++
 src/fairlens/metrics/unified.py  | 58 +++++++++++++++-----------------
 src/fairlens/plot/__init__.py    |  4 +--
 src/fairlens/plot/heatmap.py     | 57 +++++++------------------------
 5 files changed, 47 insertions(+), 82 deletions(-)

diff --git a/docs/user_guide/correlations.rst b/docs/user_guide/correlations.rst
index 6404347f..a956953a 100644
--- a/docs/user_guide/correlations.rst
+++ b/docs/user_guide/correlations.rst
@@ -65,7 +65,7 @@ Correlation Heatmaps
 ^^^^^^^^^^^^^^^^^^^^
 
 The :code:`plot` module allows users to generate a correlation heatmap of any dataset by simply
-passing the dataframe to the :code:`two_column_heatmap()` function, which will plot a heatmap from the
+passing the dataframe to the :code:`heatmap()` function, which will plot a heatmap from the
 matrix of the correlation coefficients computed by using the Pearson Coefficient, the Kruskal-Wallis
 Test and Cramer's V between each two of the columns (for numerical-numerical, categorical-numerical and
 categorical-categorical associations, respectively).
@@ -92,19 +92,17 @@ This will automatically choose different methods for different types of data, ho
 are configurable.
 
 .. ipython:: python
-    :okwarning:
 
     @savefig corr_heatmap_1.png
-    fl.plot.two_column_heatmap(df)
+    fl.plot.heatmap(df)
 
 
 Let's try generating a heatmap of the same dataset, but using some non-linear metrics
 for numerical-numerical and numerical-categorical associations for added precision.
 
 .. ipython:: python
-    :okwarning:
 
     from fairlens.metrics import distance_nn_correlation, distance_cn_correlation, cramers_v
 
     @savefig corr_heatmap_2.png
-    fl.plot.two_column_heatmap(df, distance_nn_correlation, distance_cn_correlation, cramers_v)
+    fl.plot.heatmap(df, distance_nn_correlation, distance_cn_correlation, cramers_v)
diff --git a/src/fairlens/metrics/__init__.py b/src/fairlens/metrics/__init__.py
index 27edbd41..5c4b2a35 100644
--- a/src/fairlens/metrics/__init__.py
+++ b/src/fairlens/metrics/__init__.py
@@ -23,6 +23,7 @@
     cramers_v,
     distance_cn_correlation,
     distance_nn_correlation,
+    pearson,
     r2_mcfadden,
     kruskal_wallis,
     kruskal_wallis_boolean,
@@ -58,6 +59,7 @@
     "cramers_v",
     "distance_cn_correlation",
     "distance_nn_correlation",
+    "pearson",
     "r2_mcfadden",
     "kruskal_wallis",
     "kruskal_wallis_boolean",
diff --git a/src/fairlens/metrics/unified.py b/src/fairlens/metrics/unified.py
index ce49c459..1bbcac8c 100644
--- a/src/fairlens/metrics/unified.py
+++ b/src/fairlens/metrics/unified.py
@@ -2,9 +2,9 @@
 Collection of helper methods which can be used as to interface metrics.
 """
 
-import multiprocessing as mp
-from typing import Any, Callable, List, Mapping, Optional, Tuple, Type, Union
+from typing import Any, Callable, List, Mapping, Tuple, Type, Union
 
+import numpy as np
 import pandas as pd
 
 from .. import utils
@@ -118,8 +118,6 @@ def correlation_matrix(
     num_num_metric: Callable[[pd.Series, pd.Series], float] = pearson,
     cat_num_metric: Callable[[pd.Series, pd.Series], float] = kruskal_wallis,
     cat_cat_metric: Callable[[pd.Series, pd.Series], float] = cramers_v,
-    columns_x: Optional[List[str]] = None,
-    columns_y: Optional[List[str]] = None,
 ) -> pd.DataFrame:
     """This function creates a correlation matrix out of a dataframe, using a correlation metric for each
     possible type of pair of series (i.e. numerical-numerical, categorical-numerical, categorical-categorical).
@@ -135,51 +133,51 @@ def correlation_matrix(
         cat_cat_metric (Callable[[pd.Series, pd.Series], float], optional):
             The correlation metric used for categorical-categorical series pairs. Defaults to corrected Cramer's V
             statistic.
-        columns_x (Optional[List[str]]):
-            The column names that determine the rows of the matrix.
-        columns_y (Optional[List[str]]):
-            The column names that determine the columns of the matrix.
 
     Returns:
         pd.DataFrame:
             The correlation matrix to be used in heatmap generation.
     """
 
-    if columns_x is None:
-        columns_x = df.columns
+    df = df.copy()
 
-    if columns_y is None:
-        columns_y = df.columns
+    distr_types = [utils.infer_distr_type(df[col]) for col in df.columns]
 
-    pool = mp.Pool(mp.cpu_count())
+    for col in df.columns:
+        df[col] = utils.infer_dtype(df[col])
 
-    series_list = [
-        pd.Series(
-            pool.starmap(
-                _correlation_matrix_helper,
-                [(df[col_x], df[col_y], num_num_metric, cat_num_metric, cat_cat_metric) for col_x in columns_x],
-            ),
-            index=columns_x,
-            name=col_y,
-        )
-        for col_y in columns_y
-    ]
+        if df[col].dtype.kind == "O":
+            df[col] = pd.factorize(df[col])[0]
+
+    df = df.append(pd.DataFrame({col: [i] for i, col in enumerate(df.columns)}))
 
-    pool.close()
+    def corr(a: np.ndarray, b: np.ndarray):
+        return _correlation_matrix_helper(
+            a,
+            b,
+            distr_types=distr_types,
+            num_num_metric=num_num_metric,
+            cat_num_metric=cat_num_metric,
+            cat_cat_metric=cat_cat_metric,
+        )
 
-    return pd.concat(series_list, axis=1, keys=[series.name for series in series_list])
+    return df.corr(method=corr)
 
 
 def _correlation_matrix_helper(
-    sr_a: pd.Series,
-    sr_b: pd.Series,
+    a: np.ndarray,
+    b: np.ndarray,
+    distr_types: List[utils.DistrType],
     num_num_metric: Callable[[pd.Series, pd.Series], float] = pearson,
     cat_num_metric: Callable[[pd.Series, pd.Series], float] = kruskal_wallis,
     cat_cat_metric: Callable[[pd.Series, pd.Series], float] = cramers_v,
 ) -> float:
 
-    a_type = utils.infer_distr_type(sr_a)
-    b_type = utils.infer_distr_type(sr_b)
+    a_type = distr_types[int(a[-1])]
+    b_type = distr_types[int(b[-1])]
+
+    sr_a = pd.Series(a[:-1])
+    sr_b = pd.Series(b[:-1])
 
     if a_type.is_continuous() and b_type.is_continuous():
         return num_num_metric(sr_a, sr_b)
diff --git a/src/fairlens/plot/__init__.py b/src/fairlens/plot/__init__.py
index b7c81aed..e2bac2a5 100644
--- a/src/fairlens/plot/__init__.py
+++ b/src/fairlens/plot/__init__.py
@@ -4,7 +4,7 @@
 
 
 from .distr import attr_distr_plot, distr_plot, mult_distr_plot
-from .heatmap import two_column_heatmap
+from .heatmap import heatmap
 from .style import reset_style, use_style
 
-__all__ = ["use_style", "reset_style", "distr_plot", "attr_distr_plot", "mult_distr_plot", "two_column_heatmap"]
+__all__ = ["use_style", "reset_style", "distr_plot", "attr_distr_plot", "mult_distr_plot", "heatmap"]
diff --git a/src/fairlens/plot/heatmap.py b/src/fairlens/plot/heatmap.py
index 1223b0a6..8182388c 100644
--- a/src/fairlens/plot/heatmap.py
+++ b/src/fairlens/plot/heatmap.py
@@ -2,23 +2,21 @@
 Plot correlation heatmaps for datasets.
 """
 
-from typing import Callable, List, Optional
+from typing import Callable
 
 import matplotlib.pyplot as plt
-import numpy as np
 import pandas as pd
 import seaborn as sns
 
 from ..metrics import correlation, unified
 
 
-def two_column_heatmap(
+def heatmap(
     df: pd.DataFrame,
     num_num_metric: Callable[[pd.Series, pd.Series], float] = correlation.pearson,
     cat_num_metric: Callable[[pd.Series, pd.Series], float] = correlation.kruskal_wallis,
     cat_cat_metric: Callable[[pd.Series, pd.Series], float] = correlation.cramers_v,
-    columns_x: Optional[List[str]] = None,
-    columns_y: Optional[List[str]] = None,
+    **kwargs
 ):
     """This function creates a correlation heatmap out of a dataframe, using user provided or default correlation
     metrics for all possible types of pairs of series (i.e. numerical-numerical, categorical-numerical,
@@ -35,48 +33,17 @@ def two_column_heatmap(
         cat_cat_metric (Callable[[pd.Series, pd.Series], float], optional):
             The correlation metric used for categorical-categorical series pairs. Defaults to corrected Cramer's V
             statistic.
-        columns_x (Optional[List[str]]):
-            The sensitive dataframe column names that will be used in generating the correlation heatmap.
-        columns_y (Optional[List[str]]):
-            The non-sensitive dataframe column names that will be used in generating the correlation heatmap.
+        kwargs:
+            Key word arguments for sns.heatmap.
     """
 
-    if columns_x is None:
-        columns_x = df.columns
+    corr_matrix = unified.correlation_matrix(df, num_num_metric, cat_num_metric, cat_cat_metric)
 
-    if columns_y is None:
-        columns_y = df.columns
+    if "cmap" not in kwargs:
+        kwargs["cmap"] = sns.cubehelix_palette(start=0.2, rot=-0.2, dark=0.3, as_cmap=True)
 
-    corr_matrix = unified.correlation_matrix(
-        df, num_num_metric, cat_num_metric, cat_cat_metric, columns_x, columns_y
-    ).round(2)
+    if "linewidth" not in kwargs:
+        kwargs["linewidth"] = 0.5
 
-    fig_width = 20.0
-    margin_top = 0.8
-    margin_bot = 0.8
-    margin_left = 0.8
-    margin_right = 0.8
-
-    cell_size = (fig_width - margin_left - margin_right) / float(len(columns_y))
-    fig_height = cell_size * len(columns_x) + margin_bot + margin_top
-
-    plt.figure(figsize=(fig_width, fig_height), tight_layout=True)
-    plt.subplots_adjust(
-        bottom=margin_bot / fig_height,
-        top=1.0 - margin_top / fig_height,
-        left=margin_left / fig_width,
-        right=1.0 - margin_right / fig_width,
-    )
-
-    g = sns.heatmap(
-        corr_matrix,
-        vmin=0,
-        vmax=1,
-        annot=True,
-        annot_kws={"size": 35 / np.sqrt(len(corr_matrix))},
-        square=True,
-        cbar=True,
-    )
-
-    g.set_xticklabels(g.get_xticklabels(), rotation=90, horizontalalignment="right", fontdict={"fontsize": 14})
-    g.set_yticklabels(g.get_yticklabels(), rotation=0, horizontalalignment="right", fontdict={"fontsize": 14})
+    sns.heatmap(corr_matrix, **kwargs)
+    plt.tight_layout()

From 4ae6e9f8c720728ca4faf41df675e49a2ef5c9cd Mon Sep 17 00:00:00 2001
From: Hilly12 <aahilmehta12@gmail.com>
Date: Tue, 7 Sep 2021 10:34:12 +0100
Subject: [PATCH 02/11] add kendall tau, spearman

---
 src/fairlens/metrics/__init__.py              |  4 ++
 src/fairlens/metrics/correlation.py           | 56 ++++++++++++++++---
 src/fairlens/plot/__init__.py                 |  2 +-
 .../plot/{heatmap.py => correlation.py}       |  2 +-
 4 files changed, 55 insertions(+), 9 deletions(-)
 rename src/fairlens/plot/{heatmap.py => correlation.py} (96%)

diff --git a/src/fairlens/metrics/__init__.py b/src/fairlens/metrics/__init__.py
index 5c4b2a35..60350238 100644
--- a/src/fairlens/metrics/__init__.py
+++ b/src/fairlens/metrics/__init__.py
@@ -24,6 +24,8 @@
     distance_cn_correlation,
     distance_nn_correlation,
     pearson,
+    kendall_tau,
+    spearman,
     r2_mcfadden,
     kruskal_wallis,
     kruskal_wallis_boolean,
@@ -60,6 +62,8 @@
     "distance_cn_correlation",
     "distance_nn_correlation",
     "pearson",
+    "kendall_tau",
+    "spearman",
     "r2_mcfadden",
     "kruskal_wallis",
     "kruskal_wallis_boolean",
diff --git a/src/fairlens/metrics/correlation.py b/src/fairlens/metrics/correlation.py
index 4b1ca0ba..a06cf2da 100644
--- a/src/fairlens/metrics/correlation.py
+++ b/src/fairlens/metrics/correlation.py
@@ -23,7 +23,8 @@ def cramers_v(sr_a: pd.Series, sr_b: pd.Series) -> float:
             Second categorical series to analyze.
 
     Returns:
-        float: Value of the statistic.
+        float:
+            Value of the statistic.
     """
 
     if len(sr_a.value_counts()) == 1:
@@ -49,17 +50,56 @@ def cramers_v(sr_a: pd.Series, sr_b: pd.Series) -> float:
 
 
 def pearson(sr_a: pd.Series, sr_b: pd.Series) -> float:
-    """Metric that calculates Pearson's correlation coefficent for numerical-numerical
+    """Calculates the Pearson's correlation coefficent for numerical-numerical
     pairs of series, used in heatmap generation.
 
     Args:
-        sr_a (pd.Series): First numerical series to analyze.
-        sr_b (pd.Series): Second numerical series to analyze.
+        sr_a (pd.Series):
+            First numerical series to analyze.
+        sr_b (pd.Series):
+            Second numerical series to analyze.
 
     Returns:
-        float: Value of the coefficient.
+        float:
+            Value of the coefficient.
     """
-    return abs(sr_a.corr(sr_b))
+
+    return sr_a.corr(sr_b, method="pearson")
+
+
+def kendall_tau(sr_a: pd.Series, sr_b: pd.Series) -> float:
+    """Calculates the Kendall Tau correlation coefficent for pairs of series.
+
+    Args:
+        sr_a (pd.Series):
+            First numerical series to analyze.
+        sr_b (pd.Series):
+            Second numerical series to analyze.
+
+    Returns:
+        float:
+            Value of the coefficient.
+    """
+
+    return sr_a.corr(sr_b, method="kendall")
+
+
+def spearman(sr_a: pd.Series, sr_b: pd.Series) -> float:
+    """Calculates the Spearman Rank correlation coefficent for pairs of series.
+
+    Args:
+        sr_a (pd.Series):
+            First numerical series to analyze.
+        sr_b (pd.Series):
+            Second numerical series to analyze.
+
+    Returns:
+        float:
+            Value of the coefficient.
+    """
+
+
+    return sr_a.corr(sr_b, method="spearman")
 
 
 def r2_mcfadden(sr_a: pd.Series, sr_b: pd.Series) -> float:
@@ -78,6 +118,7 @@ def r2_mcfadden(sr_a: pd.Series, sr_b: pd.Series) -> float:
     Returns:
         float: Value of the pseudo-R2 McFadden score.
     """
+
     x = sr_b.to_numpy().reshape(-1, 1)
     x = StandardScaler().fit_transform(x)
     y = sr_a.to_numpy()
@@ -147,7 +188,8 @@ def kruskal_wallis_boolean(sr_a: pd.Series, sr_b: pd.Series, p_cutoff: float = 0
             The maximum admitted p-value for the distributions to be considered independent.
 
     Returns:
-        bool: Bool value representing whether or not the two series are correlated.
+        bool:
+            Bool value representing whether or not the two series are correlated.
     """
 
     sr_a = sr_a.astype("category").cat.codes
diff --git a/src/fairlens/plot/__init__.py b/src/fairlens/plot/__init__.py
index e2bac2a5..bdf3b29a 100644
--- a/src/fairlens/plot/__init__.py
+++ b/src/fairlens/plot/__init__.py
@@ -3,8 +3,8 @@
 """
 
 
+from .correlation import heatmap
 from .distr import attr_distr_plot, distr_plot, mult_distr_plot
-from .heatmap import heatmap
 from .style import reset_style, use_style
 
 __all__ = ["use_style", "reset_style", "distr_plot", "attr_distr_plot", "mult_distr_plot", "heatmap"]
diff --git a/src/fairlens/plot/heatmap.py b/src/fairlens/plot/correlation.py
similarity index 96%
rename from src/fairlens/plot/heatmap.py
rename to src/fairlens/plot/correlation.py
index 8182388c..e3ab7160 100644
--- a/src/fairlens/plot/heatmap.py
+++ b/src/fairlens/plot/correlation.py
@@ -45,5 +45,5 @@ def heatmap(
     if "linewidth" not in kwargs:
         kwargs["linewidth"] = 0.5
 
-    sns.heatmap(corr_matrix, **kwargs)
+    sns.heatmap(corr_matrix, vmin=0, vmax=1, square=True, **kwargs)
     plt.tight_layout()

From 9ff13c43b327aad62c42fe4ff773215ff965e6a4 Mon Sep 17 00:00:00 2001
From: Hilly12 <aahilmehta12@gmail.com>
Date: Thu, 9 Sep 2021 12:45:30 +0100
Subject: [PATCH 03/11] update cramers v

---
 src/fairlens/metrics/correlation.py | 42 ++++++++++++++++-------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/src/fairlens/metrics/correlation.py b/src/fairlens/metrics/correlation.py
index a06cf2da..3950d3f6 100644
--- a/src/fairlens/metrics/correlation.py
+++ b/src/fairlens/metrics/correlation.py
@@ -27,26 +27,31 @@ def cramers_v(sr_a: pd.Series, sr_b: pd.Series) -> float:
             Value of the statistic.
     """
 
-    if len(sr_a.value_counts()) == 1:
-        return 0
-    if len(sr_b.value_counts()) == 1:
-        return 0
-    else:
-        confusion_matrix = pd.crosstab(sr_a, sr_b)
+    table_orig = pd.crosstab(sr_a.astype(str), sr_b.astype(str))
+    table = np.asarray(table_orig, dtype=np.float64)
+
+    if table.min() == 0:
+        table[table == 0] = 0.5
+
+    n = table.sum()
+    row = table.sum(1) / n
+    col = table.sum(0) / n
 
-        if confusion_matrix.shape[0] == 2:
-            correct = False
-        else:
-            correct = True
+    row = pd.Series(data=row, index=table_orig.index)
+    col = pd.Series(data=col, index=table_orig.columns)
 
-        chi2 = ss.chi2_contingency(confusion_matrix, correction=correct)[0]
-        n = sum(confusion_matrix.sum())
-        phi2 = chi2 / n
-        r, k = confusion_matrix.shape
-        phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
-        rcorr = r - ((r - 1) ** 2) / (n - 1)
-        kcorr = k - ((k - 1) ** 2) / (n - 1)
-        return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
+    itab = np.outer(row, col)
+    probs = pd.DataFrame(data=itab, index=table_orig.index, columns=table_orig.columns)
+
+    fit = table.sum() * probs
+    expected = fit.to_numpy()
+
+    real = table
+    r, c = real.shape
+    n = np.sum(real)
+    v = np.sum((real - expected) ** 2 / (expected * n * min(r - 1, c - 1))) ** 0.5
+
+    return v
 
 
 def pearson(sr_a: pd.Series, sr_b: pd.Series) -> float:
@@ -98,7 +103,6 @@ def spearman(sr_a: pd.Series, sr_b: pd.Series) -> float:
             Value of the coefficient.
     """
 
-
     return sr_a.corr(sr_b, method="spearman")
 
 

From a18723148efc5b1691045dd39a5513404ad6f6f9 Mon Sep 17 00:00:00 2001
From: Hilly12 <aahilmehta12@gmail.com>
Date: Thu, 9 Sep 2021 13:59:20 +0100
Subject: [PATCH 04/11] revert cramers v

---
 src/fairlens/metrics/correlation.py | 34 ++++++++++-------------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/src/fairlens/metrics/correlation.py b/src/fairlens/metrics/correlation.py
index 3950d3f6..4dcb81ec 100644
--- a/src/fairlens/metrics/correlation.py
+++ b/src/fairlens/metrics/correlation.py
@@ -27,31 +27,19 @@ def cramers_v(sr_a: pd.Series, sr_b: pd.Series) -> float:
             Value of the statistic.
     """
 
-    table_orig = pd.crosstab(sr_a.astype(str), sr_b.astype(str))
-    table = np.asarray(table_orig, dtype=np.float64)
-
-    if table.min() == 0:
-        table[table == 0] = 0.5
-
-    n = table.sum()
-    row = table.sum(1) / n
-    col = table.sum(0) / n
-
-    row = pd.Series(data=row, index=table_orig.index)
-    col = pd.Series(data=col, index=table_orig.columns)
-
-    itab = np.outer(row, col)
-    probs = pd.DataFrame(data=itab, index=table_orig.index, columns=table_orig.columns)
-
-    fit = table.sum() * probs
-    expected = fit.to_numpy()
+    if sr_a.nunique() == 1 or sr_b.nunique() == 1:
+        return 0
 
-    real = table
-    r, c = real.shape
-    n = np.sum(real)
-    v = np.sum((real - expected) ** 2 / (expected * n * min(r - 1, c - 1))) ** 0.5
+    confusion_matrix = pd.crosstab(sr_a, sr_b)
 
-    return v
+    chi2 = ss.chi2_contingency(confusion_matrix, correction=(confusion_matrix.shape[0] != 2))[0]
+    n = sum(confusion_matrix.sum())
+    phi2 = chi2 / n
+    r, k = confusion_matrix.shape
+    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
+    rcorr = r - ((r - 1) ** 2) / (n - 1)
+    kcorr = k - ((k - 1) ** 2) / (n - 1)
+    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
 
 
 def pearson(sr_a: pd.Series, sr_b: pd.Series) -> float:

From b6f4edcd4fa98e1314c11452fbae7e42626e9a88 Mon Sep 17 00:00:00 2001
From: Hilly12 <aahilmehta12@gmail.com>
Date: Thu, 9 Sep 2021 16:52:03 +0100
Subject: [PATCH 05/11] remove kendall tau, spearman rank

---
 src/fairlens/metrics/correlation.py | 36 +----------------------------
 1 file changed, 1 insertion(+), 35 deletions(-)

diff --git a/src/fairlens/metrics/correlation.py b/src/fairlens/metrics/correlation.py
index 4dcb81ec..d5c40afd 100644
--- a/src/fairlens/metrics/correlation.py
+++ b/src/fairlens/metrics/correlation.py
@@ -60,40 +60,6 @@ def pearson(sr_a: pd.Series, sr_b: pd.Series) -> float:
     return sr_a.corr(sr_b, method="pearson")
 
 
-def kendall_tau(sr_a: pd.Series, sr_b: pd.Series) -> float:
-    """Calculates the Kendall Tau correlation coefficent for pairs of series.
-
-    Args:
-        sr_a (pd.Series):
-            First numerical series to analyze.
-        sr_b (pd.Series):
-            Second numerical series to analyze.
-
-    Returns:
-        float:
-            Value of the coefficient.
-    """
-
-    return sr_a.corr(sr_b, method="kendall")
-
-
-def spearman(sr_a: pd.Series, sr_b: pd.Series) -> float:
-    """Calculates the Spearman Rank correlation coefficent for pairs of series.
-
-    Args:
-        sr_a (pd.Series):
-            First numerical series to analyze.
-        sr_b (pd.Series):
-            Second numerical series to analyze.
-
-    Returns:
-        float:
-            Value of the coefficient.
-    """
-
-    return sr_a.corr(sr_b, method="spearman")
-
-
 def r2_mcfadden(sr_a: pd.Series, sr_b: pd.Series) -> float:
     """Metric used for categorical-numerical continuous. It trains two multinomial logistic
     regression models on the data, one using the numerical series as the feature and the other
@@ -153,7 +119,6 @@ def kruskal_wallis(sr_a: pd.Series, sr_b: pd.Series) -> float:
             p-value is the probability that the two columns are not correlated.
     """
 
-    sr_a = sr_a.astype("category").cat.codes
     groups = sr_b.groupby(sr_a)
     arrays = [groups.get_group(category) for category in sr_a.unique()]
 
@@ -161,6 +126,7 @@ def kruskal_wallis(sr_a: pd.Series, sr_b: pd.Series) -> float:
     try:
         _, p_val = ss.kruskal(*args, nan_policy="omit")
     except ValueError:
+        # TODO: Warning
         return 0
 
     return p_val

From 45c7caf9342556977b29f47bda433621ad2ea8b0 Mon Sep 17 00:00:00 2001
From: Hilly12 <aahilmehta12@gmail.com>
Date: Thu, 9 Sep 2021 16:54:04 +0100
Subject: [PATCH 06/11] remove references to kendall tau, spearman

---
 src/fairlens/metrics/__init__.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/fairlens/metrics/__init__.py b/src/fairlens/metrics/__init__.py
index 60350238..5c4b2a35 100644
--- a/src/fairlens/metrics/__init__.py
+++ b/src/fairlens/metrics/__init__.py
@@ -24,8 +24,6 @@
     distance_cn_correlation,
     distance_nn_correlation,
     pearson,
-    kendall_tau,
-    spearman,
     r2_mcfadden,
     kruskal_wallis,
     kruskal_wallis_boolean,
@@ -62,8 +60,6 @@
     "distance_cn_correlation",
     "distance_nn_correlation",
     "pearson",
-    "kendall_tau",
-    "spearman",
     "r2_mcfadden",
     "kruskal_wallis",
     "kruskal_wallis_boolean",

From b8fb0a3ed770b580aed0d846b9f6e51c7854b157 Mon Sep 17 00:00:00 2001
From: Hilly12 <aahilmehta12@gmail.com>
Date: Mon, 13 Sep 2021 14:21:31 +0100
Subject: [PATCH 07/11] add stress tests for correlation matrix, join columns
 before dropping nulls

---
 src/fairlens/metrics/correlation.py | 37 +++++++++++++--------
 src/fairlens/metrics/unified.py     | 12 ++++---
 tests/test_correlation.py           | 50 +++++++++++++++++++++++++++--
 tests/test_plot.py                  | 13 ++++++++
 4 files changed, 92 insertions(+), 20 deletions(-)

diff --git a/src/fairlens/metrics/correlation.py b/src/fairlens/metrics/correlation.py
index d5c40afd..4e12756c 100644
--- a/src/fairlens/metrics/correlation.py
+++ b/src/fairlens/metrics/correlation.py
@@ -11,6 +11,9 @@
 from sklearn import linear_model
 from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
 
+EPSILON = 1e-6
+MIN_MEAN_SAMPLE_SIZE = 20
+
 
 def cramers_v(sr_a: pd.Series, sr_b: pd.Series) -> float:
     """Metric that calculates the corrected Cramer's V statistic for categorical-categorical
@@ -27,18 +30,27 @@ def cramers_v(sr_a: pd.Series, sr_b: pd.Series) -> float:
             Value of the statistic.
     """
 
-    if sr_a.nunique() == 1 or sr_b.nunique() == 1:
-        return 0
+    if sr_a.equals(sr_b):
+        return 1
 
     confusion_matrix = pd.crosstab(sr_a, sr_b)
+    r, k = confusion_matrix.shape
+    n = confusion_matrix.to_numpy().sum()
+
+    if r < 2 or k < 2:
+        return 0
 
     chi2 = ss.chi2_contingency(confusion_matrix, correction=(confusion_matrix.shape[0] != 2))[0]
-    n = sum(confusion_matrix.sum())
     phi2 = chi2 / n
-    r, k = confusion_matrix.shape
-    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
+
+    phi2corr = phi2 - ((k - 1) * (r - 1)) / (n - 1)
+
+    if phi2corr <= EPSILON:
+        return 0
+
     rcorr = r - ((r - 1) ** 2) / (n - 1)
     kcorr = k - ((k - 1) ** 2) / (n - 1)
+
     return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
 
 
@@ -120,15 +132,16 @@ def kruskal_wallis(sr_a: pd.Series, sr_b: pd.Series) -> float:
     """
 
     groups = sr_b.groupby(sr_a)
-    arrays = [groups.get_group(category) for category in sr_a.unique()]
+    if len(groups) < 2:
+        return 0
 
-    args = [group.array for group in arrays]
-    try:
-        _, p_val = ss.kruskal(*args, nan_policy="omit")
-    except ValueError:
-        # TODO: Warning
+    args = [groups.get_group(category).array for category in sr_a.unique()]
+
+    if np.mean([len(values) for values in args]) <= MIN_MEAN_SAMPLE_SIZE:
         return 0
 
+    _, p_val = ss.kruskal(*args, nan_policy="omit")
+
     return p_val
 
 
@@ -181,8 +194,6 @@ def distance_nn_correlation(sr_a: pd.Series, sr_b: pd.Series) -> float:
             The correlation coefficient.
     """
 
-    warnings.filterwarnings(action="ignore", category=UserWarning)
-
     if sr_a.size < sr_b.size:
         sr_a = sr_a.append(pd.Series(sr_a.mean()).repeat(sr_b.size - sr_a.size), ignore_index=True)
     elif sr_a.size > sr_b.size:
diff --git a/src/fairlens/metrics/unified.py b/src/fairlens/metrics/unified.py
index 1bbcac8c..df059104 100644
--- a/src/fairlens/metrics/unified.py
+++ b/src/fairlens/metrics/unified.py
@@ -147,7 +147,7 @@ def correlation_matrix(
         df[col] = utils.infer_dtype(df[col])
 
         if df[col].dtype.kind == "O":
-            df[col] = pd.factorize(df[col])[0]
+            df[col] = pd.Series(pd.factorize(df[col], na_sentinel=-1)[0]).replace(-1, np.nan)
 
     df = df.append(pd.DataFrame({col: [i] for i, col in enumerate(df.columns)}))
 
@@ -179,14 +179,16 @@ def _correlation_matrix_helper(
     sr_a = pd.Series(a[:-1])
     sr_b = pd.Series(b[:-1])
 
+    df = pd.DataFrame({"a": sr_a, "b": sr_b}).dropna().reset_index()
+
     if a_type.is_continuous() and b_type.is_continuous():
-        return num_num_metric(sr_a, sr_b)
+        return num_num_metric(df["a"], df["b"])
 
     elif b_type.is_continuous():
-        return cat_num_metric(sr_a, sr_b)
+        return cat_num_metric(df["a"], df["b"])
 
     elif a_type.is_continuous():
-        return cat_num_metric(sr_b, sr_a)
+        return cat_num_metric(df["b"], df["a"])
 
     else:
-        return cat_cat_metric(sr_a, sr_b)
+        return cat_cat_metric(df["a"], df["b"])
diff --git a/tests/test_correlation.py b/tests/test_correlation.py
index e6213ddd..c925032d 100644
--- a/tests/test_correlation.py
+++ b/tests/test_correlation.py
@@ -1,6 +1,15 @@
 import pandas as pd
-
-from fairlens.metrics.correlation import distance_cn_correlation, distance_nn_correlation
+import pytest
+
+from fairlens import utils
+from fairlens.metrics.correlation import (
+    cramers_v,
+    distance_cn_correlation,
+    distance_nn_correlation,
+    kruskal_wallis,
+    pearson,
+)
+from fairlens.metrics.unified import correlation_matrix
 from fairlens.sensitive.correlation import find_column_correlation, find_sensitive_correlations
 
 pair_race = "race", "Ethnicity"
@@ -9,6 +18,8 @@
 pair_gender = "gender", "Gender"
 pair_nationality = "nationality", "Nationality"
 
+epsilon = 1e-6
+
 
 def test_correlation():
     col_names = ["gender", "random", "score"]
@@ -133,3 +144,38 @@ def test_cn_unequal_series_corr():
     sr_b = pd.Series([100, 200, 99, 101, 201, 199, 299, 300, 301, 500, 501, 505, 10, 12, 1001, 1050])
 
     assert distance_cn_correlation(sr_a, sr_b) > 0.7
+
+
+@pytest.mark.parametrize("dataset", ["titanic", "german_credit_data"])
+def test_correlation_matrix(dataset):
+    df = pd.read_csv(f"datasets/{dataset}.csv")
+    num_num_metric = pearson
+    cat_num_metric = kruskal_wallis
+    cat_cat_metric = cramers_v
+
+    matrix = correlation_matrix(
+        df, num_num_metric=num_num_metric, cat_num_metric=cat_num_metric, cat_cat_metric=cat_cat_metric
+    ).to_numpy()
+
+    for i, r in enumerate(df.columns):
+        for j, c in enumerate(df.columns):
+            sr_a = utils.infer_dtype(df[r])
+            sr_b = utils.infer_dtype(df[c])
+            a_type = utils.infer_distr_type(sr_a)
+            b_type = utils.infer_distr_type(sr_b)
+
+            d = pd.DataFrame({"a": sr_a, "b": sr_b}).dropna().reset_index()
+
+            if a_type.is_continuous() and b_type.is_continuous():
+                corr = num_num_metric(d["a"], d["b"])
+
+            elif b_type.is_continuous():
+                corr = cat_num_metric(d["a"], d["b"])
+
+            elif a_type.is_continuous():
+                corr = cat_num_metric(d["b"], d["a"])
+
+            else:
+                corr = cat_cat_metric(d["a"], d["b"])
+
+            assert matrix[i][j] - corr < epsilon
diff --git a/tests/test_plot.py b/tests/test_plot.py
index 47c8e007..112ccb0b 100644
--- a/tests/test_plot.py
+++ b/tests/test_plot.py
@@ -1,6 +1,7 @@
 import pandas as pd
 import seaborn as sns
 
+from fairlens.plot.correlation import heatmap
 from fairlens.plot.distr import attr_distr_plot, distr_plot, mult_distr_plot
 
 dfa = pd.read_csv("datasets/adult.csv")
@@ -33,3 +34,15 @@ def test_mult_distr_plot_german():
 
 def test_mult_distr_plot_titanic():
     mult_distr_plot(dft, "Survived", ["Sex", "Age"])
+
+
+def test_heatmap_adult():
+    heatmap(dfa)
+
+
+def test_heatmap_german():
+    heatmap(dfg)
+
+
+def test_heatmap_titanic():
+    heatmap(dft)

From a00f23726f09463eba9e8b9e8f89a410493fbcbc Mon Sep 17 00:00:00 2001
From: bogdansurdu <gs2318@ic.ac.uk>
Date: Thu, 16 Sep 2021 11:16:48 +0300
Subject: [PATCH 08/11] update first proxy detection example

---
 docs/user_guide/correlations.rst | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/docs/user_guide/correlations.rst b/docs/user_guide/correlations.rst
index a956953a..e9d3c61f 100644
--- a/docs/user_guide/correlations.rst
+++ b/docs/user_guide/correlations.rst
@@ -32,8 +32,16 @@ Let's first look at how we would go about detecting correlations inside a datafr
     import fairlens as fl
 
     columns = ["gender", "random", "score"]
-    data = [["male", 10, 50], ["female", 20, 80], ["male", 20, 60], ["female", 10, 90]]
-
+    data = [
+        ["male", 10, 60],
+        ["female", 10, 80],
+        ["male", 10, 60],
+        ["female", 10, 80],
+        ["male", 9, 59],
+        ["female", 11, 80],
+        ["male", 12, 61],
+        ["female", 10, 83],
+    ]
     df = pd.DataFrame(data, columns=columns)
 
 Here the score seems to be correlated with gender, with females leaning towards somewhat higher scores.

From 2eb380d3a6656a624cb0b8e5616838976003525e Mon Sep 17 00:00:00 2001
From: Hilly12 <aahilmehta12@gmail.com>
Date: Thu, 16 Sep 2021 11:21:40 +0100
Subject: [PATCH 09/11] remove kwargs from heatmap

---
 src/fairlens/plot/correlation.py | 37 ++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/src/fairlens/plot/correlation.py b/src/fairlens/plot/correlation.py
index e3ab7160..e897e748 100644
--- a/src/fairlens/plot/correlation.py
+++ b/src/fairlens/plot/correlation.py
@@ -2,11 +2,12 @@
 Plot correlation heatmaps for datasets.
 """
 
-from typing import Callable
+from typing import Callable, Optional, Sequence, Tuple
 
 import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
+from matplotlib.axes import Axes
 
 from ..metrics import correlation, unified
 
@@ -16,8 +17,9 @@ def heatmap(
     num_num_metric: Callable[[pd.Series, pd.Series], float] = correlation.pearson,
     cat_num_metric: Callable[[pd.Series, pd.Series], float] = correlation.kruskal_wallis,
     cat_cat_metric: Callable[[pd.Series, pd.Series], float] = correlation.cramers_v,
-    **kwargs
-):
+    cmap: Optional[Sequence[Tuple[float, float, float]]] = None,
+    annotate: bool = False,
+) -> Axes:
     """This function creates a correlation heatmap out of a dataframe, using user provided or default correlation
     metrics for all possible types of pairs of series (i.e. numerical-numerical, categorical-numerical,
     categorical-categorical).
@@ -33,17 +35,30 @@ def heatmap(
         cat_cat_metric (Callable[[pd.Series, pd.Series], float], optional):
             The correlation metric used for categorical-categorical series pairs. Defaults to corrected Cramer's V
             statistic.
-        kwargs:
-            Key word arguments for sns.heatmap.
+        cmap (Optional[Sequence[Tuple[float, float, float]]], optional):
+            A sequence of RGB tuples used to colour the histograms. If None seaborn's default pallete
+            will be used. Defaults to None.
+        annotate (bool, optional):
+            Annotate the heatmap.
+
+    Returns:
+        matplotlib.axes.Axes:
+            The matplotlib axis containing the plot.
+
+    Examples:
+        >>> df = pd.read_csv("datasets/german_credit_data.csv")
+        >>> heatmap(df)
+        >>> plt.show()
+
+        .. image:: ../../savefig/corr_heatmap_1.png
     """
 
     corr_matrix = unified.correlation_matrix(df, num_num_metric, cat_num_metric, cat_cat_metric)
 
-    if "cmap" not in kwargs:
-        kwargs["cmap"] = sns.cubehelix_palette(start=0.2, rot=-0.2, dark=0.3, as_cmap=True)
+    cmap = cmap or sns.cubehelix_palette(start=0.2, rot=-0.2, dark=0.3, as_cmap=True)
+    annot = annotate or None
 
-    if "linewidth" not in kwargs:
-        kwargs["linewidth"] = 0.5
-
-    sns.heatmap(corr_matrix, vmin=0, vmax=1, square=True, **kwargs)
+    ax = sns.heatmap(corr_matrix, vmin=0, vmax=1, square=True, cmap=cmap, linewidth=0.5, annot=annot, fmt=".1f")
     plt.tight_layout()
+
+    return ax

From 23dbbcb908e4599cb5443bfdf8aec7714980a5b2 Mon Sep 17 00:00:00 2001
From: bogdansurdu <gs2318@ic.ac.uk>
Date: Thu, 16 Sep 2021 19:54:31 +0300
Subject: [PATCH 10/11] use infer_distr_type instead of old check, fix order of
 type checks in proxies

---
 src/fairlens/sensitive/correlation.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/fairlens/sensitive/correlation.py b/src/fairlens/sensitive/correlation.py
index e72f2416..7d330a84 100644
--- a/src/fairlens/sensitive/correlation.py
+++ b/src/fairlens/sensitive/correlation.py
@@ -7,6 +7,7 @@
 
 import pandas as pd
 
+from .. import utils
 from ..metrics import correlation as cm
 from ..sensitive import detection as dt
 
@@ -148,18 +149,17 @@ def find_column_correlation(
 def _compute_series_correlation(
     sr_a: pd.Series, sr_b: pd.Series, corr_cutoff: float = 0.75, p_cutoff: float = 0.1
 ) -> bool:
-    a_categorical = sr_a.map(type).eq(str).all()
-    b_categorical = sr_b.map(type).eq(str).all()
-
-    if a_categorical and b_categorical:
-        # If both columns are categorical, we use Cramer's V.
-        if cm.cramers_v(sr_a, sr_b) > corr_cutoff:
-            return True
-    elif not a_categorical and b_categorical:
-        # If just one column is categorical, we can group by it and use Kruskal-Wallis H Test.
-        return cm.kruskal_wallis_boolean(sr_b, sr_a, p_cutoff=p_cutoff)
-    elif a_categorical and not b_categorical:
+    a_type = utils.infer_distr_type(sr_a)
+    b_type = utils.infer_distr_type(sr_b)
+
+    if a_type.is_continuous() and b_type.is_continuous():
+        return cm.pearson(sr_a, sr_b) > corr_cutoff
+
+    elif b_type.is_continuous():
         return cm.kruskal_wallis_boolean(sr_a, sr_b, p_cutoff=p_cutoff)
 
-    # If both columns are numeric, we use standard Pearson correlation and the correlation cutoff.
-    return cm.pearson(sr_a, sr_b) > corr_cutoff
+    elif a_type.is_continuous():
+        return cm.kruskal_wallis_boolean(sr_b, sr_a, p_cutoff=p_cutoff)
+
+    else:
+        return cm.cramers_v(sr_a, sr_b) > corr_cutoff

From 4f48f5eb7028bd591b4b8a58a050df33159487b6 Mon Sep 17 00:00:00 2001
From: bogdansurdu <gs2318@ic.ac.uk>
Date: Thu, 16 Sep 2021 19:55:07 +0300
Subject: [PATCH 11/11] extend proxy tests, add correct results

---
 tests/test_correlation.py | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/tests/test_correlation.py b/tests/test_correlation.py
index c925032d..f055d2b4 100644
--- a/tests/test_correlation.py
+++ b/tests/test_correlation.py
@@ -32,6 +32,14 @@ def test_correlation():
         ["female", 11, 80],
         ["male", 12, 61],
         ["female", 10, 83],
+        ["male", 10, 60],
+        ["female", 10, 80],
+        ["male", 10, 60],
+        ["female", 10, 80],
+        ["male", 9, 59],
+        ["female", 11, 80],
+        ["male", 12, 61],
+        ["female", 10, 83],
     ]
     df = pd.DataFrame(data, columns=col_names)
     res = {"score": [pair_gender]}
@@ -75,11 +83,18 @@ def test_common_correlation():
         ["carribean", 40, 10, 2000, "single", 10, 90, 220],
         ["indo-european", 42, 10, 2500, "widowed", 10, 120, 200],
         ["arabian", 19, 10, 2200, "married", 10, 60, 115],
+        ["arabian", 21, 10, 2000, "married", 10, 60, 120],
+        ["carribean", 20, 10, 3000, "single", 10, 90, 130],
+        ["indo-european", 41, 10, 1900, "widowed", 10, 120, 210],
+        ["carribean", 40, 10, 2000, "single", 10, 90, 220],
+        ["indo-european", 42, 10, 2500, "widowed", 10, 120, 200],
+        ["arabian", 19, 10, 2200, "married", 10, 60, 115],
     ]
     df = pd.DataFrame(data, columns=col_names)
     res = {
         "corr1": [pair_race, pair_age, pair_marital],
-        "corr2": [pair_age],
+        "corr2": [pair_race, pair_age, pair_marital],
+        "entries": [pair_age],
     }
     assert find_sensitive_correlations(df) == res
 
@@ -108,14 +123,20 @@ def test_series_correlation():
         ["carribean", 40, 10, 2000, "single", 10],
         ["indo-european", 42, 10, 2500, "widowed", 10],
         ["arabian", 19, 10, 2200, "married", 10],
+        ["arabian", 21, 10, 2000, "married", 10],
+        ["carribean", 20, 10, 3000, "single", 10],
+        ["indo-european", 41, 10, 1900, "widowed", 10],
+        ["carribean", 40, 10, 2000, "single", 10],
+        ["indo-european", 42, 10, 2500, "widowed", 10],
+        ["arabian", 19, 10, 2200, "married", 10],
     ]
     df = pd.DataFrame(data, columns=col_names)
-    s1 = pd.Series([60, 90, 120, 90, 120, 60])
-    s2 = pd.Series([120, 130, 210, 220, 200, 115])
-    res1 = [pair_race, pair_marital]
-    res2 = [pair_age]
-    assert set(find_column_correlation(s1, df, corr_cutoff=0.9)) == set(res1)
-    assert set(find_column_correlation(s2, df, corr_cutoff=0.9)) == set(res2)
+    s1 = pd.Series([60, 90, 120, 90, 120, 60, 60, 90, 120, 90, 120, 60])
+    s2 = pd.Series([120, 130, 210, 220, 200, 115, 120, 130, 210, 220, 200, 115])
+    res1 = [pair_age, pair_race, pair_marital]
+    res2 = [pair_age, pair_race, pair_marital]
+    assert set(find_column_correlation(s1, df)) == set(res1)
+    assert set(find_column_correlation(s2, df)) == set(res2)
 
 
 def test_basic_nn_distance_corr():