theislab · Lilly-May · May 4, 2024 · Apr 11, 2024 · Apr 11, 2024 · Apr 12, 2024
diff --git a/ehrapy/preprocessing/__init__.py b/ehrapy/preprocessing/__init__.py
@@ -1,3 +1,4 @@
+from ehrapy.preprocessing._bias import bias_detection
 from ehrapy.preprocessing._encoding import encode, undo_encoding
 from ehrapy.preprocessing._highly_variable_features import highly_variable_features
 from ehrapy.preprocessing._imputation import (

diff --git a/ehrapy/preprocessing/_bias.py b/ehrapy/preprocessing/_bias.py
@@ -0,0 +1,119 @@
+from collections.abc import Iterable
+from typing import Literal
+
+import numpy as np
+import pandas as pd
+from anndata import AnnData
+
+from ehrapy import logging as logg
+from ehrapy.anndata import anndata_to_df
+
+
+def bias_detection(
+    adata: AnnData,
+    sensitive_features: Iterable[str] | Literal["all"],
+    corr_threshold: float = 0.5,
+    smd_threshold: float = 0.5,
+    feature_importance_threshold: float = 0.1,
+    prediction_confidence_threshold: float = 0.5,
+    corr_method: Literal["pearson", "spearman"] = "spearman",
+):
+    """Detects bias in the data.
+
+    Args:
+        adata: An annotated data matrix containing patient data.
+        sensitive_features: A list of sensitive features to check for bias.
+        corr_threshold: The threshold for the correlation coefficient between two features to be considered of interest. Defaults to 0.5.
+        smd_threshold: The threshold for the standardized mean difference between two features to be considered of interest. Defaults to 0.5.
+        feature_importance_threshold: The threshold for the feature importance of a sensitive feature for predicting another feature to be considered
+            of interest. Defaults to 0.1.
+        prediction_confidence_threshold: The threshold for the prediction confidence (R2 or accuracy) of a sensitive feature for predicting another
+            feature to be considered of interest. Defaults to 0.5.
+        corr_method: The correlation method to use. Choose between "pearson" and "spearman". Defaults to "spearman".
+    """
+    from ehrapy.tools import rank_features_supervised
+
+    if sensitive_features == "all":
+        sensitive_features = adata.var_names
+
+    correlations = _feature_correlations(adata, method=corr_method)
+    adata.varp["correlation"] = correlations
+
+    for feature in sensitive_features:
+        for comp_feature in adata.var_names:
+            if correlations.loc[feature, comp_feature] > corr_threshold:
+                logg.warning(
+                    f"Feature {feature} is highly correlated with {comp_feature} (correlation coefficient ≈{correlations.loc[feature, comp_feature]:.3f})."
+                )  # TODO: How do we print results?
+
+    smd_dict = _standardized_mean_differences(adata, sensitive_features)
+    for feature in sensitive_features:
+        abs_smd = smd_dict[feature].abs()
+        for comp_feature in adata.var_names:
+            if abs_smd[comp_feature].max() > smd_threshold:
+                logg.warning(
+                    f"Feature {comp_feature} has a high standardized mean difference with {feature}."
+                )  # TODO: Do we look at / print groups individually?
+
+    for prediction_feature in adata.var_names:
+        prediction_score = rank_features_supervised(
+            adata,
+            prediction_feature,
+            input_features="all",
+            model="rf",
+            key_added=f"{prediction_feature}_feature_importances",
+            percent_output=True,
+            logging=False,
+            return_score=True,
+        )
+        for feature in sensitive_features:
+            feature_importance = adata.var[f"{prediction_feature}_feature_importances"][feature] / 100
+            if feature_importance > feature_importance_threshold and prediction_score > prediction_confidence_threshold:
+                logg.warning(
+                    f"Feature {feature} has a high feature importance for predicting {prediction_feature} (importance in %: {feature_importance:.3f}, prediction score: {prediction_score:.3f})."
+                )
+
+
+def _feature_correlations(adata: AnnData, method: Literal["pearson", "spearman"] = "spearman"):
+    """Computes pairwise correlations between features in the AnnData object.
+
+    Args:
+        adata: An annotated data matrix containing patient data.
+        method: The correlation method to use. Choose between "pearson" and "spearman". Defaults to "spearman".
+
+    Returns:
+        A pandas DataFrame containing the correlation matrix.
+    """
+    corr_matrix = anndata_to_df(adata).corr(method=method)
+    return corr_matrix
+
+
+def _standardized_mean_differences(adata: AnnData, features: Iterable[str]) -> dict:
+    """Computes the standardized mean differences between sensitive features.
+
+    Args:
+        adata: An annotated data matrix containing patient data.
+        features: A list of features to compute the standardized mean differences (SMD) for. For each listed feature, the SMD is computed for each
+            feature, comparing one group to the rest. Thus, we obtain a n_groups_in_feature x n_features matrix of SMDs for each listed feature.
+
+    Returns:
+        A dictionary mapping each feature to a pandas DataFrame containing the standardized mean differences.
+    """
+    df = anndata_to_df(adata)
+    smd_results = {}  # type: ignore
+
+    for group_feature in features:  # TODO: Restrict to categorical features (wait for other PR)
+        smd_results[group_feature] = {}
+        for group in df[group_feature].unique():
+            group_mean = df[df[group_feature] == group].mean()
+            group_std = df[df[group_feature] == group].std()
+
+            comparison_mean = df[df[group_feature] != group].mean()
+            comparison_std = df[df[group_feature] != group].std()
+
+            smd = (group_mean - comparison_mean) / np.sqrt((group_std**2 + comparison_std**2) / 2)
+            smd_results[group_feature][group] = smd
+
+        smd_results[group_feature] = pd.DataFrame(smd_results[group_feature]).T[adata.var_names]
+
+    return smd_results
diff --git a/ehrapy/preprocessing/_imputation.py b/ehrapy/preprocessing/_imputation.py
@@ -202,7 +202,7 @@ def knn_impute(
     imputation ran successfully.
 
     Args:
-        adata: An annotated data matrix containing gene expression values.
+        adata: An annotated data matrix containing patient data.
         var_names: A list of variable names indicating which columns to impute.
                    If `None`, all columns are imputed. Default is `None`.
         n_neighbours: Number of neighbors to use when performing the imputation. Defaults to 5.

diff --git a/ehrapy/tools/feature_ranking/_feature_importances.py b/ehrapy/tools/feature_ranking/_feature_importances.py
@@ -18,15 +18,17 @@ def rank_features_supervised(
     adata: AnnData,
     predicted_feature: str,
     prediction_type: Literal["continuous", "categorical", "auto"] = "auto",
-    model: Literal["regression", "svm", "rf"] = "regression",
+    model: Literal["regression", "svm", "rf"] = "rf",
     input_features: Iterable[str] | Literal["all"] = "all",
     layer: str | None = None,
     test_split_size: float = 0.2,
     key_added: str = "feature_importances",
     feature_scaling: Literal["standard", "minmax"] | None = "standard",
     percent_output: bool = False,
+    logging: bool = True,
+    return_score: bool = False,
     **kwargs,
-):
+) -> float | None:
     """Calculate feature importances for predicting a specified feature in adata.var.
 
     Args:
@@ -49,6 +51,8 @@ def rank_features_supervised(
             for each feature individually. Defaults to 'standard'.
         percent_output: Set to True to output the feature importances as percentages. Note that information about positive or negative
             coefficients for regression models will be lost. Defaults to False.
+        logging: Set to False to disable logging. Defaults to True.
+        return_score: Set to True to return the R2 score / the accuracy of the model. Defaults to False.
         **kwargs: Additional keyword arguments to pass to the model. See the documentation of the respective model in scikit-learn for details.
 
     Examples:
@@ -92,9 +96,10 @@ def rank_features_supervised(
                 prediction_type = "categorical"
             else:
                 prediction_type = "continuous"
-        logg.info(
-            f"Predicted feature {predicted_feature} was detected as {prediction_type}. If this is incorrect, please specify in the prediction_type argument."
-        )
+        if logging:
+            logg.info(
+                f"Predicted feature {predicted_feature} was detected as {prediction_type}. If this is incorrect, please specify in the prediction_type argument."
+            )
 
     elif prediction_type == "continuous":
         if pd.api.types.is_categorical_dtype(data[predicted_feature].dtype):
@@ -167,9 +172,10 @@ def rank_features_supervised(
 
     score = predictor.score(x_test, y_test)
     evaluation_metric = "R2 score" if prediction_type == "continuous" else "accuracy"
-    logg.info(
-        f"Training completed. The model achieved an {evaluation_metric} of {score:.2f} on the test set, consisting of {len(y_test)} samples."
-    )
+    if logging:
+        logg.info(
+            f"Training completed. The model achieved an {evaluation_metric} of {score:.2f} on the test set, consisting of {len(y_test)} samples."
+        )
 
     if model == "regression" or model == "svm":
         feature_importances = pd.Series(predictor.coef_.squeeze(), index=input_data.columns)
@@ -182,3 +188,5 @@ def rank_features_supervised(
     # Reorder feature importances to match adata.var order and save importances in adata.var
     feature_importances = feature_importances.reindex(adata.var_names)
     adata.var[key_added] = feature_importances
+
+    return score if return_score else None