From 0c52f13c24e1f535dc3c4fca23b975fdd1e78e1f Mon Sep 17 00:00:00 2001
From: eljas <eljas.roellin@ikmail.com>
Date: Wed, 6 Dec 2023 10:11:27 +0100
Subject: [PATCH 01/17] tests for rank features groups with obs

---
 .../test_data_features_ranking/dataset1.csv   | 13 ++++
 tests/tools/test_features_ranking.py          | 63 +++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 tests/tools/test_data_features_ranking/dataset1.csv

diff --git a/tests/tools/test_data_features_ranking/dataset1.csv b/tests/tools/test_data_features_ranking/dataset1.csv
new file mode 100644
index 00000000..25f83ab2
--- /dev/null
+++ b/tests/tools/test_data_features_ranking/dataset1.csv
@@ -0,0 +1,13 @@
+idx,sys_bp_entry,dia_bp_entry,glucose,weight,disease,station
+1,138,78,80,77,A,ICU
+2,139,79,90,76,A,ICU
+3,140,80,120,60,A,MICU
+4,141,81,130,90,A,MICU
+5,148,77,80,110,B,ICU
+6,149,78,130,78,B,ICU
+7,150,79,120,56,B,MICU
+8,151,80,90,76,B,MICU
+9,158,55,80,67,C,ICU
+10,159,56,90,82,C,ICU
+11,160,57,120,59,C,MICU
+12,161,58,130,81,C,MICU
diff --git a/tests/tools/test_features_ranking.py b/tests/tools/test_features_ranking.py
index fefb6b02..e7ed708e 100644
--- a/tests/tools/test_features_ranking.py
+++ b/tests/tools/test_features_ranking.py
@@ -1,9 +1,15 @@
+from pathlib import Path
+
 import numpy as np
 import pandas as pd
 import pytest
 
 import ehrapy as ep
 import ehrapy.tools.feature_ranking._rank_features_groups as _utils
+from ehrapy.io._read import read_csv
+
+CURRENT_DIR = Path(__file__).parent
+_TEST_PATH = f"{CURRENT_DIR}/test_data_features_ranking"
 
 
 class TestHelperFunctions:
@@ -270,3 +276,60 @@ def test_only_cat_features(self):
         assert "scores" in adata.uns["rank_features_groups"]
         assert "logfoldchanges" in adata.uns["rank_features_groups"]
         assert "pvals_adj" in adata.uns["rank_features_groups"]
+
+    def test_rank_obs(
+        self,
+    ):
+        # prepare data with some interesting features in .obs
+        adata_features_in_obs = read_csv(
+            dataset_path=f"{_TEST_PATH}/dataset1.csv",
+            columns_obs_only=["disease", "station", "sys_bp_entry", "dia_bp_entry"],
+        )
+
+        # prepare data with these features in .X
+        adata_features_in_x = read_csv(
+            dataset_path=f"{_TEST_PATH}/dataset1.csv", columns_x_only=["station", "sys_bp_entry", "dia_bp_entry"]
+        )
+        adata_features_in_x = ep.pp.encode(adata_features_in_x, encodings={"label_encoding": ["station"]})
+
+        # rank_features_groups on .obs
+        ep.tl.rank_features_groups(adata_features_in_obs, groupby="disease", rank_obs_columns="all")
+
+        # rank features groups on .X
+        ep.tl.rank_features_groups(adata_features_in_x, groupby="disease")
+
+        # check standard rank_features_groups entries
+        assert "names" in adata_features_in_obs.uns["rank_features_groups"]
+        assert "pvals" in adata_features_in_obs.uns["rank_features_groups"]
+        assert "scores" in adata_features_in_obs.uns["rank_features_groups"]
+        assert "pvals_adj" in adata_features_in_obs.uns["rank_features_groups"]
+        assert "log2foldchanges" not in adata_features_in_obs.uns["rank_features_groups"]
+        assert "pts" not in adata_features_in_obs.uns["rank_features_groups"]
+        assert (
+            len(adata_features_in_obs.uns["rank_features_groups"]["names"]) == 3
+        )  # It only captures the length of each group
+        assert len(adata_features_in_obs.uns["rank_features_groups"]["pvals"]) == 3
+        assert len(adata_features_in_obs.uns["rank_features_groups"]["scores"]) == 3
+
+        # check the obs are used indeed
+        assert "sys_bp_entry" in adata_features_in_obs.uns["rank_features_groups"]["names"][0]
+        assert "sys_bp_entry" in adata_features_in_obs.uns["rank_features_groups"]["names"][1]
+        assert "ehrapycat_station" in adata_features_in_obs.uns["rank_features_groups"]["names"][2]
+
+        # check the X are not used
+        assert "glucose" not in adata_features_in_obs.uns["rank_features_groups"]["names"][0]
+
+        # check the results are the same
+        for record in adata_features_in_obs.uns["rank_features_groups"]["names"].dtype.names:
+            assert np.allclose(
+                adata_features_in_obs.uns["rank_features_groups"]["scores"][record],
+                adata_features_in_x.uns["rank_features_groups"]["scores"][record],
+            )
+            assert np.allclose(
+                np.array(adata_features_in_obs.uns["rank_features_groups"]["pvals"][record]),
+                np.array(adata_features_in_x.uns["rank_features_groups"]["pvals"][record]),
+            )
+            assert np.array_equal(
+                np.array(adata_features_in_obs.uns["rank_features_groups"]["names"][record]),
+                np.array(adata_features_in_x.uns["rank_features_groups"]["names"][record]),
+            )

From dd022eca545650ab8634850318ecac1e8c540552 Mon Sep 17 00:00:00 2001
From: eljas <eljas.roellin@ikmail.com>
Date: Wed, 6 Dec 2023 10:52:34 +0100
Subject: [PATCH 02/17] first drafted feature ranking using obs

---
 .../feature_ranking/_rank_features_groups.py  | 67 +++++++++++++++++--
 1 file changed, 63 insertions(+), 4 deletions(-)

diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py
index 6bf27057..932eedda 100644
--- a/ehrapy/tools/feature_ranking/_rank_features_groups.py
+++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py
@@ -6,6 +6,8 @@
 import scanpy as sc
 from anndata import AnnData
 
+from ehrapy.anndata import move_to_x
+from ehrapy.preprocessing import encode
 from ehrapy.tools import _method_options
 
 
@@ -255,11 +257,12 @@ def rank_features_groups(
     correction_method: _method_options._correction_method = "benjamini-hochberg",
     tie_correct: bool = False,
     layer: Optional[str] = None,
+    rank_obs_columns: Optional[Union[list[str], str]] = None,
     **kwds,
 ) -> None:  # pragma: no cover
     """Rank features for characterizing groups.
 
-    Expects logarithmized data.
+    Expects logarithmized data. # TODO: should this line be removed? log-transform may be a valid transformation for probably many types of data, but not a necessity for this method right?
 
     Args:
         adata: Annotated data matrix.
@@ -288,6 +291,8 @@ def rank_features_groups(
                             Used only for statistical tests (e.g. doesn't work for "logreg" `num_cols_method`)
         tie_correct: Use tie correction for `'wilcoxon'` scores. Used only for `'wilcoxon'`.
         layer: Key from `adata.layers` whose value will be used to perform tests on.
+        rank_obs_columns: Whether to rank `adata.obs` columns instead of features in `adata.layer`. If `True`, all observation columns are ranked. If list of column names, only those are ranked.
+                    layer needs to be None if this is used.
         **kwds: Are passed to test methods. Currently this affects only parameters that
                 are passed to :class:`sklearn.linear_model.LogisticRegression`.
                 For instance, you can pass `penalty='l1'` to try to come up with a
@@ -314,19 +319,67 @@ def rank_features_groups(
                     Only if `reference` is set to `'rest'`.
                     Fraction of observations from the union of the rest of each group containing the features.
 
-     Examples:
+     Examples: # TODO: there is an inf. value here. I think due to a bug considering the one-hot encoded feature of service_unit.
          >>> import ehrapy as ep
          >>> adata = ep.dt.mimic_2(encoded=True)
          >>> ep.tl.rank_features_groups(adata, "service_unit")
          >>> ep.pl.rank_features_groups(adata)
     """
+    # if rank_obs_columns is indicated, layer must be None
+    if layer is not None and rank_obs_columns is not None:
+        raise ValueError("Only one of layer and rank_obs_columns can be specified.")
+
     adata = adata.copy() if copy else adata
 
+    # TODO: check if need to remove the "actual" column according which to group
+    if rank_obs_columns is not None:
+        # keep reference to original adata, needed if copy=False
+        adata_orig = adata
+        # copy adata to work on
+        adata = adata.copy()
+
+        if isinstance(rank_obs_columns, str):
+            if rank_obs_columns == "all":
+                rank_obs_columns = adata.obs.keys().to_list()
+            else:
+                raise ValueError(
+                    f"rank_obs_columns should be 'all' or Iterable of column names, not {rank_obs_columns}."
+                )
+
+        # consider adata where all columns from obs become the features, and the other features are dropped
+        if not all(elem in adata.obs.columns.values for elem in rank_obs_columns):
+            raise ValueError(
+                f"Columns `{[col for col in rank_obs_columns if col not in adata.obs.columns.values]}` are not in obs."
+            )
+
+        # if groupby in rank_obs_columns:
+        #     rank_obs_columns.remove(groupby)
+
+        # move obs columns to X
+        # TODO: check if we want to take care of strange stuff (e.g. dates/times) or allow user to only specify columns that make sense
+        adata_with_moved_columns = move_to_x(adata, rank_obs_columns)
+
+        # remove columns previously in X
+        columns_to_select = adata_with_moved_columns.var_names.difference(adata.var_names)
+        adata_with_moved_columns = adata_with_moved_columns[:, columns_to_select]
+
+        # encode categoricals
+        adata_with_moved_columns = encode(adata_with_moved_columns, autodetect=True, encodings="label_encoding")
+
+        # assign numeric and categorical columns
+        adata_with_moved_columns.uns[
+            "non_numerical_columns"
+        ] = []  # this should be empty, as have only numeric and encoded
+        adata_with_moved_columns.uns["numerical_columns"] = adata_with_moved_columns.var_names.difference(
+            adata_with_moved_columns.uns["encoded_non_numerical_columns"]
+        ).to_list()  # this is sensitive to `encode` really detecting what it should
+        adata = adata_with_moved_columns
+
     if not adata.obs[groupby].dtype == "category":
         adata.obs[groupby] = pd.Categorical(adata.obs[groupby])
 
     adata.uns[key_added] = {}
-    adata.uns[key_added]["params"] = {
+    adata.uns[key_added]["params"] = {  # TODO: add additional parameters
         "groupby": groupby,
         "reference": reference,
         "method": num_cols_method,
@@ -404,14 +457,20 @@ def rank_features_groups(
         )
 
     # Adjust p values
-    if "pvals" in adata.uns[key_added]:
+    if "pvals" in adata.uns[key_added]:  # todo: in what scenarios can they not be there?
         adata.uns[key_added]["pvals_adj"] = _adjust_pvalues(
             adata.uns[key_added]["pvals"], corr_method=correction_method
         )
 
+    # For some reason, pts should be a DataFrame
     if "pts" in adata.uns[key_added]:
         adata.uns[key_added]["pts"] = pd.DataFrame(adata.uns[key_added]["pts"])
 
     _sort_features(adata, key_added)
 
+    # TODO: return adata_orig with rank features results
+    if rank_obs_columns is not None:
+        adata_orig.uns[key_added] = adata.uns[key_added]
+        adata = adata_orig
+
     return adata if copy else None

From a8187287321033443738fad91afaac04b4c800fc Mon Sep 17 00:00:00 2001
From: eljas <eljas.roellin@ikmail.com>
Date: Wed, 6 Dec 2023 11:39:12 +0100
Subject: [PATCH 03/17] fixed encoding names

---
 ehrapy/tools/feature_ranking/_rank_features_groups.py | 11 ++++-------
 tests/tools/test_features_ranking.py                  |  2 +-
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py
index 932eedda..02e3ea33 100644
--- a/ehrapy/tools/feature_ranking/_rank_features_groups.py
+++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py
@@ -262,7 +262,7 @@ def rank_features_groups(
 ) -> None:  # pragma: no cover
     """Rank features for characterizing groups.
 
-    Expects logarithmized data. # TODO: should this line be removed? log-transform may be a valid transformation for probably many types of data, but not a necessity for this method right?
+    Expects logarithmized data.
 
     Args:
         adata: Annotated data matrix.
@@ -319,7 +319,7 @@ def rank_features_groups(
                     Only if `reference` is set to `'rest'`.
                     Fraction of observations from the union of the rest of each group containing the features.
 
-     Examples: # TODO: there is an inf. value here. I think due to a bug considering the one-hot encoded feature of service_unit.
+     Examples:
          >>> import ehrapy as ep
          >>> adata = ep.dt.mimic_2(encoded=True)
          >>> ep.tl.rank_features_groups(adata, "service_unit")
@@ -331,7 +331,6 @@ def rank_features_groups(
 
     adata = adata.copy() if copy else adata
 
-    # TODO: check if need to remove the "actual" column according which to group
     if rank_obs_columns is not None:
         # keep reference to original adata, needed if copy=False
         adata_orig = adata
@@ -356,7 +355,6 @@ def rank_features_groups(
         #     rank_obs_columns.remove(groupby)
 
         # move obs columns to X
-        # TODO: check if we want to take care of strange stuff (e.g. dates/times) or allow user to only specify columns that make sense
         adata_with_moved_columns = move_to_x(adata, rank_obs_columns)
 
         # remove columns previously in X
@@ -364,7 +362,7 @@ def rank_features_groups(
         adata_with_moved_columns = adata_with_moved_columns[:, columns_to_select]
 
         # encode categoricals
-        adata_with_moved_columns = encode(adata_with_moved_columns, autodetect=True, encodings="label_encoding")
+        adata_with_moved_columns = encode(adata_with_moved_columns, autodetect=True, encodings="label")
 
         # assign numeric and categorical columns
         adata_with_moved_columns.uns[
@@ -379,7 +377,7 @@ def rank_features_groups(
         adata.obs[groupby] = pd.Categorical(adata.obs[groupby])
 
     adata.uns[key_added] = {}
-    adata.uns[key_added]["params"] = {  # TODO: add additional parameters
+    adata.uns[key_added]["params"] = {
         "groupby": groupby,
         "reference": reference,
         "method": num_cols_method,
@@ -468,7 +466,6 @@ def rank_features_groups(
 
     _sort_features(adata, key_added)
 
-    # TODO: return adata_orig with rank features results
     if rank_obs_columns is not None:
         adata_orig.uns[key_added] = adata.uns[key_added]
         adata = adata_orig
diff --git a/tests/tools/test_features_ranking.py b/tests/tools/test_features_ranking.py
index e7ed708e..79ef2b6b 100644
--- a/tests/tools/test_features_ranking.py
+++ b/tests/tools/test_features_ranking.py
@@ -290,7 +290,7 @@ def test_rank_obs(
         adata_features_in_x = read_csv(
             dataset_path=f"{_TEST_PATH}/dataset1.csv", columns_x_only=["station", "sys_bp_entry", "dia_bp_entry"]
         )
-        adata_features_in_x = ep.pp.encode(adata_features_in_x, encodings={"label_encoding": ["station"]})
+        adata_features_in_x = ep.pp.encode(adata_features_in_x, encodings={"label": ["station"]})
 
         # rank_features_groups on .obs
         ep.tl.rank_features_groups(adata_features_in_obs, groupby="disease", rank_obs_columns="all")

From adaa53b0acbe490f3223070f0738c76c1f3a62c8 Mon Sep 17 00:00:00 2001
From: eljas <eljas.roellin@ikmail.com>
Date: Wed, 6 Dec 2023 11:42:56 +0100
Subject: [PATCH 04/17] remove comment

---
 ehrapy/tools/feature_ranking/_rank_features_groups.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py
index 02e3ea33..31ab66b2 100644
--- a/ehrapy/tools/feature_ranking/_rank_features_groups.py
+++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py
@@ -455,7 +455,7 @@ def rank_features_groups(
         )
 
     # Adjust p values
-    if "pvals" in adata.uns[key_added]:  # todo: in what scenarios can they not be there?
+    if "pvals" in adata.uns[key_added]:
         adata.uns[key_added]["pvals_adj"] = _adjust_pvalues(
             adata.uns[key_added]["pvals"], corr_method=correction_method
         )

From 37c6a9ae8cd5699c2ec22f22ce2ea860654e503d Mon Sep 17 00:00:00 2001
From: Eljas Roellin <65244425+eroell@users.noreply.github.com>
Date: Wed, 6 Dec 2023 12:46:17 +0100
Subject: [PATCH 05/17] Remove comment

Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net>
---
 ehrapy/tools/feature_ranking/_rank_features_groups.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py
index 31ab66b2..9be05f09 100644
--- a/ehrapy/tools/feature_ranking/_rank_features_groups.py
+++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py
@@ -325,7 +325,6 @@ def rank_features_groups(
          >>> ep.tl.rank_features_groups(adata, "service_unit")
          >>> ep.pl.rank_features_groups(adata)
     """
-    # if rank_obs_columns is indicated, layer must be None
     if layer is not None and rank_obs_columns is not None:
         raise ValueError("Only one of layer and rank_obs_columns can be specified.")
 

From 06c2a009ea216cb200e11b6d71f151b7ab7c6933 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <65244425+eroell@users.noreply.github.com>
Date: Wed, 6 Dec 2023 12:47:27 +0100
Subject: [PATCH 06/17] Remove comment

Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net>
---
 ehrapy/tools/feature_ranking/_rank_features_groups.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py
index 9be05f09..8a7894fb 100644
--- a/ehrapy/tools/feature_ranking/_rank_features_groups.py
+++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py
@@ -353,7 +353,6 @@ def rank_features_groups(
         # if groupby in rank_obs_columns:
         #     rank_obs_columns.remove(groupby)
 
-        # move obs columns to X
         adata_with_moved_columns = move_to_x(adata, rank_obs_columns)
 
         # remove columns previously in X

From 6fa3de16a646c1d132ddf271e4fb0b87fc2bf322 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <65244425+eroell@users.noreply.github.com>
Date: Wed, 6 Dec 2023 12:47:47 +0100
Subject: [PATCH 07/17] Remove comment

Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net>
---
 ehrapy/tools/feature_ranking/_rank_features_groups.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py
index 8a7894fb..cfaf9b8b 100644
--- a/ehrapy/tools/feature_ranking/_rank_features_groups.py
+++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py
@@ -333,7 +333,6 @@ def rank_features_groups(
     if rank_obs_columns is not None:
         # keep reference to original adata, needed if copy=False
         adata_orig = adata
-        # copy adata to work on
         adata = adata.copy()
 
         if isinstance(rank_obs_columns, str):

From 458520ec1a232d390170072795ff2ee99959f600 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <65244425+eroell@users.noreply.github.com>
Date: Wed, 6 Dec 2023 12:49:28 +0100
Subject: [PATCH 08/17] Update
 ehrapy/tools/feature_ranking/_rank_features_groups.py

Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net>
---
 ehrapy/tools/feature_ranking/_rank_features_groups.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py
index cfaf9b8b..21d119a2 100644
--- a/ehrapy/tools/feature_ranking/_rank_features_groups.py
+++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py
@@ -326,7 +326,7 @@ def rank_features_groups(
          >>> ep.pl.rank_features_groups(adata)
     """
     if layer is not None and rank_obs_columns is not None:
-        raise ValueError("Only one of layer and rank_obs_columns can be specified.")
+        raise ValueError("Only one of 'layer' and 'rank_obs_columns' can be specified.")
 
     adata = adata.copy() if copy else adata
 

From 210bac6f4b001cdae8de220fb0c46f4fc32c34c4 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <65244425+eroell@users.noreply.github.com>
Date: Wed, 6 Dec 2023 12:54:28 +0100
Subject: [PATCH 09/17] Remove comment

Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net>
---
 ehrapy/tools/feature_ranking/_rank_features_groups.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py
index 21d119a2..d7e36e3e 100644
--- a/ehrapy/tools/feature_ranking/_rank_features_groups.py
+++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py
@@ -354,7 +354,6 @@ def rank_features_groups(
 
         adata_with_moved_columns = move_to_x(adata, rank_obs_columns)
 
-        # remove columns previously in X
         columns_to_select = adata_with_moved_columns.var_names.difference(adata.var_names)
         adata_with_moved_columns = adata_with_moved_columns[:, columns_to_select]
 

From ffadf71d7c53e36fa2b0e912e7936cccf5627cc8 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <65244425+eroell@users.noreply.github.com>
Date: Wed, 6 Dec 2023 12:54:43 +0100
Subject: [PATCH 10/17] Remove comment

Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net>
---
 ehrapy/tools/feature_ranking/_rank_features_groups.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py
index d7e36e3e..ebb802fe 100644
--- a/ehrapy/tools/feature_ranking/_rank_features_groups.py
+++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py
@@ -357,7 +357,6 @@ def rank_features_groups(
         columns_to_select = adata_with_moved_columns.var_names.difference(adata.var_names)
         adata_with_moved_columns = adata_with_moved_columns[:, columns_to_select]
 
-        # encode categoricals
         adata_with_moved_columns = encode(adata_with_moved_columns, autodetect=True, encodings="label")
 
         # assign numeric and categorical columns

From f0f4867a1474a05b707ae8cef62ca4fa2f1711d6 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <65244425+eroell@users.noreply.github.com>
Date: Wed, 6 Dec 2023 12:56:31 +0100
Subject: [PATCH 11/17] Update
 ehrapy/tools/feature_ranking/_rank_features_groups.py

Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net>
---
 ehrapy/tools/feature_ranking/_rank_features_groups.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py
index ebb802fe..3b3cf0f9 100644
--- a/ehrapy/tools/feature_ranking/_rank_features_groups.py
+++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py
@@ -359,7 +359,6 @@ def rank_features_groups(
 
         adata_with_moved_columns = encode(adata_with_moved_columns, autodetect=True, encodings="label")
 
-        # assign numeric and categorical columns
         adata_with_moved_columns.uns[
             "non_numerical_columns"
         ] = []  # this should be empty, as have only numeric and encoded

From 02148f5ee5cf3e9a2077a2d1869579b3be6d7a89 Mon Sep 17 00:00:00 2001
From: eljas <eljas.roellin@ikmail.com>
Date: Wed, 6 Dec 2023 13:00:34 +0100
Subject: [PATCH 12/17] Iterable to list and import from future

---
 ehrapy/tools/feature_ranking/_rank_features_groups.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py
index 31ab66b2..763f9a2f 100644
--- a/ehrapy/tools/feature_ranking/_rank_features_groups.py
+++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from collections.abc import Iterable
 from typing import Literal, Optional, Union
 
@@ -257,7 +259,7 @@ def rank_features_groups(
     correction_method: _method_options._correction_method = "benjamini-hochberg",
     tie_correct: bool = False,
     layer: Optional[str] = None,
-    rank_obs_columns: Optional[Union[list[str], str]] = None,
+    rank_obs_columns: Optional[Union[Iterable[str], str]] = None,
     **kwds,
 ) -> None:  # pragma: no cover
     """Rank features for characterizing groups.
@@ -355,7 +357,7 @@ def rank_features_groups(
         #     rank_obs_columns.remove(groupby)
 
         # move obs columns to X
-        adata_with_moved_columns = move_to_x(adata, rank_obs_columns)
+        adata_with_moved_columns = move_to_x(adata, list(rank_obs_columns))
 
         # remove columns previously in X
         columns_to_select = adata_with_moved_columns.var_names.difference(adata.var_names)

From bf02fff76652d0189cbc0d71ae37e1faae91ef4b Mon Sep 17 00:00:00 2001
From: eljas <eljas.roellin@ikmail.com>
Date: Thu, 7 Dec 2023 11:15:40 +0100
Subject: [PATCH 13/17] upated to use layer, obs, or both

---
 .../feature_ranking/_rank_features_groups.py  | 165 +++++++++++++-----
 tests/tools/test_features_ranking.py          | 121 +++++++++----
 2 files changed, 210 insertions(+), 76 deletions(-)

diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py
index 6d91ef34..2da75c5a 100644
--- a/ehrapy/tools/feature_ranking/_rank_features_groups.py
+++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py
@@ -244,6 +244,51 @@ def _evaluate_categorical_features(
     )
 
 
+def _check_no_datetime_columns(df):
+    datetime_cols = [col for col in df.columns if df[col].dtype == "datetime64[ns]"]
+    if datetime_cols:
+        raise ValueError(f"Columns with datetime format found: {datetime_cols}")
+
+
+def _get_intersection(adata_uns, key, selection):
+    """Get intersection of adata_uns[key] and selection"""
+    if key in adata_uns:
+        uns_enc_to_keep = list(set(adata_uns["encoded_non_numerical_columns"]) & set(selection))
+    else:
+        uns_enc_to_keep = []
+    return uns_enc_to_keep
+
+
+def _check_columns_to_rank_dict(columns_to_rank):
+    if isinstance(columns_to_rank, str):
+        if columns_to_rank == "all":
+            _var_subset = _obs_subset = False
+        else:
+            raise ValueError("If columns_to_rank is a string, it must be 'all'.")
+
+    elif isinstance(columns_to_rank, dict):
+        allowed_keys = {"var_names", "obs_names"}
+        for key in columns_to_rank.keys():
+            if key not in allowed_keys:
+                raise ValueError(
+                    f"columns_to_rank dictionary must have only keys 'var_names' and/or 'obs_names', not {key}."
+                )
+            if not isinstance(key, str):
+                raise ValueError(f"columns_to_rank dictionary keys must be strings, not {type(key)}.")
+
+        for key, value in columns_to_rank.items():
+            if not isinstance(value, Iterable) or any(not isinstance(item, str) for item in value):
+                raise ValueError(f"The value associated with key '{key}' must be an iterable of strings.")
+
+        _var_subset = "var_names" in columns_to_rank.keys()
+        _obs_subset = "obs_names" in columns_to_rank.keys()
+
+    else:
+        raise ValueError("columns_to_rank must be either 'all' or a dictionary.")
+
+    return _var_subset, _obs_subset
+
+
 def rank_features_groups(
     adata: AnnData,
     groupby: str,
@@ -259,7 +304,8 @@ def rank_features_groups(
     correction_method: _method_options._correction_method = "benjamini-hochberg",
     tie_correct: bool = False,
     layer: Optional[str] = None,
-    rank_obs_columns: Optional[Union[Iterable[str], str]] = None,
+    field_to_rank: Union[Literal["layer"], Literal["obs"], Literal["layer_and_obs"]] = "layer",
+    columns_to_rank: Union[dict[str, Iterable[str]], Literal["all"]] = "all",
     **kwds,
 ) -> None:  # pragma: no cover
     """Rank features for characterizing groups.
@@ -293,8 +339,8 @@ def rank_features_groups(
                             Used only for statistical tests (e.g. doesn't work for "logreg" `num_cols_method`)
         tie_correct: Use tie correction for `'wilcoxon'` scores. Used only for `'wilcoxon'`.
         layer: Key from `adata.layers` whose value will be used to perform tests on.
-        rank_obs_columns: Whether to rank `adata.obs` columns instead of features in `adata.layer`. If `True`, all observation columns are ranked. If list of column names, only those are ranked.
-                    layer needs to be None if this is used.
+        field_to_rank: Set to `layer` to rank variables in `adata.X` or `adata.layers[layer]` (default), `obs` to rank `adata.obs`, or `layer_and_obs` to rank both. Layer needs to be None if this is not 'layer'.
+        columns_to_rank: Subset of columns to rank. If 'all', all columns are used. If a dictionary, it must have keys 'var_names' and/or 'obs_names' and values must be iterables of strings. E.g. {'var_names': ['glucose'], 'obs_names': ['age', 'height']}.
         **kwds: Are passed to test methods. Currently this affects only parameters that
                 are passed to :class:`sklearn.linear_model.LogisticRegression`.
                 For instance, you can pass `penalty='l1'` to try to come up with a
@@ -327,50 +373,87 @@ def rank_features_groups(
          >>> ep.tl.rank_features_groups(adata, "service_unit")
          >>> ep.pl.rank_features_groups(adata)
     """
-    if layer is not None and rank_obs_columns is not None:
-        raise ValueError("Only one of 'layer' and 'rank_obs_columns' can be specified.")
+    if layer is not None and field_to_rank == "obs":
+        raise ValueError("If 'layer' is not None, 'field_to_rank' cannot be 'obs'.")
 
-    adata = adata.copy() if copy else adata
+    if field_to_rank not in ["layer", "obs", "layer_and_obs"]:
+        raise ValueError(f"layer must be one of 'layer', 'obs', 'layer_and_obs', not {field_to_rank}")
 
-    if rank_obs_columns is not None:
-        # keep reference to original adata, needed if copy=False
-        adata_orig = adata
-        adata = sc.AnnData(
-            X=np.zeros((len(adata), 1)),
-            obs=adata.obs.copy(),
-            uns={"numerical_columns": [], "non_numerical_columns": [], "encoded_non_numerical_columns": []},
-        )
+    # to give better error messages, check if columns_to_rank have valid keys and values here
+    _var_subset, _obs_subset = _check_columns_to_rank_dict(columns_to_rank)
 
-        if isinstance(rank_obs_columns, str):
-            if rank_obs_columns == "all":
-                rank_obs_columns = adata.obs.keys().to_list()
-            else:
-                raise ValueError(
-                    f"rank_obs_columns should be 'all' or Iterable of column names, not {rank_obs_columns}."
-                )
+    adata = adata.copy() if copy else adata
 
-        # consider adata where all columns from obs become the features, and the other features are dropped
-        if not all(elem in adata.obs.columns.values for elem in rank_obs_columns):
-            raise ValueError(
-                f"Columns `{[col for col in rank_obs_columns if col not in adata.obs.columns.values]}` are not in obs."
+    # to create a minimal adata object below, grab a reference to X/layer of the original adata,
+    # subsetted to the specified columns
+    if field_to_rank in ["layer", "layer_and_obs"]:
+        # for some reason ruff insists on this type check. columns_to_rank is always a dict with key "var_names" if _var_subset is True
+        if _var_subset and isinstance(columns_to_rank, dict):
+            X_to_keep = (
+                adata[:, columns_to_rank["var_names"]].X
+                if layer is None
+                else adata[:, columns_to_rank["var_names"]].layers[layer]
+            )
+            var_to_keep = adata[:, columns_to_rank["var_names"]].var
+            uns_num_to_keep = _get_intersection(
+                adata_uns=adata.uns, key="numerical_columns", selection=columns_to_rank["var_names"]
+            )
+            uns_non_num_to_keep = _get_intersection(
+                adata_uns=adata.uns, key="non_numerical_columns", selection=columns_to_rank["var_names"]
+            )
+            uns_enc_to_keep = _get_intersection(
+                adata_uns=adata.uns, key="encoded_non_numerical_columns", selection=columns_to_rank["var_names"]
             )
 
+        else:
+            X_to_keep = adata.X if layer is None else adata.layers[layer]
+            var_to_keep = adata.var
+            uns_num_to_keep = adata.uns["numerical_columns"] if "numerical_columns" in adata.uns else []
+            uns_enc_to_keep = (
+                adata.uns["encoded_non_numerical_columns"] if "encoded_non_numerical_columns" in adata.uns else []
+            )
+            uns_non_num_to_keep = adata.uns["non_numerical_columns"] if "non_numerical_columns" in adata.uns else []
+
+    else:
+        X_to_keep = np.zeros((len(adata), 1))
+        var_to_keep = pd.DataFrame({"dummy": [0]})
+        uns_num_to_keep = []
+        uns_enc_to_keep = []
+        uns_non_num_to_keep = []
+
+    adata_minimal = sc.AnnData(
+        X=X_to_keep,
+        obs=adata.obs,
+        var=var_to_keep,
+        uns={
+            "numerical_columns": uns_num_to_keep,
+            "encoded_non_numerical_columns": uns_enc_to_keep,
+            "non_numerical_columns": uns_non_num_to_keep,
+        },
+    )
+
+    if field_to_rank in ["obs", "layer_and_obs"]:
         # want columns of obs to become variables in X to be able to use rank_features_groups
-        adata_with_moved_columns = move_to_x(adata, list(rank_obs_columns))
+        # for some reason ruff insists on this type check. columns_to_rank is always a dict with key "obs_names" if _obs_subset is True
+        if _obs_subset and isinstance(columns_to_rank, dict):
+            obs_to_move = adata.obs[columns_to_rank["obs_names"]].keys()
+        else:
+            obs_to_move = adata.obs.keys()
+        _check_no_datetime_columns(adata.obs[obs_to_move])
+        adata_minimal = move_to_x(adata_minimal, list(obs_to_move))
+
+        if field_to_rank == "obs":
+            # the 0th column is a dummy of zeros and is meaningless in this case, and needs to be removed
+            adata_minimal = adata_minimal[:, 1:]
 
-        # don't want columns that have been in X before, as only consider columns from obs
-        columns_to_select = adata_with_moved_columns.var_names.difference(adata.var_names)
-        adata_with_moved_columns = adata_with_moved_columns[:, columns_to_select]
+        adata_minimal = encode(adata_minimal, autodetect=True, encodings="label")
 
-        adata_with_moved_columns = encode(adata_with_moved_columns, autodetect=True, encodings="label")
+    if layer is not None:
+        adata_minimal.layers[layer] = adata_minimal.X
 
-        adata_with_moved_columns.uns[
-            "non_numerical_columns"
-        ] = []  # this should be empty, as have only numeric and encoded
-        adata_with_moved_columns.uns["numerical_columns"] = adata_with_moved_columns.var_names.difference(
-            adata_with_moved_columns.uns["encoded_non_numerical_columns"]
-        ).to_list()  # this is sensitive to `encode` really detecting what it should
-        adata = adata_with_moved_columns
+    # save the reference to the original adata, because we will need to access it later
+    adata_orig = adata
+    adata = adata_minimal
 
     if not adata.obs[groupby].dtype == "category":
         adata.obs[groupby] = pd.Categorical(adata.obs[groupby])
@@ -453,6 +536,10 @@ def rank_features_groups(
             groups_order=group_names,
         )
 
+    # if field_to_rank was obs or layer_and_obs, the adata object we have been working with is adata_minimal
+    adata_orig.uns[key_added] = adata.uns[key_added]
+    adata = adata_orig
+
     # Adjust p values
     if "pvals" in adata.uns[key_added]:
         adata.uns[key_added]["pvals_adj"] = _adjust_pvalues(
@@ -465,8 +552,4 @@ def rank_features_groups(
 
     _sort_features(adata, key_added)
 
-    if rank_obs_columns is not None:
-        adata_orig.uns[key_added] = adata.uns[key_added]
-        adata = adata_orig
-
     return adata if copy else None
diff --git a/tests/tools/test_features_ranking.py b/tests/tools/test_features_ranking.py
index 79ef2b6b..8e10ead9 100644
--- a/tests/tools/test_features_ranking.py
+++ b/tests/tools/test_features_ranking.py
@@ -277,59 +277,110 @@ def test_only_cat_features(self):
         assert "logfoldchanges" in adata.uns["rank_features_groups"]
         assert "pvals_adj" in adata.uns["rank_features_groups"]
 
-    def test_rank_obs(
-        self,
-    ):
-        # prepare data with some interesting features in .obs
-        adata_features_in_obs = read_csv(
+    @pytest.mark.parametrize("field_to_rank", ["layer", "obs", "layer_and_obs"])
+    def test_rank_adata_immutability_property(self, field_to_rank):
+        """
+        Test that rank_features_group does not modify the adata object passed to it,
+        except for the desired .uns field.
+        This test is important because to save memory, copies are made conservatively in rank_features_groups
+        """
+        adata = read_csv(
+            dataset_path=f"{_TEST_PATH}/dataset1.csv", columns_x_only=["station", "sys_bp_entry", "dia_bp_entry"]
+        )
+        adata = ep.pp.encode(adata, encodings={"label": ["station"]})
+        adata_orig = adata.copy()
+
+        ep.tl.rank_features_groups(adata, groupby="disease", field_to_rank=field_to_rank)
+
+        assert adata_orig.shape == adata.shape
+        assert adata_orig.X.shape == adata.X.shape
+        assert adata_orig.obs.shape == adata.obs.shape
+        assert adata_orig.var.shape == adata.var.shape
+
+        assert np.allclose(adata_orig.X, adata.X)
+        assert np.array_equal(adata_orig.obs, adata.obs)
+
+        assert "rank_features_groups" in adata.uns
+
+    @pytest.mark.parametrize("field_to_rank", ["layer", "obs", "layer_and_obs"])
+    def test_rank_features_groups_generates_outputs(self, field_to_rank):
+        """
+        Test that the desired output is generated
+        """
+
+        adata = read_csv(
             dataset_path=f"{_TEST_PATH}/dataset1.csv",
             columns_obs_only=["disease", "station", "sys_bp_entry", "dia_bp_entry"],
         )
 
-        # prepare data with these features in .X
+        ep.tl.rank_features_groups(adata, groupby="disease", field_to_rank=field_to_rank)
+
+        # check standard rank_features_groups entries
+        assert "names" in adata.uns["rank_features_groups"]
+        assert "pvals" in adata.uns["rank_features_groups"]
+        assert "scores" in adata.uns["rank_features_groups"]
+        assert "pvals_adj" in adata.uns["rank_features_groups"]
+        assert "logfoldchanges" in adata.uns["rank_features_groups"]
+        assert "log2foldchanges" not in adata.uns["rank_features_groups"]
+        assert "pts" not in adata.uns["rank_features_groups"]
+
+        if field_to_rank == "layer" or field_to_rank == "obs":
+            assert len(adata.uns["rank_features_groups"]["names"]) == 3  # It only captures the length of each group
+            assert len(adata.uns["rank_features_groups"]["pvals"]) == 3
+            assert len(adata.uns["rank_features_groups"]["scores"]) == 3
+
+        elif field_to_rank == "layer_and_obs":
+            assert len(adata.uns["rank_features_groups"]["names"]) == 6  # It only captures the length of each group
+            assert len(adata.uns["rank_features_groups"]["pvals"]) == 6
+            assert len(adata.uns["rank_features_groups"]["scores"]) == 6
+
+    def test_rank_features_groups_consistent_results(self):
         adata_features_in_x = read_csv(
-            dataset_path=f"{_TEST_PATH}/dataset1.csv", columns_x_only=["station", "sys_bp_entry", "dia_bp_entry"]
+            dataset_path=f"{_TEST_PATH}/dataset1.csv",
+            columns_x_only=["station", "sys_bp_entry", "dia_bp_entry", "glucose"],
         )
         adata_features_in_x = ep.pp.encode(adata_features_in_x, encodings={"label": ["station"]})
 
-        # rank_features_groups on .obs
-        ep.tl.rank_features_groups(adata_features_in_obs, groupby="disease", rank_obs_columns="all")
-
-        # rank features groups on .X
-        ep.tl.rank_features_groups(adata_features_in_x, groupby="disease")
-
-        # check standard rank_features_groups entries
-        assert "names" in adata_features_in_obs.uns["rank_features_groups"]
-        assert "pvals" in adata_features_in_obs.uns["rank_features_groups"]
-        assert "scores" in adata_features_in_obs.uns["rank_features_groups"]
-        assert "pvals_adj" in adata_features_in_obs.uns["rank_features_groups"]
-        assert "log2foldchanges" not in adata_features_in_obs.uns["rank_features_groups"]
-        assert "pts" not in adata_features_in_obs.uns["rank_features_groups"]
-        assert (
-            len(adata_features_in_obs.uns["rank_features_groups"]["names"]) == 3
-        )  # It only captures the length of each group
-        assert len(adata_features_in_obs.uns["rank_features_groups"]["pvals"]) == 3
-        assert len(adata_features_in_obs.uns["rank_features_groups"]["scores"]) == 3
+        adata_features_in_obs = read_csv(
+            dataset_path=f"{_TEST_PATH}/dataset1.csv",
+            columns_obs_only=["disease", "station", "sys_bp_entry", "dia_bp_entry", "glucose"],
+        )
 
-        # check the obs are used indeed
-        assert "sys_bp_entry" in adata_features_in_obs.uns["rank_features_groups"]["names"][0]
-        assert "sys_bp_entry" in adata_features_in_obs.uns["rank_features_groups"]["names"][1]
-        assert "ehrapycat_station" in adata_features_in_obs.uns["rank_features_groups"]["names"][2]
+        adata_features_in_x_and_obs = read_csv(
+            dataset_path=f"{_TEST_PATH}/dataset1.csv",
+            columns_obs_only=["disease", "station"],
+        )
+        # to keep the same variables as in the datsets above, in order to make the comparison of consistency
+        adata_features_in_x_and_obs = adata_features_in_x_and_obs[:, ["sys_bp_entry", "dia_bp_entry", "glucose"]]
+        adata_features_in_x_and_obs.uns["numerical_columns"] = ["sys_bp_entry", "dia_bp_entry", "glucose"]
 
-        # check the X are not used
-        assert "glucose" not in adata_features_in_obs.uns["rank_features_groups"]["names"][0]
+        ep.tl.rank_features_groups(adata_features_in_x, groupby="disease")
+        ep.tl.rank_features_groups(adata_features_in_obs, groupby="disease", field_to_rank="obs")
+        ep.tl.rank_features_groups(adata_features_in_x_and_obs, groupby="disease", field_to_rank="layer_and_obs")
 
-        # check the results are the same
-        for record in adata_features_in_obs.uns["rank_features_groups"]["names"].dtype.names:
+        for record in adata_features_in_x.uns["rank_features_groups"]["names"].dtype.names:
             assert np.allclose(
-                adata_features_in_obs.uns["rank_features_groups"]["scores"][record],
                 adata_features_in_x.uns["rank_features_groups"]["scores"][record],
+                adata_features_in_obs.uns["rank_features_groups"]["scores"][record],
             )
             assert np.allclose(
-                np.array(adata_features_in_obs.uns["rank_features_groups"]["pvals"][record]),
                 np.array(adata_features_in_x.uns["rank_features_groups"]["pvals"][record]),
+                np.array(adata_features_in_obs.uns["rank_features_groups"]["pvals"][record]),
             )
             assert np.array_equal(
+                np.array(adata_features_in_x.uns["rank_features_groups"]["names"][record]),
                 np.array(adata_features_in_obs.uns["rank_features_groups"]["names"][record]),
+            )
+        for record in adata_features_in_x.uns["rank_features_groups"]["names"].dtype.names:
+            assert np.allclose(
+                adata_features_in_x.uns["rank_features_groups"]["scores"][record],
+                adata_features_in_x_and_obs.uns["rank_features_groups"]["scores"][record],
+            )
+            assert np.allclose(
+                np.array(adata_features_in_x.uns["rank_features_groups"]["pvals"][record]),
+                np.array(adata_features_in_x_and_obs.uns["rank_features_groups"]["pvals"][record]),
+            )
+            assert np.array_equal(
                 np.array(adata_features_in_x.uns["rank_features_groups"]["names"][record]),
+                np.array(adata_features_in_x_and_obs.uns["rank_features_groups"]["names"][record]),
             )

From 213d8d5b5a0a54d7fc0047f33a56e4dcf628ab9e Mon Sep 17 00:00:00 2001
From: eljas <eljas.roellin@ikmail.com>
Date: Thu, 7 Dec 2023 12:44:00 +0100
Subject: [PATCH 14/17] this test data should be more stable

---
 .../tools/test_data_features_ranking/dataset1.csv  | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/tools/test_data_features_ranking/dataset1.csv b/tests/tools/test_data_features_ranking/dataset1.csv
index 25f83ab2..1569f780 100644
--- a/tests/tools/test_data_features_ranking/dataset1.csv
+++ b/tests/tools/test_data_features_ranking/dataset1.csv
@@ -4,10 +4,10 @@ idx,sys_bp_entry,dia_bp_entry,glucose,weight,disease,station
 3,140,80,120,60,A,MICU
 4,141,81,130,90,A,MICU
 5,148,77,80,110,B,ICU
-6,149,78,130,78,B,ICU
-7,150,79,120,56,B,MICU
-8,151,80,90,76,B,MICU
-9,158,55,80,67,C,ICU
-10,159,56,90,82,C,ICU
-11,160,57,120,59,C,MICU
-12,161,58,130,81,C,MICU
+6,149,78,135,78,B,ICU
+7,150,79,125,56,B,MICU
+8,151,80,95,76,B,MICU
+9,158,55,70,67,C,ICU
+10,159,56,85,82,C,ICU
+11,160,57,125,59,C,MICU
+12,161,58,125,81,C,MICU

From 034f8209c596ae2d97dc543a57048fac32d6be4e Mon Sep 17 00:00:00 2001
From: Eljas Roellin <65244425+eroell@users.noreply.github.com>
Date: Thu, 7 Dec 2023 13:20:55 +0100
Subject: [PATCH 15/17] Update
 ehrapy/tools/feature_ranking/_rank_features_groups.py

Co-authored-by: Lukas Heumos <lukas.heumos@posteo.net>
---
 ehrapy/tools/feature_ranking/_rank_features_groups.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py
index 2da75c5a..35a4c27d 100644
--- a/ehrapy/tools/feature_ranking/_rank_features_groups.py
+++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py
@@ -245,7 +245,7 @@ def _evaluate_categorical_features(
 
 
 def _check_no_datetime_columns(df):
-    datetime_cols = [col for col in df.columns if df[col].dtype == "datetime64[ns]"]
+datetime_cols = [col for col in df.columns if pd.api.types.is_datetime64_any_dtype(df[col]) or pd.api.types.is_timedelta64_dtype(df[col])]
     if datetime_cols:
         raise ValueError(f"Columns with datetime format found: {datetime_cols}")
 

From 645826525165c483da215a53870ba4be1f3db9b5 Mon Sep 17 00:00:00 2001
From: eljas <eljas.roellin@ikmail.com>
Date: Thu, 7 Dec 2023 13:29:32 +0100
Subject: [PATCH 16/17] correct indent of previous commit and added comment on
 dummy X

---
 ehrapy/tools/feature_ranking/_rank_features_groups.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py
index 35a4c27d..4fe8d52e 100644
--- a/ehrapy/tools/feature_ranking/_rank_features_groups.py
+++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py
@@ -245,7 +245,11 @@ def _evaluate_categorical_features(
 
 
 def _check_no_datetime_columns(df):
-datetime_cols = [col for col in df.columns if pd.api.types.is_datetime64_any_dtype(df[col]) or pd.api.types.is_timedelta64_dtype(df[col])]
+    datetime_cols = [
+        col
+        for col in df.columns
+        if pd.api.types.is_datetime64_any_dtype(df[col]) or pd.api.types.is_timedelta64_dtype(df[col])
+    ]
     if datetime_cols:
         raise ValueError(f"Columns with datetime format found: {datetime_cols}")
 
@@ -415,6 +419,7 @@ def rank_features_groups(
             uns_non_num_to_keep = adata.uns["non_numerical_columns"] if "non_numerical_columns" in adata.uns else []
 
     else:
+        # dummy 1-dimensional X to be used by move_to_x, and removed again afterwards
         X_to_keep = np.zeros((len(adata), 1))
         var_to_keep = pd.DataFrame({"dummy": [0]})
         uns_num_to_keep = []

From f444a5905fcd9c223847f8930cae7b755e3e4873 Mon Sep 17 00:00:00 2001
From: eljas <eljas.roellin@ikmail.com>
Date: Thu, 7 Dec 2023 18:42:31 +0100
Subject: [PATCH 17/17] bug fixes, more tests and (fixed) examples in docstring

---
 .../feature_ranking/_rank_features_groups.py  | 36 ++++++++++++----
 tests/tools/test_features_ranking.py          | 43 +++++++++++++++++++
 2 files changed, 71 insertions(+), 8 deletions(-)

diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py
index 4fe8d52e..5625d95f 100644
--- a/ehrapy/tools/feature_ranking/_rank_features_groups.py
+++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py
@@ -257,7 +257,7 @@ def _check_no_datetime_columns(df):
 def _get_intersection(adata_uns, key, selection):
     """Get intersection of adata_uns[key] and selection"""
     if key in adata_uns:
-        uns_enc_to_keep = list(set(adata_uns["encoded_non_numerical_columns"]) & set(selection))
+        uns_enc_to_keep = list(set(adata_uns[key]) & set(selection))
     else:
         uns_enc_to_keep = []
     return uns_enc_to_keep
@@ -351,31 +351,48 @@ def rank_features_groups(
                 minimal set of genes that are good predictors (sparse solution meaning few non-zero fitted coefficients).
 
     Returns:
-        *names*: structured `np.ndarray` (`.uns['rank_features_groups']`)
+        *names* structured `np.ndarray` (`.uns['rank_features_groups']`)
                   Structured array to be indexed by group id storing the gene
                   names. Ordered according to scores.
-        *scores*: structured `np.ndarray` (`.uns['rank_features_groups']`)
+        *scores* structured `np.ndarray` (`.uns['rank_features_groups']`)
                   Structured array to be indexed by group id storing the z-score
                   underlying the computation of a p-value for each gene for each group.
                   Ordered according to scores.
-        *logfoldchanges*: structured `np.ndarray` (`.uns['rank_features_groups']`)
+        *logfoldchanges* structured `np.ndarray` (`.uns['rank_features_groups']`)
                           Structured array to be indexed by group id storing the log2
                           fold change for each gene for each group. Ordered according to scores.
                           Only provided if method is 't-test' like.
                           Note: this is an approximation calculated from mean-log values.
-        *pvals*: structured `np.ndarray` (`.uns['rank_features_groups']`) p-values.
-        *pvals_adj* : structured `np.ndarray` (`.uns['rank_features_groups']`) Corrected p-values.
+        *pvals* structured `np.ndarray` (`.uns['rank_features_groups']`) p-values.
+        *pvals_adj* structured `np.ndarray` (`.uns['rank_features_groups']`) Corrected p-values.
         *pts*: `pandas.DataFrame` (`.uns['rank_features_groups']`)
                Fraction of cells expressing the genes for each group.
-        *pts_rest*: `pandas.DataFrame` (`.uns['rank_features_groups']`)
+        *pts_rest* `pandas.DataFrame` (`.uns['rank_features_groups']`)
                     Only if `reference` is set to `'rest'`.
                     Fraction of observations from the union of the rest of each group containing the features.
 
      Examples:
          >>> import ehrapy as ep
-         >>> adata = ep.dt.mimic_2(encoded=True)
+         >>> adata = ep.dt.mimic_2(encoded=False)
+         >>> # want to move some metadata to the obs field
+         >>> ep.anndata.move_to_obs(adata, to_obs=["service_unit", "service_num", "age", "mort_day_censored"])
          >>> ep.tl.rank_features_groups(adata, "service_unit")
          >>> ep.pl.rank_features_groups(adata)
+
+         >>> import ehrapy as ep
+         >>> adata = ep.dt.mimic_2(encoded=False)
+         >>> # want to move some metadata to the obs field
+         >>> ep.anndata.move_to_obs(adata, to_obs=["service_unit", "service_num", "age", "mort_day_censored"])
+         >>> ep.tl.rank_features_groups(adata, "service_unit", field_to_rank="obs", columns_to_rank={"obs_names": ["age", "mort_day_censored"]})
+         >>> ep.pl.rank_features_groups(adata)
+
+         >>> import ehrapy as ep
+         >>> adata = ep.dt.mimic_2(encoded=False)
+         >>> # want to move some metadata to the obs field
+         >>> ep.anndata.move_to_obs(adata, to_obs=["service_unit", "service_num", "age", "mort_day_censored"])
+         >>> ep.tl.rank_features_groups(adata, "service_unit", field_to_rank="layer_and_obs", columns_to_rank={"var_names": ['copd_flg', 'renal_flg'], "obs_names": ["age", "mort_day_censored"]})
+         >>> ep.pl.rank_features_groups(adata)
+
     """
     if layer is not None and field_to_rank == "obs":
         raise ValueError("If 'layer' is not None, 'field_to_rank' cannot be 'obs'.")
@@ -452,6 +469,9 @@ def rank_features_groups(
             adata_minimal = adata_minimal[:, 1:]
 
         adata_minimal = encode(adata_minimal, autodetect=True, encodings="label")
+        # this is needed because encode() doesn't add this key if there are no categorical columns to encode
+        if "encoded_non_numerical_columns" not in adata_minimal.uns:
+            adata_minimal.uns["encoded_non_numerical_columns"] = []
 
     if layer is not None:
         adata_minimal.layers[layer] = adata_minimal.X
diff --git a/tests/tools/test_features_ranking.py b/tests/tools/test_features_ranking.py
index 8e10ead9..cc9c3c22 100644
--- a/tests/tools/test_features_ranking.py
+++ b/tests/tools/test_features_ranking.py
@@ -384,3 +384,46 @@ def test_rank_features_groups_consistent_results(self):
                 np.array(adata_features_in_x.uns["rank_features_groups"]["names"][record]),
                 np.array(adata_features_in_x_and_obs.uns["rank_features_groups"]["names"][record]),
             )
+
+    def test_rank_features_group_column_to_rank(self):
+        adata = read_csv(
+            dataset_path=f"{_TEST_PATH}/dataset1.csv",
+            columns_obs_only=["disease", "station", "sys_bp_entry", "dia_bp_entry"],
+            index_column="idx",
+        )
+
+        # get a fresh adata for every test to not have any side effects
+        adata_copy = adata.copy()
+
+        ep.tl.rank_features_groups(adata, groupby="disease", columns_to_rank="all")
+        assert len(adata.uns["rank_features_groups"]["names"]) == 2
+
+        # want to check a "complete selection" works
+        adata = adata_copy.copy()
+        ep.tl.rank_features_groups(adata, groupby="disease", columns_to_rank={"var_names": ["glucose", "weight"]})
+        assert len(adata.uns["rank_features_groups"]["names"]) == 2
+
+        # want to check a "sub-selection" works
+        adata = adata_copy.copy()
+        ep.tl.rank_features_groups(adata, groupby="disease", columns_to_rank={"var_names": ["glucose"]})
+        assert len(adata.uns["rank_features_groups"]["names"]) == 1
+
+        # want to check a "complete" selection works
+        adata = adata_copy.copy()
+        ep.tl.rank_features_groups(
+            adata,
+            groupby="disease",
+            field_to_rank="obs",
+            columns_to_rank={"obs_names": ["station", "sys_bp_entry", "dia_bp_entry"]},
+        )
+        assert len(adata.uns["rank_features_groups"]["names"]) == 3
+
+        # want to check a "sub-selection" selection works
+        adata = adata_copy.copy()
+        ep.tl.rank_features_groups(
+            adata,
+            groupby="disease",
+            field_to_rank="obs",
+            columns_to_rank={"obs_names": ["sys_bp_entry", "dia_bp_entry"]},
+        )
+        assert len(adata.uns["rank_features_groups"]["names"]) == 2