From 0c52f13c24e1f535dc3c4fca23b975fdd1e78e1f Mon Sep 17 00:00:00 2001 From: eljas Date: Wed, 6 Dec 2023 10:11:27 +0100 Subject: [PATCH 01/17] tests for rank features groups with obs --- .../test_data_features_ranking/dataset1.csv | 13 ++++ tests/tools/test_features_ranking.py | 63 +++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 tests/tools/test_data_features_ranking/dataset1.csv diff --git a/tests/tools/test_data_features_ranking/dataset1.csv b/tests/tools/test_data_features_ranking/dataset1.csv new file mode 100644 index 00000000..25f83ab2 --- /dev/null +++ b/tests/tools/test_data_features_ranking/dataset1.csv @@ -0,0 +1,13 @@ +idx,sys_bp_entry,dia_bp_entry,glucose,weight,disease,station +1,138,78,80,77,A,ICU +2,139,79,90,76,A,ICU +3,140,80,120,60,A,MICU +4,141,81,130,90,A,MICU +5,148,77,80,110,B,ICU +6,149,78,130,78,B,ICU +7,150,79,120,56,B,MICU +8,151,80,90,76,B,MICU +9,158,55,80,67,C,ICU +10,159,56,90,82,C,ICU +11,160,57,120,59,C,MICU +12,161,58,130,81,C,MICU diff --git a/tests/tools/test_features_ranking.py b/tests/tools/test_features_ranking.py index fefb6b02..e7ed708e 100644 --- a/tests/tools/test_features_ranking.py +++ b/tests/tools/test_features_ranking.py @@ -1,9 +1,15 @@ +from pathlib import Path + import numpy as np import pandas as pd import pytest import ehrapy as ep import ehrapy.tools.feature_ranking._rank_features_groups as _utils +from ehrapy.io._read import read_csv + +CURRENT_DIR = Path(__file__).parent +_TEST_PATH = f"{CURRENT_DIR}/test_data_features_ranking" class TestHelperFunctions: @@ -270,3 +276,60 @@ def test_only_cat_features(self): assert "scores" in adata.uns["rank_features_groups"] assert "logfoldchanges" in adata.uns["rank_features_groups"] assert "pvals_adj" in adata.uns["rank_features_groups"] + + def test_rank_obs( + self, + ): + # prepare data with some interesting features in .obs + adata_features_in_obs = read_csv( + dataset_path=f"{_TEST_PATH}/dataset1.csv", + columns_obs_only=["disease", "station", "sys_bp_entry", "dia_bp_entry"], + ) + + # prepare data with these features in .X + adata_features_in_x = read_csv( + dataset_path=f"{_TEST_PATH}/dataset1.csv", columns_x_only=["station", "sys_bp_entry", "dia_bp_entry"] + ) + adata_features_in_x = ep.pp.encode(adata_features_in_x, encodings={"label_encoding": ["station"]}) + + # rank_features_groups on .obs + ep.tl.rank_features_groups(adata_features_in_obs, groupby="disease", rank_obs_columns="all") + + # rank features groups on .X + ep.tl.rank_features_groups(adata_features_in_x, groupby="disease") + + # check standard rank_features_groups entries + assert "names" in adata_features_in_obs.uns["rank_features_groups"] + assert "pvals" in adata_features_in_obs.uns["rank_features_groups"] + assert "scores" in adata_features_in_obs.uns["rank_features_groups"] + assert "pvals_adj" in adata_features_in_obs.uns["rank_features_groups"] + assert "log2foldchanges" not in adata_features_in_obs.uns["rank_features_groups"] + assert "pts" not in adata_features_in_obs.uns["rank_features_groups"] + assert ( + len(adata_features_in_obs.uns["rank_features_groups"]["names"]) == 3 + ) # It only captures the length of each group + assert len(adata_features_in_obs.uns["rank_features_groups"]["pvals"]) == 3 + assert len(adata_features_in_obs.uns["rank_features_groups"]["scores"]) == 3 + + # check the obs are used indeed + assert "sys_bp_entry" in adata_features_in_obs.uns["rank_features_groups"]["names"][0] + assert "sys_bp_entry" in adata_features_in_obs.uns["rank_features_groups"]["names"][1] + assert "ehrapycat_station" in adata_features_in_obs.uns["rank_features_groups"]["names"][2] + + # check the X are not used + assert "glucose" not in adata_features_in_obs.uns["rank_features_groups"]["names"][0] + + # check the results are the same + for record in adata_features_in_obs.uns["rank_features_groups"]["names"].dtype.names: + assert np.allclose( + adata_features_in_obs.uns["rank_features_groups"]["scores"][record], + adata_features_in_x.uns["rank_features_groups"]["scores"][record], + ) + assert np.allclose( + np.array(adata_features_in_obs.uns["rank_features_groups"]["pvals"][record]), + np.array(adata_features_in_x.uns["rank_features_groups"]["pvals"][record]), + ) + assert np.array_equal( + np.array(adata_features_in_obs.uns["rank_features_groups"]["names"][record]), + np.array(adata_features_in_x.uns["rank_features_groups"]["names"][record]), + ) From dd022eca545650ab8634850318ecac1e8c540552 Mon Sep 17 00:00:00 2001 From: eljas Date: Wed, 6 Dec 2023 10:52:34 +0100 Subject: [PATCH 02/17] first drafted feature ranking using obs --- .../feature_ranking/_rank_features_groups.py | 67 +++++++++++++++++-- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py index 6bf27057..932eedda 100644 --- a/ehrapy/tools/feature_ranking/_rank_features_groups.py +++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py @@ -6,6 +6,8 @@ import scanpy as sc from anndata import AnnData +from ehrapy.anndata import move_to_x +from ehrapy.preprocessing import encode from ehrapy.tools import _method_options @@ -255,11 +257,12 @@ def rank_features_groups( correction_method: _method_options._correction_method = "benjamini-hochberg", tie_correct: bool = False, layer: Optional[str] = None, + rank_obs_columns: Optional[Union[list[str], str]] = None, **kwds, ) -> None: # pragma: no cover """Rank features for characterizing groups. - Expects logarithmized data. + Expects logarithmized data. # TODO: should this line be removed? log-transform may be a valid transformation for probably many types of data, but not a necessity for this method right? Args: adata: Annotated data matrix. @@ -288,6 +291,8 @@ def rank_features_groups( Used only for statistical tests (e.g. doesn't work for "logreg" `num_cols_method`) tie_correct: Use tie correction for `'wilcoxon'` scores. Used only for `'wilcoxon'`. layer: Key from `adata.layers` whose value will be used to perform tests on. + rank_obs_columns: Whether to rank `adata.obs` columns instead of features in `adata.layer`. If `True`, all observation columns are ranked. If list of column names, only those are ranked. + layer needs to be None if this is used. **kwds: Are passed to test methods. Currently this affects only parameters that are passed to :class:`sklearn.linear_model.LogisticRegression`. For instance, you can pass `penalty='l1'` to try to come up with a @@ -314,19 +319,67 @@ def rank_features_groups( Only if `reference` is set to `'rest'`. Fraction of observations from the union of the rest of each group containing the features. - Examples: + Examples: # TODO: there is an inf. value here. I think due to a bug considering the one-hot encoded feature of service_unit. >>> import ehrapy as ep >>> adata = ep.dt.mimic_2(encoded=True) >>> ep.tl.rank_features_groups(adata, "service_unit") >>> ep.pl.rank_features_groups(adata) """ + # if rank_obs_columns is indicated, layer must be None + if layer is not None and rank_obs_columns is not None: + raise ValueError("Only one of layer and rank_obs_columns can be specified.") + adata = adata.copy() if copy else adata + # TODO: check if need to remove the "actual" column according which to group + if rank_obs_columns is not None: + # keep reference to original adata, needed if copy=False + adata_orig = adata + # copy adata to work on + adata = adata.copy() + + if isinstance(rank_obs_columns, str): + if rank_obs_columns == "all": + rank_obs_columns = adata.obs.keys().to_list() + else: + raise ValueError( + f"rank_obs_columns should be 'all' or Iterable of column names, not {rank_obs_columns}." + ) + + # consider adata where all columns from obs become the features, and the other features are dropped + if not all(elem in adata.obs.columns.values for elem in rank_obs_columns): + raise ValueError( + f"Columns `{[col for col in rank_obs_columns if col not in adata.obs.columns.values]}` are not in obs." + ) + + # if groupby in rank_obs_columns: + # rank_obs_columns.remove(groupby) + + # move obs columns to X + # TODO: check if we want to take care of strange stuff (e.g. dates/times) or allow user to only specify columns that make sense + adata_with_moved_columns = move_to_x(adata, rank_obs_columns) + + # remove columns previously in X + columns_to_select = adata_with_moved_columns.var_names.difference(adata.var_names) + adata_with_moved_columns = adata_with_moved_columns[:, columns_to_select] + + # encode categoricals + adata_with_moved_columns = encode(adata_with_moved_columns, autodetect=True, encodings="label_encoding") + + # assign numeric and categorical columns + adata_with_moved_columns.uns[ + "non_numerical_columns" + ] = [] # this should be empty, as have only numeric and encoded + adata_with_moved_columns.uns["numerical_columns"] = adata_with_moved_columns.var_names.difference( + adata_with_moved_columns.uns["encoded_non_numerical_columns"] + ).to_list() # this is sensitive to `encode` really detecting what it should + adata = adata_with_moved_columns + if not adata.obs[groupby].dtype == "category": adata.obs[groupby] = pd.Categorical(adata.obs[groupby]) adata.uns[key_added] = {} - adata.uns[key_added]["params"] = { + adata.uns[key_added]["params"] = { # TODO: add additional parameters "groupby": groupby, "reference": reference, "method": num_cols_method, @@ -404,14 +457,20 @@ def rank_features_groups( ) # Adjust p values - if "pvals" in adata.uns[key_added]: + if "pvals" in adata.uns[key_added]: # todo: in what scenarios can they not be there? adata.uns[key_added]["pvals_adj"] = _adjust_pvalues( adata.uns[key_added]["pvals"], corr_method=correction_method ) + # For some reason, pts should be a DataFrame if "pts" in adata.uns[key_added]: adata.uns[key_added]["pts"] = pd.DataFrame(adata.uns[key_added]["pts"]) _sort_features(adata, key_added) + # TODO: return adata_orig with rank features results + if rank_obs_columns is not None: + adata_orig.uns[key_added] = adata.uns[key_added] + adata = adata_orig + return adata if copy else None From a8187287321033443738fad91afaac04b4c800fc Mon Sep 17 00:00:00 2001 From: eljas Date: Wed, 6 Dec 2023 11:39:12 +0100 Subject: [PATCH 03/17] fixed encoding names --- ehrapy/tools/feature_ranking/_rank_features_groups.py | 11 ++++------- tests/tools/test_features_ranking.py | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py index 932eedda..02e3ea33 100644 --- a/ehrapy/tools/feature_ranking/_rank_features_groups.py +++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py @@ -262,7 +262,7 @@ def rank_features_groups( ) -> None: # pragma: no cover """Rank features for characterizing groups. - Expects logarithmized data. # TODO: should this line be removed? log-transform may be a valid transformation for probably many types of data, but not a necessity for this method right? + Expects logarithmized data. Args: adata: Annotated data matrix. @@ -319,7 +319,7 @@ def rank_features_groups( Only if `reference` is set to `'rest'`. Fraction of observations from the union of the rest of each group containing the features. - Examples: # TODO: there is an inf. value here. I think due to a bug considering the one-hot encoded feature of service_unit. + Examples: >>> import ehrapy as ep >>> adata = ep.dt.mimic_2(encoded=True) >>> ep.tl.rank_features_groups(adata, "service_unit") @@ -331,7 +331,6 @@ def rank_features_groups( adata = adata.copy() if copy else adata - # TODO: check if need to remove the "actual" column according which to group if rank_obs_columns is not None: # keep reference to original adata, needed if copy=False adata_orig = adata @@ -356,7 +355,6 @@ def rank_features_groups( # rank_obs_columns.remove(groupby) # move obs columns to X - # TODO: check if we want to take care of strange stuff (e.g. dates/times) or allow user to only specify columns that make sense adata_with_moved_columns = move_to_x(adata, rank_obs_columns) # remove columns previously in X @@ -364,7 +362,7 @@ def rank_features_groups( adata_with_moved_columns = adata_with_moved_columns[:, columns_to_select] # encode categoricals - adata_with_moved_columns = encode(adata_with_moved_columns, autodetect=True, encodings="label_encoding") + adata_with_moved_columns = encode(adata_with_moved_columns, autodetect=True, encodings="label") # assign numeric and categorical columns adata_with_moved_columns.uns[ @@ -379,7 +377,7 @@ def rank_features_groups( adata.obs[groupby] = pd.Categorical(adata.obs[groupby]) adata.uns[key_added] = {} - adata.uns[key_added]["params"] = { # TODO: add additional parameters + adata.uns[key_added]["params"] = { "groupby": groupby, "reference": reference, "method": num_cols_method, @@ -468,7 +466,6 @@ def rank_features_groups( _sort_features(adata, key_added) - # TODO: return adata_orig with rank features results if rank_obs_columns is not None: adata_orig.uns[key_added] = adata.uns[key_added] adata = adata_orig diff --git a/tests/tools/test_features_ranking.py b/tests/tools/test_features_ranking.py index e7ed708e..79ef2b6b 100644 --- a/tests/tools/test_features_ranking.py +++ b/tests/tools/test_features_ranking.py @@ -290,7 +290,7 @@ def test_rank_obs( adata_features_in_x = read_csv( dataset_path=f"{_TEST_PATH}/dataset1.csv", columns_x_only=["station", "sys_bp_entry", "dia_bp_entry"] ) - adata_features_in_x = ep.pp.encode(adata_features_in_x, encodings={"label_encoding": ["station"]}) + adata_features_in_x = ep.pp.encode(adata_features_in_x, encodings={"label": ["station"]}) # rank_features_groups on .obs ep.tl.rank_features_groups(adata_features_in_obs, groupby="disease", rank_obs_columns="all") From adaa53b0acbe490f3223070f0738c76c1f3a62c8 Mon Sep 17 00:00:00 2001 From: eljas Date: Wed, 6 Dec 2023 11:42:56 +0100 Subject: [PATCH 04/17] remove comment --- ehrapy/tools/feature_ranking/_rank_features_groups.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py index 02e3ea33..31ab66b2 100644 --- a/ehrapy/tools/feature_ranking/_rank_features_groups.py +++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py @@ -455,7 +455,7 @@ def rank_features_groups( ) # Adjust p values - if "pvals" in adata.uns[key_added]: # todo: in what scenarios can they not be there? + if "pvals" in adata.uns[key_added]: adata.uns[key_added]["pvals_adj"] = _adjust_pvalues( adata.uns[key_added]["pvals"], corr_method=correction_method ) From 37c6a9ae8cd5699c2ec22f22ce2ea860654e503d Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Wed, 6 Dec 2023 12:46:17 +0100 Subject: [PATCH 05/17] Remove comment Co-authored-by: Lukas Heumos --- ehrapy/tools/feature_ranking/_rank_features_groups.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py index 31ab66b2..9be05f09 100644 --- a/ehrapy/tools/feature_ranking/_rank_features_groups.py +++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py @@ -325,7 +325,6 @@ def rank_features_groups( >>> ep.tl.rank_features_groups(adata, "service_unit") >>> ep.pl.rank_features_groups(adata) """ - # if rank_obs_columns is indicated, layer must be None if layer is not None and rank_obs_columns is not None: raise ValueError("Only one of layer and rank_obs_columns can be specified.") From 06c2a009ea216cb200e11b6d71f151b7ab7c6933 Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Wed, 6 Dec 2023 12:47:27 +0100 Subject: [PATCH 06/17] Remove comment Co-authored-by: Lukas Heumos --- ehrapy/tools/feature_ranking/_rank_features_groups.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py index 9be05f09..8a7894fb 100644 --- a/ehrapy/tools/feature_ranking/_rank_features_groups.py +++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py @@ -353,7 +353,6 @@ def rank_features_groups( # if groupby in rank_obs_columns: # rank_obs_columns.remove(groupby) - # move obs columns to X adata_with_moved_columns = move_to_x(adata, rank_obs_columns) # remove columns previously in X From 6fa3de16a646c1d132ddf271e4fb0b87fc2bf322 Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Wed, 6 Dec 2023 12:47:47 +0100 Subject: [PATCH 07/17] Remove comment Co-authored-by: Lukas Heumos --- ehrapy/tools/feature_ranking/_rank_features_groups.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py index 8a7894fb..cfaf9b8b 100644 --- a/ehrapy/tools/feature_ranking/_rank_features_groups.py +++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py @@ -333,7 +333,6 @@ def rank_features_groups( if rank_obs_columns is not None: # keep reference to original adata, needed if copy=False adata_orig = adata - # copy adata to work on adata = adata.copy() if isinstance(rank_obs_columns, str): From 458520ec1a232d390170072795ff2ee99959f600 Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Wed, 6 Dec 2023 12:49:28 +0100 Subject: [PATCH 08/17] Update ehrapy/tools/feature_ranking/_rank_features_groups.py Co-authored-by: Lukas Heumos --- ehrapy/tools/feature_ranking/_rank_features_groups.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py index cfaf9b8b..21d119a2 100644 --- a/ehrapy/tools/feature_ranking/_rank_features_groups.py +++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py @@ -326,7 +326,7 @@ def rank_features_groups( >>> ep.pl.rank_features_groups(adata) """ if layer is not None and rank_obs_columns is not None: - raise ValueError("Only one of layer and rank_obs_columns can be specified.") + raise ValueError("Only one of 'layer' and 'rank_obs_columns' can be specified.") adata = adata.copy() if copy else adata From 210bac6f4b001cdae8de220fb0c46f4fc32c34c4 Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Wed, 6 Dec 2023 12:54:28 +0100 Subject: [PATCH 09/17] Remove comment Co-authored-by: Lukas Heumos --- ehrapy/tools/feature_ranking/_rank_features_groups.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py index 21d119a2..d7e36e3e 100644 --- a/ehrapy/tools/feature_ranking/_rank_features_groups.py +++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py @@ -354,7 +354,6 @@ def rank_features_groups( adata_with_moved_columns = move_to_x(adata, rank_obs_columns) - # remove columns previously in X columns_to_select = adata_with_moved_columns.var_names.difference(adata.var_names) adata_with_moved_columns = adata_with_moved_columns[:, columns_to_select] From ffadf71d7c53e36fa2b0e912e7936cccf5627cc8 Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Wed, 6 Dec 2023 12:54:43 +0100 Subject: [PATCH 10/17] Remove comment Co-authored-by: Lukas Heumos --- ehrapy/tools/feature_ranking/_rank_features_groups.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py index d7e36e3e..ebb802fe 100644 --- a/ehrapy/tools/feature_ranking/_rank_features_groups.py +++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py @@ -357,7 +357,6 @@ def rank_features_groups( columns_to_select = adata_with_moved_columns.var_names.difference(adata.var_names) adata_with_moved_columns = adata_with_moved_columns[:, columns_to_select] - # encode categoricals adata_with_moved_columns = encode(adata_with_moved_columns, autodetect=True, encodings="label") # assign numeric and categorical columns From f0f4867a1474a05b707ae8cef62ca4fa2f1711d6 Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Wed, 6 Dec 2023 12:56:31 +0100 Subject: [PATCH 11/17] Update ehrapy/tools/feature_ranking/_rank_features_groups.py Co-authored-by: Lukas Heumos --- ehrapy/tools/feature_ranking/_rank_features_groups.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py index ebb802fe..3b3cf0f9 100644 --- a/ehrapy/tools/feature_ranking/_rank_features_groups.py +++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py @@ -359,7 +359,6 @@ def rank_features_groups( adata_with_moved_columns = encode(adata_with_moved_columns, autodetect=True, encodings="label") - # assign numeric and categorical columns adata_with_moved_columns.uns[ "non_numerical_columns" ] = [] # this should be empty, as have only numeric and encoded From 02148f5ee5cf3e9a2077a2d1869579b3be6d7a89 Mon Sep 17 00:00:00 2001 From: eljas Date: Wed, 6 Dec 2023 13:00:34 +0100 Subject: [PATCH 12/17] Iterable to list and import from future --- ehrapy/tools/feature_ranking/_rank_features_groups.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py index 31ab66b2..763f9a2f 100644 --- a/ehrapy/tools/feature_ranking/_rank_features_groups.py +++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from collections.abc import Iterable from typing import Literal, Optional, Union @@ -257,7 +259,7 @@ def rank_features_groups( correction_method: _method_options._correction_method = "benjamini-hochberg", tie_correct: bool = False, layer: Optional[str] = None, - rank_obs_columns: Optional[Union[list[str], str]] = None, + rank_obs_columns: Optional[Union[Iterable[str], str]] = None, **kwds, ) -> None: # pragma: no cover """Rank features for characterizing groups. @@ -355,7 +357,7 @@ def rank_features_groups( # rank_obs_columns.remove(groupby) # move obs columns to X - adata_with_moved_columns = move_to_x(adata, rank_obs_columns) + adata_with_moved_columns = move_to_x(adata, list(rank_obs_columns)) # remove columns previously in X columns_to_select = adata_with_moved_columns.var_names.difference(adata.var_names) From bf02fff76652d0189cbc0d71ae37e1faae91ef4b Mon Sep 17 00:00:00 2001 From: eljas Date: Thu, 7 Dec 2023 11:15:40 +0100 Subject: [PATCH 13/17] upated to use layer, obs, or both --- .../feature_ranking/_rank_features_groups.py | 165 +++++++++++++----- tests/tools/test_features_ranking.py | 121 +++++++++---- 2 files changed, 210 insertions(+), 76 deletions(-) diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py index 6d91ef34..2da75c5a 100644 --- a/ehrapy/tools/feature_ranking/_rank_features_groups.py +++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py @@ -244,6 +244,51 @@ def _evaluate_categorical_features( ) +def _check_no_datetime_columns(df): + datetime_cols = [col for col in df.columns if df[col].dtype == "datetime64[ns]"] + if datetime_cols: + raise ValueError(f"Columns with datetime format found: {datetime_cols}") + + +def _get_intersection(adata_uns, key, selection): + """Get intersection of adata_uns[key] and selection""" + if key in adata_uns: + uns_enc_to_keep = list(set(adata_uns["encoded_non_numerical_columns"]) & set(selection)) + else: + uns_enc_to_keep = [] + return uns_enc_to_keep + + +def _check_columns_to_rank_dict(columns_to_rank): + if isinstance(columns_to_rank, str): + if columns_to_rank == "all": + _var_subset = _obs_subset = False + else: + raise ValueError("If columns_to_rank is a string, it must be 'all'.") + + elif isinstance(columns_to_rank, dict): + allowed_keys = {"var_names", "obs_names"} + for key in columns_to_rank.keys(): + if key not in allowed_keys: + raise ValueError( + f"columns_to_rank dictionary must have only keys 'var_names' and/or 'obs_names', not {key}." + ) + if not isinstance(key, str): + raise ValueError(f"columns_to_rank dictionary keys must be strings, not {type(key)}.") + + for key, value in columns_to_rank.items(): + if not isinstance(value, Iterable) or any(not isinstance(item, str) for item in value): + raise ValueError(f"The value associated with key '{key}' must be an iterable of strings.") + + _var_subset = "var_names" in columns_to_rank.keys() + _obs_subset = "obs_names" in columns_to_rank.keys() + + else: + raise ValueError("columns_to_rank must be either 'all' or a dictionary.") + + return _var_subset, _obs_subset + + def rank_features_groups( adata: AnnData, groupby: str, @@ -259,7 +304,8 @@ def rank_features_groups( correction_method: _method_options._correction_method = "benjamini-hochberg", tie_correct: bool = False, layer: Optional[str] = None, - rank_obs_columns: Optional[Union[Iterable[str], str]] = None, + field_to_rank: Union[Literal["layer"], Literal["obs"], Literal["layer_and_obs"]] = "layer", + columns_to_rank: Union[dict[str, Iterable[str]], Literal["all"]] = "all", **kwds, ) -> None: # pragma: no cover """Rank features for characterizing groups. @@ -293,8 +339,8 @@ def rank_features_groups( Used only for statistical tests (e.g. doesn't work for "logreg" `num_cols_method`) tie_correct: Use tie correction for `'wilcoxon'` scores. Used only for `'wilcoxon'`. layer: Key from `adata.layers` whose value will be used to perform tests on. - rank_obs_columns: Whether to rank `adata.obs` columns instead of features in `adata.layer`. If `True`, all observation columns are ranked. If list of column names, only those are ranked. - layer needs to be None if this is used. + field_to_rank: Set to `layer` to rank variables in `adata.X` or `adata.layers[layer]` (default), `obs` to rank `adata.obs`, or `layer_and_obs` to rank both. Layer needs to be None if this is not 'layer'. + columns_to_rank: Subset of columns to rank. If 'all', all columns are used. If a dictionary, it must have keys 'var_names' and/or 'obs_names' and values must be iterables of strings. E.g. {'var_names': ['glucose'], 'obs_names': ['age', 'height']}. **kwds: Are passed to test methods. Currently this affects only parameters that are passed to :class:`sklearn.linear_model.LogisticRegression`. For instance, you can pass `penalty='l1'` to try to come up with a @@ -327,50 +373,87 @@ def rank_features_groups( >>> ep.tl.rank_features_groups(adata, "service_unit") >>> ep.pl.rank_features_groups(adata) """ - if layer is not None and rank_obs_columns is not None: - raise ValueError("Only one of 'layer' and 'rank_obs_columns' can be specified.") + if layer is not None and field_to_rank == "obs": + raise ValueError("If 'layer' is not None, 'field_to_rank' cannot be 'obs'.") - adata = adata.copy() if copy else adata + if field_to_rank not in ["layer", "obs", "layer_and_obs"]: + raise ValueError(f"layer must be one of 'layer', 'obs', 'layer_and_obs', not {field_to_rank}") - if rank_obs_columns is not None: - # keep reference to original adata, needed if copy=False - adata_orig = adata - adata = sc.AnnData( - X=np.zeros((len(adata), 1)), - obs=adata.obs.copy(), - uns={"numerical_columns": [], "non_numerical_columns": [], "encoded_non_numerical_columns": []}, - ) + # to give better error messages, check if columns_to_rank have valid keys and values here + _var_subset, _obs_subset = _check_columns_to_rank_dict(columns_to_rank) - if isinstance(rank_obs_columns, str): - if rank_obs_columns == "all": - rank_obs_columns = adata.obs.keys().to_list() - else: - raise ValueError( - f"rank_obs_columns should be 'all' or Iterable of column names, not {rank_obs_columns}." - ) + adata = adata.copy() if copy else adata - # consider adata where all columns from obs become the features, and the other features are dropped - if not all(elem in adata.obs.columns.values for elem in rank_obs_columns): - raise ValueError( - f"Columns `{[col for col in rank_obs_columns if col not in adata.obs.columns.values]}` are not in obs." + # to create a minimal adata object below, grab a reference to X/layer of the original adata, + # subsetted to the specified columns + if field_to_rank in ["layer", "layer_and_obs"]: + # for some reason ruff insists on this type check. columns_to_rank is always a dict with key "var_names" if _var_subset is True + if _var_subset and isinstance(columns_to_rank, dict): + X_to_keep = ( + adata[:, columns_to_rank["var_names"]].X + if layer is None + else adata[:, columns_to_rank["var_names"]].layers[layer] + ) + var_to_keep = adata[:, columns_to_rank["var_names"]].var + uns_num_to_keep = _get_intersection( + adata_uns=adata.uns, key="numerical_columns", selection=columns_to_rank["var_names"] + ) + uns_non_num_to_keep = _get_intersection( + adata_uns=adata.uns, key="non_numerical_columns", selection=columns_to_rank["var_names"] + ) + uns_enc_to_keep = _get_intersection( + adata_uns=adata.uns, key="encoded_non_numerical_columns", selection=columns_to_rank["var_names"] ) + else: + X_to_keep = adata.X if layer is None else adata.layers[layer] + var_to_keep = adata.var + uns_num_to_keep = adata.uns["numerical_columns"] if "numerical_columns" in adata.uns else [] + uns_enc_to_keep = ( + adata.uns["encoded_non_numerical_columns"] if "encoded_non_numerical_columns" in adata.uns else [] + ) + uns_non_num_to_keep = adata.uns["non_numerical_columns"] if "non_numerical_columns" in adata.uns else [] + + else: + X_to_keep = np.zeros((len(adata), 1)) + var_to_keep = pd.DataFrame({"dummy": [0]}) + uns_num_to_keep = [] + uns_enc_to_keep = [] + uns_non_num_to_keep = [] + + adata_minimal = sc.AnnData( + X=X_to_keep, + obs=adata.obs, + var=var_to_keep, + uns={ + "numerical_columns": uns_num_to_keep, + "encoded_non_numerical_columns": uns_enc_to_keep, + "non_numerical_columns": uns_non_num_to_keep, + }, + ) + + if field_to_rank in ["obs", "layer_and_obs"]: # want columns of obs to become variables in X to be able to use rank_features_groups - adata_with_moved_columns = move_to_x(adata, list(rank_obs_columns)) + # for some reason ruff insists on this type check. columns_to_rank is always a dict with key "obs_names" if _obs_subset is True + if _obs_subset and isinstance(columns_to_rank, dict): + obs_to_move = adata.obs[columns_to_rank["obs_names"]].keys() + else: + obs_to_move = adata.obs.keys() + _check_no_datetime_columns(adata.obs[obs_to_move]) + adata_minimal = move_to_x(adata_minimal, list(obs_to_move)) + + if field_to_rank == "obs": + # the 0th column is a dummy of zeros and is meaningless in this case, and needs to be removed + adata_minimal = adata_minimal[:, 1:] - # don't want columns that have been in X before, as only consider columns from obs - columns_to_select = adata_with_moved_columns.var_names.difference(adata.var_names) - adata_with_moved_columns = adata_with_moved_columns[:, columns_to_select] + adata_minimal = encode(adata_minimal, autodetect=True, encodings="label") - adata_with_moved_columns = encode(adata_with_moved_columns, autodetect=True, encodings="label") + if layer is not None: + adata_minimal.layers[layer] = adata_minimal.X - adata_with_moved_columns.uns[ - "non_numerical_columns" - ] = [] # this should be empty, as have only numeric and encoded - adata_with_moved_columns.uns["numerical_columns"] = adata_with_moved_columns.var_names.difference( - adata_with_moved_columns.uns["encoded_non_numerical_columns"] - ).to_list() # this is sensitive to `encode` really detecting what it should - adata = adata_with_moved_columns + # save the reference to the original adata, because we will need to access it later + adata_orig = adata + adata = adata_minimal if not adata.obs[groupby].dtype == "category": adata.obs[groupby] = pd.Categorical(adata.obs[groupby]) @@ -453,6 +536,10 @@ def rank_features_groups( groups_order=group_names, ) + # if field_to_rank was obs or layer_and_obs, the adata object we have been working with is adata_minimal + adata_orig.uns[key_added] = adata.uns[key_added] + adata = adata_orig + # Adjust p values if "pvals" in adata.uns[key_added]: adata.uns[key_added]["pvals_adj"] = _adjust_pvalues( @@ -465,8 +552,4 @@ def rank_features_groups( _sort_features(adata, key_added) - if rank_obs_columns is not None: - adata_orig.uns[key_added] = adata.uns[key_added] - adata = adata_orig - return adata if copy else None diff --git a/tests/tools/test_features_ranking.py b/tests/tools/test_features_ranking.py index 79ef2b6b..8e10ead9 100644 --- a/tests/tools/test_features_ranking.py +++ b/tests/tools/test_features_ranking.py @@ -277,59 +277,110 @@ def test_only_cat_features(self): assert "logfoldchanges" in adata.uns["rank_features_groups"] assert "pvals_adj" in adata.uns["rank_features_groups"] - def test_rank_obs( - self, - ): - # prepare data with some interesting features in .obs - adata_features_in_obs = read_csv( + @pytest.mark.parametrize("field_to_rank", ["layer", "obs", "layer_and_obs"]) + def test_rank_adata_immutability_property(self, field_to_rank): + """ + Test that rank_features_group does not modify the adata object passed to it, + except for the desired .uns field. + This test is important because to save memory, copies are made conservatively in rank_features_groups + """ + adata = read_csv( + dataset_path=f"{_TEST_PATH}/dataset1.csv", columns_x_only=["station", "sys_bp_entry", "dia_bp_entry"] + ) + adata = ep.pp.encode(adata, encodings={"label": ["station"]}) + adata_orig = adata.copy() + + ep.tl.rank_features_groups(adata, groupby="disease", field_to_rank=field_to_rank) + + assert adata_orig.shape == adata.shape + assert adata_orig.X.shape == adata.X.shape + assert adata_orig.obs.shape == adata.obs.shape + assert adata_orig.var.shape == adata.var.shape + + assert np.allclose(adata_orig.X, adata.X) + assert np.array_equal(adata_orig.obs, adata.obs) + + assert "rank_features_groups" in adata.uns + + @pytest.mark.parametrize("field_to_rank", ["layer", "obs", "layer_and_obs"]) + def test_rank_features_groups_generates_outputs(self, field_to_rank): + """ + Test that the desired output is generated + """ + + adata = read_csv( dataset_path=f"{_TEST_PATH}/dataset1.csv", columns_obs_only=["disease", "station", "sys_bp_entry", "dia_bp_entry"], ) - # prepare data with these features in .X + ep.tl.rank_features_groups(adata, groupby="disease", field_to_rank=field_to_rank) + + # check standard rank_features_groups entries + assert "names" in adata.uns["rank_features_groups"] + assert "pvals" in adata.uns["rank_features_groups"] + assert "scores" in adata.uns["rank_features_groups"] + assert "pvals_adj" in adata.uns["rank_features_groups"] + assert "logfoldchanges" in adata.uns["rank_features_groups"] + assert "log2foldchanges" not in adata.uns["rank_features_groups"] + assert "pts" not in adata.uns["rank_features_groups"] + + if field_to_rank == "layer" or field_to_rank == "obs": + assert len(adata.uns["rank_features_groups"]["names"]) == 3 # It only captures the length of each group + assert len(adata.uns["rank_features_groups"]["pvals"]) == 3 + assert len(adata.uns["rank_features_groups"]["scores"]) == 3 + + elif field_to_rank == "layer_and_obs": + assert len(adata.uns["rank_features_groups"]["names"]) == 6 # It only captures the length of each group + assert len(adata.uns["rank_features_groups"]["pvals"]) == 6 + assert len(adata.uns["rank_features_groups"]["scores"]) == 6 + + def test_rank_features_groups_consistent_results(self): adata_features_in_x = read_csv( - dataset_path=f"{_TEST_PATH}/dataset1.csv", columns_x_only=["station", "sys_bp_entry", "dia_bp_entry"] + dataset_path=f"{_TEST_PATH}/dataset1.csv", + columns_x_only=["station", "sys_bp_entry", "dia_bp_entry", "glucose"], ) adata_features_in_x = ep.pp.encode(adata_features_in_x, encodings={"label": ["station"]}) - # rank_features_groups on .obs - ep.tl.rank_features_groups(adata_features_in_obs, groupby="disease", rank_obs_columns="all") - - # rank features groups on .X - ep.tl.rank_features_groups(adata_features_in_x, groupby="disease") - - # check standard rank_features_groups entries - assert "names" in adata_features_in_obs.uns["rank_features_groups"] - assert "pvals" in adata_features_in_obs.uns["rank_features_groups"] - assert "scores" in adata_features_in_obs.uns["rank_features_groups"] - assert "pvals_adj" in adata_features_in_obs.uns["rank_features_groups"] - assert "log2foldchanges" not in adata_features_in_obs.uns["rank_features_groups"] - assert "pts" not in adata_features_in_obs.uns["rank_features_groups"] - assert ( - len(adata_features_in_obs.uns["rank_features_groups"]["names"]) == 3 - ) # It only captures the length of each group - assert len(adata_features_in_obs.uns["rank_features_groups"]["pvals"]) == 3 - assert len(adata_features_in_obs.uns["rank_features_groups"]["scores"]) == 3 + adata_features_in_obs = read_csv( + dataset_path=f"{_TEST_PATH}/dataset1.csv", + columns_obs_only=["disease", "station", "sys_bp_entry", "dia_bp_entry", "glucose"], + ) - # check the obs are used indeed - assert "sys_bp_entry" in adata_features_in_obs.uns["rank_features_groups"]["names"][0] - assert "sys_bp_entry" in adata_features_in_obs.uns["rank_features_groups"]["names"][1] - assert "ehrapycat_station" in adata_features_in_obs.uns["rank_features_groups"]["names"][2] + adata_features_in_x_and_obs = read_csv( + dataset_path=f"{_TEST_PATH}/dataset1.csv", + columns_obs_only=["disease", "station"], + ) + # to keep the same variables as in the datsets above, in order to make the comparison of consistency + adata_features_in_x_and_obs = adata_features_in_x_and_obs[:, ["sys_bp_entry", "dia_bp_entry", "glucose"]] + adata_features_in_x_and_obs.uns["numerical_columns"] = ["sys_bp_entry", "dia_bp_entry", "glucose"] - # check the X are not used - assert "glucose" not in adata_features_in_obs.uns["rank_features_groups"]["names"][0] + ep.tl.rank_features_groups(adata_features_in_x, groupby="disease") + ep.tl.rank_features_groups(adata_features_in_obs, groupby="disease", field_to_rank="obs") + ep.tl.rank_features_groups(adata_features_in_x_and_obs, groupby="disease", field_to_rank="layer_and_obs") - # check the results are the same - for record in adata_features_in_obs.uns["rank_features_groups"]["names"].dtype.names: + for record in adata_features_in_x.uns["rank_features_groups"]["names"].dtype.names: assert np.allclose( - adata_features_in_obs.uns["rank_features_groups"]["scores"][record], adata_features_in_x.uns["rank_features_groups"]["scores"][record], + adata_features_in_obs.uns["rank_features_groups"]["scores"][record], ) assert np.allclose( - np.array(adata_features_in_obs.uns["rank_features_groups"]["pvals"][record]), np.array(adata_features_in_x.uns["rank_features_groups"]["pvals"][record]), + np.array(adata_features_in_obs.uns["rank_features_groups"]["pvals"][record]), ) assert np.array_equal( + np.array(adata_features_in_x.uns["rank_features_groups"]["names"][record]), np.array(adata_features_in_obs.uns["rank_features_groups"]["names"][record]), + ) + for record in adata_features_in_x.uns["rank_features_groups"]["names"].dtype.names: + assert np.allclose( + adata_features_in_x.uns["rank_features_groups"]["scores"][record], + adata_features_in_x_and_obs.uns["rank_features_groups"]["scores"][record], + ) + assert np.allclose( + np.array(adata_features_in_x.uns["rank_features_groups"]["pvals"][record]), + np.array(adata_features_in_x_and_obs.uns["rank_features_groups"]["pvals"][record]), + ) + assert np.array_equal( np.array(adata_features_in_x.uns["rank_features_groups"]["names"][record]), + np.array(adata_features_in_x_and_obs.uns["rank_features_groups"]["names"][record]), ) From 213d8d5b5a0a54d7fc0047f33a56e4dcf628ab9e Mon Sep 17 00:00:00 2001 From: eljas Date: Thu, 7 Dec 2023 12:44:00 +0100 Subject: [PATCH 14/17] this test data should be more stable --- .../tools/test_data_features_ranking/dataset1.csv | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/tools/test_data_features_ranking/dataset1.csv b/tests/tools/test_data_features_ranking/dataset1.csv index 25f83ab2..1569f780 100644 --- a/tests/tools/test_data_features_ranking/dataset1.csv +++ b/tests/tools/test_data_features_ranking/dataset1.csv @@ -4,10 +4,10 @@ idx,sys_bp_entry,dia_bp_entry,glucose,weight,disease,station 3,140,80,120,60,A,MICU 4,141,81,130,90,A,MICU 5,148,77,80,110,B,ICU -6,149,78,130,78,B,ICU -7,150,79,120,56,B,MICU -8,151,80,90,76,B,MICU -9,158,55,80,67,C,ICU -10,159,56,90,82,C,ICU -11,160,57,120,59,C,MICU -12,161,58,130,81,C,MICU +6,149,78,135,78,B,ICU +7,150,79,125,56,B,MICU +8,151,80,95,76,B,MICU +9,158,55,70,67,C,ICU +10,159,56,85,82,C,ICU +11,160,57,125,59,C,MICU +12,161,58,125,81,C,MICU From 034f8209c596ae2d97dc543a57048fac32d6be4e Mon Sep 17 00:00:00 2001 From: Eljas Roellin <65244425+eroell@users.noreply.github.com> Date: Thu, 7 Dec 2023 13:20:55 +0100 Subject: [PATCH 15/17] Update ehrapy/tools/feature_ranking/_rank_features_groups.py Co-authored-by: Lukas Heumos --- ehrapy/tools/feature_ranking/_rank_features_groups.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py index 2da75c5a..35a4c27d 100644 --- a/ehrapy/tools/feature_ranking/_rank_features_groups.py +++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py @@ -245,7 +245,7 @@ def _evaluate_categorical_features( def _check_no_datetime_columns(df): - datetime_cols = [col for col in df.columns if df[col].dtype == "datetime64[ns]"] +datetime_cols = [col for col in df.columns if pd.api.types.is_datetime64_any_dtype(df[col]) or pd.api.types.is_timedelta64_dtype(df[col])] if datetime_cols: raise ValueError(f"Columns with datetime format found: {datetime_cols}") From 645826525165c483da215a53870ba4be1f3db9b5 Mon Sep 17 00:00:00 2001 From: eljas Date: Thu, 7 Dec 2023 13:29:32 +0100 Subject: [PATCH 16/17] correct indent of previous commit and added comment on dummy X --- ehrapy/tools/feature_ranking/_rank_features_groups.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py index 35a4c27d..4fe8d52e 100644 --- a/ehrapy/tools/feature_ranking/_rank_features_groups.py +++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py @@ -245,7 +245,11 @@ def _evaluate_categorical_features( def _check_no_datetime_columns(df): -datetime_cols = [col for col in df.columns if pd.api.types.is_datetime64_any_dtype(df[col]) or pd.api.types.is_timedelta64_dtype(df[col])] + datetime_cols = [ + col + for col in df.columns + if pd.api.types.is_datetime64_any_dtype(df[col]) or pd.api.types.is_timedelta64_dtype(df[col]) + ] if datetime_cols: raise ValueError(f"Columns with datetime format found: {datetime_cols}") @@ -415,6 +419,7 @@ def rank_features_groups( uns_non_num_to_keep = adata.uns["non_numerical_columns"] if "non_numerical_columns" in adata.uns else [] else: + # dummy 1-dimensional X to be used by move_to_x, and removed again afterwards X_to_keep = np.zeros((len(adata), 1)) var_to_keep = pd.DataFrame({"dummy": [0]}) uns_num_to_keep = [] From f444a5905fcd9c223847f8930cae7b755e3e4873 Mon Sep 17 00:00:00 2001 From: eljas Date: Thu, 7 Dec 2023 18:42:31 +0100 Subject: [PATCH 17/17] bug fixes, more tests and (fixed) examples in docstring --- .../feature_ranking/_rank_features_groups.py | 36 ++++++++++++---- tests/tools/test_features_ranking.py | 43 +++++++++++++++++++ 2 files changed, 71 insertions(+), 8 deletions(-) diff --git a/ehrapy/tools/feature_ranking/_rank_features_groups.py b/ehrapy/tools/feature_ranking/_rank_features_groups.py index 4fe8d52e..5625d95f 100644 --- a/ehrapy/tools/feature_ranking/_rank_features_groups.py +++ b/ehrapy/tools/feature_ranking/_rank_features_groups.py @@ -257,7 +257,7 @@ def _check_no_datetime_columns(df): def _get_intersection(adata_uns, key, selection): """Get intersection of adata_uns[key] and selection""" if key in adata_uns: - uns_enc_to_keep = list(set(adata_uns["encoded_non_numerical_columns"]) & set(selection)) + uns_enc_to_keep = list(set(adata_uns[key]) & set(selection)) else: uns_enc_to_keep = [] return uns_enc_to_keep @@ -351,31 +351,48 @@ def rank_features_groups( minimal set of genes that are good predictors (sparse solution meaning few non-zero fitted coefficients). Returns: - *names*: structured `np.ndarray` (`.uns['rank_features_groups']`) + *names* structured `np.ndarray` (`.uns['rank_features_groups']`) Structured array to be indexed by group id storing the gene names. Ordered according to scores. - *scores*: structured `np.ndarray` (`.uns['rank_features_groups']`) + *scores* structured `np.ndarray` (`.uns['rank_features_groups']`) Structured array to be indexed by group id storing the z-score underlying the computation of a p-value for each gene for each group. Ordered according to scores. - *logfoldchanges*: structured `np.ndarray` (`.uns['rank_features_groups']`) + *logfoldchanges* structured `np.ndarray` (`.uns['rank_features_groups']`) Structured array to be indexed by group id storing the log2 fold change for each gene for each group. Ordered according to scores. Only provided if method is 't-test' like. Note: this is an approximation calculated from mean-log values. - *pvals*: structured `np.ndarray` (`.uns['rank_features_groups']`) p-values. - *pvals_adj* : structured `np.ndarray` (`.uns['rank_features_groups']`) Corrected p-values. + *pvals* structured `np.ndarray` (`.uns['rank_features_groups']`) p-values. + *pvals_adj* structured `np.ndarray` (`.uns['rank_features_groups']`) Corrected p-values. *pts*: `pandas.DataFrame` (`.uns['rank_features_groups']`) Fraction of cells expressing the genes for each group. - *pts_rest*: `pandas.DataFrame` (`.uns['rank_features_groups']`) + *pts_rest* `pandas.DataFrame` (`.uns['rank_features_groups']`) Only if `reference` is set to `'rest'`. Fraction of observations from the union of the rest of each group containing the features. Examples: >>> import ehrapy as ep - >>> adata = ep.dt.mimic_2(encoded=True) + >>> adata = ep.dt.mimic_2(encoded=False) + >>> # want to move some metadata to the obs field + >>> ep.anndata.move_to_obs(adata, to_obs=["service_unit", "service_num", "age", "mort_day_censored"]) >>> ep.tl.rank_features_groups(adata, "service_unit") >>> ep.pl.rank_features_groups(adata) + + >>> import ehrapy as ep + >>> adata = ep.dt.mimic_2(encoded=False) + >>> # want to move some metadata to the obs field + >>> ep.anndata.move_to_obs(adata, to_obs=["service_unit", "service_num", "age", "mort_day_censored"]) + >>> ep.tl.rank_features_groups(adata, "service_unit", field_to_rank="obs", columns_to_rank={"obs_names": ["age", "mort_day_censored"]}) + >>> ep.pl.rank_features_groups(adata) + + >>> import ehrapy as ep + >>> adata = ep.dt.mimic_2(encoded=False) + >>> # want to move some metadata to the obs field + >>> ep.anndata.move_to_obs(adata, to_obs=["service_unit", "service_num", "age", "mort_day_censored"]) + >>> ep.tl.rank_features_groups(adata, "service_unit", field_to_rank="layer_and_obs", columns_to_rank={"var_names": ['copd_flg', 'renal_flg'], "obs_names": ["age", "mort_day_censored"]}) + >>> ep.pl.rank_features_groups(adata) + """ if layer is not None and field_to_rank == "obs": raise ValueError("If 'layer' is not None, 'field_to_rank' cannot be 'obs'.") @@ -452,6 +469,9 @@ def rank_features_groups( adata_minimal = adata_minimal[:, 1:] adata_minimal = encode(adata_minimal, autodetect=True, encodings="label") + # this is needed because encode() doesn't add this key if there are no categorical columns to encode + if "encoded_non_numerical_columns" not in adata_minimal.uns: + adata_minimal.uns["encoded_non_numerical_columns"] = [] if layer is not None: adata_minimal.layers[layer] = adata_minimal.X diff --git a/tests/tools/test_features_ranking.py b/tests/tools/test_features_ranking.py index 8e10ead9..cc9c3c22 100644 --- a/tests/tools/test_features_ranking.py +++ b/tests/tools/test_features_ranking.py @@ -384,3 +384,46 @@ def test_rank_features_groups_consistent_results(self): np.array(adata_features_in_x.uns["rank_features_groups"]["names"][record]), np.array(adata_features_in_x_and_obs.uns["rank_features_groups"]["names"][record]), ) + + def test_rank_features_group_column_to_rank(self): + adata = read_csv( + dataset_path=f"{_TEST_PATH}/dataset1.csv", + columns_obs_only=["disease", "station", "sys_bp_entry", "dia_bp_entry"], + index_column="idx", + ) + + # get a fresh adata for every test to not have any side effects + adata_copy = adata.copy() + + ep.tl.rank_features_groups(adata, groupby="disease", columns_to_rank="all") + assert len(adata.uns["rank_features_groups"]["names"]) == 2 + + # want to check a "complete selection" works + adata = adata_copy.copy() + ep.tl.rank_features_groups(adata, groupby="disease", columns_to_rank={"var_names": ["glucose", "weight"]}) + assert len(adata.uns["rank_features_groups"]["names"]) == 2 + + # want to check a "sub-selection" works + adata = adata_copy.copy() + ep.tl.rank_features_groups(adata, groupby="disease", columns_to_rank={"var_names": ["glucose"]}) + assert len(adata.uns["rank_features_groups"]["names"]) == 1 + + # want to check a "complete" selection works + adata = adata_copy.copy() + ep.tl.rank_features_groups( + adata, + groupby="disease", + field_to_rank="obs", + columns_to_rank={"obs_names": ["station", "sys_bp_entry", "dia_bp_entry"]}, + ) + assert len(adata.uns["rank_features_groups"]["names"]) == 3 + + # want to check a "sub-selection" selection works + adata = adata_copy.copy() + ep.tl.rank_features_groups( + adata, + groupby="disease", + field_to_rank="obs", + columns_to_rank={"obs_names": ["sys_bp_entry", "dia_bp_entry"]}, + ) + assert len(adata.uns["rank_features_groups"]["names"]) == 2