From 21a5f72532f4ef558e7d1a2ccb0177f06a9dda15 Mon Sep 17 00:00:00 2001 From: Gert Hulselmans Date: Fri, 7 May 2021 12:10:46 +0200 Subject: [PATCH] Add support for reading rankings databases created by create_cisTarget_databases. Add support for reading rankings databases created by create_cisTarget_databases as the index column in the Feather file is "motifs" or "tracks" instead of "features". --- src/ctxcore/rnkdb.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/src/ctxcore/rnkdb.py b/src/ctxcore/rnkdb.py index ca61335..ef914fa 100644 --- a/src/ctxcore/rnkdb.py +++ b/src/ctxcore/rnkdb.py @@ -258,9 +258,6 @@ def quote(value): return pd.DataFrame(index=self.features, columns=genes, data=rankings) -INDEX_NAME = "features" - - class FeatherRankingDatabase(RankingDatabase): def __init__(self, fname: str, name: str): """ @@ -272,13 +269,24 @@ def __init__(self, fname: str, name: str): super().__init__(name=name) assert os.path.isfile(fname), "Database {0:s} doesn't exist.".format(fname) + # FeatherReader cannot be pickle (important for dask framework) so filename is field instead. self._fname = fname + if fname.endswith('.genes_vs_motifs.rankings.feather') or fname.endswith('.regions_vs_motifs.rankings.feather'): + self._index_name = 'motifs' + elif fname.endswith('.genes_vs_tracks.rankings.feather') or fname.endswith( + 'regions_vs_tracks.rankings.feather' + ): + self._index_name = 'tracks' + else: + self._index_name = 'features' + @property @memoize def total_genes(self) -> int: - # Do not count column 1 as it contains the index with the name of the features. + # Do not count column 1 as it contains the index with the name of the index column ("motifs", "tracks" or + # "features"). return FeatherReader(self._fname).num_columns - 1 @property @@ -286,11 +294,11 @@ def total_genes(self) -> int: def genes(self) -> Tuple[str]: # noinspection PyTypeChecker reader = FeatherReader(self._fname) - # Get all gene names (exclude "features" column). + # Get all gene names (exclude index column: "motifs", "tracks" or "features"). return tuple( reader.get_column_name(idx) for idx in range(reader.num_columns) - if reader.get_column_name(idx) != INDEX_NAME + if reader.get_column_name(idx) != self._index_name ) @property @@ -302,7 +310,7 @@ def load_full(self) -> pd.DataFrame: df = FeatherReader(self._fname).read_pandas() # Avoid copying the whole dataframe by replacing the index in place. # This makes loading a database twice as fast in case the database file is already in the filesystem cache. - df.set_index(INDEX_NAME, inplace=True) + df.set_index(self._index_name, inplace=True) return df def load(self, gs: Type[GeneSignature]) -> pd.DataFrame: @@ -310,14 +318,17 @@ def load(self, gs: Type[GeneSignature]) -> pd.DataFrame: gene_set = self.geneset.intersection(set(gs.genes)) # Read ranking columns for genes in order they appear in the Feather file. df = FeatherReader(self._fname).read_pandas( - columns=(INDEX_NAME,) + tuple(sorted(gene_set, key=lambda gene: self.genes2idx[gene])) + columns=(self._index_name,) + tuple(sorted(gene_set, key=lambda gene: self.genes2idx[gene])) ) # Avoid copying the whole dataframe by replacing the index in place. # This makes loading a database twice as fast in case the database file is already in the filesystem cache. - df.set_index(INDEX_NAME, inplace=True) + df.set_index(self._index_name, inplace=True) return df +INDEX_NAME = "features" + + class ParquetRankingDatabase(RankingDatabase): def __init__(self, fname: str, name: str): """