Skip to content

Commit

Permalink
Merge pull request #1 from aertslab/support_create_cistarget_database…
Browse files Browse the repository at this point in the history
…s_index_names

Add support for reading rankings databases created by create_cisTarge…
  • Loading branch information
cflerin authored May 7, 2021
2 parents a31e7e8 + 21a5f72 commit 8c7b942
Showing 1 changed file with 20 additions and 9 deletions.
29 changes: 20 additions & 9 deletions src/ctxcore/rnkdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,9 +258,6 @@ def quote(value):
return pd.DataFrame(index=self.features, columns=genes, data=rankings)


INDEX_NAME = "features"


class FeatherRankingDatabase(RankingDatabase):
def __init__(self, fname: str, name: str):
"""
Expand All @@ -272,25 +269,36 @@ def __init__(self, fname: str, name: str):
super().__init__(name=name)

assert os.path.isfile(fname), "Database {0:s} doesn't exist.".format(fname)

# FeatherReader cannot be pickle (important for dask framework) so filename is field instead.
self._fname = fname

if fname.endswith('.genes_vs_motifs.rankings.feather') or fname.endswith('.regions_vs_motifs.rankings.feather'):
self._index_name = 'motifs'
elif fname.endswith('.genes_vs_tracks.rankings.feather') or fname.endswith(
'regions_vs_tracks.rankings.feather'
):
self._index_name = 'tracks'
else:
self._index_name = 'features'

@property
@memoize
def total_genes(self) -> int:
# Do not count column 1 as it contains the index with the name of the features.
# Do not count column 1 as it contains the index with the name of the index column ("motifs", "tracks" or
# "features").
return FeatherReader(self._fname).num_columns - 1

@property
@memoize
def genes(self) -> Tuple[str]:
# noinspection PyTypeChecker
reader = FeatherReader(self._fname)
# Get all gene names (exclude "features" column).
# Get all gene names (exclude index column: "motifs", "tracks" or "features").
return tuple(
reader.get_column_name(idx)
for idx in range(reader.num_columns)
if reader.get_column_name(idx) != INDEX_NAME
if reader.get_column_name(idx) != self._index_name
)

@property
Expand All @@ -302,22 +310,25 @@ def load_full(self) -> pd.DataFrame:
df = FeatherReader(self._fname).read_pandas()
# Avoid copying the whole dataframe by replacing the index in place.
# This makes loading a database twice as fast in case the database file is already in the filesystem cache.
df.set_index(INDEX_NAME, inplace=True)
df.set_index(self._index_name, inplace=True)
return df

def load(self, gs: Type[GeneSignature]) -> pd.DataFrame:
# For some genes in the signature there might not be a rank available in the database.
gene_set = self.geneset.intersection(set(gs.genes))
# Read ranking columns for genes in order they appear in the Feather file.
df = FeatherReader(self._fname).read_pandas(
columns=(INDEX_NAME,) + tuple(sorted(gene_set, key=lambda gene: self.genes2idx[gene]))
columns=(self._index_name,) + tuple(sorted(gene_set, key=lambda gene: self.genes2idx[gene]))
)
# Avoid copying the whole dataframe by replacing the index in place.
# This makes loading a database twice as fast in case the database file is already in the filesystem cache.
df.set_index(INDEX_NAME, inplace=True)
df.set_index(self._index_name, inplace=True)
return df


INDEX_NAME = "features"


class ParquetRankingDatabase(RankingDatabase):
def __init__(self, fname: str, name: str):
"""
Expand Down

0 comments on commit 8c7b942

Please sign in to comment.