Skip to content

Commit

Permalink
adds AdjacencySlicer and KeySlicer
Browse files Browse the repository at this point in the history
  • Loading branch information
johentsch committed Nov 7, 2023
1 parent cfa64a8 commit d4cfd6f
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 0 deletions.
11 changes: 11 additions & 0 deletions src/dimcat/dc_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,3 +387,14 @@ class ResourceNotProcessableError(DimcatError):
2: lambda name, step: f"{step!r} cannot process Resource {name!r}.",
3: lambda name, step, resource_type: f"{step!r} cannot process Resource {name!r} of type {resource_type!r}.",
}


class SlicerNotSetUpError(DimcatError):
"""optional args: (slicer_name,)"""

nargs2message = {
0: "The slicer has not been setup. Applying it would result in empty features. Set the attribute "
"'slice_intervals'.",
1: lambda name: f"The {name!r} has not been setup. Applying it would result in empty features. "
f"Set the attribute 'slice_intervals'.",
}
2 changes: 2 additions & 0 deletions src/dimcat/steps/slicers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import logging

from .value_ranges import AdjacencyGroupSlicer, KeySlicer

logger = logging.getLogger(__name__)
87 changes: 87 additions & 0 deletions src/dimcat/steps/slicers/value_ranges.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from typing import ClassVar, Optional, Type

import marshmallow as mm
import pandas as pd
from dimcat import Dataset
from dimcat.data.resources import Feature
from dimcat.data.resources.dc import SliceIntervals
from dimcat.dc_exceptions import SlicerNotSetUpError
from dimcat.steps.slicers.base import Slicer


class AdjacencyGroupSlicer(Slicer):
"""This slicer and its subclasses slices resources by adjacency groups, that is, segments where a particular
column (or combination thereof) has the same value over all rows."""

feature_providing_slice_intervals: ClassVar[Type[Feature] | str]
"""Mandatory class variable that specifies which feature provides the slice intervals."""
adjacency_group_column_name: ClassVar[Optional[str]] = None
"""Optional class variable that specifies the name of the column that contains the adjacency group.
Defaults to each row, i.e., no extra grouping.
"""

class Schema(Slicer.Schema):
slice_intervals = mm.fields.Nested(SliceIntervals.Schema)

def __init__(
self,
level_name: str = "adjacency_group",
slice_intervals: Optional[SliceIntervals] = None,
**kwargs,
):
super().__init__(level_name=level_name, **kwargs)
self._slice_intervals: Optional[SliceIntervals] = None
if slice_intervals is not None:
self.slice_intervals = slice_intervals

@property
def slice_intervals(self) -> Optional[SliceIntervals]:
return self._slice_intervals

@slice_intervals.setter
def slice_intervals(self, slice_intervals: SliceIntervals | pd.MultiIndex):
if isinstance(slice_intervals, pd.MultiIndex):
slice_intervals = SliceIntervals.from_index(slice_intervals)
elif not isinstance(slice_intervals, SliceIntervals):
raise TypeError(
f"Expected SliceIntervals or pd.MultiIndex, got {type(slice_intervals)}"
)
self._slice_intervals = slice_intervals

def fit_to_dataset(self, dataset: Dataset) -> None:
"""Set the slice intervals to the intervals provided by the relevant feature."""
feature = dataset.get_feature(self.feature_providing_slice_intervals)
self.slice_intervals = feature.get_slice_intervals(level_name=self.level_name)

def get_slice_intervals(self, resource: Feature) -> SliceIntervals:
"""Get the slice intervals from the relevant feature."""
if self.slice_intervals is None:
if isinstance(self.feature_providing_slice_intervals, type):
feature_name = self.feature_providing_slice_intervals.name
else:
feature_name = self.feature_providing_slice_intervals
if (
resource.name == feature_name
): # strict test for the exact feature, not subclasses
self.slice_intervals = resource.get_slice_intervals(
level_name=self.level_name
)
else:
raise SlicerNotSetUpError(self.dtype)
return self.slice_intervals


class KeySlicer(AdjacencyGroupSlicer):
"""Slices resources by key."""

feature_providing_slice_intervals = "KeyAnnotations"

def __init__(
self,
level_name: str = "localkey_slice",
slice_intervals: Optional[SliceIntervals] = None,
**kwargs,
):
super().__init__(
level_name=level_name, slice_intervals=slice_intervals, **kwargs
)

0 comments on commit d4cfd6f

Please sign in to comment.