From d4cfd6ff5cbdf2ef80ce3353f84c91991447a4a0 Mon Sep 17 00:00:00 2001 From: Johannes Hentschel Date: Tue, 7 Nov 2023 03:30:23 +0100 Subject: [PATCH] adds AdjacencySlicer and KeySlicer --- src/dimcat/dc_exceptions.py | 11 +++ src/dimcat/steps/slicers/__init__.py | 2 + src/dimcat/steps/slicers/value_ranges.py | 87 ++++++++++++++++++++++++ 3 files changed, 100 insertions(+) create mode 100644 src/dimcat/steps/slicers/value_ranges.py diff --git a/src/dimcat/dc_exceptions.py b/src/dimcat/dc_exceptions.py index 9cbe6e62..6c8c444d 100644 --- a/src/dimcat/dc_exceptions.py +++ b/src/dimcat/dc_exceptions.py @@ -387,3 +387,14 @@ class ResourceNotProcessableError(DimcatError): 2: lambda name, step: f"{step!r} cannot process Resource {name!r}.", 3: lambda name, step, resource_type: f"{step!r} cannot process Resource {name!r} of type {resource_type!r}.", } + + +class SlicerNotSetUpError(DimcatError): + """optional args: (slicer_name,)""" + + nargs2message = { + 0: "The slicer has not been setup. Applying it would result in empty features. Set the attribute " + "'slice_intervals'.", + 1: lambda name: f"The {name!r} has not been setup. Applying it would result in empty features. " + f"Set the attribute 'slice_intervals'.", + } diff --git a/src/dimcat/steps/slicers/__init__.py b/src/dimcat/steps/slicers/__init__.py index eea436a3..7c032083 100644 --- a/src/dimcat/steps/slicers/__init__.py +++ b/src/dimcat/steps/slicers/__init__.py @@ -1,3 +1,5 @@ import logging +from .value_ranges import AdjacencyGroupSlicer, KeySlicer + logger = logging.getLogger(__name__) diff --git a/src/dimcat/steps/slicers/value_ranges.py b/src/dimcat/steps/slicers/value_ranges.py new file mode 100644 index 00000000..5b8746ab --- /dev/null +++ b/src/dimcat/steps/slicers/value_ranges.py @@ -0,0 +1,87 @@ +from typing import ClassVar, Optional, Type + +import marshmallow as mm +import pandas as pd +from dimcat import Dataset +from dimcat.data.resources import Feature +from dimcat.data.resources.dc import SliceIntervals +from dimcat.dc_exceptions import SlicerNotSetUpError +from dimcat.steps.slicers.base import Slicer + + +class AdjacencyGroupSlicer(Slicer): + """This slicer and its subclasses slices resources by adjacency groups, that is, segments where a particular + column (or combination thereof) has the same value over all rows.""" + + feature_providing_slice_intervals: ClassVar[Type[Feature] | str] + """Mandatory class variable that specifies which feature provides the slice intervals.""" + adjacency_group_column_name: ClassVar[Optional[str]] = None + """Optional class variable that specifies the name of the column that contains the adjacency group. + Defaults to each row, i.e., no extra grouping. + """ + + class Schema(Slicer.Schema): + slice_intervals = mm.fields.Nested(SliceIntervals.Schema) + + def __init__( + self, + level_name: str = "adjacency_group", + slice_intervals: Optional[SliceIntervals] = None, + **kwargs, + ): + super().__init__(level_name=level_name, **kwargs) + self._slice_intervals: Optional[SliceIntervals] = None + if slice_intervals is not None: + self.slice_intervals = slice_intervals + + @property + def slice_intervals(self) -> Optional[SliceIntervals]: + return self._slice_intervals + + @slice_intervals.setter + def slice_intervals(self, slice_intervals: SliceIntervals | pd.MultiIndex): + if isinstance(slice_intervals, pd.MultiIndex): + slice_intervals = SliceIntervals.from_index(slice_intervals) + elif not isinstance(slice_intervals, SliceIntervals): + raise TypeError( + f"Expected SliceIntervals or pd.MultiIndex, got {type(slice_intervals)}" + ) + self._slice_intervals = slice_intervals + + def fit_to_dataset(self, dataset: Dataset) -> None: + """Set the slice intervals to the intervals provided by the relevant feature.""" + feature = dataset.get_feature(self.feature_providing_slice_intervals) + self.slice_intervals = feature.get_slice_intervals(level_name=self.level_name) + + def get_slice_intervals(self, resource: Feature) -> SliceIntervals: + """Get the slice intervals from the relevant feature.""" + if self.slice_intervals is None: + if isinstance(self.feature_providing_slice_intervals, type): + feature_name = self.feature_providing_slice_intervals.name + else: + feature_name = self.feature_providing_slice_intervals + if ( + resource.name == feature_name + ): # strict test for the exact feature, not subclasses + self.slice_intervals = resource.get_slice_intervals( + level_name=self.level_name + ) + else: + raise SlicerNotSetUpError(self.dtype) + return self.slice_intervals + + +class KeySlicer(AdjacencyGroupSlicer): + """Slices resources by key.""" + + feature_providing_slice_intervals = "KeyAnnotations" + + def __init__( + self, + level_name: str = "localkey_slice", + slice_intervals: Optional[SliceIntervals] = None, + **kwargs, + ): + super().__init__( + level_name=level_name, slice_intervals=slice_intervals, **kwargs + )