Skip to content

Commit

Permalink
feat: adds 'dimension_column' as argument for all Analyzers; enables …
Browse files Browse the repository at this point in the history
…default_analyzer for Metadata
  • Loading branch information
johentsch committed Dec 7, 2023
1 parent 024bf65 commit d192a07
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 16 deletions.
2 changes: 1 addition & 1 deletion src/dimcat/data/resources/dc.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ class DimcatResource(Resource, Generic[D]):
creating the resource, any row containing a missing value in one of the feature columns is dropped."""
# endregion column name class variables
# region associated object types
_default_analyzer: ClassVar[str] = "Proportions"
_default_analyzer: ClassVar[StepSpecs] = "Proportions"
"""Name of the Analyzer that is used by default for plotting the resource. Needs to return a :obj:`Result`."""
_extractable_features: ClassVar[Optional[Tuple[FeatureName, ...]]] = None
"""Tuple of :obj:`FeatureNames <FeatureName>` corresponding to the features that can be extracted from this
Expand Down
3 changes: 2 additions & 1 deletion src/dimcat/data/resources/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@


class Metadata(Feature):
pass
_default_analyzer = dict(dtype="Proportions", dimension_column="length_qb")
_default_value_column = "piece"


# region Annotations
Expand Down
2 changes: 1 addition & 1 deletion src/dimcat/data/resources/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def value_column(self, value_column: str):
@property
def x_column(self) -> str:
"""Name of the result column from which to create one marker per distinct value to show over the x-axis."""
if self.uses_line_of_fifths_colors:
if self.uses_line_of_fifths_colors or not self.formatted_column:
return self.value_column
else:
return self.formatted_column
Expand Down
27 changes: 23 additions & 4 deletions src/dimcat/steps/analyzers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class Analyzer(FeatureProcessingStep):
The base class performs no analysis, instantiating it serves mere testing purpose.
"""

_dimension_column_name: ClassVar[Optional[str]] = None
_default_dimension_column: ClassVar[Optional[str]] = None
"""Name of a column, contained in the Results produced by this analyzer, containing some dimension,
e.g. one to be interpreted as quantity (durations, counts, etc.) or as color."""
_enum_type: ClassVar[Type[Enum]] = AnalyzerName
Expand Down Expand Up @@ -140,14 +140,32 @@ def __init__(
features: Optional[FeatureSpecs | Iterable[FeatureSpecs]] = None,
strategy: DispatchStrategy = DispatchStrategy.GROUPBY_APPLY,
smallest_unit: UnitOfAnalysis = UnitOfAnalysis.SLICE,
fill_na: Any = None,
dimension_column: str = None,
):
super().__init__(features=features)
self._strategy: DispatchStrategy = None
self.strategy = strategy
self._smallest_unit: UnitOfAnalysis = None
self.smallest_unit = smallest_unit
self.fill_na: Any = fill_na
self._dimension_column = None
self.dimension_column = dimension_column

@property
def dimension_column(self) -> Optional[str]:
"""Name of a column, contained in the Results produced by this analyzer, containing some dimension,
e.g. one to be interpreted as quantity (durations, counts, etc.) or as color."""
return self._dimension_column

@dimension_column.setter
def dimension_column(self, dimension_column: Optional[str]):
if dimension_column is None:
self._dimension_column = self._default_dimension_column
return
if not isinstance(dimension_column, str):
raise TypeError(
f"dimension_column must be a string, not {type(dimension_column)}"
)
self._dimension_column = dimension_column

@property
def strategy(self) -> DispatchStrategy:
Expand Down Expand Up @@ -183,10 +201,11 @@ def _make_new_resource(self, resource: Feature) -> Result:
formatted_column = resource.formatted_column
else:
formatted_column = None
print("DIMCOL: ", self.dimension_column)
result = result_constructor.from_dataframe(
analyzed_resource=resource,
value_column=value_column,
dimension_column=self._dimension_column_name,
dimension_column=self.dimension_column,
formatted_column=formatted_column,
df=results,
resource_name=result_name,
Expand Down
15 changes: 9 additions & 6 deletions src/dimcat/steps/analyzers/counters.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


class Counter(Analyzer):
_dimension_column_name = "count"
_default_dimension_column = "count"
_new_resource_type = Counts

@staticmethod
Expand All @@ -25,12 +25,15 @@ def compute(feature: Feature, **kwargs) -> D:
and feature.formatted_column not in groupby
):
groupby.append(feature.formatted_column)
result = feature.groupby(groupby)[Counter._dimension_column_name].value_counts(
dropna=False
)
result = result.to_frame(Counter._dimension_column_name)
result = feature.groupby(groupby)[
Counter._default_dimension_column
].value_counts(dropna=False)
result = result.to_frame(Counter._default_dimension_column)
return result

class Schema(Analyzer.Schema):
dimension_column = mm.fields.Str(load_default="count")

def groupby_apply(self, feature: Feature, groupby: SomeSeries = None, **kwargs):
"""Performs the computation on a groupby. The value of ``groupby`` needs to be
a Series of the same length as ``feature`` or otherwise work as positional argument to feature.groupby().
Expand All @@ -47,7 +50,7 @@ def groupby_apply(self, feature: Feature, groupby: SomeSeries = None, **kwargs):
):
groupby.append(feature.formatted_column)
result = feature.groupby(groupby).size()
result = result.to_frame(self._dimension_column_name)
result = result.to_frame(self.dimension_column)

return result

Expand Down
10 changes: 7 additions & 3 deletions src/dimcat/steps/analyzers/proportions.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging

import marshmallow as mm
from dimcat.data.resources.base import D, FeatureName, SomeSeries
from dimcat.data.resources.dc import DimcatResource, Feature
from dimcat.data.resources.results import Durations
Expand All @@ -10,7 +11,7 @@


class Proportions(Analyzer):
_dimension_column_name = "duration_qb"
_default_dimension_column = "duration_qb"
_new_resource_type = Durations

@staticmethod
Expand All @@ -22,13 +23,16 @@ def compute(feature: Feature, **kwargs) -> D:
):
groupby.append(feature.formatted_column)
result = (
feature.groupby(groupby)[Proportions._dimension_column_name]
feature.groupby(groupby)[Proportions._default_dimension_column]
.sum()
.astype(float)
)
result = result.to_frame()
return result

class Schema(Analyzer.Schema):
dimension_column = mm.fields.Str(load_default="duration_qb")

def check_resource(self, resource: DimcatResource) -> None:
"""Check if the resource has a value column."""
super().check_resource(resource)
Expand All @@ -53,7 +57,7 @@ def groupby_apply(self, feature: Feature, groupby: SomeSeries = None, **kwargs):
):
groupby.append(feature.formatted_column)
result = (
feature.groupby(groupby, group_keys=False)[self._dimension_column_name]
feature.groupby(groupby, group_keys=False)[self.dimension_column]
.sum()
.astype(float)
)
Expand Down

0 comments on commit d192a07

Please sign in to comment.