From 906b58774eca0bb1fba68c5866e07d2533791222 Mon Sep 17 00:00:00 2001 From: Paul Yang Date: Thu, 27 Jul 2023 16:46:06 -0700 Subject: [PATCH 1/2] Add Primary Entity Prefix When Specifying Dimensions in the WhereFilter With the invariant that all dimensions are associated with a primary entity, require the specification of the entity when using dimensions in the where filter. e.g. ``` dimension('capacity_latest') > 10 -> dimension('listing__capacity_latest') > 10 ``` --- dbt_semantic_interfaces/naming/__init__.py | 0 dbt_semantic_interfaces/naming/dundered.py | 138 ++++++++++++++++++ dbt_semantic_interfaces/naming/keywords.py | 11 ++ .../parsing/where_filter_parser.py | 58 +++++++- .../simple_semantic_manifest/metrics.yaml | 24 +-- .../where_filter/test_parse_calls.py | 37 ++++- 6 files changed, 247 insertions(+), 21 deletions(-) create mode 100644 dbt_semantic_interfaces/naming/__init__.py create mode 100644 dbt_semantic_interfaces/naming/dundered.py create mode 100644 dbt_semantic_interfaces/naming/keywords.py diff --git a/dbt_semantic_interfaces/naming/__init__.py b/dbt_semantic_interfaces/naming/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/dbt_semantic_interfaces/naming/dundered.py b/dbt_semantic_interfaces/naming/dundered.py new file mode 100644 index 00000000..3425305b --- /dev/null +++ b/dbt_semantic_interfaces/naming/dundered.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import Optional, Tuple + +from dbt_semantic_interfaces.naming.keywords import DUNDER +from dbt_semantic_interfaces.references import EntityReference +from dbt_semantic_interfaces.type_enums.time_granularity import TimeGranularity + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class StructuredDunderedName: + """Group by items (e.g. dimensions / entities) in a query that are named using a double underscore as a seperator. + + e.g. listing__ds__week -> + entity_links: ["listing"] + element_name: "ds" + granularity: TimeGranularity.WEEK + + The time granularity is part of legacy query syntax and there are plans to migrate away from this format. + """ + + entity_links: Tuple[EntityReference, ...] + element_name: str + time_granularity: Optional[TimeGranularity] = None + + @staticmethod + def parse_name(name: str) -> StructuredDunderedName: + """Construct from a string like 'listing__ds__month'.""" + name_parts = name.split(DUNDER) + + # No dunder, e.g. "ds" + if len(name_parts) == 1: + return StructuredDunderedName((), name_parts[0]) + + associated_granularity = None + granularity: TimeGranularity + for granularity in TimeGranularity: + if name_parts[-1] == granularity.value: + associated_granularity = granularity + + # Has a time granularity + if associated_granularity: + # e.g. "ds__month" + if len(name_parts) == 2: + return StructuredDunderedName((), name_parts[0], associated_granularity) + # e.g. "messages__ds__month" + return StructuredDunderedName( + entity_links=tuple(EntityReference(element_name=entity_name) for entity_name in name_parts[:-2]), + element_name=name_parts[-2], + time_granularity=associated_granularity, + ) + # e.g. "messages__ds" + else: + return StructuredDunderedName( + entity_links=tuple(EntityReference(element_name=entity_name) for entity_name in name_parts[:-1]), + element_name=name_parts[-1], + ) + + @property + def dundered_name(self) -> str: + """Return the full name form. e.g. ds or listing__ds__month.""" + items = [entity_reference.element_name for entity_reference in self.entity_links] + [self.element_name] + if self.time_granularity and self.time_granularity != TimeGranularity.DAY: + items.append(self.time_granularity.value) + return DUNDER.join(items) + + @property + def dundered_name_without_granularity(self) -> str: + """Return the name without the time granularity. e.g. listing__ds__month -> listing__ds.""" + return DUNDER.join( + tuple(entity_reference.element_name for entity_reference in self.entity_links) + (self.element_name,) + ) + + @property + def dundered_name_without_entity(self) -> str: + """Return the name without the entity. e.g. listing__ds__month -> ds__month.""" + return DUNDER.join((self.element_name,) + ((self.time_granularity.value,) if self.time_granularity else ())) + + @property + def entity_prefix(self) -> Optional[str]: + """Return the entity prefix. e.g. listing__ds__month -> listing.""" + if len(self.entity_links) > 0: + return DUNDER.join(tuple(entity_reference.element_name for entity_reference in self.entity_links)) + + return None + + +class DunderedNameFormatter: + """Helps to parse names into StructuredDunderedName and vice versa.""" + + @staticmethod + def parse_name(name: str) -> StructuredDunderedName: + """Construct from a string like 'listing__ds__month'.""" + name_parts = name.split(DUNDER) + + # No dunder, e.g. "ds" + if len(name_parts) == 1: + return StructuredDunderedName((), name_parts[0]) + + associated_granularity = None + granularity: TimeGranularity + for granularity in TimeGranularity: + if name_parts[-1] == granularity.value: + associated_granularity = granularity + + # Has a time granularity + if associated_granularity: + # e.g. "ds__month" + if len(name_parts) == 2: + return StructuredDunderedName((), name_parts[0], associated_granularity) + # e.g. "messages__ds__month" + return StructuredDunderedName( + entity_links=tuple(EntityReference(element_name=entity_name) for entity_name in name_parts[:-2]), + element_name=name_parts[-2], + time_granularity=associated_granularity, + ) + # e.g. "messages__ds" + else: + return StructuredDunderedName( + entity_links=tuple(EntityReference(element_name=entity_name) for entity_name in name_parts[:-1]), + element_name=name_parts[-1], + ) + + @staticmethod + def create_structured_name( # noqa: D + element_name: str, + entity_links: Tuple[EntityReference, ...] = (), + time_granularity: Optional[TimeGranularity] = None, + ) -> StructuredDunderedName: + return StructuredDunderedName( + entity_links=entity_links, + element_name=element_name, + time_granularity=time_granularity, + ) diff --git a/dbt_semantic_interfaces/naming/keywords.py b/dbt_semantic_interfaces/naming/keywords.py new file mode 100644 index 00000000..b898809f --- /dev/null +++ b/dbt_semantic_interfaces/naming/keywords.py @@ -0,0 +1,11 @@ +# A double underscore used as a seperator in group by item names. +# e.g. user__country +DUNDER = "__" + +# The name for the time dimension used to tabulate / plot metrics. +METRIC_TIME_ELEMENT_NAME = "metric_time" + + +def is_metric_time_name(element_name: str) -> bool: + """Returns True if the given element name corresponds to metric time.""" + return element_name == METRIC_TIME_ELEMENT_NAME diff --git a/dbt_semantic_interfaces/parsing/where_filter_parser.py b/dbt_semantic_interfaces/parsing/where_filter_parser.py index 8e5f3e9d..71bc893a 100644 --- a/dbt_semantic_interfaces/parsing/where_filter_parser.py +++ b/dbt_semantic_interfaces/parsing/where_filter_parser.py @@ -13,6 +13,11 @@ ParseWhereFilterException, TimeDimensionCallParameterSet, ) +from dbt_semantic_interfaces.naming.dundered import DunderedNameFormatter +from dbt_semantic_interfaces.naming.keywords import ( + METRIC_TIME_ELEMENT_NAME, + is_metric_time_name, +) from dbt_semantic_interfaces.references import ( DimensionReference, EntityReference, @@ -24,6 +29,13 @@ class WhereFilterParser: """Parses the template in the WhereFilter into FilterCallParameterSets.""" + @staticmethod + def _exception_message_for_incorrect_format(element_name: str) -> str: + return ( + f"Name is in an incorrect format: '{element_name}'. It should be of the form: " + f"__" + ) + @staticmethod def parse_call_parameter_sets(where_sql_template: str) -> FilterCallParameterSets: """Return the result of extracting the semantic objects referenced in the where SQL template string.""" @@ -38,10 +50,19 @@ def parse_call_parameter_sets(where_sql_template: str) -> FilterCallParameterSet def _dimension_call(dimension_name: str, entity_path: Sequence[str] = ()) -> str: """Gets called by Jinja when rendering {{ dimension(...) }}.""" + group_by_item_name = DunderedNameFormatter.parse_name(dimension_name) + if len(group_by_item_name.entity_links) != 1: + raise ParseWhereFilterException( + WhereFilterParser._exception_message_for_incorrect_format(dimension_name) + ) + dimension_call_parameter_sets.append( DimensionCallParameterSet( - dimension_reference=DimensionReference(element_name=dimension_name), - entity_path=tuple(EntityReference(element_name=arg) for arg in entity_path), + dimension_reference=DimensionReference(element_name=group_by_item_name.element_name), + entity_path=( + tuple(EntityReference(element_name=arg) for arg in entity_path) + + group_by_item_name.entity_links + ), ) ) return _DUMMY_PLACEHOLDER @@ -50,10 +71,32 @@ def _time_dimension_call( time_dimension_name: str, time_granularity_name: str, entity_path: Sequence[str] = () ) -> str: """Gets called by Jinja when rendering {{ time_dimension(...) }}.""" + group_by_item_name = DunderedNameFormatter.parse_name(time_dimension_name) + + # metric_time is the only time dimension that does not have an associated primary entity, so the + # GroupByItemName would not have any entity links. + if is_metric_time_name(group_by_item_name.element_name): + if len(group_by_item_name.entity_links) != 0 or group_by_item_name.time_granularity is not None: + raise ParseWhereFilterException( + WhereFilterParser._exception_message_for_incorrect_format( + f"Name is in an incorrect format: {time_dimension_name} " + f"When referencing {METRIC_TIME_ELEMENT_NAME}, the name should not have any dunders." + ) + ) + + else: + if len(group_by_item_name.entity_links) != 1 or group_by_item_name.time_granularity is not None: + raise ParseWhereFilterException( + WhereFilterParser._exception_message_for_incorrect_format(time_dimension_name) + ) + time_dimension_call_parameter_sets.append( TimeDimensionCallParameterSet( - time_dimension_reference=TimeDimensionReference(element_name=time_dimension_name), - entity_path=tuple(EntityReference(element_name=arg) for arg in entity_path), + time_dimension_reference=TimeDimensionReference(element_name=group_by_item_name.element_name), + entity_path=( + tuple(EntityReference(element_name=arg) for arg in entity_path) + + group_by_item_name.entity_links + ), time_granularity=TimeGranularity(time_granularity_name), ) ) @@ -61,6 +104,13 @@ def _time_dimension_call( def _entity_call(entity_name: str, entity_path: Sequence[str] = ()) -> str: """Gets called by Jinja when rendering {{ entity(...) }}.""" + group_by_item_name = DunderedNameFormatter.parse_name(entity_name) + if len(group_by_item_name.entity_links) > 0 or group_by_item_name.time_granularity is not None: + WhereFilterParser._exception_message_for_incorrect_format( + f"Name is in an incorrect format: {entity_name} " + f"When referencing entities, the name should not have any dunders." + ) + entity_call_parameter_sets.append( EntityCallParameterSet( entity_path=tuple(EntityReference(element_name=arg) for arg in entity_path), diff --git a/tests/fixtures/semantic_manifest_yamls/simple_semantic_manifest/metrics.yaml b/tests/fixtures/semantic_manifest_yamls/simple_semantic_manifest/metrics.yaml index 3c2ae835..57555077 100644 --- a/tests/fixtures/semantic_manifest_yamls/simple_semantic_manifest/metrics.yaml +++ b/tests/fixtures/semantic_manifest_yamls/simple_semantic_manifest/metrics.yaml @@ -54,7 +54,7 @@ metric: type_params: measure: name: booking_value - filter: "{{ dimension('is_instant') }}" + filter: "{{ dimension('booking__is_instant') }}" --- metric: name: "average_instant_booking_value" @@ -63,7 +63,7 @@ metric: type_params: measure: name: average_booking_value - filter: "{{ dimension('is_instant') }}" + filter: "{{ dimension('booking__is_instant') }}" --- metric: name: "booking_value_for_non_null_listing_id" @@ -113,7 +113,7 @@ metric: type_params: measure: name: listings - filter: "{{ dimension('is_lux_latest') }}" + filter: "{{ dimension('listing__is_lux_latest') }}" --- metric: name: "smallest_listing" @@ -276,7 +276,7 @@ metric: type_params: numerator: name: average_booking_value - filter: "{{ dimension('is_instant') }}" + filter: "{{ dimension('booking__is_instant') }}" denominator: name: max_booking_value --- @@ -289,7 +289,7 @@ metric: type_params: numerator: name: average_booking_value - filter: "{{ dimension('is_lux_latest', entity_path=['listing']) }}" + filter: "{{ dimension('listing__is_lux_latest', entity_path=['listing']) }}" denominator: name: max_booking_value --- @@ -303,9 +303,9 @@ metric: expr: "average_booking_value * bookings / NULLIF(booking_value, 0)" metrics: - name: average_booking_value - filter: "{{ dimension('is_lux_latest', entity_path=['listing']) }}" + filter: "{{ dimension('listing__is_lux_latest', entity_path=['listing']) }}" - name: bookings - filter: "{{ dimension('is_lux_latest', entity_path=['listing']) }}" + filter: "{{ dimension('listing__is_lux_latest', entity_path=['listing']) }}" - name: booking_value --- metric: @@ -317,7 +317,7 @@ metric: type_params: numerator: name: booking_value - filter: "{{ dimension('is_instant') }}" + filter: "{{ dimension('booking__is_instant') }}" alias: booking_value_with_is_instant_constraint denominator: name: booking_value @@ -331,11 +331,11 @@ metric: type_params: numerator: name: total_account_balance_first_day - filter: "{{ dimension('home_state_latest', entity_path=['user']) }} IN ('CA', 'HI', 'WA')" + filter: "{{ dimension('user__home_state_latest') }} IN ('CA', 'HI', 'WA')" alias: west_coast_balance_first_day denominator: name: total_account_balance_first_day - filter: "{{ dimension('home_state_latest', entity_path=['user']) }} IN ('MD', 'NY', 'TX')" + filter: "{{ dimension('user__home_state_latest') }} IN ('MD', 'NY', 'TX')" alias: east_coast_balance_first_dat --- metric: @@ -347,7 +347,7 @@ metric: expr: delayed_bookings * 2 metrics: - name: bookings - filter: "NOT {{ dimension('is_instant') }}" + filter: "NOT {{ dimension('booking__is_instant') }}" alias: delayed_bookings --- metric: @@ -398,7 +398,7 @@ metric: - name: bookings - name: listings alias: lux_listing - filter: "{{ dimension('is_lux_latest') }}" + filter: "{{ dimension('listing__is_lux_latest') }}" --- metric: name: "instant_plus_non_referred_bookings_pct" diff --git a/tests/implementations/where_filter/test_parse_calls.py b/tests/implementations/where_filter/test_parse_calls.py index 2729de41..7c113b53 100644 --- a/tests/implementations/where_filter/test_parse_calls.py +++ b/tests/implementations/where_filter/test_parse_calls.py @@ -22,7 +22,10 @@ def test_extract_dimension_call_parameter_sets() -> None: # noqa: D parse_result = PydanticWhereFilter( where_sql_template=( - """{{ dimension('is_instant') }} AND {{ dimension('country', entity_path=['listing']) }} == 'US'""" + """\ + {{ dimension('booking__is_instant') }} \ + AND {{ dimension('user__country', entity_path=['listing']) }} == 'US'\ + """ ) ).call_parameter_sets @@ -30,11 +33,14 @@ def test_extract_dimension_call_parameter_sets() -> None: # noqa: D dimension_call_parameter_sets=( DimensionCallParameterSet( dimension_reference=DimensionReference(element_name="is_instant"), - entity_path=(), + entity_path=(EntityReference("booking"),), ), DimensionCallParameterSet( - entity_path=(EntityReference(element_name="listing"),), dimension_reference=DimensionReference(element_name="country"), + entity_path=( + EntityReference("listing"), + EntityReference("user"), + ), ), ), entity_call_parameter_sets=(), @@ -43,14 +49,35 @@ def test_extract_dimension_call_parameter_sets() -> None: # noqa: D def test_extract_time_dimension_call_parameter_sets() -> None: # noqa: D parse_result = PydanticWhereFilter( - where_sql_template="""{{ time_dimension('created_at', 'month', entity_path=['listing']) }} = '2020-01-01'""" + where_sql_template=( + """{{ time_dimension('user__created_at', 'month', entity_path=['listing']) }} = '2020-01-01'""" + ) ).call_parameter_sets assert parse_result == FilterCallParameterSets( time_dimension_call_parameter_sets=( TimeDimensionCallParameterSet( time_dimension_reference=TimeDimensionReference(element_name="created_at"), - entity_path=(EntityReference(element_name="listing"),), + entity_path=( + EntityReference("listing"), + EntityReference("user"), + ), + time_granularity=TimeGranularity.MONTH, + ), + ) + ) + + +def test_extract_metric_time_dimension_call_parameter_sets() -> None: # noqa: D + parse_result = PydanticWhereFilter( + where_sql_template=("""{{ time_dimension('metric_time', 'month') }} = '2020-01-01'""") + ).call_parameter_sets + + assert parse_result == FilterCallParameterSets( + time_dimension_call_parameter_sets=( + TimeDimensionCallParameterSet( + time_dimension_reference=TimeDimensionReference(element_name="metric_time"), + entity_path=(), time_granularity=TimeGranularity.MONTH, ), ) From f322968dd1901fd98d98ebf11b58e2cc2d1391e2 Mon Sep 17 00:00:00 2001 From: Paul Yang Date: Thu, 27 Jul 2023 16:50:18 -0700 Subject: [PATCH 2/2] Add change log for #123. --- .changes/unreleased/Breaking Changes-20230727-165005.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .changes/unreleased/Breaking Changes-20230727-165005.yaml diff --git a/.changes/unreleased/Breaking Changes-20230727-165005.yaml b/.changes/unreleased/Breaking Changes-20230727-165005.yaml new file mode 100644 index 00000000..0b68f488 --- /dev/null +++ b/.changes/unreleased/Breaking Changes-20230727-165005.yaml @@ -0,0 +1,6 @@ +kind: Breaking Changes +body: Require primary entities when specifying dimensions in the WhereFilter. +time: 2023-07-27T16:50:05.653782-07:00 +custom: + Author: plypaul + Issue: "123"