From af045aa72a0bcbe4dc61025857fcb3ba7f499af0 Mon Sep 17 00:00:00 2001 From: Lucas Valente Date: Mon, 14 Oct 2024 14:31:51 +0200 Subject: [PATCH] Add validation for missing `metric_time` when querying to/through SCD (#1451) ## Summary This PR changes `MetricQueryValidationRule` to raise `ScdRequiresMetricTimeIssue` in case the user submits a query that uses an SCD but does not group by time. ## Implementation To make this happen, I: - refactored how `MetricQueryValidationRule` works to return a new `QueryAnalysisResult` and avoid creating a bunch of separate methods that return booleans - changed `MetricQueryValidationRule` to make it get the specs for all the joinable SCDs for the given metric, match the group by input against it and raise an issue if there's no `metric_time` in the query - created a new `LinkableElementProperty` called `SCD_HOP`, which indicates that you need to join to an SCD somewhere along the path to join to that element - changed the `ValidLinkableSpecResolver` to add `SCD_HOP` to all the linkable elements which go to/through SCDs - added tests for joins to an SCD and through an SCD --- .../unreleased/Fixes-20241011-161926.yaml | 6 + .../model/linkable_element_property.py | 2 + .../model/semantics/linkable_spec_resolver.py | 43 ++++++- .../model/semantics/metric_lookup.py | 13 +++ .../parsing/scd_requires_metric_time.py | 52 +++++++++ .../validation_rules/base_validation_rule.py | 5 +- .../metric_time_requirements.py | 106 ++++++++++++------ .../semantics/test_linkable_spec_resolver.py | 2 +- .../query/test_query_parser.py | 66 +++++++++-- ..._set_from_join_path_multi_hop__result0.txt | 96 ++++++++-------- ...cd_no_time_dimension_validation__error.txt | 17 +++ ...cd_no_time_dimension_validation__error.txt | 17 +++ 12 files changed, 326 insertions(+), 99 deletions(-) create mode 100644 .changes/unreleased/Fixes-20241011-161926.yaml create mode 100644 metricflow-semantics/metricflow_semantics/query/issues/parsing/scd_requires_metric_time.py create mode 100644 metricflow-semantics/tests_metricflow_semantics/snapshots/test_query_parser.py/str/test_join_through_scd_no_time_dimension_validation__error.txt create mode 100644 metricflow-semantics/tests_metricflow_semantics/snapshots/test_query_parser.py/str/test_join_to_scd_no_time_dimension_validation__error.txt diff --git a/.changes/unreleased/Fixes-20241011-161926.yaml b/.changes/unreleased/Fixes-20241011-161926.yaml new file mode 100644 index 0000000000..568b39b9ef --- /dev/null +++ b/.changes/unreleased/Fixes-20241011-161926.yaml @@ -0,0 +1,6 @@ +kind: Fixes +body: Add new validation that checks for SCDs in the join path to make grouping by `metric_time` required in this case. +time: 2024-10-11T16:19:26.928496+02:00 +custom: + Author: serramatutu + Issue: "1451" diff --git a/metricflow-semantics/metricflow_semantics/model/linkable_element_property.py b/metricflow-semantics/metricflow_semantics/model/linkable_element_property.py index 642a9b3449..ab43a1adff 100644 --- a/metricflow-semantics/metricflow_semantics/model/linkable_element_property.py +++ b/metricflow-semantics/metricflow_semantics/model/linkable_element_property.py @@ -29,6 +29,8 @@ class LinkableElementProperty(Enum): METRIC = "metric" # A time dimension with a DatePart. DATE_PART = "date_part" + # A linkable element that is itself part of an SCD model, or a linkable element that gets joined through another SCD model. + SCD_HOP = "scd_hop" @staticmethod def all_properties() -> FrozenSet[LinkableElementProperty]: # noqa: D102 diff --git a/metricflow-semantics/metricflow_semantics/model/semantics/linkable_spec_resolver.py b/metricflow-semantics/metricflow_semantics/model/semantics/linkable_spec_resolver.py index ceb230a73d..e9b142b4c0 100644 --- a/metricflow-semantics/metricflow_semantics/model/semantics/linkable_spec_resolver.py +++ b/metricflow-semantics/metricflow_semantics/model/semantics/linkable_spec_resolver.py @@ -193,6 +193,10 @@ def __init__( logger.debug(LazyFormat(lambda: f"Building valid group-by-item indexes took: {time.time() - start_time:.2f}s")) + def _is_semantic_model_scd(self, semantic_model: SemanticModel) -> bool: + """Whether the semantic model's underlying table is an SCD.""" + return any(dim.validity_params is not None for dim in semantic_model.dimensions) + def _generate_linkable_time_dimensions( self, semantic_model_origin: SemanticModelReference, @@ -294,6 +298,8 @@ def get_joinable_metrics_for_semantic_model( necessary. """ properties = frozenset({LinkableElementProperty.METRIC, LinkableElementProperty.JOINED}) + if self._is_semantic_model_scd(semantic_model): + properties = properties.union({LinkableElementProperty.SCD_HOP}) join_path_has_path_links = len(using_join_path.path_elements) > 0 if join_path_has_path_links: @@ -331,8 +337,15 @@ def _get_elements_in_semantic_model(self, semantic_model: SemanticModel) -> Link Elements related to metric_time are handled separately in _get_metric_time_elements(). Linkable metrics are not considered local to the semantic model since they always require a join. """ + semantic_model_is_scd = self._is_semantic_model_scd(semantic_model) + linkable_dimensions = [] linkable_entities = [] + + entity_properties = frozenset({LinkableElementProperty.LOCAL, LinkableElementProperty.ENTITY}) + if semantic_model_is_scd: + entity_properties = entity_properties.union({LinkableElementProperty.SCD_HOP}) + for entity in semantic_model.entities: linkable_entities.append( LinkableEntity.create( @@ -342,7 +355,7 @@ def _get_elements_in_semantic_model(self, semantic_model: SemanticModel) -> Link join_path=SemanticModelJoinPath( left_semantic_model_reference=semantic_model.reference, ), - properties=frozenset({LinkableElementProperty.LOCAL, LinkableElementProperty.ENTITY}), + properties=entity_properties, ) ) for entity_link in self._semantic_model_lookup.entity_links_for_local_elements(semantic_model): @@ -357,12 +370,15 @@ def _get_elements_in_semantic_model(self, semantic_model: SemanticModel) -> Link join_path=SemanticModelJoinPath( left_semantic_model_reference=semantic_model.reference, ), - properties=frozenset({LinkableElementProperty.LOCAL, LinkableElementProperty.ENTITY}), + properties=entity_properties, ) ) + dimension_properties = frozenset({LinkableElementProperty.LOCAL}) + if semantic_model_is_scd: + dimension_properties = dimension_properties.union({LinkableElementProperty.SCD_HOP}) + for entity_link in self._semantic_model_lookup.entity_links_for_local_elements(semantic_model): - dimension_properties = frozenset({LinkableElementProperty.LOCAL}) for dimension in semantic_model.dimensions: dimension_type = dimension.type if dimension_type is DimensionType.CATEGORICAL: @@ -464,6 +480,7 @@ def _get_metric_time_elements(self, measure_reference: Optional[MeasureReference defined_granularity: Optional[ExpandedTimeGranularity] = None if measure_reference: measure_semantic_model = self._get_semantic_model_for_measure(measure_reference) + semantic_model_is_scd = self._is_semantic_model_scd(measure_semantic_model) measure_agg_time_dimension_reference = measure_semantic_model.checked_agg_time_dimension_for_measure( measure_reference=measure_reference ) @@ -476,6 +493,7 @@ def _get_metric_time_elements(self, measure_reference: Optional[MeasureReference # If querying metric_time without metrics, will query from time spines. # Defaults to DAY granularity if available in time spines, else smallest available granularity. min_granularity = min(self._time_spine_sources.keys()) + semantic_model_is_scd = False possible_metric_time_granularities = tuple( ExpandedTimeGranularity.from_time_granularity(time_granularity) for time_granularity in TimeGranularity @@ -506,6 +524,8 @@ def _get_metric_time_elements(self, measure_reference: Optional[MeasureReference properties.add(LinkableElementProperty.DERIVED_TIME_GRANULARITY) if date_part: properties.add(LinkableElementProperty.DATE_PART) + if semantic_model_is_scd: + properties.add(LinkableElementProperty.SCD_HOP) linkable_dimension = LinkableDimension.create( defined_in_semantic_model=measure_semantic_model.reference if measure_semantic_model else None, element_name=MetricFlowReservedKeywords.METRIC_TIME.value, @@ -717,12 +737,25 @@ def create_linkable_element_set_from_join_path( join_path: SemanticModelJoinPath, ) -> LinkableElementSet: """Given the current path, generate the respective linkable elements from the last semantic model in the path.""" + semantic_model = self._semantic_model_lookup.get_by_reference(join_path.last_semantic_model_reference) + assert ( + semantic_model + ), f"Semantic model {join_path.last_semantic_model_reference.semantic_model_name} is in join path but does not exist in SemanticModelLookup" + properties = frozenset({LinkableElementProperty.JOINED}) if len(join_path.path_elements) > 1: properties = properties.union({LinkableElementProperty.MULTI_HOP}) - semantic_model = self._semantic_model_lookup.get_by_reference(join_path.last_semantic_model_reference) - assert semantic_model + # If any of the semantic models in the join path is an SCD, add SCD_HOP + for reference_to_derived_model in join_path.derived_from_semantic_models: + derived_model = self._semantic_model_lookup.get_by_reference(reference_to_derived_model) + assert ( + derived_model + ), f"Semantic model {reference_to_derived_model.semantic_model_name} is in join path but does not exist in SemanticModelLookup" + + if self._is_semantic_model_scd(derived_model): + properties = properties.union({LinkableElementProperty.SCD_HOP}) + break linkable_dimensions: List[LinkableDimension] = [] linkable_entities: List[LinkableEntity] = [] diff --git a/metricflow-semantics/metricflow_semantics/model/semantics/metric_lookup.py b/metricflow-semantics/metricflow_semantics/model/semantics/metric_lookup.py index 3af03fc209..d6c4fe3fe3 100644 --- a/metricflow-semantics/metricflow_semantics/model/semantics/metric_lookup.py +++ b/metricflow-semantics/metricflow_semantics/model/semantics/metric_lookup.py @@ -21,6 +21,7 @@ ) from metricflow_semantics.model.semantics.semantic_model_join_evaluator import MAX_JOIN_HOPS from metricflow_semantics.model.semantics.semantic_model_lookup import SemanticModelLookup +from metricflow_semantics.specs.instance_spec import LinkableInstanceSpec from metricflow_semantics.specs.time_dimension_spec import TimeDimensionSpec from metricflow_semantics.time.granularity import ExpandedTimeGranularity @@ -256,3 +257,15 @@ def _get_min_queryable_time_granularity(self, metric_reference: MetricReference) minimum_queryable_granularity = defined_time_granularity return minimum_queryable_granularity + + def get_joinable_scd_specs_for_metric(self, metric_reference: MetricReference) -> Sequence[LinkableInstanceSpec]: + """Get the SCDs that can be joined to a metric.""" + filter = LinkableElementFilter( + with_any_of=frozenset([LinkableElementProperty.SCD_HOP]), + ) + scd_elems = self.linkable_elements_for_metrics( + metric_references=(metric_reference,), + element_set_filter=filter, + ) + + return scd_elems.specs diff --git a/metricflow-semantics/metricflow_semantics/query/issues/parsing/scd_requires_metric_time.py b/metricflow-semantics/metricflow_semantics/query/issues/parsing/scd_requires_metric_time.py new file mode 100644 index 0000000000..0aabd5fc24 --- /dev/null +++ b/metricflow-semantics/metricflow_semantics/query/issues/parsing/scd_requires_metric_time.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +from collections.abc import Sequence +from dataclasses import dataclass + +from typing_extensions import override + +from metricflow_semantics.query.group_by_item.resolution_path import MetricFlowQueryResolutionPath +from metricflow_semantics.query.issues.issues_base import ( + MetricFlowQueryIssueType, + MetricFlowQueryResolutionIssue, +) +from metricflow_semantics.query.resolver_inputs.base_resolver_inputs import MetricFlowQueryResolverInput +from metricflow_semantics.specs.instance_spec import InstanceSpec + + +@dataclass(frozen=True) +class ScdRequiresMetricTimeIssue(MetricFlowQueryResolutionIssue): + """Describes an issue with a query that includes a SCD group by but does not include metric_time.""" + + scds_in_query: Sequence[InstanceSpec] + + @override + def ui_description(self, associated_input: MetricFlowQueryResolverInput) -> str: + dim_str = ", ".join(f"'{scd.qualified_name}'" for scd in self.scds_in_query) + return ( + "Your query contains the following group bys, which are SCDs or contain SCDs " + f"in the join path: [{dim_str}].\n\nA query containing SCDs must also contain " + "the metric_time dimension in order to join the SCD table to the valid time " + "range. Please add metric_time to the query and try again. If you're " + "using agg_time_dimension, use metric_time instead." + ) + + @override + def with_path_prefix(self, path_prefix: MetricFlowQueryResolutionPath) -> ScdRequiresMetricTimeIssue: + return ScdRequiresMetricTimeIssue( + issue_type=self.issue_type, + parent_issues=self.parent_issues, + query_resolution_path=self.query_resolution_path.with_path_prefix(path_prefix), + scds_in_query=self.scds_in_query, + ) + + @staticmethod + def from_parameters( # noqa: D102 + scds_in_query: Sequence[InstanceSpec], query_resolution_path: MetricFlowQueryResolutionPath + ) -> ScdRequiresMetricTimeIssue: + return ScdRequiresMetricTimeIssue( + issue_type=MetricFlowQueryIssueType.ERROR, + parent_issues=(), + query_resolution_path=query_resolution_path, + scds_in_query=scds_in_query, + ) diff --git a/metricflow-semantics/metricflow_semantics/query/validation_rules/base_validation_rule.py b/metricflow-semantics/metricflow_semantics/query/validation_rules/base_validation_rule.py index 7de6ebc161..ec3587472f 100644 --- a/metricflow-semantics/metricflow_semantics/query/validation_rules/base_validation_rule.py +++ b/metricflow-semantics/metricflow_semantics/query/validation_rules/base_validation_rule.py @@ -3,7 +3,7 @@ from abc import ABC, abstractmethod from typing import Sequence -from dbt_semantic_interfaces.protocols import Metric, WhereFilterIntersection +from dbt_semantic_interfaces.protocols import WhereFilterIntersection from dbt_semantic_interfaces.references import MetricReference from metricflow_semantics.model.semantic_manifest_lookup import SemanticManifestLookup @@ -21,9 +21,6 @@ def __init__( # noqa: D107 self._manifest_lookup = manifest_lookup self._resolver_input_for_query = resolver_input_for_query - def _get_metric(self, metric_reference: MetricReference) -> Metric: - return self._manifest_lookup.metric_lookup.get_metric(metric_reference) - @abstractmethod def validate_metric_in_resolution_dag( self, diff --git a/metricflow-semantics/metricflow_semantics/query/validation_rules/metric_time_requirements.py b/metricflow-semantics/metricflow_semantics/query/validation_rules/metric_time_requirements.py index bbb0fd37c7..5b215562d8 100644 --- a/metricflow-semantics/metricflow_semantics/query/validation_rules/metric_time_requirements.py +++ b/metricflow-semantics/metricflow_semantics/query/validation_rules/metric_time_requirements.py @@ -1,12 +1,16 @@ from __future__ import annotations -from functools import cached_property -from typing import Sequence +from dataclasses import dataclass +from functools import lru_cache +from typing import List, Sequence from dbt_semantic_interfaces.enum_extension import assert_values_exhausted from dbt_semantic_interfaces.naming.keywords import METRIC_TIME_ELEMENT_NAME from dbt_semantic_interfaces.protocols import WhereFilterIntersection -from dbt_semantic_interfaces.references import MetricReference, TimeDimensionReference +from dbt_semantic_interfaces.references import ( + MetricReference, + TimeDimensionReference, +) from dbt_semantic_interfaces.type_enums import MetricType from typing_extensions import override @@ -19,18 +23,34 @@ from metricflow_semantics.query.issues.parsing.offset_metric_requires_metric_time import ( OffsetMetricRequiresMetricTimeIssue, ) -from metricflow_semantics.query.resolver_inputs.query_resolver_inputs import ResolverInputForQuery +from metricflow_semantics.query.issues.parsing.scd_requires_metric_time import ( + ScdRequiresMetricTimeIssue, +) +from metricflow_semantics.query.resolver_inputs.query_resolver_inputs import ( + ResolverInputForQuery, +) from metricflow_semantics.query.validation_rules.base_validation_rule import PostResolutionQueryValidationRule +from metricflow_semantics.specs.instance_spec import InstanceSpec from metricflow_semantics.specs.time_dimension_spec import TimeDimensionSpec +@dataclass(frozen=True) +class QueryItemsAnalysis: + """Contains data about which items a query contains.""" + + scds: Sequence[InstanceSpec] + has_metric_time: bool + has_agg_time_dimension: bool + + class MetricTimeQueryValidationRule(PostResolutionQueryValidationRule): """Validates cases where a query requires metric_time to be specified as a group-by-item. Currently, known cases are: * Cumulative metrics. - * Derived metrics with an offset time.g + * Derived metrics with an offset time. + * Slowly changing dimensions """ def __init__( # noqa: D107 @@ -46,30 +66,35 @@ def __init__( # noqa: D107 ) ) - @cached_property - def _group_by_items_include_metric_time(self) -> bool: - for group_by_item_input in self._resolver_input_for_query.group_by_item_inputs: - if group_by_item_input.spec_pattern.matches_any(self._metric_time_specs): - return True - - return False - - def _query_includes_metric_time_or_agg_time_dimension(self, metric_reference: MetricReference) -> bool: - return self._group_by_items_include_metric_time or self._group_by_items_include_agg_time_dimension( - query_resolver_input=self._resolver_input_for_query, metric_reference=metric_reference - ) - - def _group_by_items_include_agg_time_dimension( + @lru_cache + def _get_query_items_analysis( self, query_resolver_input: ResolverInputForQuery, metric_reference: MetricReference - ) -> bool: + ) -> QueryItemsAnalysis: + has_agg_time_dimension = False + has_metric_time = False + scds: List[InstanceSpec] = [] + valid_agg_time_dimension_specs = self._manifest_lookup.metric_lookup.get_valid_agg_time_dimensions_for_metric( metric_reference ) + + scd_specs = self._manifest_lookup.metric_lookup.get_joinable_scd_specs_for_metric(metric_reference) + for group_by_item_input in query_resolver_input.group_by_item_inputs: + if group_by_item_input.spec_pattern.matches_any(self._metric_time_specs): + has_metric_time = True + if group_by_item_input.spec_pattern.matches_any(valid_agg_time_dimension_specs): - return True + has_agg_time_dimension = True - return False + scd_matches = group_by_item_input.spec_pattern.match(scd_specs) + scds.extend(scd_matches) + + return QueryItemsAnalysis( + scds=scds, + has_metric_time=has_metric_time, + has_agg_time_dimension=has_agg_time_dimension, + ) @override def validate_metric_in_resolution_dag( @@ -77,11 +102,23 @@ def validate_metric_in_resolution_dag( metric_reference: MetricReference, resolution_path: MetricFlowQueryResolutionPath, ) -> MetricFlowQueryResolutionIssueSet: - metric = self._get_metric(metric_reference) + metric = self._manifest_lookup.metric_lookup.get_metric(metric_reference) + + query_items_analysis = self._get_query_items_analysis(self._resolver_input_for_query, metric_reference) + + issues = MetricFlowQueryResolutionIssueSet.empty_instance() - if metric.type is MetricType.SIMPLE or metric.type is MetricType.CONVERSION: - return MetricFlowQueryResolutionIssueSet.empty_instance() - elif metric.type is MetricType.CUMULATIVE: + # Queries that join to an SCD don't support direct references to agg_time_dimension, so we + # only check for metric_time. If we decide to support agg_time_dimension, we should add a check + if len(query_items_analysis.scds) > 0 and not query_items_analysis.has_metric_time: + issues = issues.add_issue( + ScdRequiresMetricTimeIssue.from_parameters( + scds_in_query=query_items_analysis.scds, + query_resolution_path=resolution_path, + ) + ) + + if metric.type is MetricType.CUMULATIVE: if ( metric.type_params is not None and metric.type_params.cumulative_type_params is not None @@ -89,34 +126,35 @@ def validate_metric_in_resolution_dag( metric.type_params.cumulative_type_params.window is not None or metric.type_params.cumulative_type_params.grain_to_date is not None ) - and not self._query_includes_metric_time_or_agg_time_dimension(metric_reference) + and not (query_items_analysis.has_metric_time or query_items_analysis.has_agg_time_dimension) ): - return MetricFlowQueryResolutionIssueSet.from_issue( + issues = issues.add_issue( CumulativeMetricRequiresMetricTimeIssue.from_parameters( metric_reference=metric_reference, query_resolution_path=resolution_path, ) ) - return MetricFlowQueryResolutionIssueSet.empty_instance() - elif metric.type is MetricType.RATIO or metric.type is MetricType.DERIVED: has_time_offset = any( input_metric.offset_window is not None or input_metric.offset_to_grain is not None for input_metric in metric.input_metrics ) - if has_time_offset and not self._query_includes_metric_time_or_agg_time_dimension(metric_reference): - return MetricFlowQueryResolutionIssueSet.from_issue( + if has_time_offset and not ( + query_items_analysis.has_metric_time or query_items_analysis.has_agg_time_dimension + ): + issues = issues.add_issue( OffsetMetricRequiresMetricTimeIssue.from_parameters( metric_reference=metric_reference, input_metrics=metric.input_metrics, query_resolution_path=resolution_path, ) ) - return MetricFlowQueryResolutionIssueSet.empty_instance() - else: + elif metric.type is not MetricType.SIMPLE and metric.type is not MetricType.CONVERSION: assert_values_exhausted(metric.type) + return issues + @override def validate_query_in_resolution_dag( self, diff --git a/metricflow-semantics/tests_metricflow_semantics/model/semantics/test_linkable_spec_resolver.py b/metricflow-semantics/tests_metricflow_semantics/model/semantics/test_linkable_spec_resolver.py index b322a591a8..e22dd99d17 100644 --- a/metricflow-semantics/tests_metricflow_semantics/model/semantics/test_linkable_spec_resolver.py +++ b/metricflow-semantics/tests_metricflow_semantics/model/semantics/test_linkable_spec_resolver.py @@ -172,7 +172,7 @@ def test_create_linkable_element_set_from_join_path_multi_hop( # noqa: D103 left_semantic_model_reference=SemanticModelReference("views_source"), path_elements=( SemanticModelJoinPathElement( - semantic_model_reference=SemanticModelReference("bookings"), + semantic_model_reference=SemanticModelReference("bookings_source"), join_on_entity=EntityReference("guest"), ), SemanticModelJoinPathElement( diff --git a/metricflow-semantics/tests_metricflow_semantics/query/test_query_parser.py b/metricflow-semantics/tests_metricflow_semantics/query/test_query_parser.py index 9b8ebbabf1..c1b62ed1b2 100644 --- a/metricflow-semantics/tests_metricflow_semantics/query/test_query_parser.py +++ b/metricflow-semantics/tests_metricflow_semantics/query/test_query_parser.py @@ -6,6 +6,7 @@ import pytest from _pytest.fixtures import FixtureRequest +from dbt_semantic_interfaces.implementations.semantic_manifest import PydanticSemanticManifest from dbt_semantic_interfaces.parsing.dir_to_model import parse_yaml_files_to_validation_ready_semantic_manifest from dbt_semantic_interfaces.parsing.objects import YamlConfigFile from dbt_semantic_interfaces.protocols import SemanticManifest @@ -29,7 +30,7 @@ EXAMPLE_PROJECT_CONFIGURATION_YAML_CONFIG_FILE, ) from metricflow_semantics.test_helpers.metric_time_dimension import MTD -from metricflow_semantics.test_helpers.snapshot_helpers import assert_object_snapshot_equal +from metricflow_semantics.test_helpers.snapshot_helpers import assert_object_snapshot_equal, assert_str_snapshot_equal logger = logging.getLogger(__name__) @@ -191,6 +192,12 @@ def revenue_query_parser() -> MetricFlowQueryParser: # noqa return query_parser_from_yaml([EXAMPLE_PROJECT_CONFIGURATION_YAML_CONFIG_FILE, revenue_yaml_file]) +@pytest.fixture +def scd_query_parser(scd_semantic_manifest: PydanticSemanticManifest) -> MetricFlowQueryParser: # noqa + semantic_manifest_lookup = SemanticManifestLookup(scd_semantic_manifest) + return MetricFlowQueryParser(semantic_manifest_lookup) + + def test_query_parser( # noqa: D103 request: FixtureRequest, mf_test_configuration: MetricFlowTestConfiguration, @@ -459,6 +466,52 @@ def test_cumulative_metric_agg_time_dimension_name_validation( assert_object_snapshot_equal(request=request, mf_test_configuration=mf_test_configuration, obj=result) +def test_join_to_scd_no_time_dimension_validation( + request: FixtureRequest, + mf_test_configuration: MetricFlowTestConfiguration, + scd_query_parser: MetricFlowQueryParser, +) -> None: + """Test that queries that join to SCD semantic models fail if no time dimensions are selected.""" + with pytest.raises(InvalidQueryException) as exc_info: + scd_query_parser.parse_and_validate_query( + metric_names=["bookings"], + group_by_names=["listing__country"], + ) + + assert len(exc_info.value.args) == 1 + assert isinstance(exc_info.value.args[0], str) + assert_str_snapshot_equal( + request=request, + mf_test_configuration=mf_test_configuration, + snapshot_id="error", + snapshot_str=exc_info.value.args[0], + ) + + +def test_join_through_scd_no_time_dimension_validation( + request: FixtureRequest, + mf_test_configuration: MetricFlowTestConfiguration, + scd_query_parser: MetricFlowQueryParser, +) -> None: + """Test that queries that join through SCDs semantic models fail if no time dimensions are selected.""" + with pytest.raises(InvalidQueryException) as exc_info: + # "user__home_state_latest" is not an SCD itself, but since we go through + # "listing" and that is an SCD, we should raise an exception here as well + scd_query_parser.parse_and_validate_query( + metric_names=["bookings"], + group_by_names=["listing__user__home_state_latest"], + ) + + assert len(exc_info.value.args) == 1 + assert isinstance(exc_info.value.args[0], str) + assert_str_snapshot_equal( + request=request, + mf_test_configuration=mf_test_configuration, + snapshot_id="error", + snapshot_str=exc_info.value.args[0], + ) + + def test_derived_metric_query_parsing( request: FixtureRequest, mf_test_configuration: MetricFlowTestConfiguration, @@ -607,12 +660,11 @@ def test_offset_metric_with_diff_agg_time_dims_error() -> None: # noqa: D103 def query_parser_from_yaml(yaml_contents: List[YamlConfigFile]) -> MetricFlowQueryParser: """Given yaml files, return a query parser using default source nodes, resolvers and time spine source.""" - semantic_manifest_lookup = SemanticManifestLookup( - parse_yaml_files_to_validation_ready_semantic_manifest( - yaml_contents, apply_transformations=True - ).semantic_manifest - ) - SemanticManifestValidator[SemanticManifest]().checked_validations(semantic_manifest_lookup.semantic_manifest) + semantic_manifest = parse_yaml_files_to_validation_ready_semantic_manifest( + yaml_contents, apply_transformations=True + ).semantic_manifest + semantic_manifest_lookup = SemanticManifestLookup(semantic_manifest) + SemanticManifestValidator[SemanticManifest]().checked_validations(semantic_manifest) return MetricFlowQueryParser( semantic_manifest_lookup=semantic_manifest_lookup, ) diff --git a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_linkable_spec_resolver.py/str/test_create_linkable_element_set_from_join_path_multi_hop__result0.txt b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_linkable_spec_resolver.py/str/test_create_linkable_element_set_from_join_path_multi_hop__result0.txt index ec9b2bff4b..d7235a54b4 100644 --- a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_linkable_spec_resolver.py/str/test_create_linkable_element_set_from_join_path_multi_hop__result0.txt +++ b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_linkable_spec_resolver.py/str/test_create_linkable_element_set_from_join_path_multi_hop__result0.txt @@ -1,48 +1,48 @@ -Model Join-Path Entity Links Name Time Granularity Date Part Properties ------------------------------------------------ -------------------- --------------- ------------------ ----------- ---------------------------------------------------------------- -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') capacity_latest ['JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') country_latest ['JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at day ['JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at day DAY ['DATE_PART', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at day DOW ['DATE_PART', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at day DOY ['DATE_PART', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at day MONTH ['DATE_PART', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at day QUARTER ['DATE_PART', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at day YEAR ['DATE_PART', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at martian_day ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at month ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at month MONTH ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at month QUARTER ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at month YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at quarter ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at quarter QUARTER ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at quarter YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at week ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at week MONTH ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at week QUARTER ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at week YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at year ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') created_at year YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds day ['JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds day DAY ['DATE_PART', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds day DOW ['DATE_PART', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds day DOY ['DATE_PART', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds day MONTH ['DATE_PART', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds day QUARTER ['DATE_PART', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds day YEAR ['DATE_PART', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds martian_day ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds month ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds month MONTH ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds month QUARTER ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds month YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds quarter ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds quarter QUARTER ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds quarter YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds week ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds week MONTH ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds week QUARTER ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds week YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds year ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') ds year YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') is_lux_latest ['JOINED', 'MULTI_HOP'] -('views_source', 'bookings', 'listings_latest') ('guest', 'listing') user ['ENTITY', 'JOINED', 'MULTI_HOP'] +Model Join-Path Entity Links Name Time Granularity Date Part Properties +------------------------------------------------------ -------------------- --------------- ------------------ ----------- ---------------------------------------------------------------- +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') capacity_latest ['JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') country_latest ['JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at day ['JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at day DAY ['DATE_PART', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at day DOW ['DATE_PART', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at day DOY ['DATE_PART', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at day MONTH ['DATE_PART', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at day QUARTER ['DATE_PART', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at day YEAR ['DATE_PART', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at martian_day ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at month ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at month MONTH ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at month QUARTER ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at month YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at quarter ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at quarter QUARTER ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at quarter YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at week ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at week MONTH ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at week QUARTER ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at week YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at year ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') created_at year YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds day ['JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds day DAY ['DATE_PART', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds day DOW ['DATE_PART', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds day DOY ['DATE_PART', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds day MONTH ['DATE_PART', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds day QUARTER ['DATE_PART', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds day YEAR ['DATE_PART', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds martian_day ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds month ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds month MONTH ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds month QUARTER ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds month YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds quarter ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds quarter QUARTER ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds quarter YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds week ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds week MONTH ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds week QUARTER ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds week YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds year ['DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') ds year YEAR ['DATE_PART', 'DERIVED_TIME_GRANULARITY', 'JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') is_lux_latest ['JOINED', 'MULTI_HOP'] +('views_source', 'bookings_source', 'listings_latest') ('guest', 'listing') user ['ENTITY', 'JOINED', 'MULTI_HOP'] diff --git a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_query_parser.py/str/test_join_through_scd_no_time_dimension_validation__error.txt b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_query_parser.py/str/test_join_through_scd_no_time_dimension_validation__error.txt new file mode 100644 index 0000000000..26dd08b74c --- /dev/null +++ b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_query_parser.py/str/test_join_through_scd_no_time_dimension_validation__error.txt @@ -0,0 +1,17 @@ +Got errors while resolving the query. + +Error #1: + Message: + + Your query contains the following group bys, which are SCDs or contain SCDs in the join path: ['listing__user__home_state_latest']. + + A query containing SCDs must also contain the metric_time dimension in order to join the SCD table to the valid time range. Please add metric_time to the query and try again. If you're using agg_time_dimension, use metric_time instead. + + Query Input: + + Query(['bookings'], ['listing__user__home_state_latest'] + + Issue Location: + + [Resolve Query(['bookings'])] + -> [Resolve Metric('bookings')] diff --git a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_query_parser.py/str/test_join_to_scd_no_time_dimension_validation__error.txt b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_query_parser.py/str/test_join_to_scd_no_time_dimension_validation__error.txt new file mode 100644 index 0000000000..39b0d7ee3f --- /dev/null +++ b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_query_parser.py/str/test_join_to_scd_no_time_dimension_validation__error.txt @@ -0,0 +1,17 @@ +Got errors while resolving the query. + +Error #1: + Message: + + Your query contains the following group bys, which are SCDs or contain SCDs in the join path: ['listing__country']. + + A query containing SCDs must also contain the metric_time dimension in order to join the SCD table to the valid time range. Please add metric_time to the query and try again. If you're using agg_time_dimension, use metric_time instead. + + Query Input: + + Query(['bookings'], ['listing__country'] + + Issue Location: + + [Resolve Query(['bookings'])] + -> [Resolve Metric('bookings')]