From f640e2cf16124a5677971a8d8747315ccfb51113 Mon Sep 17 00:00:00 2001 From: Courtney Holcomb Date: Fri, 26 Apr 2024 11:44:07 -0700 Subject: [PATCH] WIP - testing. separate out logic for deduping path keys --- .../model/semantics/linkable_spec_resolver.py | 62 ++++++++++++++++--- .../builder/test_dataflow_plan_builder.py | 54 ++++++++++++++++ 2 files changed, 107 insertions(+), 9 deletions(-) diff --git a/metricflow/model/semantics/linkable_spec_resolver.py b/metricflow/model/semantics/linkable_spec_resolver.py index 26f36af5a0..8794144943 100644 --- a/metricflow/model/semantics/linkable_spec_resolver.py +++ b/metricflow/model/semantics/linkable_spec_resolver.py @@ -120,8 +120,14 @@ def element_name(self) -> str: # noqa: D102 return self.reference.element_name @property - def path_key(self) -> ElementPathKey: # noqa: D102 - return ElementPathKey(element_name=self.element_name, entity_links=self.join_path.entity_links) + def path_key(self) -> ElementPathKey: + """Key that can uniquely identify an element and the joins used to realize the element. + + Use qualified name to guarantee uniqueness. + """ + return ElementPathKey( + element_name=self.as_group_by_metric_spec.qualified_name, entity_links=self.join_path.entity_links + ) @property def reference(self) -> MetricReference: # noqa: D102 @@ -141,6 +147,14 @@ def metric_subquery_entity_links(self) -> Tuple[EntityReference, ...]: """Entity links used to join the metric to the entity it's grouped by in the metric subquery.""" return self.join_path.metric_subquery_join_path_element.entity_links + @property + def as_group_by_metric_spec(self) -> GroupByMetricSpec: + return GroupByMetricSpec( + element_name=self.element_name, + entity_links=self.join_path.entity_links, + metric_subquery_entity_links=self.metric_subquery_entity_links, + ) + @dataclass(frozen=True) class LinkableElementSet: @@ -256,7 +270,9 @@ def intersection_by_path_key(linkable_element_sets: Sequence[LinkableElementSet] for path_key, entities in join_path_to_linkable_entities.items() }, path_key_to_linkable_metrics={ - path_key: tuple(sorted(metrics, key=lambda linkable_metric: linkable_metric.element_name)) + path_key: tuple( + sorted(metrics, key=lambda linkable_metric: linkable_metric.as_group_by_metric_spec.qualified_name) + ) for path_key, metrics in join_path_to_linkable_metrics.items() }, ) @@ -354,11 +370,7 @@ def as_spec_set(self) -> LinkableSpecSet: # noqa: D102 for path_key in self.path_key_to_linkable_entities ), group_by_metric_specs=tuple( - GroupByMetricSpec( - element_name=linkable_metric.element_name, - entity_links=linkable_metric.join_path.entity_links, - metric_subquery_entity_links=linkable_metric.metric_subquery_entity_links, - ) + linkable_metric.as_group_by_metric_spec for linkable_metrics in self.path_key_to_linkable_metrics.values() for linkable_metric in linkable_metrics ), @@ -627,10 +639,17 @@ def __init__( linkable_element_set_for_metric = self.get_linkable_elements_for_metrics([metric_reference]) for linkable_entities in linkable_element_set_for_metric.path_key_to_linkable_entities.values(): for linkable_entity in linkable_entities: + if ( + metric_reference.element_name == "bookings_per_listing" + and linkable_entity.element_name == "user" + ): + print("metric bookings_per_listing can be queried with:", linkable_entity) metric_subquery_join_path_element = MetricSubqueryJoinPathElement( metric_reference=metric_reference, join_on_entity=linkable_entity.reference, - metric_to_entity_join_path=SemanticModelJoinPath(linkable_entity.join_path), + metric_to_entity_join_path=( + SemanticModelJoinPath(linkable_entity.join_path) if linkable_entity.join_path else None + ), ) self._joinable_metrics_for_entities[linkable_entity.reference].add( metric_subquery_join_path_element @@ -653,6 +672,23 @@ def __init__( logger.info(f"Building valid group-by-item indexes took: {time.time() - start_time:.2f}s") + print( + [ + subq + for subq in self._joinable_metrics_for_entities[EntityReference("user")] + if subq.metric_reference.element_name == "bookings_per_listing" + ] + ) + # one of these has an empty join path. why? would expect None in that case. + + # when succeeding, we have: + # 1. MetricSubqueryJoinPathElement(metric_reference=MetricReference(element_name='bookings_per_listing'), join_on_entity=EntityReference(element_name='user'), metric_to_entity_join_path=SemanticModelJoinPath(path_elements=())) + # 2. MetricSubqueryJoinPathElement(metric_reference=MetricReference(element_name='bookings_per_listing'), join_on_entity=EntityReference(element_name='user'), metric_to_entity_join_path=SemanticModelJoinPath(path_elements=(SemanticModelJoinPathElement(semantic_model_reference=SemanticModelReference(semantic_model_name='listings_latest'), join_on_entity=EntityReference(element_name='listing')),))) + # DIFFERENT ORDER FAILS! + # 1. Why does order matter? -> are linkable specs overriding each other? + # 2. Why is there no metric_to_entity_join_path in the first? is that expected + does that matter? + # 3. Am I misreading the error? is it on the subquery? + def _metric_requires_metric_time(self, metric: Metric) -> bool: """Checks if the metric can only be queried with metric_time. Also checks input metrics. @@ -1008,6 +1044,14 @@ def get_linkable_elements_for_metrics( without_any_of: FrozenSet[LinkableElementProperty] = frozenset(), ) -> LinkableElementSet: """Gets the valid linkable elements that are common to all requested metrics.""" + + # Is this working? seems like maybe not for derived metrics? + # It's broken!!! UGHHHHH. only used in metadata queries! but it's wrong!!! + # Start a new PR to fix this + # High level: we need to align these things: + # 1. DFP uses same logic as group by resolution + # 2. Metadata queries - test that if you list dimensions + list entities for a given metric, you can successfully query each of them + # THat's a good place to start the next PR! linkable_element_sets = [] for metric_reference in metric_references: element_sets = self._metric_to_linkable_element_sets.get(metric_reference.element_name) diff --git a/tests/dataflow/builder/test_dataflow_plan_builder.py b/tests/dataflow/builder/test_dataflow_plan_builder.py index 1d319ea2e3..2b415a441c 100644 --- a/tests/dataflow/builder/test_dataflow_plan_builder.py +++ b/tests/dataflow/builder/test_dataflow_plan_builder.py @@ -2,17 +2,20 @@ import datetime import logging +import string import pytest from _pytest.fixtures import FixtureRequest from dbt_semantic_interfaces.implementations.filters.where_filter import PydanticWhereFilter from dbt_semantic_interfaces.naming.keywords import METRIC_TIME_ELEMENT_NAME +from dbt_semantic_interfaces.references import MeasureReference from dbt_semantic_interfaces.type_enums.time_granularity import TimeGranularity from metricflow.dataflow.builder.dataflow_plan_builder import DataflowPlanBuilder from metricflow.dataset.dataset import DataSet from metricflow.errors.errors import UnableToSatisfyQueryError from metricflow.filters.time_constraint import TimeRangeConstraint +from metricflow.model.semantics.linkable_element_properties import LinkableElementProperty from metricflow.query.query_parser import MetricFlowQueryParser from metricflow.specs.column_assoc import ColumnAssociationResolver from metricflow.specs.specs import ( @@ -1319,3 +1322,54 @@ def test_metric_in_metric_where_filter( mf_test_configuration=mf_test_configuration, dag_graph=dataflow_plan, ) + + +def test_all_available_single_join_metric_filters( + dataflow_plan_builder: DataflowPlanBuilder, query_parser: MetricFlowQueryParser +) -> None: + """Checks that all allowed metric filters do not error when used in dataflow plan (single-hop for both inner and out query).""" + for group_by_metric in dataflow_plan_builder._metric_lookup.linkable_elements_for_measure( + MeasureReference("listings"), without_any_of={LinkableElementProperty.MULTI_HOP} + ).as_spec_set.group_by_metric_specs: + entity_spec = group_by_metric.metric_subquery_entity_spec + if entity_spec.entity_links: + # TODO: Will handle in a different test. + continue + print("testing:", group_by_metric) + query_spec = query_parser.parse_and_validate_query( + metric_names=("listings",), + where_constraint=PydanticWhereFilter( + where_sql_template=string.Template("{{ Metric('$metric_name', ['$entity_name']) }} > 2").substitute( + metric_name=group_by_metric.element_name, entity_name=entity_spec.element_name + ), + ), + ) + dataflow_plan_builder.build_plan(query_spec) + assert 0 + + +def test_temp(dataflow_plan_builder: DataflowPlanBuilder, query_parser: MetricFlowQueryParser) -> None: + query_spec = query_parser.parse_and_validate_query( + metric_names=("listings",), + where_constraint=PydanticWhereFilter( + where_sql_template=string.Template("{{ Metric('$metric_name', ['$entity_name']) }} > 2").substitute( + metric_name="bookings_per_listing", entity_name="user" + ), + ), + ) + dataflow_plan_builder.build_plan(query_spec) + + # FLAKE + # 1. this combo should never be allowed - why is it? + # 2. but also why does it not fail every time? + # - let's investigate 2 first. why does the order matter? or does it? is that a bad assumption? + + assert 0 + + +def test_temp2(dataflow_plan_builder: DataflowPlanBuilder, query_parser: MetricFlowQueryParser) -> None: + # does this pass? does it flake? + query_spec = query_parser.parse_and_validate_query(metric_names=("bookings_per_listing",), group_by_names=("user",)) + dataflow_plan_builder.build_plan(query_spec) + assert 0 + # fails consistently - this combo should not be allowed