Skip to content

Commit

Permalink
Allow ambiguous entity-paths in group-by-item inputs.
Browse files Browse the repository at this point in the history
This allows an input like "entity_0__country" to match group-by-items with
longer paths like "entity_1__entity_0__country" if there's a single
group-by-item with the shortest entity-path in the set of available items.
  • Loading branch information
plypaul committed Jan 10, 2024
1 parent d7df8cd commit 5bff396
Show file tree
Hide file tree
Showing 18 changed files with 359 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def ui_description(self, associated_input: MetricFlowQueryResolverInput) -> str:
candidates_str.append(str(spec))
return (
f"The given input is ambiguous and can't be resolved. The input could match:\n\n"
f"{indent(mf_pformat(candidates_str))}"
f"{indent(mf_pformat(sorted(candidates_str)))}"
)

@override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import Sequence

from typing_extensions import override

from metricflow.mf_logging.formatting import indent
from metricflow.mf_logging.pretty_print import mf_pformat
from metricflow.naming.object_builder_scheme import ObjectBuilderNamingScheme
from metricflow.query.group_by_item.candidate_push_down.group_by_item_candidate import GroupByItemCandidateSet
from metricflow.query.group_by_item.resolution_path import MetricFlowQueryResolutionPath
from metricflow.query.issues.issues_base import (
MetricFlowQueryIssueType,
MetricFlowQueryResolutionIssue,
)
from metricflow.query.resolver_inputs.base_resolver_inputs import MetricFlowQueryResolverInput


@dataclass(frozen=True)
class MultipleMatchIssue(MetricFlowQueryResolutionIssue):
"""Describes an issue during group-by-item resolution the input pattern matches multiple specs."""

candidate_set: GroupByItemCandidateSet

@staticmethod
def from_parameters( # noqa: D
query_resolution_path: MetricFlowQueryResolutionPath,
candidate_set: GroupByItemCandidateSet,
parent_issues: Sequence[MetricFlowQueryResolutionIssue],
) -> MultipleMatchIssue:
return MultipleMatchIssue(
issue_type=MetricFlowQueryIssueType.ERROR,
query_resolution_path=query_resolution_path,
candidate_set=candidate_set,
parent_issues=tuple(parent_issues),
)

@override
def ui_description(self, associated_input: MetricFlowQueryResolverInput) -> str:
last_path_item = self.query_resolution_path.last_item
naming_scheme = (
associated_input.input_pattern_description.naming_scheme
if associated_input.input_pattern_description is not None
else ObjectBuilderNamingScheme()
)

specs_as_strs = []
for spec in self.candidate_set.specs:
input_str = naming_scheme.input_str(spec)
if input_str is not None:
specs_as_strs.append(input_str)
else:
specs_as_strs.append(f"<{repr(spec)}>")

return (
f"The given input matches multiple group-by-items for {last_path_item.ui_description}:\n\n"
f"{indent(mf_pformat(specs_as_strs))}\n\n"
f"Please use a more specific input to resolve the ambiguity."
)

@override
def with_path_prefix(self, path_prefix: MetricFlowQueryResolutionPath) -> MultipleMatchIssue:
return MultipleMatchIssue(
issue_type=self.issue_type,
parent_issues=tuple(issue.with_path_prefix(path_prefix) for issue in self.parent_issues),
query_resolution_path=self.query_resolution_path.with_path_prefix(path_prefix),
candidate_set=self.candidate_set.with_path_prefix(path_prefix),
)
38 changes: 32 additions & 6 deletions metricflow/specs/patterns/entity_link_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,23 +81,49 @@ class EntityLinkPattern(SpecPattern):
item using a specified entity. Additional semantic models can be joined using additional entities to obtain the
group-by-item. The series of entities that are used form the entity path. Since the entity path does not specify
which semantic models need to be used, additional resolution is done in later stages to generate the necessary SQL.
The entity links that are specified is used as a suffix match.
"""

parameter_set: EntityLinkPatternParameterSet

def _match_entity_links(self, candidate_specs: Sequence[LinkableInstanceSpec]) -> Sequence[LinkableInstanceSpec]:
assert self.parameter_set.entity_links is not None
num_links_to_check = len(self.parameter_set.entity_links)
matching_specs: Sequence[LinkableInstanceSpec] = tuple(
candidate_spec
for candidate_spec in candidate_specs
if (
self.parameter_set.entity_links[-num_links_to_check:]
== candidate_spec.entity_links[-num_links_to_check:]
)
)

if len(matching_specs) <= 1:
return matching_specs

# If multiple match, then return only the ones with the shortest entity link path. There could be multiple
# e.g. booking__listing__country and listing__country will match with listing__country.
shortest_entity_link_length = min(len(matching_spec.entity_links) for matching_spec in matching_specs)
return tuple(spec for spec in matching_specs if len(spec.entity_links) == shortest_entity_link_length)

@override
def match(self, candidate_specs: Sequence[InstanceSpec]) -> Sequence[LinkableInstanceSpec]:
filtered_candidate_specs = InstanceSpecSet.from_specs(candidate_specs).linkable_specs
# Checks that EntityLinkPatternParameterSetField is valid wrt to the parameter set.

matching_specs: List[LinkableInstanceSpec] = []
# Entity links could be a partial match, so it's handled separately.
if ParameterSetField.ENTITY_LINKS in self.parameter_set.fields_to_compare:
filtered_candidate_specs = self._match_entity_links(filtered_candidate_specs)

# Using some Python introspection magic to figure out specs that match the listed fields.
keys_to_check = set(field_to_compare.value for field_to_compare in self.parameter_set.fields_to_compare)
# Checks that EntityLinkPatternParameterSetField is valid wrt to the parameter set.
parameter_set_values = tuple(getattr(self.parameter_set, key_to_check) for key_to_check in keys_to_check)
other_keys_to_check = set(
field_to_compare.value for field_to_compare in self.parameter_set.fields_to_compare
).difference({ParameterSetField.ENTITY_LINKS.value})

matching_specs: List[LinkableInstanceSpec] = []
parameter_set_values = tuple(getattr(self.parameter_set, key_to_check) for key_to_check in other_keys_to_check)
for spec in filtered_candidate_specs:
spec_values = tuple(getattr(spec, key_to_check, None) for key_to_check in keys_to_check)
spec_values = tuple(getattr(spec, key_to_check, None) for key_to_check in other_keys_to_check)
if spec_values == parameter_set_values:
matching_specs.append(spec)

Expand Down
14 changes: 14 additions & 0 deletions metricflow/test/fixtures/model_fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,3 +276,17 @@ def ambiguous_resolution_manifest_lookup( # noqa: D
ambiguous_resolution_manifest: PydanticSemanticManifest,
) -> SemanticManifestLookup:
return SemanticManifestLookup(ambiguous_resolution_manifest)


@pytest.fixture(scope="session")
def simple_multi_hop_join_manifest(template_mapping: Dict[str, str]) -> PydanticSemanticManifest: # noqa: D
build_result = load_semantic_manifest("simple_multi_hop_join_manifest", template_mapping)
return build_result.semantic_manifest


@pytest.fixture(scope="session")
def simple_multi_hop_join_manifest_lookup( # noqa: D
simple_multi_hop_join_manifest: PydanticSemanticManifest,
) -> SemanticManifestLookup:
"""Manifest used to test ambiguous resolution of group-by-items."""
return SemanticManifestLookup(simple_multi_hop_join_manifest)
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
semantic_model:
name: entity_0_dimension_source
description: Contains dimensions for "entity_0"

node_relation:
schema_name: $source_schema
alias: entity_0_dimension_table

dimensions:
- name: country
type: categorical

entities:
- name: entity_0
type: primary
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
---
semantic_model:
name: entity_1_and_entity_2_measure_source
description: A measure source associated with "entity_1" and "entity_2".

node_relation:
schema_name: $source_schema
alias: entity_1_and_entity_2_measure_table

defaults:
agg_time_dimension: ds

measures:
- name: entity_1_and_entity_2_measure
agg: sum
expr: "1"

dimensions:
- name: ds
type: time
type_params:
time_granularity: day

entities:
- name: composite_entity
type: primary
expr: entity_1 || entity_2
- name: entity_1
type: foreign
- name: entity_2
type: foreign
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
---
semantic_model:
name: entity_1_measure_source
description: A measure source associated with "entity_1".

node_relation:
schema_name: $source_schema
alias: entity_1_measure_table

defaults:
agg_time_dimension: ds

measures:
- name: entity_1_measure
agg: sum

dimensions:
- name: ds
type: time
type_params:
time_granularity: day

entities:
- name: entity_1
type: primary
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
---
semantic_model:
name: entity_1_to_entity_0_mapping_source
description: Maps "entity_1" to "entity_0"

node_relation:
schema_name: $source_schema
alias: entity_1_to_entity_0_mapping_table

entities:
- name: entity_1
type: primary
- name: entity_0
type: foreign
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
---
semantic_model:
name: entity_2_to_entity_0_mapping_source
description: Maps "entity_2" to "entity_0"

node_relation:
schema_name: $source_schema
alias: entity_2_to_entity_0_mapping_table

entities:
- name: entity_2
type: primary
- name: entity_0
type: foreign
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
---
metric:
name: entity_1_metric
description: Metric based on a measure defined with "entity_1"
type: simple
type_params:
measure: entity_1_measure
---
metric:
name: entity_1_and_entity_2_metric
description: Metric based on a measure defined with "entity_1" and "entity_2"
type: simple
type_params:
measure: entity_1_and_entity_2_measure
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
table_snapshot:
table_name: entity_0_dimension_source
column_definitions:
- name: entity_0
type: STRING
- name: country
type: STRING
rows:
- ["E0_0", "us"]
- ["E0_1", "ca"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
table_snapshot:
table_name: entity_1_and_entity_2_measure_table
column_definitions:
- name: entity_1
type: STRING
- name: entity_2
type: STRING
- name: ds
type: TIME
rows:
- ["E0_0", "E1_0", "2020-01-01"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
table_snapshot:
table_name: entity_1_measure_table
column_definitions:
- name: entity_1
type: STRING
- name: ds
type: TIME
rows:
- ["E1_0", "2020-01-01"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
table_snapshot:
table_name: entity_1_to_entity_0_table
column_definitions:
- name: entity_1
type: STRING
- name: entity_0
type: STRING
rows:
- ["E1_0", "E0_0"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
table_snapshot:
table_name: entity_2_measure_table
column_definitions:
- name: entity_2
type: STRING
- name: ds
type: TIME
rows:
- ["E2_0", "2020-01-01"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
table_snapshot:
table_name: entity_2_to_entity_0_mapping_table
column_definitions:
- name: entity_2
type: STRING
- name: entity_0
type: STRING
rows:
- ["E2_0", "E0_1"]
Loading

0 comments on commit 5bff396

Please sign in to comment.