From 5023f345f10549e0fecaf8bb33fcb73a7ccb9cd9 Mon Sep 17 00:00:00 2001 From: Courtney Holcomb Date: Mon, 13 May 2024 08:48:32 -0700 Subject: [PATCH 1/6] Handle group by metrics in NoneDatePartPattern This pattern was filtering out group by metrics unintentionally. --- .../metricflow_semantics/specs/patterns/none_date_part.py | 1 + 1 file changed, 1 insertion(+) diff --git a/metricflow-semantics/metricflow_semantics/specs/patterns/none_date_part.py b/metricflow-semantics/metricflow_semantics/specs/patterns/none_date_part.py index e451948f5a..94879175d4 100644 --- a/metricflow-semantics/metricflow_semantics/specs/patterns/none_date_part.py +++ b/metricflow-semantics/metricflow_semantics/specs/patterns/none_date_part.py @@ -27,5 +27,6 @@ def match(self, candidate_specs: Sequence[InstanceSpec]) -> Sequence[LinkableIns specs_to_return.append(time_dimension_spec) specs_to_return.extend(spec_set.dimension_specs) specs_to_return.extend(spec_set.entity_specs) + specs_to_return.extend(spec_set.group_by_metric_specs) return specs_to_return From 5b45d3b0c3bd8b30a630f5430f82a1f2a5b362ca Mon Sep 17 00:00:00 2001 From: Courtney Holcomb Date: Mon, 13 May 2024 09:26:27 -0700 Subject: [PATCH 2/6] Dedupe filter suggestions Currently, metric filters have a parameter that's not captured in their input string (the outer query entity links). This means that you might end up with multiple input strings that look the same, so we need to dedupe those. These duplicates will look different once we add a new parameter to metric filters that allows users to specify the outer entity links. But there should be no harm in using a set here, either way. --- .../metricflow_semantics/query/suggestion_generator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metricflow-semantics/metricflow_semantics/query/suggestion_generator.py b/metricflow-semantics/metricflow_semantics/query/suggestion_generator.py index f89784ced7..bab7efd8bd 100644 --- a/metricflow-semantics/metricflow_semantics/query/suggestion_generator.py +++ b/metricflow-semantics/metricflow_semantics/query/suggestion_generator.py @@ -42,12 +42,12 @@ def input_suggestions( candidate_specs = candidate_filter.match(candidate_specs) # Use edit distance to figure out the closest matches, so convert the specs to strings. - candidate_strs = [] + candidate_strs = set() for candidate_spec in candidate_specs: candidate_str = self._input_naming_scheme.input_str(candidate_spec) if candidate_str is not None: - candidate_strs.append(candidate_str) + candidate_strs.add(candidate_str) fuzzy_matches = top_fuzzy_matches( item=self._input_str, From f2c78bd55c9cc26a78fbda7f8ecc7d38f5807930 Mon Sep 17 00:00:00 2001 From: Courtney Holcomb Date: Mon, 13 May 2024 09:01:07 -0700 Subject: [PATCH 3/6] Test metric filter suggestions --- .../tests_metricflow_semantics/query/test_query_parser.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/metricflow-semantics/tests_metricflow_semantics/query/test_query_parser.py b/metricflow-semantics/tests_metricflow_semantics/query/test_query_parser.py index d5f82d4927..ecbf9ef0d3 100644 --- a/metricflow-semantics/tests_metricflow_semantics/query/test_query_parser.py +++ b/metricflow-semantics/tests_metricflow_semantics/query/test_query_parser.py @@ -640,3 +640,11 @@ def query_parser_from_yaml(yaml_contents: List[YamlConfigFile]) -> MetricFlowQue return MetricFlowQueryParser( semantic_manifest_lookup=semantic_manifest_lookup, ) + + +def test_invalid_group_by_metric(bookings_query_parser: MetricFlowQueryParser) -> None: + """Tests that a query for an invalid group by metric gives an appropriate group by metric suggestion.""" + with pytest.raises(InvalidQueryException, match="Metric\\('bookings', group_by=\\['listing'\\]\\)"): + bookings_query_parser.parse_and_validate_query( + metric_names=("bookings",), where_constraint_str="{{ Metric('listings', ['garbage']) }} > 1" + ) From 49ff11f19bf6fe36a18ad47031f157c4a1656d6e Mon Sep 17 00:00:00 2001 From: Courtney Holcomb Date: Mon, 13 May 2024 09:17:18 -0700 Subject: [PATCH 4/6] Handle group by metrics in DunderNameTransform --- .../metricflow_semantics/naming/dunder_scheme.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/metricflow-semantics/metricflow_semantics/naming/dunder_scheme.py b/metricflow-semantics/metricflow_semantics/naming/dunder_scheme.py index 53a56acd14..992b87d2f2 100644 --- a/metricflow-semantics/metricflow_semantics/naming/dunder_scheme.py +++ b/metricflow-semantics/metricflow_semantics/naming/dunder_scheme.py @@ -159,7 +159,9 @@ def transform(self, spec_set: InstanceSpecSet) -> Sequence[str]: items.append(time_dimension_spec.time_granularity.value) names_to_return.append(DUNDER.join(items)) - for other_group_by_item_specs in spec_set.entity_specs + spec_set.dimension_specs: + for other_group_by_item_specs in ( + spec_set.entity_specs + spec_set.dimension_specs + spec_set.group_by_metric_specs + ): items = list(entity_link.element_name for entity_link in other_group_by_item_specs.entity_links) + [ other_group_by_item_specs.element_name ] From 347c23158fa2320e05fe4843f47f98a567d0e67d Mon Sep 17 00:00:00 2001 From: Courtney Holcomb Date: Mon, 13 May 2024 10:58:48 -0700 Subject: [PATCH 5/6] Add NoGroupByMetricPattern GroupByMetrics are allowed in filters, but not in group by items. Use new pattern to distinguish the two. --- .../group_by_item/group_by_item_resolver.py | 10 +++---- .../query/suggestion_generator.py | 8 ++++- .../specs/patterns/no_group_by_metric.py | 29 +++++++++++++++++++ 3 files changed, 40 insertions(+), 7 deletions(-) create mode 100644 metricflow-semantics/metricflow_semantics/specs/patterns/no_group_by_metric.py diff --git a/metricflow-semantics/metricflow_semantics/query/group_by_item/group_by_item_resolver.py b/metricflow-semantics/metricflow_semantics/query/group_by_item/group_by_item_resolver.py index 5bda159f03..dbc861467e 100644 --- a/metricflow-semantics/metricflow_semantics/query/group_by_item/group_by_item_resolver.py +++ b/metricflow-semantics/metricflow_semantics/query/group_by_item/group_by_item_resolver.py @@ -28,6 +28,7 @@ ) from metricflow_semantics.query.suggestion_generator import QueryItemSuggestionGenerator from metricflow_semantics.specs.patterns.base_time_grain import BaseTimeGrainPattern +from metricflow_semantics.specs.patterns.no_group_by_metric import NoGroupByMetricPattern from metricflow_semantics.specs.patterns.spec_pattern import SpecPattern from metricflow_semantics.specs.patterns.typed_patterns import TimeDimensionPattern from metricflow_semantics.specs.spec_classes import LinkableInstanceSpec @@ -87,7 +88,7 @@ def resolve_matching_item_for_querying( """ push_down_visitor = _PushDownGroupByItemCandidatesVisitor( manifest_lookup=self._manifest_lookup, - source_spec_patterns=(spec_pattern,), + source_spec_patterns=(spec_pattern, NoGroupByMetricPattern()), suggestion_generator=suggestion_generator, ) @@ -146,15 +147,12 @@ def resolve_matching_item_for_filters( suggestion_generator = QueryItemSuggestionGenerator( input_naming_scheme=ObjectBuilderNamingScheme(), input_str=input_str, - candidate_filters=QueryItemSuggestionGenerator.GROUP_BY_ITEM_CANDIDATE_FILTERS, + candidate_filters=QueryItemSuggestionGenerator.FILTER_ITEM_CANDIDATE_FILTERS, ) push_down_visitor = _PushDownGroupByItemCandidatesVisitor( manifest_lookup=self._manifest_lookup, - source_spec_patterns=( - spec_pattern, - BaseTimeGrainPattern(), - ), + source_spec_patterns=(spec_pattern, BaseTimeGrainPattern()), suggestion_generator=suggestion_generator, ) diff --git a/metricflow-semantics/metricflow_semantics/query/suggestion_generator.py b/metricflow-semantics/metricflow_semantics/query/suggestion_generator.py index bab7efd8bd..ce53c50b74 100644 --- a/metricflow-semantics/metricflow_semantics/query/suggestion_generator.py +++ b/metricflow-semantics/metricflow_semantics/query/suggestion_generator.py @@ -6,6 +6,7 @@ from metricflow_semantics.naming.naming_scheme import QueryItemNamingScheme from metricflow_semantics.query.similarity import top_fuzzy_matches from metricflow_semantics.specs.patterns.base_time_grain import BaseTimeGrainPattern +from metricflow_semantics.specs.patterns.no_group_by_metric import NoGroupByMetricPattern from metricflow_semantics.specs.patterns.none_date_part import NoneDatePartPattern from metricflow_semantics.specs.patterns.spec_pattern import SpecPattern from metricflow_semantics.specs.spec_classes import InstanceSpec @@ -23,7 +24,12 @@ class QueryItemSuggestionGenerator: # Adding these filters so that we don't get multiple suggestions that are similar, but with different # grains. Some additional thought is needed to tweak this as the base grain may not be the best suggestion. - GROUP_BY_ITEM_CANDIDATE_FILTERS: Tuple[SpecPattern, ...] = (BaseTimeGrainPattern(), NoneDatePartPattern()) + FILTER_ITEM_CANDIDATE_FILTERS: Tuple[SpecPattern, ...] = (BaseTimeGrainPattern(), NoneDatePartPattern()) + GROUP_BY_ITEM_CANDIDATE_FILTERS: Tuple[SpecPattern, ...] = ( + BaseTimeGrainPattern(), + NoneDatePartPattern(), + NoGroupByMetricPattern(), + ) def __init__( # noqa: D107 self, input_naming_scheme: QueryItemNamingScheme, input_str: str, candidate_filters: Sequence[SpecPattern] diff --git a/metricflow-semantics/metricflow_semantics/specs/patterns/no_group_by_metric.py b/metricflow-semantics/metricflow_semantics/specs/patterns/no_group_by_metric.py new file mode 100644 index 0000000000..1cbeb1fed9 --- /dev/null +++ b/metricflow-semantics/metricflow_semantics/specs/patterns/no_group_by_metric.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from typing import List, Sequence + +from typing_extensions import override + +from metricflow_semantics.specs.patterns.spec_pattern import SpecPattern +from metricflow_semantics.specs.spec_classes import ( + InstanceSpec, + LinkableInstanceSpec, +) +from metricflow_semantics.specs.spec_set import group_specs_by_type + + +class NoGroupByMetricPattern(SpecPattern): + """Matches to linkable specs, but only if they're not group by metrics. + + Group by metrics are allowed in filters but not in the query input group by. + """ + + @override + def match(self, candidate_specs: Sequence[InstanceSpec]) -> Sequence[LinkableInstanceSpec]: + specs_to_return: List[LinkableInstanceSpec] = [] + spec_set = group_specs_by_type(candidate_specs) + specs_to_return.extend(spec_set.time_dimension_specs) + specs_to_return.extend(spec_set.dimension_specs) + specs_to_return.extend(spec_set.entity_specs) + + return specs_to_return From fd3657e001e79d88368a3a4d30e1c471abaa9c7f Mon Sep 17 00:00:00 2001 From: Courtney Holcomb Date: Mon, 13 May 2024 10:55:05 -0700 Subject: [PATCH 6/6] Sort candidate items before getting similarity scores for consistent results You'll notice some of the snapshots here have changed. The fuzzy match scores for these items have not changed. What changed was the order of candidate items passed into the similarity scorer once group by metrics were added to the list of candidate items. The change in order for these snapshots is due to ties in score that are returned in the order they came in. Sort inputs to ensure consistent results in the future. --- .../metricflow_semantics/query/similarity.py | 6 +++++- ..._for_defined_filters_in_multi_metric_query__result_0.txt | 2 +- .../test_suggestions_for_defined_where_filter__result_0.txt | 2 +- .../str/test_suggestions_for_metric__result_0.txt | 2 +- .../str/test_suggestions_for_multiple_metrics__result_0.txt | 2 +- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/metricflow-semantics/metricflow_semantics/query/similarity.py b/metricflow-semantics/metricflow_semantics/query/similarity.py index d7b01657a1..c1eb6f4e24 100644 --- a/metricflow-semantics/metricflow_semantics/query/similarity.py +++ b/metricflow-semantics/metricflow_semantics/query/similarity.py @@ -23,6 +23,10 @@ def top_fuzzy_matches( Return scores from -1 -> 0 inclusive. """ + # In the case of a tie in score, items will be returned in the order they were passed in. + # Sort candidate item inputs first for consistent results. + sorted_candidate_items = sorted(candidate_items) + scored_items = [] # Rank choices by edit distance score. @@ -31,7 +35,7 @@ def top_fuzzy_matches( rapidfuzz.process.extract( # This scorer seems to return the best results. item, - list(candidate_items), + sorted_candidate_items, limit=max_matches, scorer=rapidfuzz.fuzz.token_set_ratio, ), diff --git a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_filters_in_multi_metric_query__result_0.txt b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_filters_in_multi_metric_query__result_0.txt index 5e57d22dab..c2032697c8 100644 --- a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_filters_in_multi_metric_query__result_0.txt +++ b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_filters_in_multi_metric_query__result_0.txt @@ -16,8 +16,8 @@ Error #1: [ "Dimension('listing__capacity_latest')", "TimeDimension('listing__created_at', 'day')", - "TimeDimension('listing__ds', 'day')", "Dimension('listing__is_lux_latest')", + "TimeDimension('listing__ds', 'day')", "TimeDimension('user__created_at', 'day')", "TimeDimension('user__ds_latest', 'day')", ] diff --git a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_where_filter__result_0.txt b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_where_filter__result_0.txt index 8a22260997..0bbbe1a1a0 100644 --- a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_where_filter__result_0.txt +++ b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_where_filter__result_0.txt @@ -16,8 +16,8 @@ Error #1: [ "Dimension('listing__capacity_latest')", "TimeDimension('listing__created_at', 'day')", - "TimeDimension('listing__ds', 'day')", "Dimension('listing__is_lux_latest')", + "TimeDimension('listing__ds', 'day')", "Dimension('listing__country_latest')", "TimeDimension('user__created_at', 'day')", ] diff --git a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_metric__result_0.txt b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_metric__result_0.txt index 938b2778c6..3455c5270e 100644 --- a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_metric__result_0.txt +++ b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_metric__result_0.txt @@ -6,7 +6,7 @@ Error #1: The given input does not exactly match any known metrics. Suggestions: - ['bookings', 'booking_fees', 'booking_value', 'instant_bookings', 'booking_payments', 'max_booking_value'] + ['bookings', 'booking_fees', 'booking_value', 'booking_payments', 'instant_bookings', 'booking_value_p99'] Query Input: diff --git a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_multiple_metrics__result_0.txt b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_multiple_metrics__result_0.txt index a07132f9fa..727cec2b57 100644 --- a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_multiple_metrics__result_0.txt +++ b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_multiple_metrics__result_0.txt @@ -19,7 +19,7 @@ Error #1: 'listing__lux_listing', 'listing__is_lux_latest', 'listing__country_latest', - 'listing__created_at__day', + 'listing__capacity_latest', ] Query Input: