diff --git a/metricflow-semantics/metricflow_semantics/naming/dunder_scheme.py b/metricflow-semantics/metricflow_semantics/naming/dunder_scheme.py index 53a56acd14..992b87d2f2 100644 --- a/metricflow-semantics/metricflow_semantics/naming/dunder_scheme.py +++ b/metricflow-semantics/metricflow_semantics/naming/dunder_scheme.py @@ -159,7 +159,9 @@ def transform(self, spec_set: InstanceSpecSet) -> Sequence[str]: items.append(time_dimension_spec.time_granularity.value) names_to_return.append(DUNDER.join(items)) - for other_group_by_item_specs in spec_set.entity_specs + spec_set.dimension_specs: + for other_group_by_item_specs in ( + spec_set.entity_specs + spec_set.dimension_specs + spec_set.group_by_metric_specs + ): items = list(entity_link.element_name for entity_link in other_group_by_item_specs.entity_links) + [ other_group_by_item_specs.element_name ] diff --git a/metricflow-semantics/metricflow_semantics/query/group_by_item/group_by_item_resolver.py b/metricflow-semantics/metricflow_semantics/query/group_by_item/group_by_item_resolver.py index 5bda159f03..dbc861467e 100644 --- a/metricflow-semantics/metricflow_semantics/query/group_by_item/group_by_item_resolver.py +++ b/metricflow-semantics/metricflow_semantics/query/group_by_item/group_by_item_resolver.py @@ -28,6 +28,7 @@ ) from metricflow_semantics.query.suggestion_generator import QueryItemSuggestionGenerator from metricflow_semantics.specs.patterns.base_time_grain import BaseTimeGrainPattern +from metricflow_semantics.specs.patterns.no_group_by_metric import NoGroupByMetricPattern from metricflow_semantics.specs.patterns.spec_pattern import SpecPattern from metricflow_semantics.specs.patterns.typed_patterns import TimeDimensionPattern from metricflow_semantics.specs.spec_classes import LinkableInstanceSpec @@ -87,7 +88,7 @@ def resolve_matching_item_for_querying( """ push_down_visitor = _PushDownGroupByItemCandidatesVisitor( manifest_lookup=self._manifest_lookup, - source_spec_patterns=(spec_pattern,), + source_spec_patterns=(spec_pattern, NoGroupByMetricPattern()), suggestion_generator=suggestion_generator, ) @@ -146,15 +147,12 @@ def resolve_matching_item_for_filters( suggestion_generator = QueryItemSuggestionGenerator( input_naming_scheme=ObjectBuilderNamingScheme(), input_str=input_str, - candidate_filters=QueryItemSuggestionGenerator.GROUP_BY_ITEM_CANDIDATE_FILTERS, + candidate_filters=QueryItemSuggestionGenerator.FILTER_ITEM_CANDIDATE_FILTERS, ) push_down_visitor = _PushDownGroupByItemCandidatesVisitor( manifest_lookup=self._manifest_lookup, - source_spec_patterns=( - spec_pattern, - BaseTimeGrainPattern(), - ), + source_spec_patterns=(spec_pattern, BaseTimeGrainPattern()), suggestion_generator=suggestion_generator, ) diff --git a/metricflow-semantics/metricflow_semantics/query/similarity.py b/metricflow-semantics/metricflow_semantics/query/similarity.py index d7b01657a1..c1eb6f4e24 100644 --- a/metricflow-semantics/metricflow_semantics/query/similarity.py +++ b/metricflow-semantics/metricflow_semantics/query/similarity.py @@ -23,6 +23,10 @@ def top_fuzzy_matches( Return scores from -1 -> 0 inclusive. """ + # In the case of a tie in score, items will be returned in the order they were passed in. + # Sort candidate item inputs first for consistent results. + sorted_candidate_items = sorted(candidate_items) + scored_items = [] # Rank choices by edit distance score. @@ -31,7 +35,7 @@ def top_fuzzy_matches( rapidfuzz.process.extract( # This scorer seems to return the best results. item, - list(candidate_items), + sorted_candidate_items, limit=max_matches, scorer=rapidfuzz.fuzz.token_set_ratio, ), diff --git a/metricflow-semantics/metricflow_semantics/query/suggestion_generator.py b/metricflow-semantics/metricflow_semantics/query/suggestion_generator.py index f89784ced7..ce53c50b74 100644 --- a/metricflow-semantics/metricflow_semantics/query/suggestion_generator.py +++ b/metricflow-semantics/metricflow_semantics/query/suggestion_generator.py @@ -6,6 +6,7 @@ from metricflow_semantics.naming.naming_scheme import QueryItemNamingScheme from metricflow_semantics.query.similarity import top_fuzzy_matches from metricflow_semantics.specs.patterns.base_time_grain import BaseTimeGrainPattern +from metricflow_semantics.specs.patterns.no_group_by_metric import NoGroupByMetricPattern from metricflow_semantics.specs.patterns.none_date_part import NoneDatePartPattern from metricflow_semantics.specs.patterns.spec_pattern import SpecPattern from metricflow_semantics.specs.spec_classes import InstanceSpec @@ -23,7 +24,12 @@ class QueryItemSuggestionGenerator: # Adding these filters so that we don't get multiple suggestions that are similar, but with different # grains. Some additional thought is needed to tweak this as the base grain may not be the best suggestion. - GROUP_BY_ITEM_CANDIDATE_FILTERS: Tuple[SpecPattern, ...] = (BaseTimeGrainPattern(), NoneDatePartPattern()) + FILTER_ITEM_CANDIDATE_FILTERS: Tuple[SpecPattern, ...] = (BaseTimeGrainPattern(), NoneDatePartPattern()) + GROUP_BY_ITEM_CANDIDATE_FILTERS: Tuple[SpecPattern, ...] = ( + BaseTimeGrainPattern(), + NoneDatePartPattern(), + NoGroupByMetricPattern(), + ) def __init__( # noqa: D107 self, input_naming_scheme: QueryItemNamingScheme, input_str: str, candidate_filters: Sequence[SpecPattern] @@ -42,12 +48,12 @@ def input_suggestions( candidate_specs = candidate_filter.match(candidate_specs) # Use edit distance to figure out the closest matches, so convert the specs to strings. - candidate_strs = [] + candidate_strs = set() for candidate_spec in candidate_specs: candidate_str = self._input_naming_scheme.input_str(candidate_spec) if candidate_str is not None: - candidate_strs.append(candidate_str) + candidate_strs.add(candidate_str) fuzzy_matches = top_fuzzy_matches( item=self._input_str, diff --git a/metricflow-semantics/metricflow_semantics/specs/patterns/no_group_by_metric.py b/metricflow-semantics/metricflow_semantics/specs/patterns/no_group_by_metric.py new file mode 100644 index 0000000000..1cbeb1fed9 --- /dev/null +++ b/metricflow-semantics/metricflow_semantics/specs/patterns/no_group_by_metric.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from typing import List, Sequence + +from typing_extensions import override + +from metricflow_semantics.specs.patterns.spec_pattern import SpecPattern +from metricflow_semantics.specs.spec_classes import ( + InstanceSpec, + LinkableInstanceSpec, +) +from metricflow_semantics.specs.spec_set import group_specs_by_type + + +class NoGroupByMetricPattern(SpecPattern): + """Matches to linkable specs, but only if they're not group by metrics. + + Group by metrics are allowed in filters but not in the query input group by. + """ + + @override + def match(self, candidate_specs: Sequence[InstanceSpec]) -> Sequence[LinkableInstanceSpec]: + specs_to_return: List[LinkableInstanceSpec] = [] + spec_set = group_specs_by_type(candidate_specs) + specs_to_return.extend(spec_set.time_dimension_specs) + specs_to_return.extend(spec_set.dimension_specs) + specs_to_return.extend(spec_set.entity_specs) + + return specs_to_return diff --git a/metricflow-semantics/metricflow_semantics/specs/patterns/none_date_part.py b/metricflow-semantics/metricflow_semantics/specs/patterns/none_date_part.py index e451948f5a..94879175d4 100644 --- a/metricflow-semantics/metricflow_semantics/specs/patterns/none_date_part.py +++ b/metricflow-semantics/metricflow_semantics/specs/patterns/none_date_part.py @@ -27,5 +27,6 @@ def match(self, candidate_specs: Sequence[InstanceSpec]) -> Sequence[LinkableIns specs_to_return.append(time_dimension_spec) specs_to_return.extend(spec_set.dimension_specs) specs_to_return.extend(spec_set.entity_specs) + specs_to_return.extend(spec_set.group_by_metric_specs) return specs_to_return diff --git a/metricflow-semantics/tests_metricflow_semantics/query/test_query_parser.py b/metricflow-semantics/tests_metricflow_semantics/query/test_query_parser.py index d5f82d4927..ecbf9ef0d3 100644 --- a/metricflow-semantics/tests_metricflow_semantics/query/test_query_parser.py +++ b/metricflow-semantics/tests_metricflow_semantics/query/test_query_parser.py @@ -640,3 +640,11 @@ def query_parser_from_yaml(yaml_contents: List[YamlConfigFile]) -> MetricFlowQue return MetricFlowQueryParser( semantic_manifest_lookup=semantic_manifest_lookup, ) + + +def test_invalid_group_by_metric(bookings_query_parser: MetricFlowQueryParser) -> None: + """Tests that a query for an invalid group by metric gives an appropriate group by metric suggestion.""" + with pytest.raises(InvalidQueryException, match="Metric\\('bookings', group_by=\\['listing'\\]\\)"): + bookings_query_parser.parse_and_validate_query( + metric_names=("bookings",), where_constraint_str="{{ Metric('listings', ['garbage']) }} > 1" + ) diff --git a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_filters_in_multi_metric_query__result_0.txt b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_filters_in_multi_metric_query__result_0.txt index 5e57d22dab..c2032697c8 100644 --- a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_filters_in_multi_metric_query__result_0.txt +++ b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_filters_in_multi_metric_query__result_0.txt @@ -16,8 +16,8 @@ Error #1: [ "Dimension('listing__capacity_latest')", "TimeDimension('listing__created_at', 'day')", - "TimeDimension('listing__ds', 'day')", "Dimension('listing__is_lux_latest')", + "TimeDimension('listing__ds', 'day')", "TimeDimension('user__created_at', 'day')", "TimeDimension('user__ds_latest', 'day')", ] diff --git a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_where_filter__result_0.txt b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_where_filter__result_0.txt index 8a22260997..0bbbe1a1a0 100644 --- a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_where_filter__result_0.txt +++ b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_where_filter__result_0.txt @@ -16,8 +16,8 @@ Error #1: [ "Dimension('listing__capacity_latest')", "TimeDimension('listing__created_at', 'day')", - "TimeDimension('listing__ds', 'day')", "Dimension('listing__is_lux_latest')", + "TimeDimension('listing__ds', 'day')", "Dimension('listing__country_latest')", "TimeDimension('user__created_at', 'day')", ] diff --git a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_metric__result_0.txt b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_metric__result_0.txt index 938b2778c6..3455c5270e 100644 --- a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_metric__result_0.txt +++ b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_metric__result_0.txt @@ -6,7 +6,7 @@ Error #1: The given input does not exactly match any known metrics. Suggestions: - ['bookings', 'booking_fees', 'booking_value', 'instant_bookings', 'booking_payments', 'max_booking_value'] + ['bookings', 'booking_fees', 'booking_value', 'booking_payments', 'instant_bookings', 'booking_value_p99'] Query Input: diff --git a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_multiple_metrics__result_0.txt b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_multiple_metrics__result_0.txt index a07132f9fa..727cec2b57 100644 --- a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_multiple_metrics__result_0.txt +++ b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_multiple_metrics__result_0.txt @@ -19,7 +19,7 @@ Error #1: 'listing__lux_listing', 'listing__is_lux_latest', 'listing__country_latest', - 'listing__created_at__day', + 'listing__capacity_latest', ] Query Input: