From 223b0c1501b6b2829946b81f5f980fbbab225175 Mon Sep 17 00:00:00 2001 From: Courtney Holcomb Date: Mon, 13 May 2024 10:55:05 -0700 Subject: [PATCH] Sort candidate items before getting similarity scores for consistent results You'll notice some of the snapshots here have changed. The fuzzy match scores for these items have not changed. What changed was the order of candidate items passed into the similarity scorer once group by metrics were added to the list of candidate items. The change in order for these snapshots is due to ties in score that are returned in the order they came in. Sort inputs to ensure consistent results in the future. --- .../metricflow_semantics/query/similarity.py | 6 +++++- ..._for_defined_filters_in_multi_metric_query__result_0.txt | 2 +- .../test_suggestions_for_defined_where_filter__result_0.txt | 2 +- .../str/test_suggestions_for_metric__result_0.txt | 2 +- .../str/test_suggestions_for_multiple_metrics__result_0.txt | 2 +- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/metricflow-semantics/metricflow_semantics/query/similarity.py b/metricflow-semantics/metricflow_semantics/query/similarity.py index d7b01657a1..c1eb6f4e24 100644 --- a/metricflow-semantics/metricflow_semantics/query/similarity.py +++ b/metricflow-semantics/metricflow_semantics/query/similarity.py @@ -23,6 +23,10 @@ def top_fuzzy_matches( Return scores from -1 -> 0 inclusive. """ + # In the case of a tie in score, items will be returned in the order they were passed in. + # Sort candidate item inputs first for consistent results. + sorted_candidate_items = sorted(candidate_items) + scored_items = [] # Rank choices by edit distance score. @@ -31,7 +35,7 @@ def top_fuzzy_matches( rapidfuzz.process.extract( # This scorer seems to return the best results. item, - list(candidate_items), + sorted_candidate_items, limit=max_matches, scorer=rapidfuzz.fuzz.token_set_ratio, ), diff --git a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_filters_in_multi_metric_query__result_0.txt b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_filters_in_multi_metric_query__result_0.txt index 5e57d22dab..c2032697c8 100644 --- a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_filters_in_multi_metric_query__result_0.txt +++ b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_filters_in_multi_metric_query__result_0.txt @@ -16,8 +16,8 @@ Error #1: [ "Dimension('listing__capacity_latest')", "TimeDimension('listing__created_at', 'day')", - "TimeDimension('listing__ds', 'day')", "Dimension('listing__is_lux_latest')", + "TimeDimension('listing__ds', 'day')", "TimeDimension('user__created_at', 'day')", "TimeDimension('user__ds_latest', 'day')", ] diff --git a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_where_filter__result_0.txt b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_where_filter__result_0.txt index 8a22260997..0bbbe1a1a0 100644 --- a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_where_filter__result_0.txt +++ b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_defined_where_filter__result_0.txt @@ -16,8 +16,8 @@ Error #1: [ "Dimension('listing__capacity_latest')", "TimeDimension('listing__created_at', 'day')", - "TimeDimension('listing__ds', 'day')", "Dimension('listing__is_lux_latest')", + "TimeDimension('listing__ds', 'day')", "Dimension('listing__country_latest')", "TimeDimension('user__created_at', 'day')", ] diff --git a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_metric__result_0.txt b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_metric__result_0.txt index 938b2778c6..3455c5270e 100644 --- a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_metric__result_0.txt +++ b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_metric__result_0.txt @@ -6,7 +6,7 @@ Error #1: The given input does not exactly match any known metrics. Suggestions: - ['bookings', 'booking_fees', 'booking_value', 'instant_bookings', 'booking_payments', 'max_booking_value'] + ['bookings', 'booking_fees', 'booking_value', 'booking_payments', 'instant_bookings', 'booking_value_p99'] Query Input: diff --git a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_multiple_metrics__result_0.txt b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_multiple_metrics__result_0.txt index a07132f9fa..727cec2b57 100644 --- a/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_multiple_metrics__result_0.txt +++ b/metricflow-semantics/tests_metricflow_semantics/snapshots/test_suggestions.py/str/test_suggestions_for_multiple_metrics__result_0.txt @@ -19,7 +19,7 @@ Error #1: 'listing__lux_listing', 'listing__is_lux_latest', 'listing__country_latest', - 'listing__created_at__day', + 'listing__capacity_latest', ] Query Input: