From 0d20f372331ccd1e2fd08e9ed3f7bec2aca21371 Mon Sep 17 00:00:00 2001 From: William Deng Date: Thu, 8 Dec 2022 17:08:00 -0500 Subject: [PATCH] tests for conversion metrics --- metricflow/test/fixtures/model_fixtures.py | 2 + .../data_sources/buys_source.yaml | 30 +++ .../data_sources/visits_source.yaml | 33 +++ .../model_yamls/simple_model/metrics.yaml | 28 +++ metricflow/test/fixtures/table_fixtures.py | 55 +++++ .../test_cases/itest_conversion_metrics.yaml | 230 ++++++++++++++++++ .../test/integration/test_configured_cases.py | 7 + .../test/model/test_data_source_container.py | 7 +- .../test_dataflow_to_sql_plan.py | 47 +++- 9 files changed, 437 insertions(+), 2 deletions(-) create mode 100644 metricflow/test/fixtures/model_yamls/simple_model/data_sources/buys_source.yaml create mode 100644 metricflow/test/fixtures/model_yamls/simple_model/data_sources/visits_source.yaml create mode 100644 metricflow/test/integration/test_cases/itest_conversion_metrics.yaml diff --git a/metricflow/test/fixtures/model_fixtures.py b/metricflow/test/fixtures/model_fixtures.py index bf5e94e6b2..97db4293d5 100644 --- a/metricflow/test/fixtures/model_fixtures.py +++ b/metricflow/test/fixtures/model_fixtures.py @@ -170,6 +170,8 @@ def template_mapping(mf_test_session_state: MetricFlowTestSessionState) -> Dict[ "source_schema": schema, "accounts_source_table": f"{schema}.fct_accounts", "primary_accounts_table": f"{schema}.dim_primary_accounts", + "buys_source_table": f"{schema}.fct_buys", + "visits_source_table": f"{schema}.fct_visits", } diff --git a/metricflow/test/fixtures/model_yamls/simple_model/data_sources/buys_source.yaml b/metricflow/test/fixtures/model_yamls/simple_model/data_sources/buys_source.yaml new file mode 100644 index 0000000000..5609232d9c --- /dev/null +++ b/metricflow/test/fixtures/model_yamls/simple_model/data_sources/buys_source.yaml @@ -0,0 +1,30 @@ +--- +data_source: + name: buys_source + description: buys_source + owners: + - support@transformdata.io + + sql_query: | + -- User Defined SQL Query + SELECT * FROM $buys_source_table + + measures: + - name: buys + expr: 1 + agg: count + - name: buyers + expr: user_id + agg: count_distinct + + dimensions: + - name: ds + type: time + type_params: + is_primary: True + time_granularity: day + + identifiers: + - name: user + type: foreign + expr: user_id diff --git a/metricflow/test/fixtures/model_yamls/simple_model/data_sources/visits_source.yaml b/metricflow/test/fixtures/model_yamls/simple_model/data_sources/visits_source.yaml new file mode 100644 index 0000000000..9a517b1785 --- /dev/null +++ b/metricflow/test/fixtures/model_yamls/simple_model/data_sources/visits_source.yaml @@ -0,0 +1,33 @@ +--- +data_source: + name: visits_source + description: visits_source + owners: + - support@transformdata.io + + sql_query: | + -- User Defined SQL Query + SELECT * FROM $visits_source_table + + measures: + - name: visits + expr: 1 + agg: count + - name: visitors + expr: user_id + agg: count_distinct + + + dimensions: + - name: ds + type: time + type_params: + is_primary: True + time_granularity: day + - name: referrer_id + type: categorical + + identifiers: + - name: user + type: foreign + expr: user_id \ No newline at end of file diff --git a/metricflow/test/fixtures/model_yamls/simple_model/metrics.yaml b/metricflow/test/fixtures/model_yamls/simple_model/metrics.yaml index 69b6f06fb1..b9b603be1b 100644 --- a/metricflow/test/fixtures/model_yamls/simple_model/metrics.yaml +++ b/metricflow/test/fixtures/model_yamls/simple_model/metrics.yaml @@ -682,3 +682,31 @@ metric: - name: bookings offset_window: 5 days alias: bookings_5_days_ago +--- +metric: + name: visit_buy_conversion_rate + description: conversion rate on visits-buys on a 7 day window + owners: + - support@transformdata.io + type: conversion + type_params: + conversion_type_params: + base_measure: visits + conversion_measure: buys + window: 7 days + entity: user + calculation: conversion_rate +--- +metric: + name: visit_buy_conversions + description: conversion count on visits-buys on a 7 day window + owners: + - support@transformdata.io + type: conversion + type_params: + conversion_type_params: + base_measure: visits + conversion_measure: buys + window: 7 days + entity: user + calculation: conversions diff --git a/metricflow/test/fixtures/table_fixtures.py b/metricflow/test/fixtures/table_fixtures.py index ace1d9e41b..73922c1e3c 100644 --- a/metricflow/test/fixtures/table_fixtures.py +++ b/metricflow/test/fixtures/table_fixtures.py @@ -360,6 +360,61 @@ def create_simple_model_tables(mf_test_session_state: MetricFlowTestSessionState ), ) + # Events data + visits_data = [ + ("u0004114", "2020-01-01", "fb_ad_1"), + ("u0004214", "2020-01-01", "fb_ad_2"), + ("u0003141", "2020-01-01", "homepage_1"), + ("u0003154", "2020-01-01", "homepage_2"), + ("u1612112", "2020-01-02", "homepage_1"), + ("u0042324", "2020-01-02", "homepage_1"), + ("u0005432", "2020-01-02", "fb_ad_3"), + ("u0003452", "2020-01-02", "fb_ad_1"), + ("u0003452", "2020-01-02", "user_2"), + ("u0042324", "2020-01-03", "fb_ad_1"), + ("u0005432", "2020-01-03", "google_ad_1"), + ("u0005472", "2020-01-03", "google_ad_2"), + ("u0005414", "2020-01-04", "google_ad_1"), + ("u0004114", "2020-01-06", "homepage_1"), + ("u0004114", "2020-01-07", "fb_ad_2"), + ("u0004117", "2020-01-10", "google_ad_1"), + ("u0003141", "2020-01-12", "user_1"), + ] + + create_table( + sql_client=sql_client, + sql_table=SqlTable(schema_name=schema, table_name="fct_visits"), + df=make_df( + sql_client=sql_client, + columns=["user_id", DEFAULT_DS, "referrer_id"], + time_columns={DEFAULT_DS}, + data=visits_data, + ), + ) + + buy_data = [ + ("u0004114", "2020-01-02"), + ("u0042324", "2020-01-03"), + ("u0042324", "2020-01-03"), + ("u0005432", "2020-01-04"), + ("u1612112", "2020-01-07"), + ("u0004114", "2020-01-07"), + ("u0004117", "2020-01-10"), + ("u0003141", "2020-01-07"), + ("u0003452", "2020-01-04"), + ] + + create_table( + sql_client=sql_client, + sql_table=SqlTable(schema_name=schema, table_name="fct_buys"), + df=make_df( + sql_client=sql_client, + columns=["user_id", DEFAULT_DS], + time_columns={DEFAULT_DS}, + data=buy_data, + ), + ) + return True diff --git a/metricflow/test/integration/test_cases/itest_conversion_metrics.yaml b/metricflow/test/integration/test_cases/itest_conversion_metrics.yaml new file mode 100644 index 0000000000..9a186bc650 --- /dev/null +++ b/metricflow/test/integration/test_cases/itest_conversion_metrics.yaml @@ -0,0 +1,230 @@ +--- +integration_test: + name: conversion_rate_metric + description: Query a conversion metric that calculates the conversion rate + model: SIMPLE_MODEL + metrics: ["visit_buy_conversion_rate"] + group_bys: ["metric_time"] + check_query: | + SELECT + opportunities.metric_time AS metric_time + , CAST(conversions.buys AS {{ double_data_type_name }}) / CAST(NULLIF(opportunities.visits, 0) AS {{ double_data_type_name }}) AS visit_buy_conversion_rate + FROM ( + SELECT + metric_time, SUM(a.visits) AS visits + FROM ( + SELECT + ds AS metric_time, 1 AS visits + FROM {{ source_schema }}.fct_visits visits + ) a + GROUP BY + a.metric_time + ) opportunities + INNER JOIN ( + SELECT + b.ds AS metric_time, SUM(b.buys) AS buys + FROM ( + SELECT DISTINCT + first_value(v.ds) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS ds + , first_value(v.user_id) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS user_id + , first_value(v.referrer_id) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS referrer_id + , buy_source.uuid + , 1 AS buys + FROM {{ source_schema }}.fct_visits v + INNER JOIN + ( + SELECT *, {{ generate_random_uuid() }} AS uuid FROM {{ source_schema }}.fct_buys + ) buy_source + ON + v.user_id = buy_source.user_id AND v.ds <= buy_source.ds AND v.ds > {{ render_date_sub("buy_source", "ds", 7, TimeGranularity.DAY) }} + ) b + GROUP BY + b.ds + ) conversions + ON opportunities.metric_time = conversions.metric_time +--- +integration_test: + name: conversion_rate_metric_with_dimension + description: Query a conversion metric that calculates the conversion rate without time dimension + model: SIMPLE_MODEL + metrics: ["visit_buy_conversion_rate"] + group_bys: ["referrer_id"] + check_query: | + SELECT + opportunities.referrer_id AS referrer_id + , CAST(conversions.buys AS {{ double_data_type_name }}) / CAST(NULLIF(opportunities.visits, 0) AS {{ double_data_type_name }}) AS visit_buy_conversion_rate + FROM ( + SELECT + referrer_id, SUM(a.visits) AS visits + FROM ( + SELECT + referrer_id, 1 AS visits + FROM {{ source_schema }}.fct_visits visits + ) a + GROUP BY + a.referrer_id + ) opportunities + INNER JOIN ( + SELECT + referrer_id AS referrer_id, SUM(b.buys) AS buys + FROM ( + SELECT DISTINCT + first_value(v.ds) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS ds + , first_value(v.user_id) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS user_id + , first_value(v.referrer_id) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS referrer_id + , buy_source.uuid + , 1 AS buys + FROM {{ source_schema }}.fct_visits v + INNER JOIN + ( + SELECT *, {{ generate_random_uuid() }} AS uuid FROM {{ source_schema }}.fct_buys + ) buy_source + ON + v.user_id = buy_source.user_id AND v.ds <= buy_source.ds AND v.ds > buy_source.ds - INTERVAL 7 day + ) b + GROUP BY + b.referrer_id + ) conversions + ON opportunities.referrer_id = conversions.referrer_id +--- +integration_test: + name: conversion_rate_metric_with_multiple_dimension + description: Query a conversion metric that calculates the conversion rate with multiple dimension + model: SIMPLE_MODEL + metrics: ["visit_buy_conversion_rate"] + group_bys: ["metric_time", "referrer_id"] + check_query: | + SELECT + opportunities.referrer_id AS referrer_id + , opportunities.metric_time AS metric_time + , CAST(conversions.buys AS {{ double_data_type_name }}) / CAST(NULLIF(opportunities.visits, 0) AS {{ double_data_type_name }}) AS visit_buy_conversion_rate + FROM ( + SELECT + metric_time, referrer_id, SUM(a.visits) AS visits + FROM ( + SELECT + ds AS metric_time, referrer_id, 1 AS visits + FROM {{ source_schema }}.fct_visits visits + ) a + GROUP BY + a.referrer_id, a.metric_time + ) opportunities + INNER JOIN ( + SELECT + b.ds AS metric_time, referrer_id AS referrer_id, SUM(b.buys) AS buys + FROM ( + SELECT DISTINCT + first_value(v.ds) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS ds + , first_value(v.user_id) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS user_id + , first_value(v.referrer_id) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS referrer_id + , buy_source.uuid + , 1 AS buys + FROM {{ source_schema }}.fct_visits v + INNER JOIN + ( + SELECT *, {{ generate_random_uuid() }} AS uuid FROM {{ source_schema }}.fct_buys + ) buy_source + ON + v.user_id = buy_source.user_id AND v.ds <= buy_source.ds AND v.ds > buy_source.ds - INTERVAL 7 day + ) b + GROUP BY + b.referrer_id, b.ds + ) conversions + ON opportunities.referrer_id = conversions.referrer_id AND opportunities.metric_time = conversions.metric_time +--- +integration_test: + name: conversion_count_metric + description: Query a conversion metric that calculates the conversion count + model: SIMPLE_MODEL + metrics: ["visit_buy_conversions"] + group_bys: ["metric_time"] + check_query: | + SELECT + conversions.metric_time AS metric_time + , conversions.buys AS visit_buy_conversions + FROM ( + SELECT + b.ds AS metric_time, SUM(b.buys) AS buys + FROM ( + SELECT DISTINCT + first_value(v.ds) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS ds + , first_value(v.user_id) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS user_id + , first_value(v.referrer_id) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS referrer_id + , buy_source.uuid + , 1 AS buys + FROM {{ source_schema }}.fct_visits v + INNER JOIN + ( + SELECT *, {{ generate_random_uuid() }} AS uuid FROM {{ source_schema }}.fct_buys + ) buy_source + ON + v.user_id = buy_source.user_id AND v.ds <= buy_source.ds AND v.ds > buy_source.ds - INTERVAL 7 day + ) b + GROUP BY + b.ds + ) conversions +--- +integration_test: + name: conversion_count_metric_with_dimension + description: Query a conversion metric that calculates the conversion count without time dimension + model: SIMPLE_MODEL + metrics: ["visit_buy_conversions"] + group_bys: ["referrer_id"] + check_query: | + SELECT + conversions.referrer_id AS referrer_id + , conversions.buys AS visit_buy_conversions + FROM ( + SELECT + referrer_id AS referrer_id, SUM(b.buys) AS buys + FROM ( + SELECT DISTINCT + first_value(v.ds) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS ds + , first_value(v.user_id) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS user_id + , first_value(v.referrer_id) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS referrer_id + , buy_source.uuid + , 1 AS buys + FROM {{ source_schema }}.fct_visits v + INNER JOIN + ( + SELECT *, {{ generate_random_uuid() }} AS uuid FROM {{ source_schema }}.fct_buys + ) buy_source + ON + v.user_id = buy_source.user_id AND v.ds <= buy_source.ds AND v.ds > buy_source.ds - INTERVAL 7 day + ) b + GROUP BY + b.referrer_id + ) conversions +--- +integration_test: + name: conversion_count_metric_with_multiple_dimension + description: Query a conversion metric that calculates the conversion count with multiple dimension + model: SIMPLE_MODEL + metrics: ["visit_buy_conversions"] + group_bys: ["metric_time", "referrer_id"] + check_query: | + SELECT + conversions.referrer_id AS referrer_id + , conversions.metric_time AS metric_time + , conversions.buys AS visit_buy_conversions + FROM ( + SELECT + b.ds AS metric_time, referrer_id AS referrer_id, SUM(b.buys) AS buys + FROM ( + SELECT DISTINCT + first_value(v.ds) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS ds + , first_value(v.user_id) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS user_id + , first_value(v.referrer_id) OVER (PARTITION BY buy_source.ds, buy_source.user_id ORDER BY v.ds DESC NULLS FIRST) AS referrer_id + , buy_source.uuid + , 1 AS buys + FROM {{ source_schema }}.fct_visits v + INNER JOIN + ( + SELECT *, {{ generate_random_uuid() }} AS uuid FROM {{ source_schema }}.fct_buys + ) buy_source + ON + v.user_id = buy_source.user_id AND v.ds <= buy_source.ds AND v.ds > buy_source.ds - INTERVAL 7 day + ) b + GROUP BY + b.referrer_id, b.ds + ) conversions diff --git a/metricflow/test/integration/test_configured_cases.py b/metricflow/test/integration/test_configured_cases.py index 8944d3f0ea..0c6a50b4f9 100644 --- a/metricflow/test/integration/test_configured_cases.py +++ b/metricflow/test/integration/test_configured_cases.py @@ -19,6 +19,7 @@ from metricflow.sql.sql_exprs import ( SqlPercentileExpression, SqlPercentileExpressionArgument, + SqlGenerateUuidExpression, SqlTimeDeltaExpression, SqlColumnReferenceExpression, SqlColumnReference, @@ -134,6 +135,11 @@ def double_data_type_name(self) -> str: """Return the name of the double data type for the relevant SQL engine""" return self._sql_client.sql_engine_attributes.double_data_type_name + def generate_random_uuid(self) -> str: + """Returns the generate random UUID SQL function.""" + expr = SqlGenerateUuidExpression() + return self._sql_client.sql_engine_attributes.sql_query_plan_renderer.expr_renderer.render_sql_expr(expr).sql + def filter_not_supported_features( sql_client: SqlClient, required_features: Tuple[RequiredDwEngineFeatures, ...] @@ -261,6 +267,7 @@ def test_case( render_percentile_expr=check_query_helpers.render_percentile_expr, mf_time_spine_source=time_spine_source.spine_table.sql, double_data_type_name=check_query_helpers.double_data_type_name, + generate_random_uuid=check_query_helpers.generate_random_uuid, ) ) # If we sort, it's effectively not checking the order whatever order that the output was would be overwritten. diff --git a/metricflow/test/model/test_data_source_container.py b/metricflow/test/model/test_data_source_container.py index 7ed3a99c38..b31c1ff476 100644 --- a/metricflow/test/model/test_data_source_container.py +++ b/metricflow/test/model/test_data_source_container.py @@ -45,6 +45,7 @@ def test_get_names(new_data_source_semantics: DataSourceSemantics) -> None: # n "home_state_latest", "is_instant", "is_lux_latest", + "referrer_id", "verification_type", ] assert sorted([d.element_name for d in new_data_source_semantics.get_dimension_references()]) == expected @@ -59,6 +60,8 @@ def test_get_names(new_data_source_semantics: DataSourceSemantics) -> None: # n "booking_value", "booking_value_p99", "bookings", + "buyers", + "buys", "current_account_balance_by_user", "discrete_booking_value_p99", "identity_verifications", @@ -73,6 +76,8 @@ def test_get_names(new_data_source_semantics: DataSourceSemantics) -> None: # n "total_account_balance_first_day", "txn_revenue", "views", + "visitors", + "visits", ] assert sorted([m.element_name for m in new_data_source_semantics.measure_references]) == expected @@ -202,4 +207,4 @@ def test_get_data_sources_for_identifier(new_data_source_semantics: DataSourceSe linked_data_sources = new_data_source_semantics.get_data_sources_for_identifier( identifier_reference=identifier_reference ) - assert len(linked_data_sources) == 9 + assert len(linked_data_sources) == 11 diff --git a/metricflow/test/plan_conversion/test_dataflow_to_sql_plan.py b/metricflow/test/plan_conversion/test_dataflow_to_sql_plan.py index c241785c00..66bd0257bc 100644 --- a/metricflow/test/plan_conversion/test_dataflow_to_sql_plan.py +++ b/metricflow/test/plan_conversion/test_dataflow_to_sql_plan.py @@ -7,10 +7,12 @@ from metricflow.constraints.time_constraint import TimeRangeConstraint from metricflow.dataflow.builder.dataflow_plan_builder import DataflowPlanBuilder from metricflow.dataflow.dataflow_plan import ( + AddGeneratedUuidColumnNode, DataflowPlan, WriteToResultDataframeNode, FilterElementsNode, AggregateMeasuresNode, + JoinConversionEventsNode, JoinDescription, JoinToBaseOutputNode, ComputeMetricsNode, @@ -26,6 +28,7 @@ from metricflow.dataset.data_source_adapter import DataSourceDataSet from metricflow.dataset.dataset import DataSet from metricflow.model.semantic_model import SemanticModel +from metricflow.model.objects.metric import MetricTimeWindow from metricflow.plan_conversion.column_resolver import DefaultColumnAssociationResolver from metricflow.plan_conversion.dataflow_to_sql import DataflowToSqlQueryPlanConverter from metricflow.plan_conversion.time_spine import TimeSpineSource @@ -34,6 +37,7 @@ from metricflow.specs import ( DimensionSpec, IdentifierSpec, + MetadataSpec, MeasureSpec, MetricInputMeasureSpec, MetricSpec, @@ -46,6 +50,7 @@ LinkableSpecSet, InstanceSpecSet, ) +from metricflow.model.validations.unique_valid_name import MetricFlowReservedKeywords from metricflow.sql.optimizer.optimization_levels import SqlQueryOptimizationLevel from metricflow.sql.sql_bind_parameters import SqlBindParameters from metricflow.test.dataflow_plan_to_svg import display_graph_if_requested @@ -57,7 +62,6 @@ from metricflow.test.test_utils import as_datetime from metricflow.test.time.metric_time_dimension import MTD_SPEC_DAY from metricflow.time.time_granularity import TimeGranularity -from metricflow.model.objects.metric import MetricTimeWindow @pytest.fixture(scope="session") @@ -1784,6 +1788,47 @@ def test_common_data_source( # noqa: D ) +def test_join_conversion_events_node( # noqa: D + request: FixtureRequest, + mf_test_session_state: MetricFlowTestSessionState, + dataflow_plan_builder: DataflowPlanBuilder[DataSourceDataSet], + dataflow_to_sql_converter: DataflowToSqlQueryPlanConverter[DataSourceDataSet], + sql_client: SqlClient, + consistent_id_object_repository: ConsistentIdObjectRepository, +) -> None: + buys_source = AddGeneratedUuidColumnNode( + parent_node=consistent_id_object_repository.simple_model_read_nodes["buys_source"] + ) + visits_source = consistent_id_object_repository.simple_model_read_nodes["visits_source"] + + measure_spec = MeasureSpec( + element_name="buys", + ) + base_time_dimension = TimeDimensionSpec.from_name("ds") + conversion_time_dimension = TimeDimensionSpec.from_name("ds") + window = MetricTimeWindow(count=7, granularity=TimeGranularity.DAY) + entity = IdentifierSpec.from_name("user") + conversion_primary_keys = (MetadataSpec.from_name(MetricFlowReservedKeywords.MF_INTERNAL_UUID.value),) + join_node = JoinConversionEventsNode[DataSourceDataSet]( + base_node=visits_source, + base_time_dimension_spec=base_time_dimension, + conversion_node=buys_source, + conversion_measure_spec=measure_spec, + conversion_time_dimension_spec=conversion_time_dimension, + conversion_primary_key_specs=conversion_primary_keys, + entity_spec=entity, + window=window, + ) + + convert_and_check( + request=request, + mf_test_session_state=mf_test_session_state, + dataflow_to_sql_converter=dataflow_to_sql_converter, + sql_client=sql_client, + node=join_node, + ) + + def test_derived_metric_with_offset_window( # noqa: D request: FixtureRequest, mf_test_session_state: MetricFlowTestSessionState,