From 76b60c38ef7d051a39b6cd8f34688089ba361772 Mon Sep 17 00:00:00 2001 From: Jordan Stein Date: Fri, 19 Jan 2024 13:00:59 -0800 Subject: [PATCH 1/2] fix typo and re-format sql --- website/docs/docs/build/conversion-metrics.md | 160 ++++++++---------- 1 file changed, 74 insertions(+), 86 deletions(-) diff --git a/website/docs/docs/build/conversion-metrics.md b/website/docs/docs/build/conversion-metrics.md index 2238655fbe0..8d5f7eb24bf 100644 --- a/website/docs/docs/build/conversion-metrics.md +++ b/website/docs/docs/build/conversion-metrics.md @@ -105,19 +105,18 @@ This step joins the `BUYS` table to the `VISITS` table and gets all combinations The SQL generated in these steps looks like the following: ```sql -select - v.ds, - v.user_id, - v.referrer_id, - b.ds, - b.uuid, - 1 as buys -from visits v -inner join ( - select *, uuid_string() as uuid from buys -- Adds a uuid column to uniquely identify the different rows -) b -on -v.user_id = b.user_id and v.ds <= b.ds and v.ds > b.ds - interval '7 day' +SELECT v.ds, + v.user_id, + v.referrer_id, + b.ds, + b.uuid, + 1 as buys +FROM visits v + INNER JOIN (SLECT *, uuid_string() as uuid + FROM buys -- Adds a uuid column to uniquely identify the different rows + ) b + ON + v.user_id = b.user_id and v.ds <= b.ds and v.ds > b.ds - interval '7 day' ``` The dataset returns the following (note that there are two potential conversion events for the first visit): @@ -134,19 +133,17 @@ The dataset returns the following (note that there are two potential conversion Instead of returning the raw visit values, use window functions to link conversions to the closest base event. You can partition by the conversion source and get the `first_value` ordered by `visit ds`, descending to get the closest base event from the conversion event: ```sql -select - first_value(v.ds) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as v_ds, - first_value(v.user_id) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as user_id, - first_value(v.referrer_id) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as referrer_id, - b.ds, - b.uuid, - 1 as buys -from visits v -inner join ( - select *, uuid_string() as uuid from buys -) b -on -v.user_id = b.user_id and v.ds <= b.ds and v.ds > b.ds - interval '7 day' +SELECT first_value(v.ds) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as v_ds, + first_value(v.user_id) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as user_id, + first_value(v.referrer_id) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as referrer_id, + b.ds, + b.uuid, + 1 as buys +FROM visits v + INNER JOIN (select *, uuid_string() as uuid + from buys) b + ON + v.user_id = b.user_id and v.ds <= b.ds and v.ds > b.ds - interval '7 day' ``` @@ -168,19 +165,18 @@ To resolve this and eliminate duplicates, use a distinct select. The UUID also h Instead of regular select used in the [Step 2](#step-2-refine-with-window-function), use a distinct select to remove the duplicates: ```sql -select distinct - first_value(v.ds) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as v_ds, - first_value(v.user_id) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as user_id, - first_value(v.referrer_id) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as referrer_id, - b.ds, - b.uuid, - 1 as buys -from visits v -inner join ( - select *, uuid_string() as uuid from buys -) b -on -v.user_id = b.user_id and v.ds <= b.ds and v.ds > b.ds - interval '7 day'; +SELECT DISTINCT first_value(v.ds) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as v_ds, + first_value(v.user_id) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as user_id, + first_value(v.referrer_id) + over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as referrer_id, + b.ds, + b.uuid, + 1 as buys +FROM visits v + INNER JOIN (select *, uuid_string() as uuid + from buys) b + ON + v.user_id = b.user_id and v.ds <= b.ds and v.ds > b.ds - interval '7 day'; ``` The dataset returns the following: @@ -201,38 +197,28 @@ You now have a dataset where every conversion is connected to a visit event. To Now that you’ve tied each conversion event to a visit, you can calculate the aggregated conversions and opportunities measures. Then, you can join them to calculate the actual conversion rate. The SQL to calculate the conversion rate is as follows: ```sql -select - coalesce(subq_3.metric_time__day, subq_13.metric_time__day) as metric_time__day, - cast(max(subq_13.buys) as double) / cast(nullif(max(subq_3.visits), 0) as double) as visit_to_buy_conversion_rate_7d -from ( -- base measure - select - metric_time__day, - sum(visits) as mqls - from ( - select - date_trunc('day', first_contact_date) as metric_time__day, - 1 as visits - from visits - ) subq_2 - group by - metric_time__day -) subq_3 -full outer join ( -- conversion measure - select - metric_time__day, - sum(buys) as sellers - from ( - -- ... - -- The output of this subquery is the table produced in Step 3. The SQL is hidden for legibility. - -- To see the full SQL output, add --explain to your conversion metric query. - ) subq_10 - group by - metric_time__day -) subq_13 -on - subq_3.metric_time__day = subq_13.metric_time__day -group by - metric_time__day +SELECT coalesce(subq_3.metric_time__day, subq_13.metric_time__day) as metric_time__day, + cast(max(subq_13.buys) as double) / + cast(nullif(max(subq_3.visits), 0) as double) as visit_to_buy_conversion_rate_7d +FROM ( -- base measure + SELECT metric_time__day, + sum(visits) as mqls + FROM (SELECT date_trunc('day', first_contact_date) as metric_time__day, + 1 as visits + FROM visits) subq_2 + GROUP BY metric_time__day) subq_3 + FULL OUTER JOIN ( -- conversion measure + SELECT metric_time__day, + sum(buys) as sellers + FROM ( + -- ... + -- The output of this subquery is the table produced in Step 3. The SQL is hidden for legibility. + -- To see the full SQL output, add --explain to your conversion metric query. + ) subq_10 + GROUP BY metric_time__day) subq_13 + ON + subq_3.metric_time__day = subq_13.metric_time__day +GROUP BY metric_time__day ``` ### Additional settings @@ -249,7 +235,7 @@ Use the following additional settings to customize your conversion metrics: To return zero in the final data set, you can set the value of a null conversion event to zero instead of null. You can add the `fill_nulls_with` parameter to your conversion metric definition like this: ```yaml -- name: vist_to_buy_conversion_rate_7_day_window +- name: visit_to_buy_conversion_rate_7_day_window description: "Conversion rate from viewing a page to making a purchase" type: conversion label: Visit to Seller Conversion Rate (7 day window) @@ -329,22 +315,24 @@ In this case, you want to set `product_id` as the constant property. You can spe You will add an additional condition to the join to make sure the constant property is the same across conversions. ```sql -select distinct - first_value(v.ds) over (partition by buy_source.ds, buy_source.user_id, buy_source.session_id order by v.ds desc rows between unbounded preceding and unbounded following) as ds, - first_value(v.user_id) over (partition by buy_source.ds, buy_source.user_id, buy_source.session_id order by v.ds desc rows between unbounded preceding and unbounded following) as user_id, - first_value(v.referrer_id) over (partition by buy_source.ds, buy_source.user_id, buy_source.session_id order by v.ds desc rows between unbounded preceding and unbounded following) as referrer_id, - buy_source.uuid, - 1 as buys -from {{ source_schema }}.fct_view_item_details v -inner join +SELECT DISTINCT first_value(v.ds) + over (partition by buy_source.ds, buy_source.user_id, buy_source.session_id order by v.ds desc rows between unbounded preceding and unbounded following) as ds, + first_value(v.user_id) + over (partition by buy_source.ds, buy_source.user_id, buy_source.session_id order by v.ds desc rows between unbounded preceding and unbounded following) as user_id, + first_value(v.referrer_id) + over (partition by buy_source.ds, buy_source.user_id, buy_source.session_id order by v.ds desc rows between unbounded preceding and unbounded following) as referrer_id, + buy_source.uuid, + 1 as buys +FROM {{ source_schema }}.fct_view_item_details v +INNER JOIN ( - select *, {{ generate_random_uuid() }} as uuid from {{ source_schema }}.fct_purchases + SELECT *, {{ generate_random_uuid() }} as uuid FROM {{ source_schema }}.fct_purchases ) buy_source -on - v.user_id = buy_source.user_id - and v.ds <= buy_source.ds - and v.ds > buy_source.ds - interval '7 day' - and buy_source.product_id = v.product_id --Joining on the constant property product_id +ON + v.user_id = buy_source.user_id + AND v.ds <= buy_source.ds + AND v.ds > buy_source.ds - INTERVAL '7 day' + AND buy_source.product_id = v.product_id --Joining on the constant property product_id ``` From 7fef1110ba5a6681a46ea47b5ba5cc713cedbeec Mon Sep 17 00:00:00 2001 From: mirnawong1 Date: Tue, 23 Jan 2024 11:58:39 +0000 Subject: [PATCH 2/2] update sql --- website/docs/docs/build/conversion-metrics.md | 160 ++++++++++-------- 1 file changed, 85 insertions(+), 75 deletions(-) diff --git a/website/docs/docs/build/conversion-metrics.md b/website/docs/docs/build/conversion-metrics.md index 8d5f7eb24bf..5b63a6bbbf1 100644 --- a/website/docs/docs/build/conversion-metrics.md +++ b/website/docs/docs/build/conversion-metrics.md @@ -105,18 +105,19 @@ This step joins the `BUYS` table to the `VISITS` table and gets all combinations The SQL generated in these steps looks like the following: ```sql -SELECT v.ds, - v.user_id, - v.referrer_id, - b.ds, - b.uuid, - 1 as buys -FROM visits v - INNER JOIN (SLECT *, uuid_string() as uuid - FROM buys -- Adds a uuid column to uniquely identify the different rows - ) b - ON - v.user_id = b.user_id and v.ds <= b.ds and v.ds > b.ds - interval '7 day' +select + v.ds, + v.user_id, + v.referrer_id, + b.ds, + b.uuid, + 1 as buys +from visits v +inner join ( + select *, uuid_string() as uuid from buys -- Adds a uuid column to uniquely identify the different rows +) b +on +v.user_id = b.user_id and v.ds <= b.ds and v.ds > b.ds - interval '7 days' ``` The dataset returns the following (note that there are two potential conversion events for the first visit): @@ -133,18 +134,19 @@ The dataset returns the following (note that there are two potential conversion Instead of returning the raw visit values, use window functions to link conversions to the closest base event. You can partition by the conversion source and get the `first_value` ordered by `visit ds`, descending to get the closest base event from the conversion event: ```sql -SELECT first_value(v.ds) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as v_ds, - first_value(v.user_id) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as user_id, - first_value(v.referrer_id) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as referrer_id, - b.ds, - b.uuid, - 1 as buys -FROM visits v - INNER JOIN (select *, uuid_string() as uuid - from buys) b - ON - v.user_id = b.user_id and v.ds <= b.ds and v.ds > b.ds - interval '7 day' - +select + first_value(v.ds) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as v_ds, + first_value(v.user_id) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as user_id, + first_value(v.referrer_id) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as referrer_id, + b.ds, + b.uuid, + 1 as buys +from visits v +inner join ( + select *, uuid_string() as uuid from buys +) b +on +v.user_id = b.user_id and v.ds <= b.ds and v.ds > b.ds - interval '7 day' ``` The dataset returns the following: @@ -165,18 +167,19 @@ To resolve this and eliminate duplicates, use a distinct select. The UUID also h Instead of regular select used in the [Step 2](#step-2-refine-with-window-function), use a distinct select to remove the duplicates: ```sql -SELECT DISTINCT first_value(v.ds) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as v_ds, - first_value(v.user_id) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as user_id, - first_value(v.referrer_id) - over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as referrer_id, - b.ds, - b.uuid, - 1 as buys -FROM visits v - INNER JOIN (select *, uuid_string() as uuid - from buys) b - ON - v.user_id = b.user_id and v.ds <= b.ds and v.ds > b.ds - interval '7 day'; +select distinct + first_value(v.ds) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as v_ds, + first_value(v.user_id) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as user_id, + first_value(v.referrer_id) over (partition by b.ds, b.user_id, b.uuid order by v.ds desc) as referrer_id, + b.ds, + b.uuid, + 1 as buys +from visits v +inner join ( + select *, uuid_string() as uuid from buys +) b +on +v.user_id = b.user_id and v.ds <= b.ds and v.ds > b.ds - interval '7 day'; ``` The dataset returns the following: @@ -197,28 +200,38 @@ You now have a dataset where every conversion is connected to a visit event. To Now that you’ve tied each conversion event to a visit, you can calculate the aggregated conversions and opportunities measures. Then, you can join them to calculate the actual conversion rate. The SQL to calculate the conversion rate is as follows: ```sql -SELECT coalesce(subq_3.metric_time__day, subq_13.metric_time__day) as metric_time__day, - cast(max(subq_13.buys) as double) / - cast(nullif(max(subq_3.visits), 0) as double) as visit_to_buy_conversion_rate_7d -FROM ( -- base measure - SELECT metric_time__day, - sum(visits) as mqls - FROM (SELECT date_trunc('day', first_contact_date) as metric_time__day, - 1 as visits - FROM visits) subq_2 - GROUP BY metric_time__day) subq_3 - FULL OUTER JOIN ( -- conversion measure - SELECT metric_time__day, - sum(buys) as sellers - FROM ( - -- ... - -- The output of this subquery is the table produced in Step 3. The SQL is hidden for legibility. - -- To see the full SQL output, add --explain to your conversion metric query. - ) subq_10 - GROUP BY metric_time__day) subq_13 - ON - subq_3.metric_time__day = subq_13.metric_time__day -GROUP BY metric_time__day +select + coalesce(subq_3.metric_time__day, subq_13.metric_time__day) as metric_time__day, + cast(max(subq_13.buys) as double) / cast(nullif(max(subq_3.visits), 0) as double) as visit_to_buy_conversion_rate_7d +from ( -- base measure + select + metric_time__day, + sum(visits) as mqls + from ( + select + date_trunc('day', first_contact_date) as metric_time__day, + 1 as visits + from visits + ) subq_2 + group by + metric_time__day +) subq_3 +full outer join ( -- conversion measure + select + metric_time__day, + sum(buys) as sellers + from ( + -- ... + -- The output of this subquery is the table produced in Step 3. The SQL is hidden for legibility. + -- To see the full SQL output, add --explain to your conversion metric query. + ) subq_10 + group by + metric_time__day +) subq_13 +on + subq_3.metric_time__day = subq_13.metric_time__day +group by + metric_time__day ``` ### Additional settings @@ -315,25 +328,22 @@ In this case, you want to set `product_id` as the constant property. You can spe You will add an additional condition to the join to make sure the constant property is the same across conversions. ```sql -SELECT DISTINCT first_value(v.ds) - over (partition by buy_source.ds, buy_source.user_id, buy_source.session_id order by v.ds desc rows between unbounded preceding and unbounded following) as ds, - first_value(v.user_id) - over (partition by buy_source.ds, buy_source.user_id, buy_source.session_id order by v.ds desc rows between unbounded preceding and unbounded following) as user_id, - first_value(v.referrer_id) - over (partition by buy_source.ds, buy_source.user_id, buy_source.session_id order by v.ds desc rows between unbounded preceding and unbounded following) as referrer_id, - buy_source.uuid, - 1 as buys -FROM {{ source_schema }}.fct_view_item_details v -INNER JOIN +select distinct + first_value(v.ds) over (partition by buy_source.ds, buy_source.user_id, buy_source.session_id order by v.ds desc rows between unbounded preceding and unbounded following) as ds, + first_value(v.user_id) over (partition by buy_source.ds, buy_source.user_id, buy_source.session_id order by v.ds desc rows between unbounded preceding and unbounded following) as user_id, + first_value(v.referrer_id) over (partition by buy_source.ds, buy_source.user_id, buy_source.session_id order by v.ds desc rows between unbounded preceding and unbounded following) as referrer_id, + buy_source.uuid, + 1 as buys +from {{ source_schema }}.fct_view_item_details v +inner join ( - SELECT *, {{ generate_random_uuid() }} as uuid FROM {{ source_schema }}.fct_purchases + select *, {{ generate_random_uuid() }} as uuid from {{ source_schema }}.fct_purchases ) buy_source -ON - v.user_id = buy_source.user_id - AND v.ds <= buy_source.ds - AND v.ds > buy_source.ds - INTERVAL '7 day' - AND buy_source.product_id = v.product_id --Joining on the constant property product_id - +on + v.user_id = buy_source.user_id + and v.ds <= buy_source.ds + and v.ds > buy_source.ds - interval '7 day' + and buy_source.product_id = v.product_id --Joining on the constant property product_id ```