From 26503d24a9afeea327f6a693cec4bbdb56ee7136 Mon Sep 17 00:00:00 2001 From: kik-kik Date: Thu, 17 Oct 2024 11:45:29 +0200 Subject: [PATCH 1/6] feat: create ETL for computing KPIs using the new dau_reporting ping --- sql_generators/kpi_reporting/__init__.py | 193 ++++++++++++++++++ .../active_user_aggregates.query.sql | 59 ++++++ .../templates/active_user_aggregates.view.sql | 8 + .../templates/reporting_users.view.sql | 51 +++++ .../kpi_reporting/templates/union.view.sql | 14 ++ 5 files changed, 325 insertions(+) create mode 100644 sql_generators/kpi_reporting/__init__.py create mode 100644 sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql create mode 100644 sql_generators/kpi_reporting/templates/active_user_aggregates.view.sql create mode 100644 sql_generators/kpi_reporting/templates/reporting_users.view.sql create mode 100644 sql_generators/kpi_reporting/templates/union.view.sql diff --git a/sql_generators/kpi_reporting/__init__.py b/sql_generators/kpi_reporting/__init__.py new file mode 100644 index 00000000000..140fca47a75 --- /dev/null +++ b/sql_generators/kpi_reporting/__init__.py @@ -0,0 +1,193 @@ +"""Generate active users aggregates per app.""" + +from dataclasses import asdict, dataclass +from enum import Enum +from os import path +from pathlib import Path + +import click +from jinja2 import Environment, FileSystemLoader + +from bigquery_etl.cli.utils import use_cloud_function_option +from bigquery_etl.format_sql.formatter import reformat +from bigquery_etl.util.common import write_sql + +GENERATOR_ROOT = Path(path.dirname(__file__)) + +HEADER = f"-- Query generated via `{GENERATOR_ROOT.name}` SQL generator." +VERSION = "v1" +TEMPLATES = ("active_users.view.sql",) +BIGEYE_COLLECTION = "Browser Metrics KPI" + + +@dataclass +class Product: + """Encapsulation of what we expect a 'Product' to look like in this generator.""" + + friendly_name: str + is_mobile_kpi: bool = False + + +class MobileProducts(Enum): + """Enumeration with browser names and equivalent dataset names.""" + + fenix = Product( + friendly_name="Fenix", + is_mobile_kpi=True, + ) + # focus_android = Product( + # friendly_name="Focus Android", + # is_mobile_kpi=True, + # ) + # klar_android = Product( + # friendly_name="Klar Android", + # ) + # firefox_ios = Product( + # friendly_name="Firefox iOS", + # is_mobile_kpi=True, + # ) + # focus_ios = Product( + # friendly_name="Focus iOS", + # is_mobile_kpi=True, + # ) + # klar_ios = Product( + # friendly_name="Klar iOS", + # ) + + +@click.command() +@click.option( + "--output-dir", + "--output_dir", + help="Output directory generated SQL is written to", + type=click.Path(file_okay=False), + default="sql", +) +@click.option( + "--target-project", + "--target_project", + help="Google Cloud project ID", + default="moz-fx-data-shared-prod", +) +@use_cloud_function_option +def generate(target_project, output_dir, use_cloud_function): + """Generate per-app queries, views and metadata for active users and search counts aggregates. + + The parent folders will be created if not existing and existing files will be overwritten. + """ + env = Environment(loader=FileSystemLoader(str(GENERATOR_ROOT / "templates"))) + output_dir = Path(output_dir) / target_project + + default_template_args = { + "header": HEADER, + "version": VERSION, + "project_id": target_project, + "bigeye_collection": BIGEYE_COLLECTION, + } + + query_support_configs = ( + "checks.sql", + "metadata.yaml", + "schema.yaml", + "bigconfig.yml", + ) + + for template in TEMPLATES: + for product in MobileProducts: + + target_name, target_filename, target_extension = template.split(".") + + target_dataset = ( + product.name + "_derived" + if target_filename == "query" + else product.name + ) + + table_id = f"{target_project}.{target_dataset}.{target_name}" + + full_table_id = ( + table_id + f"_{VERSION}" if target_filename == "query" else table_id + ) + + product_args = { + "dataset": product.name, + "target_name": target_name, + "app_name": product.name, + "name": target_name, + } + + sql_template = env.get_template(template) + rendered_sql = reformat( + sql_template.render( + **default_template_args, + **asdict(product.value), + **product_args, + ) + ) + + write_sql( + output_dir=output_dir, + full_table_id=full_table_id, + basename=f"{target_filename}.{target_extension}", + sql=rendered_sql, + skip_existing=False, + ) + + # we only want to copy files defined in query_support_configs for query files. + if target_filename != "query": + continue + + for query_support_config in query_support_configs: + support_config_template = env.get_template( + f"{target_name}.{query_support_config}" + ) + support_config_rendered = support_config_template.render( + **default_template_args, + **asdict(product.value), + **product_args, + format=False, + ) + + write_sql( + output_dir=output_dir, + full_table_id=full_table_id, + basename=query_support_config, + sql=( + reformat(support_config_rendered) + if query_support_config.endswith(".sql") + else support_config_rendered + ), + skip_existing=False, + ) + + # we only want to generate a union view inside telemetry for views + if target_filename != "view": + continue + + target_dataset = "telemetry" + + union_target_name = f"mobile_{target_name}" + + union_sql_template = env.get_template("union.view.sql") + union_sql_rendered = union_sql_template.render( + **default_template_args, + dataset=target_dataset, + name=target_name, + target_name=union_target_name, + target_filename=target_filename, + format=False, + products=[ + { + "name": product.name, + } + for product in MobileProducts + ], + ) + + write_sql( + output_dir=output_dir, + full_table_id=f"{target_project}.{target_dataset}.{union_target_name}", + basename=f"{target_filename}.{target_extension}", + sql=(reformat(union_sql_rendered)), + skip_existing=False, + ) diff --git a/sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql b/sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql new file mode 100644 index 00000000000..83aac1b8685 --- /dev/null +++ b/sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql @@ -0,0 +1,59 @@ +{{ header }} +SELECT + submission_date, + client_id, + app_name, + normalized_channel, + EXTRACT(YEAR FROM first_seen_date) AS first_seen_year, + submission_date = first_seen_date AS is_new_profile, + IFNULL(country, '??') country, + city, + COALESCE(REGEXP_EXTRACT(locale, r'^(.+?)-'), locale, NULL) AS locale, + normalized_os, + -- normalized_os_version, + COALESCE( + SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(0)], "") AS INTEGER), + 0 + ) AS os_version_major, + COALESCE( + SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(1)], "") AS INTEGER), + 0 + ) AS os_version_minor, + COALESCE( + SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(2)], "") AS INTEGER), + 0 + ) AS os_version_patch, + app_display_version AS app_version, + device_model, + distribution_id, + activity_segment AS segment, + COUNTIF(is_daily_user) AS daily_users, + COUNTIF(is_weekly_user) AS weekly_users, + COUNTIF(is_monthly_user) AS monthly_users, + COUNTIF(is_dau) AS dau, + COUNTIF(is_wau) AS wau, + COUNTIF(is_mau) AS mau, + -- SUM(uri_count) AS uri_count, + -- SUM(active_hours_sum) AS active_hours, +FROM + `{{ project_id }}.{{ dataset }}.reporting_users` +WHERE + submission_date = @submission_date +GROUP BY + submission_date, + client_id, + app_name, + normalized_channel, + first_seen_year, + is_new_profile, + country, + city, + locale, + normalized_os, + os_version_major, + os_version_minor, + os_version_patch, + app_version, + device_model, + distribution_id, + segment diff --git a/sql_generators/kpi_reporting/templates/active_user_aggregates.view.sql b/sql_generators/kpi_reporting/templates/active_user_aggregates.view.sql new file mode 100644 index 00000000000..17a7925ddee --- /dev/null +++ b/sql_generators/kpi_reporting/templates/active_user_aggregates.view.sql @@ -0,0 +1,8 @@ +{{ header }} +CREATE OR REPLACE VIEW + `{{ project_id }}.{{ dataset }}.{{ name }}` +AS +SELECT + * +FROM + `{{ project_id }}.{{ dataset }}.{{ name }}_{{ version }}` diff --git a/sql_generators/kpi_reporting/templates/reporting_users.view.sql b/sql_generators/kpi_reporting/templates/reporting_users.view.sql new file mode 100644 index 00000000000..2fe7c99e259 --- /dev/null +++ b/sql_generators/kpi_reporting/templates/reporting_users.view.sql @@ -0,0 +1,51 @@ +{{ header }} +CREATE OR REPLACE VIEW + `{{ project_id }}.{{ dataset }}.{{ name }}` +AS +SELECT + * EXCEPT (isp), + CASE + WHEN LOWER(isp) = "browserstack" + THEN CONCAT("{{ friendly_name }}", " ", isp) + WHEN LOWER(distribution_id) = "mozillaonline" + THEN CONCAT("{{ friendly_name }}", " ", distribution_id) + ELSE "{{ friendly_name }}" + END AS app_name, + -- Activity fields to support metrics built on top of activity + CASE + WHEN BIT_COUNT(days_active_bits) + BETWEEN 1 + AND 6 + THEN "infrequent_user" + WHEN BIT_COUNT(days_active_bits) + BETWEEN 7 + AND 13 + THEN "casual_user" + WHEN BIT_COUNT(days_active_bits) + BETWEEN 14 + AND 20 + THEN "regular_user" + WHEN BIT_COUNT(days_active_bits) >= 21 + THEN "core_user" + ELSE "other" + END AS activity_segment, + IFNULL(mozfun.bits28.days_since_seen(days_active_bits) = 0, FALSE) AS is_dau, + IFNULL(mozfun.bits28.days_since_seen(days_active_bits) < 7, FALSE) AS is_wau, + IFNULL(mozfun.bits28.days_since_seen(days_active_bits) < 28, FALSE) AS is_mau, + -- Metrics based on pings sent + IFNULL(mozfun.bits28.days_since_seen(days_seen_bits) = 0, FALSE) AS is_daily_user, + IFNULL(mozfun.bits28.days_since_seen(days_seen_bits) < 7, FALSE) AS is_weekly_user, + IFNULL(mozfun.bits28.days_since_seen(days_seen_bits) < 28, FALSE) AS is_monthly_user, + {% if is_mobile_kpi %} + ( + LOWER(IFNULL(isp, "")) <> "browserstack" + AND LOWER(IFNULL(distribution_id, "")) <> "mozillaonline" + ) + {% else %} + FALSE + {% endif %} AS is_mobile, + -- Adding isp at the end because it's in different column index in baseline table for some products. + -- This is to make sure downstream union works as intended. + isp, +FROM + `{{ project_id }}.{{ dataset }}.dau_reporting_last_seen` diff --git a/sql_generators/kpi_reporting/templates/union.view.sql b/sql_generators/kpi_reporting/templates/union.view.sql new file mode 100644 index 00000000000..805612916f1 --- /dev/null +++ b/sql_generators/kpi_reporting/templates/union.view.sql @@ -0,0 +1,14 @@ +{{ header }} +CREATE OR REPLACE VIEW + `{{ project_id }}.{{ dataset }}.{{ target_name }}` +AS +{% for product in products %} +{% if not loop.first %} +UNION ALL +{% endif %} +SELECT + *, + "{{ product.name }}" AS product_name, +FROM + `{{ project_id }}.{{ product.name }}.{{ name }}` +{% endfor %} From aafae0874a5837f93e81dfc5e52d7b57558f94e1 Mon Sep 17 00:00:00 2001 From: kik-kik Date: Thu, 17 Oct 2024 11:50:20 +0200 Subject: [PATCH 2/6] feat: add basic bigeye config for active_user_aggregates --- .../active_user_aggregates.bigconfig.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 sql_generators/kpi_reporting/templates/active_user_aggregates.bigconfig.yml diff --git a/sql_generators/kpi_reporting/templates/active_user_aggregates.bigconfig.yml b/sql_generators/kpi_reporting/templates/active_user_aggregates.bigconfig.yml new file mode 100644 index 00000000000..6588c3d9387 --- /dev/null +++ b/sql_generators/kpi_reporting/templates/active_user_aggregates.bigconfig.yml @@ -0,0 +1,19 @@ +type: BIGCONFIG_FILE +table_deployments: +- collection: + name: {{ bigeye_collection }} + deployments: + - fq_table_name: {{ project_id }}.{{ project_id }}.{{ dataset }}_derived.{{ name }}_{{ version }} + table_metrics: + - metric_type: + type: PREDEFINED + predefined_metric: FRESHNESS + metric_schedule: + named_schedule: + name: Default Schedule - 13:00 UTC + - metric_type: + type: PREDEFINED + predefined_metric: VOLUME + metric_schedule: + named_schedule: + name: Default Schedule - 13:00 UTC From 50e6d8556d1a3c22570d54909a8bf99e09a38998 Mon Sep 17 00:00:00 2001 From: kik-kik Date: Thu, 17 Oct 2024 11:54:07 +0200 Subject: [PATCH 3/6] feat: remove client_id from the final select --- .../kpi_reporting/templates/active_user_aggregates.query.sql | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql b/sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql index 83aac1b8685..0ece556db33 100644 --- a/sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql +++ b/sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql @@ -1,7 +1,7 @@ {{ header }} SELECT submission_date, - client_id, + -- client_id, app_name, normalized_channel, EXTRACT(YEAR FROM first_seen_date) AS first_seen_year, @@ -41,7 +41,6 @@ WHERE submission_date = @submission_date GROUP BY submission_date, - client_id, app_name, normalized_channel, first_seen_year, From e97ef5bceba0395ffe49f7aa62496aef325bdf85 Mon Sep 17 00:00:00 2001 From: kik-kik Date: Thu, 17 Oct 2024 11:59:54 +0200 Subject: [PATCH 4/6] feat: add metadata to active_user_aggregates --- .../active_user_aggregates.metadata.yaml | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 sql_generators/kpi_reporting/templates/active_user_aggregates.metadata.yaml diff --git a/sql_generators/kpi_reporting/templates/active_user_aggregates.metadata.yaml b/sql_generators/kpi_reporting/templates/active_user_aggregates.metadata.yaml new file mode 100644 index 00000000000..1f159f329ea --- /dev/null +++ b/sql_generators/kpi_reporting/templates/active_user_aggregates.metadata.yaml @@ -0,0 +1,28 @@ +friendly_name: Active User Aggregates / KPI reporting - {{ friendly_name }} +description: |- + Used for reporting our core KPI metrics ({{ friendly_name }}) + +owners: + - mozilla/kpi_table_reviewers + - kik@mozilla.com +labels: + schedule: daily + incremental: true + change_controlled: true +scheduling: + dag_name: bqetl_analytics_aggregations + depends_on_past: false + task_group: {{ app_name }} +bigquery: + time_partitioning: + type: day + field: submission_date + require_partition_filter: false + clustering: + fields: + - app_name + - normalized_channel + - country +monitoring: + enabled: true + collection: {{ bigeye_collection }} From b22f43685d56416a20ff301380515430fc122304 Mon Sep 17 00:00:00 2001 From: kik-kik Date: Thu, 17 Oct 2024 12:00:32 +0200 Subject: [PATCH 5/6] feat: add schema for the new active_user_aggregates --- .../kpi_reporting/templates/schema.yaml | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 sql_generators/kpi_reporting/templates/schema.yaml diff --git a/sql_generators/kpi_reporting/templates/schema.yaml b/sql_generators/kpi_reporting/templates/schema.yaml new file mode 100644 index 00000000000..39cc7678724 --- /dev/null +++ b/sql_generators/kpi_reporting/templates/schema.yaml @@ -0,0 +1,110 @@ +fields: +- name: submission_date + type: DATE + mode: NULLABLE + description: + +- name: app_name + type: STRING + mode: NULLABLE + description: + +- name: normalized_channel + type: STRING + mode: NULLABLE + description: + +- name: first_seen_year + type: INTEGER + mode: NULLABLE + description: + +- name: is_new_profile + type: BOOLEAN + mode: NULLABLE + description: + +- name: country + type: STRING + mode: NULLABLE + description: + +- name: city + type: STRING + mode: NULLABLE + description: + +- name: locale + type: STRING + mode: NULLABLE + description: + +- name: normalized_os + type: STRING + mode: NULLABLE + description: + +- name: os_version_major + type: INTEGER + mode: NULLABLE + description: + +- name: os_version_minor + type: INTEGER + mode: NULLABLE + description: + +- name: os_version_patch + type: INTEGER + mode: NULLABLE + description: + +- name: app_version + type: STRING + mode: NULLABLE + description: + +- name: device_model + type: STRING + mode: NULLABLE + description: + +- name: distribution_id + type: STRING + mode: NULLABLE + description: + +- name: segment + type: STRING + mode: NULLABLE + description: + +- name: daily_users + type: INTEGER + mode: NULLABLE + description: + +- name: weekly_users + type: INTEGER + mode: NULLABLE + description: + +- name: monthly_users + type: INTEGER + mode: NULLABLE + description: + +- name: dau + type: INTEGER + mode: NULLABLE + description: + +- name: wau + type: INTEGER + mode: NULLABLE + description: + +- name: mau + type: INTEGER + mode: NULLABLE + description: From a1956dce7a956c796098d2319bbf0fa3423c89f0 Mon Sep 17 00:00:00 2001 From: kik-kik Date: Thu, 17 Oct 2024 14:44:29 +0200 Subject: [PATCH 6/6] feat: remove fields we do not need for kpi reporting --- .../active_user_aggregates.query.sql | 36 ++++++------------- .../templates/reporting_users.view.sql | 23 ++++++++++-- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql b/sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql index 0ece556db33..08f436b1f26 100644 --- a/sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql +++ b/sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql @@ -1,32 +1,19 @@ {{ header }} SELECT submission_date, - -- client_id, app_name, normalized_channel, - EXTRACT(YEAR FROM first_seen_date) AS first_seen_year, - submission_date = first_seen_date AS is_new_profile, - IFNULL(country, '??') country, - city, - COALESCE(REGEXP_EXTRACT(locale, r'^(.+?)-'), locale, NULL) AS locale, - normalized_os, - -- normalized_os_version, - COALESCE( - SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(0)], "") AS INTEGER), - 0 - ) AS os_version_major, - COALESCE( - SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(1)], "") AS INTEGER), - 0 - ) AS os_version_minor, - COALESCE( - SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(2)], "") AS INTEGER), - 0 - ) AS os_version_patch, - app_display_version AS app_version, - device_model, + first_seen_year, + is_new_profile, + country, + locale, + -- normalized_os, + os_version_major, + os_version_minor, + os_version_patch, + app_version, distribution_id, - activity_segment AS segment, + segment, COUNTIF(is_daily_user) AS daily_users, COUNTIF(is_weekly_user) AS weekly_users, COUNTIF(is_monthly_user) AS monthly_users, @@ -46,13 +33,10 @@ GROUP BY first_seen_year, is_new_profile, country, - city, locale, - normalized_os, os_version_major, os_version_minor, os_version_patch, app_version, - device_model, distribution_id, segment diff --git a/sql_generators/kpi_reporting/templates/reporting_users.view.sql b/sql_generators/kpi_reporting/templates/reporting_users.view.sql index 2fe7c99e259..11996a6c760 100644 --- a/sql_generators/kpi_reporting/templates/reporting_users.view.sql +++ b/sql_generators/kpi_reporting/templates/reporting_users.view.sql @@ -3,7 +3,11 @@ CREATE OR REPLACE VIEW `{{ project_id }}.{{ dataset }}.{{ name }}` AS SELECT - * EXCEPT (isp), + * EXCEPT (isp, normalized_os_version, app_display_version) + REPLACE( + COALESCE(REGEXP_EXTRACT(locale, r'^(.+?)-'), locale, NULL) AS locale, + IFNULL(country, '??') AS country + ), CASE WHEN LOWER(isp) = "browserstack" THEN CONCAT("{{ friendly_name }}", " ", isp) @@ -28,7 +32,22 @@ SELECT WHEN BIT_COUNT(days_active_bits) >= 21 THEN "core_user" ELSE "other" - END AS activity_segment, + END AS segment, + COALESCE( + SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(0)], "") AS INTEGER), + 0 + ) AS os_version_major, + COALESCE( + SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(1)], "") AS INTEGER), + 0 + ) AS os_version_minor, + COALESCE( + SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(2)], "") AS INTEGER), + 0 + ) AS os_version_patch, + app_display_version AS app_version, + EXTRACT(YEAR FROM first_seen_date) AS first_seen_year, + submission_date = first_seen_date AS is_new_profile, IFNULL(mozfun.bits28.days_since_seen(days_active_bits) = 0, FALSE) AS is_dau, IFNULL(mozfun.bits28.days_since_seen(days_active_bits) < 7, FALSE) AS is_wau, IFNULL(mozfun.bits28.days_since_seen(days_active_bits) < 28, FALSE) AS is_mau,