From 26503d24a9afeea327f6a693cec4bbdb56ee7136 Mon Sep 17 00:00:00 2001 From: kik-kik Date: Thu, 17 Oct 2024 11:45:29 +0200 Subject: [PATCH] feat: create ETL for computing KPIs using the new dau_reporting ping --- sql_generators/kpi_reporting/__init__.py | 193 ++++++++++++++++++ .../active_user_aggregates.query.sql | 59 ++++++ .../templates/active_user_aggregates.view.sql | 8 + .../templates/reporting_users.view.sql | 51 +++++ .../kpi_reporting/templates/union.view.sql | 14 ++ 5 files changed, 325 insertions(+) create mode 100644 sql_generators/kpi_reporting/__init__.py create mode 100644 sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql create mode 100644 sql_generators/kpi_reporting/templates/active_user_aggregates.view.sql create mode 100644 sql_generators/kpi_reporting/templates/reporting_users.view.sql create mode 100644 sql_generators/kpi_reporting/templates/union.view.sql diff --git a/sql_generators/kpi_reporting/__init__.py b/sql_generators/kpi_reporting/__init__.py new file mode 100644 index 00000000000..140fca47a75 --- /dev/null +++ b/sql_generators/kpi_reporting/__init__.py @@ -0,0 +1,193 @@ +"""Generate active users aggregates per app.""" + +from dataclasses import asdict, dataclass +from enum import Enum +from os import path +from pathlib import Path + +import click +from jinja2 import Environment, FileSystemLoader + +from bigquery_etl.cli.utils import use_cloud_function_option +from bigquery_etl.format_sql.formatter import reformat +from bigquery_etl.util.common import write_sql + +GENERATOR_ROOT = Path(path.dirname(__file__)) + +HEADER = f"-- Query generated via `{GENERATOR_ROOT.name}` SQL generator." +VERSION = "v1" +TEMPLATES = ("active_users.view.sql",) +BIGEYE_COLLECTION = "Browser Metrics KPI" + + +@dataclass +class Product: + """Encapsulation of what we expect a 'Product' to look like in this generator.""" + + friendly_name: str + is_mobile_kpi: bool = False + + +class MobileProducts(Enum): + """Enumeration with browser names and equivalent dataset names.""" + + fenix = Product( + friendly_name="Fenix", + is_mobile_kpi=True, + ) + # focus_android = Product( + # friendly_name="Focus Android", + # is_mobile_kpi=True, + # ) + # klar_android = Product( + # friendly_name="Klar Android", + # ) + # firefox_ios = Product( + # friendly_name="Firefox iOS", + # is_mobile_kpi=True, + # ) + # focus_ios = Product( + # friendly_name="Focus iOS", + # is_mobile_kpi=True, + # ) + # klar_ios = Product( + # friendly_name="Klar iOS", + # ) + + +@click.command() +@click.option( + "--output-dir", + "--output_dir", + help="Output directory generated SQL is written to", + type=click.Path(file_okay=False), + default="sql", +) +@click.option( + "--target-project", + "--target_project", + help="Google Cloud project ID", + default="moz-fx-data-shared-prod", +) +@use_cloud_function_option +def generate(target_project, output_dir, use_cloud_function): + """Generate per-app queries, views and metadata for active users and search counts aggregates. + + The parent folders will be created if not existing and existing files will be overwritten. + """ + env = Environment(loader=FileSystemLoader(str(GENERATOR_ROOT / "templates"))) + output_dir = Path(output_dir) / target_project + + default_template_args = { + "header": HEADER, + "version": VERSION, + "project_id": target_project, + "bigeye_collection": BIGEYE_COLLECTION, + } + + query_support_configs = ( + "checks.sql", + "metadata.yaml", + "schema.yaml", + "bigconfig.yml", + ) + + for template in TEMPLATES: + for product in MobileProducts: + + target_name, target_filename, target_extension = template.split(".") + + target_dataset = ( + product.name + "_derived" + if target_filename == "query" + else product.name + ) + + table_id = f"{target_project}.{target_dataset}.{target_name}" + + full_table_id = ( + table_id + f"_{VERSION}" if target_filename == "query" else table_id + ) + + product_args = { + "dataset": product.name, + "target_name": target_name, + "app_name": product.name, + "name": target_name, + } + + sql_template = env.get_template(template) + rendered_sql = reformat( + sql_template.render( + **default_template_args, + **asdict(product.value), + **product_args, + ) + ) + + write_sql( + output_dir=output_dir, + full_table_id=full_table_id, + basename=f"{target_filename}.{target_extension}", + sql=rendered_sql, + skip_existing=False, + ) + + # we only want to copy files defined in query_support_configs for query files. + if target_filename != "query": + continue + + for query_support_config in query_support_configs: + support_config_template = env.get_template( + f"{target_name}.{query_support_config}" + ) + support_config_rendered = support_config_template.render( + **default_template_args, + **asdict(product.value), + **product_args, + format=False, + ) + + write_sql( + output_dir=output_dir, + full_table_id=full_table_id, + basename=query_support_config, + sql=( + reformat(support_config_rendered) + if query_support_config.endswith(".sql") + else support_config_rendered + ), + skip_existing=False, + ) + + # we only want to generate a union view inside telemetry for views + if target_filename != "view": + continue + + target_dataset = "telemetry" + + union_target_name = f"mobile_{target_name}" + + union_sql_template = env.get_template("union.view.sql") + union_sql_rendered = union_sql_template.render( + **default_template_args, + dataset=target_dataset, + name=target_name, + target_name=union_target_name, + target_filename=target_filename, + format=False, + products=[ + { + "name": product.name, + } + for product in MobileProducts + ], + ) + + write_sql( + output_dir=output_dir, + full_table_id=f"{target_project}.{target_dataset}.{union_target_name}", + basename=f"{target_filename}.{target_extension}", + sql=(reformat(union_sql_rendered)), + skip_existing=False, + ) diff --git a/sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql b/sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql new file mode 100644 index 00000000000..83aac1b8685 --- /dev/null +++ b/sql_generators/kpi_reporting/templates/active_user_aggregates.query.sql @@ -0,0 +1,59 @@ +{{ header }} +SELECT + submission_date, + client_id, + app_name, + normalized_channel, + EXTRACT(YEAR FROM first_seen_date) AS first_seen_year, + submission_date = first_seen_date AS is_new_profile, + IFNULL(country, '??') country, + city, + COALESCE(REGEXP_EXTRACT(locale, r'^(.+?)-'), locale, NULL) AS locale, + normalized_os, + -- normalized_os_version, + COALESCE( + SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(0)], "") AS INTEGER), + 0 + ) AS os_version_major, + COALESCE( + SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(1)], "") AS INTEGER), + 0 + ) AS os_version_minor, + COALESCE( + SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(2)], "") AS INTEGER), + 0 + ) AS os_version_patch, + app_display_version AS app_version, + device_model, + distribution_id, + activity_segment AS segment, + COUNTIF(is_daily_user) AS daily_users, + COUNTIF(is_weekly_user) AS weekly_users, + COUNTIF(is_monthly_user) AS monthly_users, + COUNTIF(is_dau) AS dau, + COUNTIF(is_wau) AS wau, + COUNTIF(is_mau) AS mau, + -- SUM(uri_count) AS uri_count, + -- SUM(active_hours_sum) AS active_hours, +FROM + `{{ project_id }}.{{ dataset }}.reporting_users` +WHERE + submission_date = @submission_date +GROUP BY + submission_date, + client_id, + app_name, + normalized_channel, + first_seen_year, + is_new_profile, + country, + city, + locale, + normalized_os, + os_version_major, + os_version_minor, + os_version_patch, + app_version, + device_model, + distribution_id, + segment diff --git a/sql_generators/kpi_reporting/templates/active_user_aggregates.view.sql b/sql_generators/kpi_reporting/templates/active_user_aggregates.view.sql new file mode 100644 index 00000000000..17a7925ddee --- /dev/null +++ b/sql_generators/kpi_reporting/templates/active_user_aggregates.view.sql @@ -0,0 +1,8 @@ +{{ header }} +CREATE OR REPLACE VIEW + `{{ project_id }}.{{ dataset }}.{{ name }}` +AS +SELECT + * +FROM + `{{ project_id }}.{{ dataset }}.{{ name }}_{{ version }}` diff --git a/sql_generators/kpi_reporting/templates/reporting_users.view.sql b/sql_generators/kpi_reporting/templates/reporting_users.view.sql new file mode 100644 index 00000000000..2fe7c99e259 --- /dev/null +++ b/sql_generators/kpi_reporting/templates/reporting_users.view.sql @@ -0,0 +1,51 @@ +{{ header }} +CREATE OR REPLACE VIEW + `{{ project_id }}.{{ dataset }}.{{ name }}` +AS +SELECT + * EXCEPT (isp), + CASE + WHEN LOWER(isp) = "browserstack" + THEN CONCAT("{{ friendly_name }}", " ", isp) + WHEN LOWER(distribution_id) = "mozillaonline" + THEN CONCAT("{{ friendly_name }}", " ", distribution_id) + ELSE "{{ friendly_name }}" + END AS app_name, + -- Activity fields to support metrics built on top of activity + CASE + WHEN BIT_COUNT(days_active_bits) + BETWEEN 1 + AND 6 + THEN "infrequent_user" + WHEN BIT_COUNT(days_active_bits) + BETWEEN 7 + AND 13 + THEN "casual_user" + WHEN BIT_COUNT(days_active_bits) + BETWEEN 14 + AND 20 + THEN "regular_user" + WHEN BIT_COUNT(days_active_bits) >= 21 + THEN "core_user" + ELSE "other" + END AS activity_segment, + IFNULL(mozfun.bits28.days_since_seen(days_active_bits) = 0, FALSE) AS is_dau, + IFNULL(mozfun.bits28.days_since_seen(days_active_bits) < 7, FALSE) AS is_wau, + IFNULL(mozfun.bits28.days_since_seen(days_active_bits) < 28, FALSE) AS is_mau, + -- Metrics based on pings sent + IFNULL(mozfun.bits28.days_since_seen(days_seen_bits) = 0, FALSE) AS is_daily_user, + IFNULL(mozfun.bits28.days_since_seen(days_seen_bits) < 7, FALSE) AS is_weekly_user, + IFNULL(mozfun.bits28.days_since_seen(days_seen_bits) < 28, FALSE) AS is_monthly_user, + {% if is_mobile_kpi %} + ( + LOWER(IFNULL(isp, "")) <> "browserstack" + AND LOWER(IFNULL(distribution_id, "")) <> "mozillaonline" + ) + {% else %} + FALSE + {% endif %} AS is_mobile, + -- Adding isp at the end because it's in different column index in baseline table for some products. + -- This is to make sure downstream union works as intended. + isp, +FROM + `{{ project_id }}.{{ dataset }}.dau_reporting_last_seen` diff --git a/sql_generators/kpi_reporting/templates/union.view.sql b/sql_generators/kpi_reporting/templates/union.view.sql new file mode 100644 index 00000000000..805612916f1 --- /dev/null +++ b/sql_generators/kpi_reporting/templates/union.view.sql @@ -0,0 +1,14 @@ +{{ header }} +CREATE OR REPLACE VIEW + `{{ project_id }}.{{ dataset }}.{{ target_name }}` +AS +{% for product in products %} +{% if not loop.first %} +UNION ALL +{% endif %} +SELECT + *, + "{{ product.name }}" AS product_name, +FROM + `{{ project_id }}.{{ product.name }}.{{ name }}` +{% endfor %}