Skip to content

Commit

Permalink
feat: create ETL for computing KPIs using the new dau_reporting ping
Browse files Browse the repository at this point in the history
  • Loading branch information
kik-kik committed Oct 17, 2024
1 parent d7c8a12 commit 26503d2
Show file tree
Hide file tree
Showing 5 changed files with 325 additions and 0 deletions.
193 changes: 193 additions & 0 deletions sql_generators/kpi_reporting/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
"""Generate active users aggregates per app."""

from dataclasses import asdict, dataclass
from enum import Enum
from os import path
from pathlib import Path

import click
from jinja2 import Environment, FileSystemLoader

from bigquery_etl.cli.utils import use_cloud_function_option
from bigquery_etl.format_sql.formatter import reformat
from bigquery_etl.util.common import write_sql

GENERATOR_ROOT = Path(path.dirname(__file__))

HEADER = f"-- Query generated via `{GENERATOR_ROOT.name}` SQL generator."
VERSION = "v1"
TEMPLATES = ("active_users.view.sql",)
BIGEYE_COLLECTION = "Browser Metrics KPI"


@dataclass
class Product:
"""Encapsulation of what we expect a 'Product' to look like in this generator."""

friendly_name: str
is_mobile_kpi: bool = False


class MobileProducts(Enum):
"""Enumeration with browser names and equivalent dataset names."""

fenix = Product(
friendly_name="Fenix",
is_mobile_kpi=True,
)
# focus_android = Product(
# friendly_name="Focus Android",
# is_mobile_kpi=True,
# )
# klar_android = Product(
# friendly_name="Klar Android",
# )
# firefox_ios = Product(
# friendly_name="Firefox iOS",
# is_mobile_kpi=True,
# )
# focus_ios = Product(
# friendly_name="Focus iOS",
# is_mobile_kpi=True,
# )
# klar_ios = Product(
# friendly_name="Klar iOS",
# )


@click.command()
@click.option(
"--output-dir",
"--output_dir",
help="Output directory generated SQL is written to",
type=click.Path(file_okay=False),
default="sql",
)
@click.option(
"--target-project",
"--target_project",
help="Google Cloud project ID",
default="moz-fx-data-shared-prod",
)
@use_cloud_function_option
def generate(target_project, output_dir, use_cloud_function):
"""Generate per-app queries, views and metadata for active users and search counts aggregates.
The parent folders will be created if not existing and existing files will be overwritten.
"""
env = Environment(loader=FileSystemLoader(str(GENERATOR_ROOT / "templates")))
output_dir = Path(output_dir) / target_project

default_template_args = {
"header": HEADER,
"version": VERSION,
"project_id": target_project,
"bigeye_collection": BIGEYE_COLLECTION,
}

query_support_configs = (
"checks.sql",
"metadata.yaml",
"schema.yaml",
"bigconfig.yml",
)

for template in TEMPLATES:
for product in MobileProducts:

target_name, target_filename, target_extension = template.split(".")

target_dataset = (
product.name + "_derived"
if target_filename == "query"
else product.name
)

table_id = f"{target_project}.{target_dataset}.{target_name}"

full_table_id = (
table_id + f"_{VERSION}" if target_filename == "query" else table_id
)

product_args = {
"dataset": product.name,
"target_name": target_name,
"app_name": product.name,
"name": target_name,
}

sql_template = env.get_template(template)
rendered_sql = reformat(
sql_template.render(
**default_template_args,
**asdict(product.value),
**product_args,
)
)

write_sql(
output_dir=output_dir,
full_table_id=full_table_id,
basename=f"{target_filename}.{target_extension}",
sql=rendered_sql,
skip_existing=False,
)

# we only want to copy files defined in query_support_configs for query files.
if target_filename != "query":
continue

for query_support_config in query_support_configs:
support_config_template = env.get_template(
f"{target_name}.{query_support_config}"
)
support_config_rendered = support_config_template.render(
**default_template_args,
**asdict(product.value),
**product_args,
format=False,
)

write_sql(
output_dir=output_dir,
full_table_id=full_table_id,
basename=query_support_config,
sql=(
reformat(support_config_rendered)
if query_support_config.endswith(".sql")
else support_config_rendered
),
skip_existing=False,
)

# we only want to generate a union view inside telemetry for views
if target_filename != "view":
continue

target_dataset = "telemetry"

union_target_name = f"mobile_{target_name}"

union_sql_template = env.get_template("union.view.sql")
union_sql_rendered = union_sql_template.render(
**default_template_args,
dataset=target_dataset,
name=target_name,
target_name=union_target_name,
target_filename=target_filename,
format=False,
products=[
{
"name": product.name,
}
for product in MobileProducts
],
)

write_sql(
output_dir=output_dir,
full_table_id=f"{target_project}.{target_dataset}.{union_target_name}",
basename=f"{target_filename}.{target_extension}",
sql=(reformat(union_sql_rendered)),
skip_existing=False,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{{ header }}
SELECT
submission_date,
client_id,
app_name,
normalized_channel,
EXTRACT(YEAR FROM first_seen_date) AS first_seen_year,
submission_date = first_seen_date AS is_new_profile,
IFNULL(country, '??') country,
city,
COALESCE(REGEXP_EXTRACT(locale, r'^(.+?)-'), locale, NULL) AS locale,
normalized_os,
-- normalized_os_version,
COALESCE(
SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(0)], "") AS INTEGER),
0
) AS os_version_major,
COALESCE(
SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(1)], "") AS INTEGER),
0
) AS os_version_minor,
COALESCE(
SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(2)], "") AS INTEGER),
0
) AS os_version_patch,
app_display_version AS app_version,
device_model,
distribution_id,
activity_segment AS segment,
COUNTIF(is_daily_user) AS daily_users,
COUNTIF(is_weekly_user) AS weekly_users,
COUNTIF(is_monthly_user) AS monthly_users,
COUNTIF(is_dau) AS dau,
COUNTIF(is_wau) AS wau,
COUNTIF(is_mau) AS mau,
-- SUM(uri_count) AS uri_count,
-- SUM(active_hours_sum) AS active_hours,
FROM
`{{ project_id }}.{{ dataset }}.reporting_users`
WHERE
submission_date = @submission_date
GROUP BY
submission_date,
client_id,
app_name,
normalized_channel,
first_seen_year,
is_new_profile,
country,
city,
locale,
normalized_os,
os_version_major,
os_version_minor,
os_version_patch,
app_version,
device_model,
distribution_id,
segment
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{{ header }}
CREATE OR REPLACE VIEW
`{{ project_id }}.{{ dataset }}.{{ name }}`
AS
SELECT
*
FROM
`{{ project_id }}.{{ dataset }}.{{ name }}_{{ version }}`
51 changes: 51 additions & 0 deletions sql_generators/kpi_reporting/templates/reporting_users.view.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
{{ header }}
CREATE OR REPLACE VIEW
`{{ project_id }}.{{ dataset }}.{{ name }}`
AS
SELECT
* EXCEPT (isp),
CASE
WHEN LOWER(isp) = "browserstack"
THEN CONCAT("{{ friendly_name }}", " ", isp)
WHEN LOWER(distribution_id) = "mozillaonline"
THEN CONCAT("{{ friendly_name }}", " ", distribution_id)
ELSE "{{ friendly_name }}"
END AS app_name,
-- Activity fields to support metrics built on top of activity
CASE
WHEN BIT_COUNT(days_active_bits)
BETWEEN 1
AND 6
THEN "infrequent_user"
WHEN BIT_COUNT(days_active_bits)
BETWEEN 7
AND 13
THEN "casual_user"
WHEN BIT_COUNT(days_active_bits)
BETWEEN 14
AND 20
THEN "regular_user"
WHEN BIT_COUNT(days_active_bits) >= 21
THEN "core_user"
ELSE "other"
END AS activity_segment,
IFNULL(mozfun.bits28.days_since_seen(days_active_bits) = 0, FALSE) AS is_dau,
IFNULL(mozfun.bits28.days_since_seen(days_active_bits) < 7, FALSE) AS is_wau,
IFNULL(mozfun.bits28.days_since_seen(days_active_bits) < 28, FALSE) AS is_mau,
-- Metrics based on pings sent
IFNULL(mozfun.bits28.days_since_seen(days_seen_bits) = 0, FALSE) AS is_daily_user,
IFNULL(mozfun.bits28.days_since_seen(days_seen_bits) < 7, FALSE) AS is_weekly_user,
IFNULL(mozfun.bits28.days_since_seen(days_seen_bits) < 28, FALSE) AS is_monthly_user,
{% if is_mobile_kpi %}
(
LOWER(IFNULL(isp, "")) <> "browserstack"
AND LOWER(IFNULL(distribution_id, "")) <> "mozillaonline"
)
{% else %}
FALSE
{% endif %} AS is_mobile,
-- Adding isp at the end because it's in different column index in baseline table for some products.
-- This is to make sure downstream union works as intended.
isp,
FROM
`{{ project_id }}.{{ dataset }}.dau_reporting_last_seen`
14 changes: 14 additions & 0 deletions sql_generators/kpi_reporting/templates/union.view.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
{{ header }}
CREATE OR REPLACE VIEW
`{{ project_id }}.{{ dataset }}.{{ target_name }}`
AS
{% for product in products %}
{% if not loop.first %}
UNION ALL
{% endif %}
SELECT
*,
"{{ product.name }}" AS product_name,
FROM
`{{ project_id }}.{{ product.name }}.{{ name }}`
{% endfor %}

0 comments on commit 26503d2

Please sign in to comment.