Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(DENG-4998): create ETL for computing KPIs using the new dau_reporting ping #6365

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 193 additions & 0 deletions sql_generators/kpi_reporting/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
"""Generate active users aggregates per app."""

from dataclasses import asdict, dataclass
from enum import Enum
from os import path
from pathlib import Path

import click
from jinja2 import Environment, FileSystemLoader

from bigquery_etl.cli.utils import use_cloud_function_option
from bigquery_etl.format_sql.formatter import reformat
from bigquery_etl.util.common import write_sql

GENERATOR_ROOT = Path(path.dirname(__file__))

HEADER = f"-- Query generated via `{GENERATOR_ROOT.name}` SQL generator."
VERSION = "v1"
TEMPLATES = ("active_users.view.sql",)
BIGEYE_COLLECTION = "Browser Metrics KPI"


@dataclass
class Product:
"""Encapsulation of what we expect a 'Product' to look like in this generator."""

friendly_name: str
is_mobile_kpi: bool = False


class MobileProducts(Enum):
"""Enumeration with browser names and equivalent dataset names."""

fenix = Product(
friendly_name="Fenix",
is_mobile_kpi=True,
)
# focus_android = Product(
# friendly_name="Focus Android",
# is_mobile_kpi=True,
# )
# klar_android = Product(
# friendly_name="Klar Android",
# )
# firefox_ios = Product(
# friendly_name="Firefox iOS",
# is_mobile_kpi=True,
# )
# focus_ios = Product(
# friendly_name="Focus iOS",
# is_mobile_kpi=True,
# )
# klar_ios = Product(
# friendly_name="Klar iOS",
# )


@click.command()
@click.option(
"--output-dir",
"--output_dir",
help="Output directory generated SQL is written to",
type=click.Path(file_okay=False),
default="sql",
)
@click.option(
"--target-project",
"--target_project",
help="Google Cloud project ID",
default="moz-fx-data-shared-prod",
)
@use_cloud_function_option
def generate(target_project, output_dir, use_cloud_function):
"""Generate per-app queries, views and metadata for active users and search counts aggregates.

The parent folders will be created if not existing and existing files will be overwritten.
"""
env = Environment(loader=FileSystemLoader(str(GENERATOR_ROOT / "templates")))
output_dir = Path(output_dir) / target_project

default_template_args = {
"header": HEADER,
"version": VERSION,
"project_id": target_project,
"bigeye_collection": BIGEYE_COLLECTION,
}

query_support_configs = (
"checks.sql",
"metadata.yaml",
"schema.yaml",
"bigconfig.yml",
)

for template in TEMPLATES:
for product in MobileProducts:

target_name, target_filename, target_extension = template.split(".")

target_dataset = (
product.name + "_derived"
if target_filename == "query"
else product.name
)

table_id = f"{target_project}.{target_dataset}.{target_name}"

full_table_id = (
table_id + f"_{VERSION}" if target_filename == "query" else table_id
)

product_args = {
"dataset": product.name,
"target_name": target_name,
"app_name": product.name,
"name": target_name,
}

sql_template = env.get_template(template)
rendered_sql = reformat(
sql_template.render(
**default_template_args,
**asdict(product.value),
**product_args,
)
)

write_sql(
output_dir=output_dir,
full_table_id=full_table_id,
basename=f"{target_filename}.{target_extension}",
sql=rendered_sql,
skip_existing=False,
)

# we only want to copy files defined in query_support_configs for query files.
if target_filename != "query":
continue

for query_support_config in query_support_configs:
support_config_template = env.get_template(
f"{target_name}.{query_support_config}"
)
support_config_rendered = support_config_template.render(
**default_template_args,
**asdict(product.value),
**product_args,
format=False,
)

write_sql(
output_dir=output_dir,
full_table_id=full_table_id,
basename=query_support_config,
sql=(
reformat(support_config_rendered)
if query_support_config.endswith(".sql")
else support_config_rendered
),
skip_existing=False,
)

# we only want to generate a union view inside telemetry for views
if target_filename != "view":
continue

target_dataset = "telemetry"

union_target_name = f"mobile_{target_name}"

union_sql_template = env.get_template("union.view.sql")
union_sql_rendered = union_sql_template.render(
**default_template_args,
dataset=target_dataset,
name=target_name,
target_name=union_target_name,
target_filename=target_filename,
format=False,
products=[
{
"name": product.name,
}
for product in MobileProducts
],
)

write_sql(
output_dir=output_dir,
full_table_id=f"{target_project}.{target_dataset}.{union_target_name}",
basename=f"{target_filename}.{target_extension}",
sql=(reformat(union_sql_rendered)),
skip_existing=False,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
type: BIGCONFIG_FILE
table_deployments:
- collection:
name: {{ bigeye_collection }}
deployments:
- fq_table_name: {{ project_id }}.{{ project_id }}.{{ dataset }}_derived.{{ name }}_{{ version }}
table_metrics:
- metric_type:
type: PREDEFINED
predefined_metric: FRESHNESS
metric_schedule:
named_schedule:
name: Default Schedule - 13:00 UTC
- metric_type:
type: PREDEFINED
predefined_metric: VOLUME
metric_schedule:
named_schedule:
name: Default Schedule - 13:00 UTC
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
friendly_name: Active User Aggregates / KPI reporting - {{ friendly_name }}
description: |-
Used for reporting our core KPI metrics ({{ friendly_name }})

owners:
- mozilla/kpi_table_reviewers
- [email protected]
labels:
schedule: daily
incremental: true
change_controlled: true
scheduling:
dag_name: bqetl_analytics_aggregations
depends_on_past: false
task_group: {{ app_name }}
bigquery:
time_partitioning:
type: day
field: submission_date
require_partition_filter: false
clustering:
fields:
- app_name
- normalized_channel
- country
monitoring:
enabled: true
collection: {{ bigeye_collection }}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{{ header }}
SELECT
submission_date,
app_name,
normalized_channel,
first_seen_year,
is_new_profile,
country,
locale,
-- normalized_os,
os_version_major,
os_version_minor,
os_version_patch,
app_version,
distribution_id,
segment,
COUNTIF(is_daily_user) AS daily_users,
COUNTIF(is_weekly_user) AS weekly_users,
COUNTIF(is_monthly_user) AS monthly_users,
COUNTIF(is_dau) AS dau,
COUNTIF(is_wau) AS wau,
COUNTIF(is_mau) AS mau,
-- SUM(uri_count) AS uri_count,
-- SUM(active_hours_sum) AS active_hours,
FROM
`{{ project_id }}.{{ dataset }}.reporting_users`
WHERE
submission_date = @submission_date
GROUP BY
submission_date,
app_name,
normalized_channel,
first_seen_year,
is_new_profile,
country,
locale,
os_version_major,
os_version_minor,
os_version_patch,
app_version,
distribution_id,
segment
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{{ header }}
CREATE OR REPLACE VIEW
`{{ project_id }}.{{ dataset }}.{{ name }}`
AS
SELECT
*
FROM
`{{ project_id }}.{{ dataset }}.{{ name }}_{{ version }}`
70 changes: 70 additions & 0 deletions sql_generators/kpi_reporting/templates/reporting_users.view.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
{{ header }}
CREATE OR REPLACE VIEW
`{{ project_id }}.{{ dataset }}.{{ name }}`
AS
SELECT
* EXCEPT (isp, normalized_os_version, app_display_version)
REPLACE(
COALESCE(REGEXP_EXTRACT(locale, r'^(.+?)-'), locale, NULL) AS locale,
IFNULL(country, '??') AS country
),
CASE
WHEN LOWER(isp) = "browserstack"
THEN CONCAT("{{ friendly_name }}", " ", isp)
WHEN LOWER(distribution_id) = "mozillaonline"
THEN CONCAT("{{ friendly_name }}", " ", distribution_id)
ELSE "{{ friendly_name }}"
END AS app_name,
-- Activity fields to support metrics built on top of activity
CASE
WHEN BIT_COUNT(days_active_bits)
BETWEEN 1
AND 6
THEN "infrequent_user"
WHEN BIT_COUNT(days_active_bits)
BETWEEN 7
AND 13
THEN "casual_user"
WHEN BIT_COUNT(days_active_bits)
BETWEEN 14
AND 20
THEN "regular_user"
WHEN BIT_COUNT(days_active_bits) >= 21
THEN "core_user"
ELSE "other"
END AS segment,
COALESCE(
SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(0)], "") AS INTEGER),
0
) AS os_version_major,
COALESCE(
SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(1)], "") AS INTEGER),
0
) AS os_version_minor,
COALESCE(
SAFE_CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(2)], "") AS INTEGER),
0
) AS os_version_patch,
app_display_version AS app_version,
EXTRACT(YEAR FROM first_seen_date) AS first_seen_year,
submission_date = first_seen_date AS is_new_profile,
IFNULL(mozfun.bits28.days_since_seen(days_active_bits) = 0, FALSE) AS is_dau,
IFNULL(mozfun.bits28.days_since_seen(days_active_bits) < 7, FALSE) AS is_wau,
IFNULL(mozfun.bits28.days_since_seen(days_active_bits) < 28, FALSE) AS is_mau,
-- Metrics based on pings sent
IFNULL(mozfun.bits28.days_since_seen(days_seen_bits) = 0, FALSE) AS is_daily_user,
IFNULL(mozfun.bits28.days_since_seen(days_seen_bits) < 7, FALSE) AS is_weekly_user,
IFNULL(mozfun.bits28.days_since_seen(days_seen_bits) < 28, FALSE) AS is_monthly_user,
{% if is_mobile_kpi %}
(
LOWER(IFNULL(isp, "")) <> "browserstack"
AND LOWER(IFNULL(distribution_id, "")) <> "mozillaonline"
)
{% else %}
FALSE
{% endif %} AS is_mobile,
-- Adding isp at the end because it's in different column index in baseline table for some products.
-- This is to make sure downstream union works as intended.
isp,
FROM
`{{ project_id }}.{{ dataset }}.dau_reporting_last_seen`
Loading