Skip to content

Commit

Permalink
feat: initial commit with all usage_reporting templates and configura…
Browse files Browse the repository at this point in the history
…tion
  • Loading branch information
kik-kik committed Nov 29, 2024
1 parent 6a77eac commit 0b452ba
Show file tree
Hide file tree
Showing 21 changed files with 497 additions and 8 deletions.
12 changes: 9 additions & 3 deletions sql_generators/glean_usage/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
glean_app_ping_views,
metrics_clients_daily,
metrics_clients_last_seen,
usage_reporting_clients_daily,
usage_reporting_clients_first_seen,
usage_reporting_clients_last_seen,
)
from sql_generators.glean_usage.common import get_app_info, list_tables

Expand All @@ -43,6 +46,9 @@
event_error_monitoring.EventErrorMonitoring(),
event_flow_monitoring.EventFlowMonitoring(),
events_stream.EventsStreamTable(),
usage_reporting_clients_daily.DauReportingClientsDailyTable(),
usage_reporting_clients_first_seen.DauReportingClientsFirstSeenTable(),
usage_reporting_clients_last_seen.DauReportingClientsLastSeenTable(),
]


Expand Down Expand Up @@ -136,7 +142,7 @@ def get_tables(table_name="baseline_v1"):
not in ConfigLoader.get("generate", "glean_usage", "skip_apps", fallback=[])
]

id_token=get_id_token()
id_token = get_id_token()

# Prepare parameters so that generation of all Glean datasets can be done in parallel

Expand All @@ -151,7 +157,7 @@ def get_tables(table_name="baseline_v1"):
use_cloud_function=use_cloud_function,
app_info=app_info,
parallelism=parallelism,
id_token=id_token
id_token=id_token,
),
baseline_table,
)
Expand All @@ -169,7 +175,7 @@ def get_tables(table_name="baseline_v1"):
output_dir=output_dir,
use_cloud_function=use_cloud_function,
parallelism=parallelism,
id_token=id_token
id_token=id_token,
),
info,
)
Expand Down
13 changes: 10 additions & 3 deletions sql_generators/glean_usage/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,13 @@ def table_names_from_baseline(baseline_table, include_project_id=True):
events_view=f"{prefix}.events",
events_stream_table=f"{prefix}_derived.events_stream_v1",
events_stream_view=f"{prefix}.events_stream",
usage_reporting_stable_table=f"{prefix}_stable.usage_reporting_v1",
usage_reporting_clients_daily_table=f"{prefix}_derived.usage_reporting_clients_daily_v1",
usage_reporting_clients_first_seen_table=f"{prefix}_derived.usage_reporting_clients_first_seen_v1",
usage_reporting_clients_last_seen_table=f"{prefix}_derived.usage_reporting_clients_last_seen_v1",
usage_reporting_clients_daily_view=f"{prefix}.usage_reporting_clients_daily",
usage_reporting_clients_first_seen_view=f"{prefix}.usage_reporting_clients_first_seen",
usage_reporting_clients_last_seen_view=f"{prefix}.usage_reporting_clients_last_seen",
)


Expand Down Expand Up @@ -234,7 +241,7 @@ def generate_per_app_id(
use_cloud_function=True,
app_info=[],
parallelism=8,
id_token=None
id_token=None,
):
"""Generate the baseline table query per app_id."""
if not self.per_app_id_enabled:
Expand Down Expand Up @@ -268,7 +275,7 @@ def generate_per_app_id(
derived_dataset=derived_dataset,
app_name=app_name,
has_distribution_id=app_name in APPS_WITH_DISTRIBUTION_ID,
has_profile_group_id= app_name in APPS_WITH_PROFILE_GROUP_ID,
has_profile_group_id=app_name in APPS_WITH_PROFILE_GROUP_ID,
)

render_kwargs.update(self.custom_render_kwargs)
Expand Down Expand Up @@ -364,7 +371,7 @@ def generate_per_app(
output_dir=None,
use_cloud_function=True,
parallelism=8,
id_token=None
id_token=None,
):
"""Generate the baseline table query per app_name."""
if not self.per_app_enabled:
Expand Down
4 changes: 2 additions & 2 deletions sql_generators/glean_usage/templates/cross_channel.view.sql
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ AS
UNION ALL
{% endif -%}
{% if app_name == "fenix" -%}
SELECT
SELECT
"{{ dataset }}" AS normalized_app_id,
* REPLACE(mozfun.norm.fenix_app_info("{{ dataset }}", app_build).channel AS normalized_channel),
{% else -%}
SELECT
SELECT
"{{ dataset }}" AS normalized_app_id,
* REPLACE("{{ channel }}" AS normalized_channel)
{% endif -%}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{{ header_yaml }}
friendly_name: Clients Daily Based on the DAU Reporting Ping.
description: |-
A daily aggregate of usage_reporting pings per `profile_usage_id`.
Cluster by: `normalized_channel`, `locale`
owners:
- [email protected]
labels:
incremental: true
schedule: daily
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{{ header }}
CREATE OR REPLACE VIEW
`{{ project_id }}.{{ usage_reporting_clients_daily_view }}`
AS
SELECT
*
FROM
`{{ project_id }}.{{ usage_reporting_clients_daily_table }}`
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{{ header_yaml }}
friendly_name: Clients Daily Based on the DAU Reporting Ping.
description: |-
A daily aggregate of usage_reporting pings per `profile_usage_id`.
Cluster by: `normalized_channel`, `locale`
owners:
- [email protected]
labels:
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_glean_usage
task_group: {{ app_name }}
bigquery:
time_partitioning:
type: day
field: submission_date
require_partition_filter: true
clustering:
fields:
- normalized_channel
- locale
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
{{ header }}

WITH base AS (
SELECT
submission_timestamp,
DATE(submission_timestamp) AS submission_date,
metrics.uuid.usage_profile_id,
normalized_channel,
client_info.app_display_version,
client_info.app_build,
normalized_os,
normalized_os_version,
client_info.locale,
{% if has_distribution_id %}
metrics.string.metrics_distribution_id AS distribution_id,
{% else %}
CAST(NULL AS STRING) AS distribution_id,
{% endif %}
{% if "_desktop" in app_name %}
metrics.counter.browser_engagement_uri_count,
metrics.counter.browser_engagement_active_ticks,
{% endif %}
CAST(NULL AS BOOLEAN) AS is_active,
SAFE.PARSE_DATE('%F', SUBSTR(client_info.first_run_date, 1, 10)) AS first_run_date,
FROM
`{{ project_id }}.{{ usage_reporting_stable_table }}`
WHERE
usage_profile_id IS NOT NULL
)
SELECT
submission_date,
usage_profile_id,
--
-- Take the earliest first_run_date if ambiguous.
MIN(first_run_date) OVER w1 AS first_run_date,
-- For all other dimensions, we use the mode of observed values in the day.
udf.mode_last(ARRAY_AGG(normalized_channel) OVER w1) AS normalized_channel,
udf.mode_last(ARRAY_AGG(normalized_os) OVER w1) AS normalized_os,
udf.mode_last(ARRAY_AGG(normalized_os_version) OVER w1) AS normalized_os_version,
udf.mode_last(ARRAY_AGG(locale) OVER w1) AS locale,
udf.mode_last(ARRAY_AGG(app_build) OVER w1) AS app_build,
udf.mode_last(ARRAY_AGG(app_display_version) OVER w1) AS app_display_version,
udf.mode_last(ARRAY_AGG(distribution_id) OVER w1) AS distribution_id,
{% if "_desktop" in app_name %}
COALESCE(is_active, SUM(browser_engagement_uri_count) > 0 AND SUM(browser_engagement_active_ticks) > 0, False) AS is_active,
{% else %}
-- At the moment we do not have duration, default to True.
-- COALESCE(is_active, SUM(IF(duration BETWEEN 0 AND 100000, duration, 0)) OVER w1 > 0, False) AS is_active,
TRUE AS is_active
{% endif %}
FROM
base
WHERE
{% raw %}
{% if is_init() %}
submission_date >= '2024-10-10'
{% else %}
submission_date = @submission_date
{% endif %}
{% endraw %}
QUALIFY
ROW_NUMBER() OVER (
PARTITION BY
usage_profile_id,
submission_date
ORDER BY
submission_timestamp
) = 1

WINDOW
w1 AS (
PARTITION BY
usage_profile_id,
submission_date
ORDER BY
submission_timestamp
ROWS BETWEEN
UNBOUNDED PRECEDING
AND UNBOUNDED FOLLOWING
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
fields:
- mode: NULLABLE
name: submission_date
type: DATE
description: |
Logical date used for processing and paritioning.
- mode: NULLABLE
name: usage_profile_id
type: STRING
description:

- mode: NULLABLE
name: first_run_date
type: DATE
description: |
The date of the first run of the application.
- mode: NULLABLE
name: normalized_channel
type: STRING
description: |
The channel the application is being distributed on.
- mode: NULLABLE
name: normalized_os
type: STRING
description: |
The name of the operating system.
- mode: NULLABLE
name: normalized_os_version
type: STRING
description: |
The user-visible version of the operating system (e.g. "1.2.3").
If the version detection fails, this metric gets set to Unknown.
- mode: NULLABLE
name: locale
type: STRING
description: |
The locale of the application during initialization (e.g. "es-ES").
If the locale can't be determined on the system, the value is "und", to indicate "undetermined".
- mode: NULLABLE
name: app_build
type: STRING
description: |
The build identifier generated by the CI system (e.g. "1234/A").
If the value was not provided through configuration, this metric gets set to Unknown.
- mode: NULLABLE
name: app_display_version
type: STRING
description: |
The user visible version string (e.g. "1.0.3").
If the value was not provided through configuration, this metric gets set to Unknown.
- mode: NULLABLE
name: distribution_id
type: STRING
description: |
A string containing the distribution identifier. This was used to identify installs
from Mozilla Online, but now also identifies partnership deal distributions.
- mode: NULLABLE
name: is_active
type: BOOLEAN
description: |
A flag field indicating whether the specific client was active.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{{ header_yaml }}
friendly_name: Clients First Seen Based on the DAU Reporting Ping.
description: |-
A representation of when we saw each `profile_usage_id`
for the first time based on the usage_reporting ping.
owners:
- [email protected]
labels:
incremental: true
schedule: daily
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{{ header }}
CREATE OR REPLACE VIEW
`{{ project_id }}.{{ usage_reporting_clients_first_seen_view }}`
AS
SELECT
*
FROM
`{{ project_id }}.{{ usage_reporting_clients_first_seen_table }}`
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{{ header_yaml }}
friendly_name: Clients First Seen Based on the DAU Reporting Ping.
description: |-
A representation of when we saw each `profile_usage_id`
for the first time based on the usage_reporting ping.
owners:
- [email protected]
labels:
incremental: true
schedule: daily
scheduling:
dag_name: bqetl_glean_usage
task_group: {{ app_name }}
bigquery:
time_partitioning:
type: day
field: first_seen_date
require_partition_filter: false
Loading

0 comments on commit 0b452ba

Please sign in to comment.