Skip to content

Commit

Permalink
Remove unneeded group bys from time spine dataset (#1453)
Browse files Browse the repository at this point in the history
We've been applying an unnecessary group by to the time spine dataset.
Previously, we were applying group bys whenever a `DATE_TRUNC` column
was included in the select statement, _even if_ the base grain was
included in the select statement. These group bys are unnecessary,
because If there is no `DATE_TRUNC` applied to one select column, then
none of the columns will be changed by the group by. For example:
```
SELECT
  ds
  , DATE_TRUNC(ds, month)
FROM time_spine
GROUP BY
  ds, DATE_TRUNC(ds, month)
```
will return the same result as:
```
SELECT
  ds
  , DATE_TRUNC(ds, month)
FROM time_spine
```
The results are the same, but the first query is inefficient because of
the unnecessary group by. This PR removes the unnecessary group bys.
Note that if the base grain is not included, the group by is still
necessary.

I recommend reviewing by commit because the number of snapshot changes
is very large.
  • Loading branch information
courtneyholcomb authored Oct 10, 2024
1 parent 9bc8910 commit 10138b0
Show file tree
Hide file tree
Showing 142 changed files with 505 additions and 1,402 deletions.
6 changes: 6 additions & 0 deletions .changes/unreleased/Fixes-20241009-174346.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
kind: Fixes
body: Remove unnecessary group bys that make queries less efficient.
time: 2024-10-09T17:43:46.011252-07:00
custom:
Author: courtneyholcomb
Issue: "1453"
8 changes: 4 additions & 4 deletions metricflow/plan_conversion/dataflow_to_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,18 +263,18 @@ def _make_time_spine_data_set(
table_alias=time_spine_table_alias, column_name=time_spine_source.base_column
)
select_columns: Tuple[SqlSelectColumn, ...] = ()
apply_group_by = False
apply_group_by = True
for agg_time_dimension_instance in agg_time_dimension_instances:
column_alias = self.column_association_resolver.resolve_spec(agg_time_dimension_instance.spec).column_name
# If the requested granularity is the same as the granularity of the spine, do a direct select.
# TODO: also handle date part.
agg_time_grain = agg_time_dimension_instance.spec.time_granularity
assert (
not agg_time_grain.is_custom_granularity
), "Custom time granularities are not yet supported for all queries."
if agg_time_grain.base_granularity == time_spine_source.base_granularity:
select_columns += (SqlSelectColumn(expr=column_expr, column_alias=column_alias),)
# If any columns have a different granularity, apply a DATE_TRUNC() and aggregate via group_by.
apply_group_by = False
# If any columns have a different granularity, apply a DATE_TRUNC().
else:
select_columns += (
SqlSelectColumn(
Expand All @@ -284,7 +284,7 @@ def _make_time_spine_data_set(
column_alias=column_alias,
),
)
apply_group_by = True
# TODO: also handle date part.

return SqlDataSet(
instance_set=time_spine_instance_set,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,6 @@ FROM (
, DATETIME_TRUNC(subq_3.ds, isoweek) AS metric_time__week
, DATETIME_TRUNC(subq_3.ds, quarter) AS metric_time__quarter
FROM ***************************.mf_time_spine subq_3
GROUP BY
metric_time__day
, metric_time__week
, metric_time__quarter
) subq_2
INNER JOIN (
-- Metric Time Dimension 'ds'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,27 +21,16 @@ FROM (
-- Aggregate Measures
-- Compute Metrics via Expressions
SELECT
subq_11.metric_time__day AS metric_time__day
, subq_11.metric_time__week AS metric_time__week
, subq_11.metric_time__quarter AS metric_time__quarter
subq_12.ds AS metric_time__day
, DATETIME_TRUNC(subq_12.ds, isoweek) AS metric_time__week
, DATETIME_TRUNC(subq_12.ds, quarter) AS metric_time__quarter
, SUM(revenue_src_28000.revenue) AS revenue_all_time
FROM (
-- Time Spine
SELECT
ds AS metric_time__day
, DATETIME_TRUNC(ds, isoweek) AS metric_time__week
, DATETIME_TRUNC(ds, quarter) AS metric_time__quarter
FROM ***************************.mf_time_spine subq_12
GROUP BY
metric_time__day
, metric_time__week
, metric_time__quarter
) subq_11
FROM ***************************.mf_time_spine subq_12
INNER JOIN
***************************.fct_revenue revenue_src_28000
ON
(
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_11.metric_time__day
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_12.ds
)
GROUP BY
metric_time__day
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,6 @@ FROM (
DATETIME_TRUNC(subq_3.ds, month) AS revenue_instance__ds__month
, subq_3.ds AS metric_time__day
FROM ***************************.mf_time_spine subq_3
GROUP BY
revenue_instance__ds__month
, metric_time__day
) subq_2
INNER JOIN (
-- Metric Time Dimension 'ds'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,17 @@
-- Aggregate Measures
-- Compute Metrics via Expressions
SELECT
subq_9.revenue_instance__ds__month AS revenue_instance__ds__month
, subq_9.metric_time__day AS metric_time__day
DATETIME_TRUNC(subq_10.ds, month) AS revenue_instance__ds__month
, subq_10.ds AS metric_time__day
, SUM(revenue_src_28000.revenue) AS trailing_2_months_revenue
FROM (
-- Time Spine
SELECT
DATETIME_TRUNC(ds, month) AS revenue_instance__ds__month
, ds AS metric_time__day
FROM ***************************.mf_time_spine subq_10
GROUP BY
revenue_instance__ds__month
, metric_time__day
) subq_9
FROM ***************************.mf_time_spine subq_10
INNER JOIN
***************************.fct_revenue revenue_src_28000
ON
(
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_9.metric_time__day
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_10.ds
) AND (
DATETIME_TRUNC(revenue_src_28000.created_at, day) > DATE_SUB(CAST(subq_9.metric_time__day AS DATETIME), INTERVAL 2 month)
DATETIME_TRUNC(revenue_src_28000.created_at, day) > DATE_SUB(CAST(subq_10.ds AS DATETIME), INTERVAL 2 month)
)
GROUP BY
revenue_instance__ds__month
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,6 @@ FROM (
subq_3.ds AS revenue_instance__ds__day
, DATETIME_TRUNC(subq_3.ds, month) AS revenue_instance__ds__month
FROM ***************************.mf_time_spine subq_3
GROUP BY
revenue_instance__ds__day
, revenue_instance__ds__month
) subq_2
INNER JOIN (
-- Metric Time Dimension 'ds'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,17 @@
-- Aggregate Measures
-- Compute Metrics via Expressions
SELECT
subq_9.revenue_instance__ds__day AS revenue_instance__ds__day
, subq_9.revenue_instance__ds__month AS revenue_instance__ds__month
subq_10.ds AS revenue_instance__ds__day
, DATETIME_TRUNC(subq_10.ds, month) AS revenue_instance__ds__month
, SUM(revenue_src_28000.revenue) AS trailing_2_months_revenue
FROM (
-- Time Spine
SELECT
ds AS revenue_instance__ds__day
, DATETIME_TRUNC(ds, month) AS revenue_instance__ds__month
FROM ***************************.mf_time_spine subq_10
GROUP BY
revenue_instance__ds__day
, revenue_instance__ds__month
) subq_9
FROM ***************************.mf_time_spine subq_10
INNER JOIN
***************************.fct_revenue revenue_src_28000
ON
(
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_9.revenue_instance__ds__day
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_10.ds
) AND (
DATETIME_TRUNC(revenue_src_28000.created_at, day) > DATE_SUB(CAST(subq_9.revenue_instance__ds__day AS DATETIME), INTERVAL 2 month)
DATETIME_TRUNC(revenue_src_28000.created_at, day) > DATE_SUB(CAST(subq_10.ds AS DATETIME), INTERVAL 2 month)
)
GROUP BY
revenue_instance__ds__day
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,6 @@ FROM (
subq_3.ds AS metric_time__day
, DATETIME_TRUNC(subq_3.ds, month) AS metric_time__month
FROM ***************************.mf_time_spine subq_3
GROUP BY
metric_time__day
, metric_time__month
) subq_2
INNER JOIN (
-- Metric Time Dimension 'ds'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,17 @@
-- Aggregate Measures
-- Compute Metrics via Expressions
SELECT
subq_9.metric_time__day AS metric_time__day
, subq_9.metric_time__month AS metric_time__month
subq_10.ds AS metric_time__day
, DATETIME_TRUNC(subq_10.ds, month) AS metric_time__month
, SUM(revenue_src_28000.revenue) AS trailing_2_months_revenue
FROM (
-- Time Spine
SELECT
ds AS metric_time__day
, DATETIME_TRUNC(ds, month) AS metric_time__month
FROM ***************************.mf_time_spine subq_10
GROUP BY
metric_time__day
, metric_time__month
) subq_9
FROM ***************************.mf_time_spine subq_10
INNER JOIN
***************************.fct_revenue revenue_src_28000
ON
(
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_9.metric_time__day
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_10.ds
) AND (
DATETIME_TRUNC(revenue_src_28000.created_at, day) > DATE_SUB(CAST(subq_9.metric_time__day AS DATETIME), INTERVAL 2 month)
DATETIME_TRUNC(revenue_src_28000.created_at, day) > DATE_SUB(CAST(subq_10.ds AS DATETIME), INTERVAL 2 month)
)
GROUP BY
metric_time__day
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,6 @@ FROM (
subq_3.ds AS metric_time__day
, DATETIME_TRUNC(subq_3.ds, isoweek) AS metric_time__week
FROM ***************************.mf_time_spine subq_3
GROUP BY
metric_time__day
, metric_time__week
) subq_2
INNER JOIN (
-- Metric Time Dimension 'ds'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,15 @@ FROM (
-- Aggregate Measures
-- Compute Metrics via Expressions
SELECT
subq_11.metric_time__day AS metric_time__day
, subq_11.metric_time__week AS metric_time__week
subq_12.ds AS metric_time__day
, DATETIME_TRUNC(subq_12.ds, isoweek) AS metric_time__week
, SUM(revenue_src_28000.revenue) AS revenue_all_time
FROM (
-- Time Spine
SELECT
ds AS metric_time__day
, DATETIME_TRUNC(ds, isoweek) AS metric_time__week
FROM ***************************.mf_time_spine subq_12
GROUP BY
metric_time__day
, metric_time__week
) subq_11
FROM ***************************.mf_time_spine subq_12
INNER JOIN
***************************.fct_revenue revenue_src_28000
ON
(
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_11.metric_time__day
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_12.ds
)
GROUP BY
metric_time__day
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,6 @@ FROM (
subq_3.ds AS metric_time__day
, DATETIME_TRUNC(subq_3.ds, isoweek) AS metric_time__week
FROM ***************************.mf_time_spine subq_3
GROUP BY
metric_time__day
, metric_time__week
) subq_2
INNER JOIN (
-- Metric Time Dimension 'ds'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,26 +18,17 @@ FROM (
-- Pass Only Elements: ['txn_revenue', 'metric_time__week', 'metric_time__day']
-- Aggregate Measures
SELECT
subq_12.metric_time__day AS metric_time__day
, subq_12.metric_time__week AS metric_time__week
subq_13.ds AS metric_time__day
, DATETIME_TRUNC(subq_13.ds, isoweek) AS metric_time__week
, SUM(revenue_src_28000.revenue) AS txn_revenue
FROM (
-- Time Spine
SELECT
ds AS metric_time__day
, DATETIME_TRUNC(ds, isoweek) AS metric_time__week
FROM ***************************.mf_time_spine subq_13
GROUP BY
metric_time__day
, metric_time__week
) subq_12
FROM ***************************.mf_time_spine subq_13
INNER JOIN
***************************.fct_revenue revenue_src_28000
ON
(
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_12.metric_time__day
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_13.ds
) AND (
DATETIME_TRUNC(revenue_src_28000.created_at, day) > DATE_SUB(CAST(subq_12.metric_time__day AS DATETIME), INTERVAL 2 month)
DATETIME_TRUNC(revenue_src_28000.created_at, day) > DATE_SUB(CAST(subq_13.ds AS DATETIME), INTERVAL 2 month)
)
GROUP BY
metric_time__day
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,6 @@ FROM (
subq_3.ds AS metric_time__day
, DATETIME_TRUNC(subq_3.ds, month) AS metric_time__month
FROM ***************************.mf_time_spine subq_3
GROUP BY
metric_time__day
, metric_time__month
) subq_2
INNER JOIN (
-- Metric Time Dimension 'ds'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,17 @@ FROM (
-- Aggregate Measures
-- Compute Metrics via Expressions
SELECT
subq_11.metric_time__day AS metric_time__day
, subq_11.metric_time__month AS metric_time__month
subq_12.ds AS metric_time__day
, DATETIME_TRUNC(subq_12.ds, month) AS metric_time__month
, SUM(revenue_src_28000.revenue) AS revenue_mtd
FROM (
-- Time Spine
SELECT
ds AS metric_time__day
, DATETIME_TRUNC(ds, month) AS metric_time__month
FROM ***************************.mf_time_spine subq_12
GROUP BY
metric_time__day
, metric_time__month
) subq_11
FROM ***************************.mf_time_spine subq_12
INNER JOIN
***************************.fct_revenue revenue_src_28000
ON
(
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_11.metric_time__day
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_12.ds
) AND (
DATETIME_TRUNC(revenue_src_28000.created_at, day) >= DATETIME_TRUNC(subq_11.metric_time__day, month)
DATETIME_TRUNC(revenue_src_28000.created_at, day) >= DATETIME_TRUNC(subq_12.ds, month)
)
GROUP BY
metric_time__day
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,6 @@ FROM (
, DATETIME_TRUNC(subq_3.ds, year) AS revenue_instance__ds__year
, subq_3.ds AS metric_time__day
FROM ***************************.mf_time_spine subq_3
GROUP BY
revenue_instance__ds__quarter
, revenue_instance__ds__year
, metric_time__day
) subq_2
INNER JOIN (
-- Metric Time Dimension 'ds'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,29 +21,18 @@ FROM (
-- Aggregate Measures
-- Compute Metrics via Expressions
SELECT
subq_11.revenue_instance__ds__quarter AS revenue_instance__ds__quarter
, subq_11.revenue_instance__ds__year AS revenue_instance__ds__year
, subq_11.metric_time__day AS metric_time__day
DATETIME_TRUNC(subq_12.ds, quarter) AS revenue_instance__ds__quarter
, DATETIME_TRUNC(subq_12.ds, year) AS revenue_instance__ds__year
, subq_12.ds AS metric_time__day
, SUM(revenue_src_28000.revenue) AS revenue_mtd
FROM (
-- Time Spine
SELECT
DATETIME_TRUNC(ds, quarter) AS revenue_instance__ds__quarter
, DATETIME_TRUNC(ds, year) AS revenue_instance__ds__year
, ds AS metric_time__day
FROM ***************************.mf_time_spine subq_12
GROUP BY
revenue_instance__ds__quarter
, revenue_instance__ds__year
, metric_time__day
) subq_11
FROM ***************************.mf_time_spine subq_12
INNER JOIN
***************************.fct_revenue revenue_src_28000
ON
(
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_11.metric_time__day
DATETIME_TRUNC(revenue_src_28000.created_at, day) <= subq_12.ds
) AND (
DATETIME_TRUNC(revenue_src_28000.created_at, day) >= DATETIME_TRUNC(subq_11.metric_time__day, month)
DATETIME_TRUNC(revenue_src_28000.created_at, day) >= DATETIME_TRUNC(subq_12.ds, month)
)
GROUP BY
revenue_instance__ds__quarter
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,6 @@ FROM (
subq_3.ds AS metric_time__day
, DATETIME_TRUNC(subq_3.ds, year) AS metric_time__year
FROM ***************************.mf_time_spine subq_3
GROUP BY
metric_time__day
, metric_time__year
) subq_2
INNER JOIN (
-- Metric Time Dimension 'ds'
Expand Down
Loading

0 comments on commit 10138b0

Please sign in to comment.