From 98a08fd07155577e33a25a6da0b9527eb717b790 Mon Sep 17 00:00:00 2001
From: Michael Greenberg <michael.greenberg@materialize.com>
Date: Mon, 23 Dec 2024 15:24:03 -0500
Subject: [PATCH 1/5] explain LIR attribution

---
 .../content/transform-data/troubleshooting.md | 304 ++++++++++++++++++
 1 file changed, 304 insertions(+)

diff --git a/doc/user/content/transform-data/troubleshooting.md b/doc/user/content/transform-data/troubleshooting.md
index 9f697bf05b251..d14dbd505550b 100644
--- a/doc/user/content/transform-data/troubleshooting.md
+++ b/doc/user/content/transform-data/troubleshooting.md
@@ -277,6 +277,310 @@ If you've gone through the dataflow troubleshooting and do not want to make any
 changes to your query, consider [sizing up your cluster](/sql/create-cluster/#size).
 A larger size cluster will provision more memory and CPU resources.
 
+## Which part of my query runs slowly or uses a lot of memory?
+
+{{< public-preview />}}
+
+You can [`EXPLAIN`](/sql/explain-plan/) a query to see how it will be
+run as a dataflow. In particular, `EXPLAIN PHYSICAL PLAN` will show
+the concrete, fully optimized plan that Materialize will run.  (That
+plan is written in our "low-level intermediate representation", which
+we call _LIR_.)  If you have [created an index](/concepts/indexes) or
+[materialized view](/concepts/views#materialized-views), you can use
+[`mz_introspection.mz_lir_mapping`](/sql/system-catalog/mz_introspection/#mz_lir_mapping)
+to attribute various performance characteristics to the operators
+inside your query.
+
+Every time you create an index or materialized view, Materialize uses
+[`mz_introspection.mz_lir_mapping`](/sql/system-catalog/mz_introspection/#mz_lir_mapping)
+to map the higher-level LIR operators to zero or more lower-level
+dataflow operators. You can construct queries that will combine
+information from
+[`mz_introspection.mz_lir_mapping`](/sql/system-catalog/mz_introspection/#mz_lir_mapping)
+and other internal views to discover which parts of your query are
+computationally expensive (e.g.,
+[`mz_introspection.mz_compute_operator_durations_histogram`](/sql/system-catalog/mz_introspection/#mz_compute_operator_durations_histogram), [`mz_introspection.mz_scheduling_elapsed`](/sql/system-catalog/mz_introspection/#mz_scheduling_elapsed))
+or consuming excessive memory (e.g., ).
+
+We show how to use
+[`mz_introspection.mz_lir_mapping`](/sql/system-catalog/mz_introspection/#mz_lir_mapping)
+by example based on the [`Auction` load
+generator](/sql/create-source/load-generator/#auction) from the
+[quickstart guide](/get-started/quickstart/#step-2-create-the-source):
+
+```sql
+CREATE SOURCE auction_house
+FROM LOAD GENERATOR AUCTION
+(TICK INTERVAL '1s', AS OF 100000)
+FOR ALL TABLES;
+
+CREATE VIEW winning_bids AS
+  SELECT DISTINCT ON (a.id) b.*, a.item, a.seller
+    FROM auctions AS a
+    JOIN bids AS b
+      ON a.id = b.auction_id
+   WHERE b.bid_time < a.end_time
+     AND mz_now() >= a.end_time
+   ORDER BY a.id, b.amount DESC, b.bid_time, b.buyer;
+
+CREATE INDEX wins_by_item ON winning_bids (item);
+```
+
+We attribute four different kinds of performance data to parts of the
+`wins_by_item` query: computation time, memory usage, [top-k](/transform-data/idiomatic-materialize-sql/top-k/) query [sizing hints](/transform-data/optimization/#query-hints), and worker
+skew.
+
+### Attributing computation time
+
+One way to understand which parts of your query are 'expensive' is to
+look at the overall time spent computing in each part. Materialize
+reports the time spent in each _dataflow operator_ in
+[`mz_introspection.mz_compute_operator_durations_histogram`](/sql/system-catalog/mz_introspection/#mz_compute_operator_durations_histogram). By
+joining it with
+[`mz_introspection.mz_lir_mapping`](/sql/system-catalog/mz_introspection/#mz_lir_mapping),
+we can attribute the time spent in each operator to the higher-level,
+more intelligible LIR operators:
+
+```sql
+SELECT mo.name AS name, global_id, lir_id, parent_lir_id, REPEAT(' ', nesting * 2) || operator AS operator,
+       SUM(duration_ns)/1000 * '1 microsecond'::INTERVAL AS duration, SUM(count) AS count
+    FROM           mz_introspection.mz_lir_mapping mlm
+         LEFT JOIN mz_introspection.mz_compute_operator_durations_histogram mcodh
+                ON (mlm.operator_id_start <= mcodh.id AND mcodh.id < mlm.operator_id_end)
+              JOIN mz_catalog.mz_objects mo
+                ON (mlm.global_id = mo.id)
+GROUP BY mo.name, global_id, lir_id, operator, parent_lir_id, nesting
+ORDER BY global_id, lir_id DESC;
+```
+
+Running this query on an auction generator will produce results that look something like the following (though the specific numbers will vary, of course):
+
+
+| name         | global_id | lir_id | parent_lir_id | operator                         | duration        | count  |
+| ------------ | --------- | ------ | ------------- | -------------------------------- | --------------- | ------ |
+| winning_bids | u148      | 6      | null          | TopK::Basic 5                    | 00:00:00.74516  | 108650 |
+| winning_bids | u148      | 5      | 6             |   Join::Differential 2 » 4       | 00:00:00.017005 | 19099  |
+| winning_bids | u148      | 4      | 5             |     Arrange 3                    | 00:00:00.058835 | 11506  |
+| winning_bids | u148      | 3      | 4             |       Get::PassArrangements u145 | null            | null   |
+| winning_bids | u148      | 2      | 5             |     Arrange 1                    | 00:00:00.013885 | 7413   |
+| winning_bids | u148      | 1      | 2             |       Get::Collection u144       | null            | null   |
+| wins_by_item | u149      | 8      | null          | Arrange 7                        | 00:00:00.013887 | 9347   |
+| wins_by_item | u149      | 7      | 8             |   Get::PassArrangements u148     | null            | null   |
+
+The `duration` column lets us see that the `TopK` operator is where we
+spend the bulk of the query's computation time.
+
+The query works by finding every dataflow operator in the range
+(`mz_lir_mapping.operator_id_start` inclusive to
+`mz_lir_mapping.operator_id_end` exclusive) and summing up the time
+spent (`SUM(duration_ns)`). The query joins with
+[`mz_catalog.mz_objects`](/sql/system-catalog/mz_catalog/#mz_objects)
+to find the actual name corresponding to the `global_id`.
+
+Notice that the index is actually _two_ `global_id`s: `u148` is the
+`winning_bids` view, while `u149` is the `wins_by_item` index.
+
+The `operator` is indented using [`REPEAT`](/sql/functions/#repeat) and
+`mz_lir_mapping.nesting`). The indenting, combined with ordering by
+`mz_lir_mapping.lir_id` descending, gives us a tree-shaped view of the
+LIR plan.
+
+The LIR operators reported in `mz_lir_mapping.operator` are terser
+than those in `EXPLAIN PHYSICAL PLAN`. Each operator is restricted to
+a single line, of the form `OPERATOR [INPUT_LIR_ID ...]`. That is,
+`lir_id` 4 is the operator `Arrange 3`, the operator that will arrange
+in memory the results of reading dataflow `u145` (here, the `bids`
+table from the source).
+
+### Attributing memory usage
+
+(If you have not read about [attributing computation time](#attributing-computation-time), please do so first, as it explains some core concepts.)
+
+We can attribute memory usage to our query by joining
+[`mz_introspection.mz_lir_mapping`](/sql/system-catalog/mz_introspection/#mz_lir_mapping)
+with
+[`mz_introspection.mz_arrangement_sizes`](/sql/system-catalog/mz_introspection/#mz_arrangement_sizes):
+
+```sql
+  SELECT mo.name AS name, global_id, lir_id, parent_lir_id, REPEAT(' ', nesting * 2) || operator AS operator,
+         pg_size_pretty(SUM(size)) AS size
+    FROM           mz_introspection.mz_lir_mapping mlm
+         LEFT JOIN mz_introspection.mz_arrangement_sizes mas
+                ON (mlm.operator_id_start <= mas.operator_id AND mas.operator_id < mlm.operator_id_end)
+              JOIN mz_catalog.mz_objects mo
+                ON (mlm.global_id = mo.id)
+GROUP BY mo.name, global_id, lir_id, operator, parent_lir_id, nesting
+ORDER BY global_id, lir_id DESC;
+```
+
+| name         | global_id | lir_id | parent_lir_id | operator                         | size    |
+| ------------ | --------- | ------ | ------------- | -------------------------------- | ------- |
+| winning_bids | u148      | 6      | null          | TopK::Basic 5                    | 38 MB   |
+| winning_bids | u148      | 5      | 6             |   Join::Differential 2 » 4       | null    |
+| winning_bids | u148      | 4      | 5             |     Arrange 3                    | 2008 kB |
+| winning_bids | u148      | 3      | 4             |       Get::PassArrangements u145 | null    |
+| winning_bids | u148      | 2      | 5             |     Arrange 1                    | 900 kB  |
+| winning_bids | u148      | 1      | 2             |       Get::Collection u144       | null    |
+| wins_by_item | u149      | 8      | null          | Arrange 7                        | 707 kB  |
+| wins_by_item | u149      | 7      | 8             |   Get::PassArrangements u148     | null    |
+
+The `TopK` is overwhelmingly responsible for memory usage: arranging
+the `bids` table (`lir_id` 4) and `auctions` table (`lir_id` 2) are
+cheap in comparison, as is arranging the final output in the
+`wins_by_item` index (`lir_id` 8).
+
+### Attributing `TopK` hints
+
+The
+[`mz_introspection.mz_expected_group_size_advice`](/sql/system-catalog/mz_introspection/#mz_expected_group_size_advice)
+looks at your running dataflow and suggests parameters you can set. We
+can attribute this to particular parts of our query using
+[`mz_introspection.mz_lir_mapping`](/sql/system-catalog/mz_introspection/#mz_lir_mapping):
+
+```sql
+  SELECT mo.name AS name, mlm.global_id AS global_id, lir_id, parent_lir_id, REPEAT(' ', nesting * 2) || operator AS operator,
+         levels, to_cut, pg_size_pretty(savings), hint
+    FROM           mz_introspection.mz_lir_mapping mlm
+              JOIN mz_introspection.mz_dataflow_global_ids mdgi
+                ON (mlm.global_id = mdgi.global_id)
+         LEFT JOIN mz_introspection.mz_expected_group_size_advice megsa
+                ON (megsa.dataflow_id = mdgi.id AND
+                    mlm.operator_id_start <= megsa.region_id AND megsa.region_id < mlm.operator_id_end)
+              JOIN mz_catalog.mz_objects mo
+                ON (mlm.global_id = mo.id)
+ORDER BY mlm.global_id, lir_id DESC;
+```
+
+Each `TopK` operator will have associated hints:
+
+| name         | global_id | lir_id | parent_lir_id | operator                         | levels | to_cut | pg_size_pretty | hint  |
+| ------------ | --------- | ------ | ------------- | -------------------------------- | ------ | ------ | -------------- | ----- |
+| winning_bids | u148      | 6      | null          | TopK::Basic 5                    | 8      | 6      | 27 MB          | 255.0 |
+| winning_bids | u148      | 5      | 6             |   Join::Differential 2 » 4       | null   | null   | null           | null  |
+| winning_bids | u148      | 4      | 5             |     Arrange 3                    | null   | null   | null           | null  |
+| winning_bids | u148      | 3      | 4             |       Get::PassArrangements u145 | null   | null   | null           | null  |
+| winning_bids | u148      | 2      | 5             |     Arrange 1                    | null   | null   | null           | null  |
+| winning_bids | u148      | 1      | 2             |       Get::Collection u144       | null   | null   | null           | null  |
+| wins_by_item | u149      | 8      | null          | Arrange 7                        | null   | null   | null           | null  |
+| wins_by_item | u149      | 7      | 8             |   Get::PassArrangements u148     | null   | null   | null           | null  |
+
+Here, the hinted `DISTINCT ON INPUT GROUP SIZE` is `255.0`. We can re-create our view and index using the hint as follows:
+
+```sql
+DROP VIEW winning_bids CASCADE;
+
+CREATE VIEW winning_bids AS
+SELECT DISTINCT ON (a.id) b.*, a.item, a.seller
+ FROM auctions AS a
+ JOIN bids AS b
+   ON a.id = b.auction_id
+WHERE b.bid_time < a.end_time
+  AND mz_now() >= a.end_time
+OPTIONS (DISTINCT ON INPUT GROUP SIZE = 255) -- use hint!
+ORDER BY a.id,
+  b.amount DESC,
+  b.bid_time,
+  b.buyer;
+
+CREATE INDEX wins_by_item ON winning_bids (item);
+```
+
+Re-running the `TopK`-hints query will show only `null` hints---there
+are no hints because our `TopK` is now appropriately sized. But if we
+re-run our [query for attributing memory
+usage](#attributing-memory-usage), we can see that our `TopK` operator
+uses a third of the memory it was using before:
+
+| name         | global_id | lir_id | parent_lir_id | operator                         | size    |
+| ------------ | --------- | ------ | ------------- | -------------------------------- | ------- |
+| winning_bids | u150      | 6      | null          | TopK::Basic 5                    | 11 MB   |
+| winning_bids | u150      | 5      | 6             |   Join::Differential 2 » 4       | null    |
+| winning_bids | u150      | 4      | 5             |     Arrange 3                    | 1996 kB |
+| winning_bids | u150      | 3      | 4             |       Get::PassArrangements u145 | null    |
+| winning_bids | u150      | 2      | 5             |     Arrange 1                    | 575 kB  |
+| winning_bids | u150      | 1      | 2             |       Get::Collection u144       | null    |
+| wins_by_item | u151      | 8      | null          | Arrange 7                        | 402 kB  |
+| wins_by_item | u151      | 7      | 8             |   Get::PassArrangements u150     | null    |
+
+### Localizing worker skew
+
+[Worker skew](/transform-data/dataflow-troubleshooting/#is-work-distributed-equally-across-workers) occurs when your data do not end up getting evenly
+partitioned between workers.  Worker skew can only happen when your
+cluster has more than one worker. (You can query
+[`mz_catalog.mz_cluster_replica_sizes`](/sql/system-catalog/mz_catalog/#mz_cluster_replica_sizes)
+to determine how many workers a given cluster size has.)
+
+You can identify worker skew by comparing a worker's time spent to the
+overall time spent across all workers:
+
+```sql
+              SELECT mo.name AS name, global_id, lir_id, REPEAT(' ', 2 * nesting) || operator as operator,
+                     worker_id,
+                     SUM(elapsed_ns) / 1000 * '1 microsecond'::INTERVAL AS elapsed_ns,
+                     SUM(aebi.total_ns) / 1000 * '1 microsecond'::INTERVAL  as avg_ns,
+                     SUM(elapsed_ns) / SUM(aebi.total_ns) as ratio
+                FROM mz_introspection.mz_lir_mapping mlm
+  CROSS JOIN LATERAL (  SELECT SUM(elapsed_ns) AS total_ns
+                          FROM mz_introspection.mz_scheduling_elapsed_per_worker mse
+                         WHERE mlm.operator_id_start <= id AND id < mlm.operator_id_end
+                      GROUP BY worker_id) aebi
+  CROSS JOIN LATERAL (  SELECT worker_id, SUM(elapsed_ns) as elapsed_ns
+                          FROM mz_introspection.mz_scheduling_elapsed_per_worker mse
+                         WHERE mlm.operator_id_start <= id AND id < mlm.operator_id_end
+                      GROUP BY worker_id) epw
+                JOIN mz_catalog.mz_objects mo
+                  ON (mlm.global_id = mo.id)
+            GROUP BY mo.name, global_id, lir_id, nesting, operator, worker_id
+            ORDER BY global_id, lir_id DESC;
+```
+
+| name         | global_id | lir_id | operator                   | worker_id | elapsed_ns      | avg_ns          | ratio                                     |
+| ------------ | --------- | ------ | -------------------------- | --------- | --------------- | --------------- | ----------------------------------------- |
+| winning_bids | u150      | 6      | TopK::Basic 5              | 0         | 00:00:02.359193 | 00:00:02.338105 | 1.00901943619301904208932433787196422606  |
+| winning_bids | u150      | 6      | TopK::Basic 5              | 1         | 00:00:02.342375 | 00:00:02.338105 | 1.0018262379840754650921842785877521019   |
+| winning_bids | u150      | 6      | TopK::Basic 5              | 2         | 00:00:02.314143 | 00:00:02.338105 | 0.989751590156994710647908375633282330915 |
+| winning_bids | u150      | 6      | TopK::Basic 5              | 3         | 00:00:02.336708 | 00:00:02.338105 | 0.999402735665910782170583007907001341123 |
+| winning_bids | u150      | 5      |   Join::Differential 2 » 4 | 0         | 00:00:00.557381 | 00:00:00.548652 | 1.01590883682808615066920883751285427977  |
+| winning_bids | u150      | 5      |   Join::Differential 2 » 4 | 1         | 00:00:00.542891 | 00:00:00.548652 | 0.98949878273761917487532841143193434597  |
+| winning_bids | u150      | 5      |   Join::Differential 2 » 4 | 2         | 00:00:00.546565 | 00:00:00.548652 | 0.99619568016440763325559996932924096555  |
+| winning_bids | u150      | 5      |   Join::Differential 2 » 4 | 3         | 00:00:00.547773 | 00:00:00.548652 | 0.99839670026988704119986278172597040871  |
+| winning_bids | u150      | 4      |     Arrange 3              | 0         | 00:00:00.314206 | 00:00:00.338216 | 0.929009114588746221704744632262170530522 |
+| winning_bids | u150      | 4      |     Arrange 3              | 1         | 00:00:00.306153 | 00:00:00.338216 | 0.905199632876776517170415418238708660282 |
+| winning_bids | u150      | 4      |     Arrange 3              | 2         | 00:00:00.432691 | 00:00:00.338216 | 1.27933123914530934762928875720815264567  |
+| winning_bids | u150      | 4      |     Arrange 3              | 3         | 00:00:00.299815 | 00:00:00.338216 | 0.886460013389167913495551192290968163528 |
+| winning_bids | u150      | 2      |     Arrange 1              | 0         | 00:00:00.258171 | 00:00:00.244214 | 1.05715143156633465139502061854023757049  |
+| winning_bids | u150      | 2      |     Arrange 1              | 1         | 00:00:00.234322 | 00:00:00.244214 | 0.959493923931760528554652347461354389562 |
+| winning_bids | u150      | 2      |     Arrange 1              | 2         | 00:00:00.230453 | 00:00:00.244214 | 0.943651272055032566225444196288453924574 |
+| winning_bids | u150      | 2      |     Arrange 1              | 3         | 00:00:00.25391  | 00:00:00.244214 | 1.03970337244687225382488283770995411537  |
+| wins_by_item | u151      | 8      | Arrange 7                  | 0         | 00:00:00.202201 | 00:00:00.204736 | 0.987618403431703110437750824708179947656 |
+| wins_by_item | u151      | 8      | Arrange 7                  | 1         | 00:00:00.216731 | 00:00:00.204736 | 1.05858895341593917863900275768799815376  |
+| wins_by_item | u151      | 8      | Arrange 7                  | 2         | 00:00:00.220717 | 00:00:00.204736 | 1.07805793303181678638721908696246766799  |
+| wins_by_item | u151      | 8      | Arrange 7                  | 3         | 00:00:00.179294 | 00:00:00.204736 | 0.875734710120540924536027330641354230589 |
+
+The `ratio` column tells you whether a worker is particularly over- or under-loaded.
+
+### Writing your own attribution queries
+
+Since Materialize maps LIR nodes to ranges of dataflow operators in
+[`mz_introspection.mz_lir_mapping`](/sql/system-catalog/mz_introspection/#mz_lir_mapping),
+you can freely combine information from
+[`mz_catalog`](/sql/system-catalog/mz_catalog/) and
+[`mz_introspection`](/sql/system-catalog/mz_introspection/) to
+understand your dataflows' behavior. We offer the queries above as a
+starting point. When building your own, keep the following in mind.
+
+  1. `mz_lir_mapping.operator` is not stable and **should not be
+     parsed**. If you want to traverse the LIR tree, use
+     `mz_lir_mapping.parent_lir_id`.
+  2. Make sure you `GROUP BY global_id`---mixing `lir_id`s from
+     different `global_id`s will produce nonsense.
+  3. Use `REPEAT(' ', 2 * nesting) || operator` and `ORDER BY lir_id
+     DESC` to correctly render the LIR tree.
+  4. `mz_lir_mapping.operator_id_start` is inclusive;
+     `mz_lir_mapping.operator_id_end` is exclusive. If they are equal
+     to each other, that LIR operator does not correspond to any
+     dataflow operators.
+
 ## How do I troubleshoot slow queries?
 
 Materialize stores a (sampled) log of the SQL statements that are issued against

From e6b4f117f5a9b6d57be155b7aead6262fbafde5f Mon Sep 17 00:00:00 2001
From: Michael Greenberg <michael.greenberg@materialize.com>
Date: Thu, 26 Dec 2024 09:47:02 -0500
Subject: [PATCH 2/5] kay's ce

Co-authored-by: Kay Kim <kaykim00@gmail.com>
---
 doc/user/content/transform-data/troubleshooting.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/user/content/transform-data/troubleshooting.md b/doc/user/content/transform-data/troubleshooting.md
index d14dbd505550b..175cb532a976b 100644
--- a/doc/user/content/transform-data/troubleshooting.md
+++ b/doc/user/content/transform-data/troubleshooting.md
@@ -381,7 +381,7 @@ Notice that the index is actually _two_ `global_id`s: `u148` is the
 `winning_bids` view, while `u149` is the `wins_by_item` index.
 
 The `operator` is indented using [`REPEAT`](/sql/functions/#repeat) and
-`mz_lir_mapping.nesting`). The indenting, combined with ordering by
+`mz_lir_mapping.nesting`. The indenting, combined with ordering by
 `mz_lir_mapping.lir_id` descending, gives us a tree-shaped view of the
 LIR plan.
 

From 31b6e6326cb5a78663d91ffbd5e9e0c4b9bf9aa0 Mon Sep 17 00:00:00 2001
From: Michael Greenberg <michael.greenberg@materialize.com>
Date: Thu, 26 Dec 2024 09:47:28 -0500
Subject: [PATCH 3/5] kay's fancy tip template

Co-authored-by: Kay Kim <kaykim00@gmail.com>
---
 doc/user/content/transform-data/troubleshooting.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/user/content/transform-data/troubleshooting.md b/doc/user/content/transform-data/troubleshooting.md
index 175cb532a976b..c5eb27980a7fe 100644
--- a/doc/user/content/transform-data/troubleshooting.md
+++ b/doc/user/content/transform-data/troubleshooting.md
@@ -394,7 +394,11 @@ table from the source).
 
 ### Attributing memory usage
 
-(If you have not read about [attributing computation time](#attributing-computation-time), please do so first, as it explains some core concepts.)
+{{< tip >}}
+
+If you have not read about [attributing computation time](#attributing-computation-time), please do so first, as it explains some core concepts.
+
+{{< /tip >}}
 
 We can attribute memory usage to our query by joining
 [`mz_introspection.mz_lir_mapping`](/sql/system-catalog/mz_introspection/#mz_lir_mapping)

From b9ed4cb90773566586eebeaf1cec2763cd6fc351 Mon Sep 17 00:00:00 2001
From: Michael Greenberg <michael.greenberg@materialize.com>
Date: Thu, 26 Dec 2024 10:02:57 -0500
Subject: [PATCH 4/5] add WHERE clause, some clarifications; thanks @kay-kim!

---
 .../content/transform-data/troubleshooting.md | 55 ++++++++++++-------
 1 file changed, 34 insertions(+), 21 deletions(-)

diff --git a/doc/user/content/transform-data/troubleshooting.md b/doc/user/content/transform-data/troubleshooting.md
index c5eb27980a7fe..9917543a452b8 100644
--- a/doc/user/content/transform-data/troubleshooting.md
+++ b/doc/user/content/transform-data/troubleshooting.md
@@ -349,6 +349,7 @@ SELECT mo.name AS name, global_id, lir_id, parent_lir_id, REPEAT(' ', nesting *
                 ON (mlm.operator_id_start <= mcodh.id AND mcodh.id < mlm.operator_id_end)
               JOIN mz_catalog.mz_objects mo
                 ON (mlm.global_id = mo.id)
+   WHERE mo.name IN ('wins_by_item', 'winning_bids')
 GROUP BY mo.name, global_id, lir_id, operator, parent_lir_id, nesting
 ORDER BY global_id, lir_id DESC;
 ```
@@ -367,18 +368,22 @@ Running this query on an auction generator will produce results that look someth
 | wins_by_item | u149      | 8      | null          | Arrange 7                        | 00:00:00.013887 | 9347   |
 | wins_by_item | u149      | 7      | 8             |   Get::PassArrangements u148     | null            | null   |
 
-The `duration` column lets us see that the `TopK` operator is where we
-spend the bulk of the query's computation time.
+The `duration` column shows that the `TopK` operator is where we
+spend the bulk of the query's computation time.  Notice that creating
+our index installs _two_ `global_id`s: `u148` is the dataflow for the
+`winning_bids` view uitself, while `u149` is the dataflow for the
+`wins_by_item` index on `winning_bids`.
 
 The query works by finding every dataflow operator in the range
 (`mz_lir_mapping.operator_id_start` inclusive to
 `mz_lir_mapping.operator_id_end` exclusive) and summing up the time
 spent (`SUM(duration_ns)`). The query joins with
 [`mz_catalog.mz_objects`](/sql/system-catalog/mz_catalog/#mz_objects)
-to find the actual name corresponding to the `global_id`.
-
-Notice that the index is actually _two_ `global_id`s: `u148` is the
-`winning_bids` view, while `u149` is the `wins_by_item` index.
+to find the actual name corresponding to the `global_id`.  The `WHERE
+mo.name IN ...`  clause of the query ensures we only see information
+about this index and view. If you leave this `WHERE` clause out, you
+will see information on _every_ view, materialized view, and index on
+your current cluster.
 
 The `operator` is indented using [`REPEAT`](/sql/functions/#repeat) and
 `mz_lir_mapping.nesting`. The indenting, combined with ordering by
@@ -413,6 +418,7 @@ with
                 ON (mlm.operator_id_start <= mas.operator_id AND mas.operator_id < mlm.operator_id_end)
               JOIN mz_catalog.mz_objects mo
                 ON (mlm.global_id = mo.id)
+   WHERE mo.name IN ('wins_by_item', 'winning_bids')
 GROUP BY mo.name, global_id, lir_id, operator, parent_lir_id, nesting
 ORDER BY global_id, lir_id DESC;
 ```
@@ -452,6 +458,7 @@ can attribute this to particular parts of our query using
                     mlm.operator_id_start <= megsa.region_id AND megsa.region_id < mlm.operator_id_end)
               JOIN mz_catalog.mz_objects mo
                 ON (mlm.global_id = mo.id)
+   WHERE mo.name IN ('wins_by_item', 'winning_bids')
 ORDER BY mlm.global_id, lir_id DESC;
 ```
 
@@ -518,24 +525,25 @@ You can identify worker skew by comparing a worker's time spent to the
 overall time spent across all workers:
 
 ```sql
-              SELECT mo.name AS name, global_id, lir_id, REPEAT(' ', 2 * nesting) || operator as operator,
+ SELECT mo.name AS name, global_id, lir_id, REPEAT(' ', 2 * nesting) || operator as operator,
                      worker_id,
                      SUM(elapsed_ns) / 1000 * '1 microsecond'::INTERVAL AS elapsed_ns,
                      SUM(aebi.total_ns) / 1000 * '1 microsecond'::INTERVAL  as avg_ns,
                      SUM(elapsed_ns) / SUM(aebi.total_ns) as ratio
-                FROM mz_introspection.mz_lir_mapping mlm
-  CROSS JOIN LATERAL (  SELECT SUM(elapsed_ns) AS total_ns
-                          FROM mz_introspection.mz_scheduling_elapsed_per_worker mse
-                         WHERE mlm.operator_id_start <= id AND id < mlm.operator_id_end
-                      GROUP BY worker_id) aebi
-  CROSS JOIN LATERAL (  SELECT worker_id, SUM(elapsed_ns) as elapsed_ns
-                          FROM mz_introspection.mz_scheduling_elapsed_per_worker mse
-                         WHERE mlm.operator_id_start <= id AND id < mlm.operator_id_end
-                      GROUP BY worker_id) epw
-                JOIN mz_catalog.mz_objects mo
-                  ON (mlm.global_id = mo.id)
-            GROUP BY mo.name, global_id, lir_id, nesting, operator, worker_id
-            ORDER BY global_id, lir_id DESC;
+   FROM                    mz_introspection.mz_lir_mapping mlm
+        CROSS JOIN LATERAL (  SELECT SUM(elapsed_ns) AS total_ns
+                                FROM mz_introspection.mz_scheduling_elapsed_per_worker mse
+                               WHERE mlm.operator_id_start <= id AND id < mlm.operator_id_end
+                            GROUP BY worker_id) aebi
+        CROSS JOIN LATERAL (  SELECT worker_id, SUM(elapsed_ns) as elapsed_ns
+                                FROM mz_introspection.mz_scheduling_elapsed_per_worker mse
+                               WHERE mlm.operator_id_start <= id AND id < mlm.operator_id_end
+                            GROUP BY worker_id) epw
+                      JOIN mz_catalog.mz_objects mo
+                        ON (mlm.global_id = mo.id)
+   WHERE mo.name IN ('wins_by_item', 'winning_bids')
+GROUP BY mo.name, global_id, lir_id, nesting, operator, worker_id
+ORDER BY global_id, lir_id DESC;
 ```
 
 | name         | global_id | lir_id | operator                   | worker_id | elapsed_ns      | avg_ns          | ratio                                     |
@@ -575,7 +583,8 @@ starting point. When building your own, keep the following in mind.
 
   1. `mz_lir_mapping.operator` is not stable and **should not be
      parsed**. If you want to traverse the LIR tree, use
-     `mz_lir_mapping.parent_lir_id`.
+     `mz_lir_mapping.parent_lir_id`. If there are metadata that would
+     be useful for us to provide that we do not, please let us know.
   2. Make sure you `GROUP BY global_id`---mixing `lir_id`s from
      different `global_id`s will produce nonsense.
   3. Use `REPEAT(' ', 2 * nesting) || operator` and `ORDER BY lir_id
@@ -584,6 +593,10 @@ starting point. When building your own, keep the following in mind.
      `mz_lir_mapping.operator_id_end` is exclusive. If they are equal
      to each other, that LIR operator does not correspond to any
      dataflow operators.
+  5. Join with `mz_catalog.mz_objects` and restrict based on
+     `mz_objects.name` to only see output for views, materialized
+     views, and indexes you're interested in. Otherwise you will see
+     information on everything installed in your current cluster.
 
 ## How do I troubleshoot slow queries?
 

From 169cd80dc472b73351bdad9508bc1b2702a448d5 Mon Sep 17 00:00:00 2001
From: Michael Greenberg <michael.greenberg@materialize.com>
Date: Thu, 26 Dec 2024 15:42:59 -0500
Subject: [PATCH 5/5] improve worker skew query, clarify

---
 .../content/transform-data/troubleshooting.md | 70 ++++++++++++-------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/doc/user/content/transform-data/troubleshooting.md b/doc/user/content/transform-data/troubleshooting.md
index 9917543a452b8..66fb9c6739c29 100644
--- a/doc/user/content/transform-data/troubleshooting.md
+++ b/doc/user/content/transform-data/troubleshooting.md
@@ -441,6 +441,12 @@ cheap in comparison, as is arranging the final output in the
 
 ### Attributing `TopK` hints
 
+{{< tip >}}
+
+If you have not read about [attributing computation time](#attributing-computation-time), please do so first, as it explains some core concepts.
+
+{{< /tip >}}
+
 The
 [`mz_introspection.mz_expected_group_size_advice`](/sql/system-catalog/mz_introspection/#mz_expected_group_size_advice)
 looks at your running dataflow and suggests parameters you can set. We
@@ -515,6 +521,13 @@ uses a third of the memory it was using before:
 
 ### Localizing worker skew
 
+{{< tip >}}
+
+If you have not read about [attributing computation time](#attributing-computation-time), please do so first, as it explains some core concepts.
+
+{{< /tip >}}
+
+
 [Worker skew](/transform-data/dataflow-troubleshooting/#is-work-distributed-equally-across-workers) occurs when your data do not end up getting evenly
 partitioned between workers.  Worker skew can only happen when your
 cluster has more than one worker. (You can query
@@ -527,9 +540,10 @@ overall time spent across all workers:
 ```sql
  SELECT mo.name AS name, global_id, lir_id, REPEAT(' ', 2 * nesting) || operator as operator,
                      worker_id,
+                     ROUND(SUM(elapsed_ns) / SUM(aebi.total_ns), 2) as ratio,
                      SUM(elapsed_ns) / 1000 * '1 microsecond'::INTERVAL AS elapsed_ns,
-                     SUM(aebi.total_ns) / 1000 * '1 microsecond'::INTERVAL  as avg_ns,
-                     SUM(elapsed_ns) / SUM(aebi.total_ns) as ratio
+                     SUM(aebi.total_ns) / 1000 * '1 microsecond'::INTERVAL  as avg_ns
+
    FROM                    mz_introspection.mz_lir_mapping mlm
         CROSS JOIN LATERAL (  SELECT SUM(elapsed_ns) AS total_ns
                                 FROM mz_introspection.mz_scheduling_elapsed_per_worker mse
@@ -546,30 +560,34 @@ GROUP BY mo.name, global_id, lir_id, nesting, operator, worker_id
 ORDER BY global_id, lir_id DESC;
 ```
 
-| name         | global_id | lir_id | operator                   | worker_id | elapsed_ns      | avg_ns          | ratio                                     |
-| ------------ | --------- | ------ | -------------------------- | --------- | --------------- | --------------- | ----------------------------------------- |
-| winning_bids | u150      | 6      | TopK::Basic 5              | 0         | 00:00:02.359193 | 00:00:02.338105 | 1.00901943619301904208932433787196422606  |
-| winning_bids | u150      | 6      | TopK::Basic 5              | 1         | 00:00:02.342375 | 00:00:02.338105 | 1.0018262379840754650921842785877521019   |
-| winning_bids | u150      | 6      | TopK::Basic 5              | 2         | 00:00:02.314143 | 00:00:02.338105 | 0.989751590156994710647908375633282330915 |
-| winning_bids | u150      | 6      | TopK::Basic 5              | 3         | 00:00:02.336708 | 00:00:02.338105 | 0.999402735665910782170583007907001341123 |
-| winning_bids | u150      | 5      |   Join::Differential 2 » 4 | 0         | 00:00:00.557381 | 00:00:00.548652 | 1.01590883682808615066920883751285427977  |
-| winning_bids | u150      | 5      |   Join::Differential 2 » 4 | 1         | 00:00:00.542891 | 00:00:00.548652 | 0.98949878273761917487532841143193434597  |
-| winning_bids | u150      | 5      |   Join::Differential 2 » 4 | 2         | 00:00:00.546565 | 00:00:00.548652 | 0.99619568016440763325559996932924096555  |
-| winning_bids | u150      | 5      |   Join::Differential 2 » 4 | 3         | 00:00:00.547773 | 00:00:00.548652 | 0.99839670026988704119986278172597040871  |
-| winning_bids | u150      | 4      |     Arrange 3              | 0         | 00:00:00.314206 | 00:00:00.338216 | 0.929009114588746221704744632262170530522 |
-| winning_bids | u150      | 4      |     Arrange 3              | 1         | 00:00:00.306153 | 00:00:00.338216 | 0.905199632876776517170415418238708660282 |
-| winning_bids | u150      | 4      |     Arrange 3              | 2         | 00:00:00.432691 | 00:00:00.338216 | 1.27933123914530934762928875720815264567  |
-| winning_bids | u150      | 4      |     Arrange 3              | 3         | 00:00:00.299815 | 00:00:00.338216 | 0.886460013389167913495551192290968163528 |
-| winning_bids | u150      | 2      |     Arrange 1              | 0         | 00:00:00.258171 | 00:00:00.244214 | 1.05715143156633465139502061854023757049  |
-| winning_bids | u150      | 2      |     Arrange 1              | 1         | 00:00:00.234322 | 00:00:00.244214 | 0.959493923931760528554652347461354389562 |
-| winning_bids | u150      | 2      |     Arrange 1              | 2         | 00:00:00.230453 | 00:00:00.244214 | 0.943651272055032566225444196288453924574 |
-| winning_bids | u150      | 2      |     Arrange 1              | 3         | 00:00:00.25391  | 00:00:00.244214 | 1.03970337244687225382488283770995411537  |
-| wins_by_item | u151      | 8      | Arrange 7                  | 0         | 00:00:00.202201 | 00:00:00.204736 | 0.987618403431703110437750824708179947656 |
-| wins_by_item | u151      | 8      | Arrange 7                  | 1         | 00:00:00.216731 | 00:00:00.204736 | 1.05858895341593917863900275768799815376  |
-| wins_by_item | u151      | 8      | Arrange 7                  | 2         | 00:00:00.220717 | 00:00:00.204736 | 1.07805793303181678638721908696246766799  |
-| wins_by_item | u151      | 8      | Arrange 7                  | 3         | 00:00:00.179294 | 00:00:00.204736 | 0.875734710120540924536027330641354230589 |
-
-The `ratio` column tells you whether a worker is particularly over- or under-loaded.
+| name         | global_id | lir_id | operator                   | worker_id | ratio | elapsed_ns      | avg_ns          |
+| ------------ | --------- | ------ | -------------------------- | --------- | ----- | --------------- | --------------- |
+| winning_bids | u186      | 6      | TopK::Basic 5              | 0         | 1     | 00:00:03.172611 | 00:00:03.177245 |
+| winning_bids | u186      | 6      | TopK::Basic 5              | 1         | 1     | 00:00:03.175515 | 00:00:03.177245 |
+| winning_bids | u186      | 6      | TopK::Basic 5              | 2         | 1     | 00:00:03.174291 | 00:00:03.177245 |
+| winning_bids | u186      | 6      | TopK::Basic 5              | 3         | 1     | 00:00:03.186564 | 00:00:03.177245 |
+| winning_bids | u186      | 5      |   Join::Differential 2 » 4 | 0         | 0.97  | 00:00:00.157787 | 00:00:00.162148 |
+| winning_bids | u186      | 5      |   Join::Differential 2 » 4 | 1         | 1.05  | 00:00:00.170231 | 00:00:00.162148 |
+| winning_bids | u186      | 5      |   Join::Differential 2 » 4 | 2         | 1     | 00:00:00.162352 | 00:00:00.162148 |
+| winning_bids | u186      | 5      |   Join::Differential 2 » 4 | 3         | 0.98  | 00:00:00.158224 | 00:00:00.162148 |
+| winning_bids | u186      | 4      |     Arrange 3              | 0         | 0.67  | 00:00:00.059754 | 00:00:00.088972 |
+| winning_bids | u186      | 4      |     Arrange 3              | 1         | 0.64  | 00:00:00.057283 | 00:00:00.088972 |
+| winning_bids | u186      | 4      |     Arrange 3              | 2         | 2.02  | 00:00:00.179739 | 00:00:00.088972 |
+| winning_bids | u186      | 4      |     Arrange 3              | 3         | 0.66  | 00:00:00.059112 | 00:00:00.088972 |
+| winning_bids | u186      | 2      |     Arrange 1              | 0         | 0.82  | 00:00:00.023081 | 00:00:00.028271 |
+| winning_bids | u186      | 2      |     Arrange 1              | 1         | 1.61  | 00:00:00.045394 | 00:00:00.028271 |
+| winning_bids | u186      | 2      |     Arrange 1              | 2         | 0.77  | 00:00:00.021894 | 00:00:00.028271 |
+| winning_bids | u186      | 2      |     Arrange 1              | 3         | 0.8   | 00:00:00.022717 | 00:00:00.028271 |
+| wins_by_item | u187      | 8      | Arrange 7                  | 0         | 0.85  | 00:00:00.02085  | 00:00:00.024526 |
+| wins_by_item | u187      | 8      | Arrange 7                  | 1         | 1.27  | 00:00:00.031028 | 00:00:00.024526 |
+| wins_by_item | u187      | 8      | Arrange 7                  | 2         | 1.44  | 00:00:00.035279 | 00:00:00.024526 |
+| wins_by_item | u187      | 8      | Arrange 7                  | 3         | 0.45  | 00:00:00.010946 | 00:00:00.024526 |
+
+The `ratio` column tells you whether a worker is particularly over- or
+under-loaded: a `ratio` below 1 indicates a worker doing a below
+average amount of work, while a `ratio` above 1 indicates a worker
+doing an above average amount of work. There will always be some
+amount of variation, but very high ratios indicate a skewed workload.
 
 ### Writing your own attribution queries