From cb7a365d8641f13e0483dcd966a50fe3c428104d Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@grinaker.org>
Date: Sun, 14 Jul 2024 17:27:59 +0200
Subject: [PATCH] wip sql: eliminate noop projections in short_circuit
 optimizer

---
 src/sql/execution/execute.rs          |  4 +-
 src/sql/planner/optimizer.rs          | 20 ++++--
 src/sql/planner/plan.rs               | 30 ++++++++-
 src/sql/testscripts/queries/aggregate | 88 +++++++++++----------------
 src/sql/testscripts/queries/group_by  | 59 ++++++++----------
 src/sql/testscripts/queries/having    | 15 ++---
 src/sql/testscripts/queries/order     |  5 +-
 7 files changed, 111 insertions(+), 110 deletions(-)
diff --git a/src/sql/execution/execute.rs b/src/sql/execution/execute.rs
index d52509821..e17899c44 100644
--- a/src/sql/execution/execute.rs
+++ b/src/sql/execution/execute.rs
@@ -86,10 +86,12 @@ pub fn execute(node: Node, txn: &impl Transaction) -> Result<Rows> {
         }
 
         Node::IndexLookup { table, column, values, alias: _ } => {
+            let column = table.columns.into_iter().nth(column).expect("invalid column").name;
+            let table = table.name;
             source::lookup_index(txn, table, column, values)
         }
 
-        Node::KeyLookup { table, keys, alias: _ } => source::lookup_key(txn, table, keys),
+        Node::KeyLookup { table, keys, alias: _ } => source::lookup_key(txn, table.name, keys),
 
         Node::Limit { source, limit } => {
             let source = execute(*source, txn)?;
diff --git a/src/sql/planner/optimizer.rs b/src/sql/planner/optimizer.rs
index 87a802a9d..8b699d21d 100644
--- a/src/sql/planner/optimizer.rs
+++ b/src/sql/planner/optimizer.rs
@@ -238,14 +238,13 @@ pub(super) fn index_lookup(node: Node) -> Result<Node> {
         };
 
         // Extract the lookup values and expression from the cnf vector.
-        let (field, values) = cnf.remove(i).into_field_values().expect("field lookup failed");
+        let (column, values) = cnf.remove(i).into_field_values().expect("field lookup failed");
 
         // Build the primary key or secondary index lookup node.
-        if field == table.primary_key {
-            node = Node::KeyLookup { table: table.name, keys: values, alias };
+        if column == table.primary_key {
+            node = Node::KeyLookup { table, keys: values, alias };
         } else {
-            let column = table.columns.into_iter().nth(field).unwrap().name;
-            node = Node::IndexLookup { table: table.name, column, values, alias };
+            node = Node::IndexLookup { table, column, values, alias };
         }
 
         // If there's any remaining CNF expressions add a filter node for them.
@@ -356,6 +355,17 @@ pub(super) fn short_circuit(node: Node) -> Result<Node> {
             Node::Nothing
         }
 
+        // Remove projections that simply pass through the original columns.
+        Node::Projection { source, expressions, .. }
+            if source.size() == expressions.len()
+                && expressions
+                    .iter()
+                    .enumerate()
+                    .all(|(i, e)| matches!(e, Expression::Field(f, _) if i == *f)) =>
+        {
+            *source
+        }
+
         node => node,
     };
 
diff --git a/src/sql/planner/plan.rs b/src/sql/planner/plan.rs
index 3331e874b..a35a11af6 100644
--- a/src/sql/planner/plan.rs
+++ b/src/sql/planner/plan.rs
@@ -109,9 +109,9 @@ pub enum Node {
     /// Looks up the given values in a secondary index and emits matching rows.
     /// NULL and NaN values are considered equal, to allow IS NULL and IS NAN
     /// index lookups, as is -0.0 and 0.0.
-    IndexLookup { table: String, column: String, values: Vec<Value>, alias: Option<String> },
+    IndexLookup { table: Table, column: usize, values: Vec<Value>, alias: Option<String> },
     /// Looks up the given primary keys and emits their rows.
-    KeyLookup { table: String, keys: Vec<Value>, alias: Option<String> },
+    KeyLookup { table: Table, keys: Vec<Value>, alias: Option<String> },
     /// Only emits the first limit rows from the source, discards the rest.
     Limit { source: Box<Node>, limit: usize },
     /// Joins the left and right sources on the given predicate by buffering the
@@ -278,6 +278,28 @@ impl Node {
             | Self::Scan { filter: None, .. }) => node,
         })
     }
+
+    /// Returns the column width of the node.
+    /// TODO: this can replace various size parameters, e.g. left_size and
+    /// right_size in NestedLoopJoin. Possibly also Scope.len().
+    pub fn size(&self) -> usize {
+        match &self {
+            Node::Aggregate { aggregates, group_by, .. } => aggregates.len() + group_by.len(),
+            Node::Filter { source, .. } => source.size(),
+            Node::HashJoin { left, right, .. } => left.size() + right.size(),
+            Node::IndexLookup { table, .. } => table.columns.len(),
+            Node::KeyLookup { table, .. } => table.columns.len(),
+            Node::Limit { source, .. } => source.size(),
+            Node::NestedLoopJoin { left, right, .. } => left.size() + right.size(),
+            Node::Nothing => 0,
+            Node::EmptyRow => 0, // TODO: incorrect, but will be removed
+            Node::Offset { source, .. } => source.size(),
+            Node::Order { source, .. } => source.size(),
+            Node::Projection { expressions, .. } => expressions.len(),
+            Node::Scan { table, .. } => table.columns.len(),
+            Node::Values { rows } => rows.first().map(|row| row.len()).unwrap_or(0),
+        }
+    }
 }
 
 /// Formats the plan as an EXPLAIN tree.
@@ -375,6 +397,8 @@ impl Node {
                 right.format(f, prefix, false, true)?;
             }
             Self::IndexLookup { table, column, alias, values } => {
+                let column = &table.columns[*column].name;
+                let table = &table.name;
                 write!(f, "IndexLookup: {table}.{column}")?;
                 if let Some(alias) = alias {
                     write!(f, " as {alias}.{column}")?;
@@ -386,7 +410,7 @@ impl Node {
                 }
             }
             Self::KeyLookup { table, alias, keys } => {
-                write!(f, "KeyLookup: {table}")?;
+                write!(f, "KeyLookup: {}", table.name)?;
                 if let Some(alias) = alias {
                     write!(f, " as {alias}")?;
                 }
diff --git a/src/sql/testscripts/queries/aggregate b/src/sql/testscripts/queries/aggregate
index 350406372..4dc8f4509 100644
--- a/src/sql/testscripts/queries/aggregate
+++ b/src/sql/testscripts/queries/aggregate
@@ -17,64 +17,54 @@
 ok
 
 # COUNT(*) returns the row count.
-# TODO: revisit the plan here. This can be eliminated by short-circuiting optimizer.
 [plan]> SELECT COUNT(*) FROM test
 ---
-Projection: #0
-└─ Aggregate: count(TRUE)
-   └─ Scan: test
+Aggregate: count(TRUE)
+└─ Scan: test
 6
 
 # COUNT works on constant values.
-# TODO: revisit the plan here. This can be eliminated by short-circuiting optimizer.
 [plan,header]> SELECT COUNT(NULL), COUNT(TRUE), COUNT(1), COUNT(3.14), COUNT(NAN), COUNT('')
 ---
-Projection: #0, #1, #2, #3, #4, #5
-└─ Aggregate: count(NULL), count(TRUE), count(1), count(3.14), count(NaN), count()
-   └─ EmptyRow
+Aggregate: count(NULL), count(TRUE), count(1), count(3.14), count(NaN), count()
+└─ EmptyRow
 , , , , , 
 0, 1, 1, 1, 1, 1
 
 # COUNT works on no rows.
 [plan]> SELECT COUNT(id), COUNT("bool"), COUNT("float"), COUNT("string") FROM test WHERE false
 ---
-Projection: #0, #1, #2, #3
-└─ Aggregate: count(id), count(bool), count(float), count(string)
-   └─ Nothing
+Aggregate: count(id), count(bool), count(float), count(string)
+└─ Nothing
 0, 0, 0, 0
 
 # COUNT returns number of non-NULL values.
-# TODO: revisit the plan here. This can be eliminated by short-circuiting optimizer.
 [plan,header]> SELECT COUNT(id), COUNT("bool"), COUNT("float"), COUNT("string") FROM test
 ---
-Projection: #0, #1, #2, #3
-└─ Aggregate: count(id), count(bool), count(float), count(string)
-   └─ Scan: test
+Aggregate: count(id), count(bool), count(float), count(string)
+└─ Scan: test
 , , , 
 6, 3, 5, 4
 
 # MAX works on constant values.
 [plan]> SELECT MAX(NULL), MAX(TRUE), MAX(1), MAX(3.14), MAX(NAN), MAX('foo') FROM test
 ---
-Projection: #0, #1, #2, #3, #4, #5
-└─ Aggregate: max(NULL), max(TRUE), max(1), max(3.14), max(NaN), max(foo)
-   └─ Scan: test
+Aggregate: max(NULL), max(TRUE), max(1), max(3.14), max(NaN), max(foo)
+└─ Scan: test
 NULL, TRUE, 1, 3.14, NaN, foo
 
 # MAX works on no rows.
 [plan]> SELECT MAX(id), MAX("bool"), MAX("float"), MAX("string") FROM test WHERE false
 ---
-Projection: #0, #1, #2, #3
-└─ Aggregate: max(id), max(bool), max(float), max(string)
-   └─ Nothing
+Aggregate: max(id), max(bool), max(float), max(string)
+└─ Nothing
 NULL, NULL, NULL, NULL
 
 # MAX returns the max value, or NULL if any value is NULL.
 [plan]> SELECT MAX(id) FROM test
 ---
-Projection: #0
-└─ Aggregate: max(id)
-   └─ Scan: test
+Aggregate: max(id)
+└─ Scan: test
 5
 
 > SELECT MAX("bool") FROM test
@@ -98,25 +88,22 @@ inf
 # MIN works on constant values.
 [plan]> SELECT MIN(NULL), MIN(TRUE), MIN(1), MIN(3.14), MIN(NAN), MIN('foo') FROM test
 ---
-Projection: #0, #1, #2, #3, #4, #5
-└─ Aggregate: min(NULL), min(TRUE), min(1), min(3.14), min(NaN), min(foo)
-   └─ Scan: test
+Aggregate: min(NULL), min(TRUE), min(1), min(3.14), min(NaN), min(foo)
+└─ Scan: test
 NULL, TRUE, 1, 3.14, NaN, foo
 
 # MIN works on no rows.
 [plan]> SELECT MIN(id), MIN("bool"), MIN("float"), MIN("string") FROM test WHERE false
 ---
-Projection: #0, #1, #2, #3
-└─ Aggregate: min(id), min(bool), min(float), min(string)
-   └─ Nothing
+Aggregate: min(id), min(bool), min(float), min(string)
+└─ Nothing
 NULL, NULL, NULL, NULL
 
 # MIN returns the min value, or NULL if any value is NULL.
 [plan]> SELECT MIN(id) FROM test
 ---
-Projection: #0
-└─ Aggregate: min(id)
-   └─ Scan: test
+Aggregate: min(id)
+└─ Scan: test
 0
 
 > SELECT MIN("bool") FROM test
@@ -138,9 +125,8 @@ FALSE
 # SUM works on constant values, but only numbers.
 [plan]> SELECT SUM(NULL), SUM(1), SUM(3.14), SUM(NAN) FROM test
 ---
-Projection: #0, #1, #2, #3
-└─ Aggregate: sum(NULL), sum(1), sum(3.14), sum(NaN)
-   └─ Scan: test
+Aggregate: sum(NULL), sum(1), sum(3.14), sum(NaN)
+└─ Scan: test
 NULL, 6, 18.84, NaN
 
 !> SELECT SUM(TRUE)
@@ -152,18 +138,16 @@ Error: invalid input: can't add 0 and foo
 # SUM works on no rows.
 [plan]> SELECT SUM(id), SUM("bool"), SUM("float"), SUM("string") FROM test WHERE false
 ---
-Projection: #0, #1, #2, #3
-└─ Aggregate: sum(id), sum(bool), sum(float), sum(string)
-   └─ Nothing
+Aggregate: sum(id), sum(bool), sum(float), sum(string)
+└─ Nothing
 NULL, NULL, NULL, NULL
 
 # SUM returns the sum, or NULL if any value is NULL. Errors
 # on booleans or strings.
 [plan]> SELECT SUM(id) FROM test
 ---
-Projection: #0
-└─ Aggregate: sum(id)
-   └─ Scan: test
+Aggregate: sum(id)
+└─ Scan: test
 15
 
 !> SELECT SUM("bool") FROM test
@@ -187,9 +171,8 @@ Error: invalid input: can't add 0 and
 # AVG works on constant values, but only numbers.
 [plan]> SELECT AVG(NULL), AVG(1), AVG(3.14), AVG(NAN) FROM test
 ---
-Projection: #0, #1, #2, #3
-└─ Aggregate: avg(NULL), avg(1), avg(3.14), avg(NaN)
-   └─ Scan: test
+Aggregate: avg(NULL), avg(1), avg(3.14), avg(NaN)
+└─ Scan: test
 NULL, 1, 3.14, NaN
 
 !> SELECT AVG(TRUE)
@@ -201,18 +184,16 @@ Error: invalid input: can't add 0 and foo
 # AVG works on no rows.
 [plan]> SELECT AVG(id), AVG("bool"), AVG("float"), AVG("string") FROM test WHERE false
 ---
-Projection: #0, #1, #2, #3
-└─ Aggregate: avg(id), avg(bool), avg(float), avg(string)
-   └─ Nothing
+Aggregate: avg(id), avg(bool), avg(float), avg(string)
+└─ Nothing
 NULL, NULL, NULL, NULL
 
 # AVG returns the average, or NULL if any value is NULL. Errors
 # on booleans or strings.
 [plan]> SELECT AVG(id) FROM test
 ---
-Projection: #0
-└─ Aggregate: avg(id)
-   └─ Scan: test
+Aggregate: avg(id)
+└─ Scan: test
 2
 
 !> SELECT AVG("bool") FROM test
@@ -236,9 +217,8 @@ Error: invalid input: can't add 0 and
 # Constant aggregates can be used with rows.
 [plan]> SELECT COUNT(1), MIN(1), MAX(1), SUM(1), AVG(1) FROM test
 ---
-Projection: #0, #1, #2, #3, #4
-└─ Aggregate: count(1), min(1), max(1), sum(1), avg(1)
-   └─ Scan: test
+Aggregate: count(1), min(1), max(1), sum(1), avg(1)
+└─ Scan: test
 6, 1, 1, 6, 1
 
 # Constant aggregates can't be used with value rows.
diff --git a/src/sql/testscripts/queries/group_by b/src/sql/testscripts/queries/group_by
index 28a516237..468015fec 100644
--- a/src/sql/testscripts/queries/group_by
+++ b/src/sql/testscripts/queries/group_by
@@ -31,9 +31,8 @@ Projection: #1, #2, #3, #4, #5
 # Simple GROUP BY, including NULL group.
 [plan]> SELECT "group", COUNT(*) FROM test GROUP BY "group"
 ---
-Projection: test.group, #1
-└─ Aggregate: count(TRUE) group by group
-   └─ Scan: test
+Aggregate: count(TRUE) group by group
+└─ Scan: test
 NULL, 1
 a, 3
 b, 3
@@ -41,9 +40,8 @@ b, 3
 [plan]> SELECT "group", COUNT(*), MIN("bool"), MAX("string"), SUM("int"), AVG("float") \
     FROM test GROUP BY "group"
 ---
-Projection: test.group, #1, #2, #3, #4, #5
-└─ Aggregate: count(TRUE), min(bool), max(string), sum(int), avg(float) group by group
-   └─ Scan: test
+Aggregate: count(TRUE), min(bool), max(string), sum(int), avg(float) group by group
+└─ Scan: test
 NULL, 1, NULL, NULL, NULL, NULL
 a, 3, FALSE, AB, 9, NaN
 b, 3, FALSE, 👋, 41, NaN
@@ -51,9 +49,8 @@ b, 3, FALSE, 👋, 41, NaN
 # GROUP BY works on booleans.
 [plan]> SELECT "bool", COUNT(*) FROM test GROUP BY "bool"
 ---
-Projection: test.bool, #1
-└─ Aggregate: count(TRUE) group by bool
-   └─ Scan: test
+Aggregate: count(TRUE) group by bool
+└─ Scan: test
 NULL, 1
 FALSE, 3
 TRUE, 3
@@ -61,9 +58,8 @@ TRUE, 3
 # GROUP BY works on integers.
 [plan]> SELECT "int", COUNT(*) FROM test GROUP BY "int"
 ---
-Projection: test.int, #1
-└─ Aggregate: count(TRUE) group by int
-   └─ Scan: test
+Aggregate: count(TRUE) group by int
+└─ Scan: test
 NULL, 1
 -1, 2
 0, 1
@@ -74,9 +70,8 @@ NULL, 1
 # GROUP BY works with floats, including a NAN group and -0.0 and 0.0 being equal.
 [plan]> SELECT "float", COUNT(*) FROM test GROUP BY "float"
 ---
-Projection: test.float, #1
-└─ Aggregate: count(TRUE) group by float
-   └─ Scan: test
+Aggregate: count(TRUE) group by float
+└─ Scan: test
 NULL, 1
 0, 2
 3.14, 1
@@ -86,9 +81,8 @@ NaN, 2
 # GROUP BY works on strings.
 [plan]> SELECT "string", COUNT(*) FROM test GROUP BY "string"
 ---
-Projection: test.string, #1
-└─ Aggregate: count(TRUE) group by string
-   └─ Scan: test
+Aggregate: count(TRUE) group by string
+└─ Scan: test
 NULL, 1
 , 2
 AB, 1
@@ -108,9 +102,8 @@ Projection: #1
 # GROUP BY works when there is no aggregate function.
 [plan]> SELECT "group" FROM test GROUP BY "group"
 ---
-Projection: test.group
-└─ Aggregate:  group by group
-   └─ Scan: test
+Aggregate:  group by group
+└─ Scan: test
 NULL
 a
 b
@@ -122,9 +115,8 @@ Error: invalid input: unknown field g
 
 [plan]> SELECT "group", COUNT(*) FROM test AS t GROUP BY t."group"
 ---
-Projection: group, #1
-└─ Aggregate: count(TRUE) group by t.group
-   └─ Scan: test as t
+Aggregate: count(TRUE) group by t.group
+└─ Scan: test as t
 NULL, 1
 a, 3
 b, 3
@@ -164,9 +156,8 @@ Projection: #1
 # GROUP BY can use an expression also used in the SELECT.
 [plan]> SELECT id % 2, COUNT(*) FROM test GROUP BY id % 2
 ---
-Projection: #0, #1
-└─ Aggregate: count(TRUE) group by id % 2
-   └─ Scan: test
+Aggregate: count(TRUE) group by id % 2
+└─ Scan: test
 0, 4
 1, 3
 
@@ -183,9 +174,8 @@ Error: invalid input: unknown function min
 # GROUP BY works with multiple groups.
 [plan]> SELECT "group", "bool", COUNT(*) FROM test GROUP BY "group", "bool"
 ---
-Projection: test.group, test.bool, #2
-└─ Aggregate: count(TRUE) group by group, bool
-   └─ Scan: test
+Aggregate: count(TRUE) group by group, bool
+└─ Scan: test
 NULL, NULL, 1
 a, FALSE, 1
 a, TRUE, 2
@@ -195,9 +185,8 @@ b, TRUE, 1
 # GROUP BY works with joins.
 [plan]> SELECT t.id % 2, COUNT(*) FROM test t JOIN other o ON t.id % 2 = o.id GROUP BY t.id % 2
 ---
-Projection: #0, #1
-└─ Aggregate: count(TRUE) group by t.id % 2
-   └─ NestedLoopJoin: inner on t.id % 2 = o.id
-      ├─ Scan: test as t
-      └─ Scan: other as o
+Aggregate: count(TRUE) group by t.id % 2
+└─ NestedLoopJoin: inner on t.id % 2 = o.id
+   ├─ Scan: test as t
+   └─ Scan: other as o
 1, 3
diff --git a/src/sql/testscripts/queries/having b/src/sql/testscripts/queries/having
index 45a36c2c2..7928c6da8 100644
--- a/src/sql/testscripts/queries/having
+++ b/src/sql/testscripts/queries/having
@@ -52,9 +52,8 @@ b, 42
 ---
 Projection: #0
 └─ Filter: #1 > 10
-   └─ Projection: test.group, #1
-      └─ Aggregate: max(int) group by group
-         └─ Scan: test
+   └─ Aggregate: max(int) group by group
+      └─ Scan: test
 b
 
 [plan]> SELECT "group", MAX("int") FROM test GROUP BY "group" HAVING MAX("int") - MIN("int") > 10
@@ -70,9 +69,8 @@ b, 42
 [plan]> SELECT "group", MAX("int") AS m FROM test GROUP BY "group" HAVING m > 10
 ---
 Filter: m > 10
-└─ Projection: test.group, #1
-   └─ Aggregate: max(int) group by group
-      └─ Scan: test
+└─ Aggregate: max(int) group by group
+   └─ Scan: test
 b, 42
 
 # Having works with an aggregate function not in the SELECT clause.
@@ -80,9 +78,8 @@ b, 42
 ---
 Projection: #0, #1
 └─ Filter: #2 > 10
-   └─ Projection: test.group, #1, #2
-      └─ Aggregate: count(TRUE), max(int) group by group
-         └─ Scan: test
+   └─ Aggregate: count(TRUE), max(int) group by group
+      └─ Scan: test
 b, 3
 
 # Having works with compound expressions.
diff --git a/src/sql/testscripts/queries/order b/src/sql/testscripts/queries/order
index 2ebf1a23b..716926492 100644
--- a/src/sql/testscripts/queries/order
+++ b/src/sql/testscripts/queries/order
@@ -485,9 +485,8 @@ FALSE, -1
 ---
 Projection: #0
 └─ Order: #1 desc
-   └─ Projection: test.bool, #1
-      └─ Aggregate: max(int) group by bool
-         └─ Scan: test
+   └─ Aggregate: max(int) group by bool
+      └─ Scan: test
 NULL
 TRUE
 FALSE