From cb7a365d8641f13e0483dcd966a50fe3c428104d Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Sun, 14 Jul 2024 17:27:59 +0200 Subject: [PATCH] wip sql: eliminate noop projections in short_circuit optimizer --- src/sql/execution/execute.rs | 4 +- src/sql/planner/optimizer.rs | 20 ++++-- src/sql/planner/plan.rs | 30 ++++++++- src/sql/testscripts/queries/aggregate | 88 +++++++++++---------------- src/sql/testscripts/queries/group_by | 59 ++++++++---------- src/sql/testscripts/queries/having | 15 ++--- src/sql/testscripts/queries/order | 5 +- 7 files changed, 111 insertions(+), 110 deletions(-) diff --git a/src/sql/execution/execute.rs b/src/sql/execution/execute.rs index d52509821..e17899c44 100644 --- a/src/sql/execution/execute.rs +++ b/src/sql/execution/execute.rs @@ -86,10 +86,12 @@ pub fn execute(node: Node, txn: &impl Transaction) -> Result { } Node::IndexLookup { table, column, values, alias: _ } => { + let column = table.columns.into_iter().nth(column).expect("invalid column").name; + let table = table.name; source::lookup_index(txn, table, column, values) } - Node::KeyLookup { table, keys, alias: _ } => source::lookup_key(txn, table, keys), + Node::KeyLookup { table, keys, alias: _ } => source::lookup_key(txn, table.name, keys), Node::Limit { source, limit } => { let source = execute(*source, txn)?; diff --git a/src/sql/planner/optimizer.rs b/src/sql/planner/optimizer.rs index 87a802a9d..8b699d21d 100644 --- a/src/sql/planner/optimizer.rs +++ b/src/sql/planner/optimizer.rs @@ -238,14 +238,13 @@ pub(super) fn index_lookup(node: Node) -> Result { }; // Extract the lookup values and expression from the cnf vector. - let (field, values) = cnf.remove(i).into_field_values().expect("field lookup failed"); + let (column, values) = cnf.remove(i).into_field_values().expect("field lookup failed"); // Build the primary key or secondary index lookup node. - if field == table.primary_key { - node = Node::KeyLookup { table: table.name, keys: values, alias }; + if column == table.primary_key { + node = Node::KeyLookup { table, keys: values, alias }; } else { - let column = table.columns.into_iter().nth(field).unwrap().name; - node = Node::IndexLookup { table: table.name, column, values, alias }; + node = Node::IndexLookup { table, column, values, alias }; } // If there's any remaining CNF expressions add a filter node for them. @@ -356,6 +355,17 @@ pub(super) fn short_circuit(node: Node) -> Result { Node::Nothing } + // Remove projections that simply pass through the original columns. + Node::Projection { source, expressions, .. } + if source.size() == expressions.len() + && expressions + .iter() + .enumerate() + .all(|(i, e)| matches!(e, Expression::Field(f, _) if i == *f)) => + { + *source + } + node => node, }; diff --git a/src/sql/planner/plan.rs b/src/sql/planner/plan.rs index 3331e874b..a35a11af6 100644 --- a/src/sql/planner/plan.rs +++ b/src/sql/planner/plan.rs @@ -109,9 +109,9 @@ pub enum Node { /// Looks up the given values in a secondary index and emits matching rows. /// NULL and NaN values are considered equal, to allow IS NULL and IS NAN /// index lookups, as is -0.0 and 0.0. - IndexLookup { table: String, column: String, values: Vec, alias: Option }, + IndexLookup { table: Table, column: usize, values: Vec, alias: Option }, /// Looks up the given primary keys and emits their rows. - KeyLookup { table: String, keys: Vec, alias: Option }, + KeyLookup { table: Table, keys: Vec, alias: Option }, /// Only emits the first limit rows from the source, discards the rest. Limit { source: Box, limit: usize }, /// Joins the left and right sources on the given predicate by buffering the @@ -278,6 +278,28 @@ impl Node { | Self::Scan { filter: None, .. }) => node, }) } + + /// Returns the column width of the node. + /// TODO: this can replace various size parameters, e.g. left_size and + /// right_size in NestedLoopJoin. Possibly also Scope.len(). + pub fn size(&self) -> usize { + match &self { + Node::Aggregate { aggregates, group_by, .. } => aggregates.len() + group_by.len(), + Node::Filter { source, .. } => source.size(), + Node::HashJoin { left, right, .. } => left.size() + right.size(), + Node::IndexLookup { table, .. } => table.columns.len(), + Node::KeyLookup { table, .. } => table.columns.len(), + Node::Limit { source, .. } => source.size(), + Node::NestedLoopJoin { left, right, .. } => left.size() + right.size(), + Node::Nothing => 0, + Node::EmptyRow => 0, // TODO: incorrect, but will be removed + Node::Offset { source, .. } => source.size(), + Node::Order { source, .. } => source.size(), + Node::Projection { expressions, .. } => expressions.len(), + Node::Scan { table, .. } => table.columns.len(), + Node::Values { rows } => rows.first().map(|row| row.len()).unwrap_or(0), + } + } } /// Formats the plan as an EXPLAIN tree. @@ -375,6 +397,8 @@ impl Node { right.format(f, prefix, false, true)?; } Self::IndexLookup { table, column, alias, values } => { + let column = &table.columns[*column].name; + let table = &table.name; write!(f, "IndexLookup: {table}.{column}")?; if let Some(alias) = alias { write!(f, " as {alias}.{column}")?; @@ -386,7 +410,7 @@ impl Node { } } Self::KeyLookup { table, alias, keys } => { - write!(f, "KeyLookup: {table}")?; + write!(f, "KeyLookup: {}", table.name)?; if let Some(alias) = alias { write!(f, " as {alias}")?; } diff --git a/src/sql/testscripts/queries/aggregate b/src/sql/testscripts/queries/aggregate index 350406372..4dc8f4509 100644 --- a/src/sql/testscripts/queries/aggregate +++ b/src/sql/testscripts/queries/aggregate @@ -17,64 +17,54 @@ ok # COUNT(*) returns the row count. -# TODO: revisit the plan here. This can be eliminated by short-circuiting optimizer. [plan]> SELECT COUNT(*) FROM test --- -Projection: #0 -└─ Aggregate: count(TRUE) - └─ Scan: test +Aggregate: count(TRUE) +└─ Scan: test 6 # COUNT works on constant values. -# TODO: revisit the plan here. This can be eliminated by short-circuiting optimizer. [plan,header]> SELECT COUNT(NULL), COUNT(TRUE), COUNT(1), COUNT(3.14), COUNT(NAN), COUNT('') --- -Projection: #0, #1, #2, #3, #4, #5 -└─ Aggregate: count(NULL), count(TRUE), count(1), count(3.14), count(NaN), count() - └─ EmptyRow +Aggregate: count(NULL), count(TRUE), count(1), count(3.14), count(NaN), count() +└─ EmptyRow , , , , , 0, 1, 1, 1, 1, 1 # COUNT works on no rows. [plan]> SELECT COUNT(id), COUNT("bool"), COUNT("float"), COUNT("string") FROM test WHERE false --- -Projection: #0, #1, #2, #3 -└─ Aggregate: count(id), count(bool), count(float), count(string) - └─ Nothing +Aggregate: count(id), count(bool), count(float), count(string) +└─ Nothing 0, 0, 0, 0 # COUNT returns number of non-NULL values. -# TODO: revisit the plan here. This can be eliminated by short-circuiting optimizer. [plan,header]> SELECT COUNT(id), COUNT("bool"), COUNT("float"), COUNT("string") FROM test --- -Projection: #0, #1, #2, #3 -└─ Aggregate: count(id), count(bool), count(float), count(string) - └─ Scan: test +Aggregate: count(id), count(bool), count(float), count(string) +└─ Scan: test , , , 6, 3, 5, 4 # MAX works on constant values. [plan]> SELECT MAX(NULL), MAX(TRUE), MAX(1), MAX(3.14), MAX(NAN), MAX('foo') FROM test --- -Projection: #0, #1, #2, #3, #4, #5 -└─ Aggregate: max(NULL), max(TRUE), max(1), max(3.14), max(NaN), max(foo) - └─ Scan: test +Aggregate: max(NULL), max(TRUE), max(1), max(3.14), max(NaN), max(foo) +└─ Scan: test NULL, TRUE, 1, 3.14, NaN, foo # MAX works on no rows. [plan]> SELECT MAX(id), MAX("bool"), MAX("float"), MAX("string") FROM test WHERE false --- -Projection: #0, #1, #2, #3 -└─ Aggregate: max(id), max(bool), max(float), max(string) - └─ Nothing +Aggregate: max(id), max(bool), max(float), max(string) +└─ Nothing NULL, NULL, NULL, NULL # MAX returns the max value, or NULL if any value is NULL. [plan]> SELECT MAX(id) FROM test --- -Projection: #0 -└─ Aggregate: max(id) - └─ Scan: test +Aggregate: max(id) +└─ Scan: test 5 > SELECT MAX("bool") FROM test @@ -98,25 +88,22 @@ inf # MIN works on constant values. [plan]> SELECT MIN(NULL), MIN(TRUE), MIN(1), MIN(3.14), MIN(NAN), MIN('foo') FROM test --- -Projection: #0, #1, #2, #3, #4, #5 -└─ Aggregate: min(NULL), min(TRUE), min(1), min(3.14), min(NaN), min(foo) - └─ Scan: test +Aggregate: min(NULL), min(TRUE), min(1), min(3.14), min(NaN), min(foo) +└─ Scan: test NULL, TRUE, 1, 3.14, NaN, foo # MIN works on no rows. [plan]> SELECT MIN(id), MIN("bool"), MIN("float"), MIN("string") FROM test WHERE false --- -Projection: #0, #1, #2, #3 -└─ Aggregate: min(id), min(bool), min(float), min(string) - └─ Nothing +Aggregate: min(id), min(bool), min(float), min(string) +└─ Nothing NULL, NULL, NULL, NULL # MIN returns the min value, or NULL if any value is NULL. [plan]> SELECT MIN(id) FROM test --- -Projection: #0 -└─ Aggregate: min(id) - └─ Scan: test +Aggregate: min(id) +└─ Scan: test 0 > SELECT MIN("bool") FROM test @@ -138,9 +125,8 @@ FALSE # SUM works on constant values, but only numbers. [plan]> SELECT SUM(NULL), SUM(1), SUM(3.14), SUM(NAN) FROM test --- -Projection: #0, #1, #2, #3 -└─ Aggregate: sum(NULL), sum(1), sum(3.14), sum(NaN) - └─ Scan: test +Aggregate: sum(NULL), sum(1), sum(3.14), sum(NaN) +└─ Scan: test NULL, 6, 18.84, NaN !> SELECT SUM(TRUE) @@ -152,18 +138,16 @@ Error: invalid input: can't add 0 and foo # SUM works on no rows. [plan]> SELECT SUM(id), SUM("bool"), SUM("float"), SUM("string") FROM test WHERE false --- -Projection: #0, #1, #2, #3 -└─ Aggregate: sum(id), sum(bool), sum(float), sum(string) - └─ Nothing +Aggregate: sum(id), sum(bool), sum(float), sum(string) +└─ Nothing NULL, NULL, NULL, NULL # SUM returns the sum, or NULL if any value is NULL. Errors # on booleans or strings. [plan]> SELECT SUM(id) FROM test --- -Projection: #0 -└─ Aggregate: sum(id) - └─ Scan: test +Aggregate: sum(id) +└─ Scan: test 15 !> SELECT SUM("bool") FROM test @@ -187,9 +171,8 @@ Error: invalid input: can't add 0 and # AVG works on constant values, but only numbers. [plan]> SELECT AVG(NULL), AVG(1), AVG(3.14), AVG(NAN) FROM test --- -Projection: #0, #1, #2, #3 -└─ Aggregate: avg(NULL), avg(1), avg(3.14), avg(NaN) - └─ Scan: test +Aggregate: avg(NULL), avg(1), avg(3.14), avg(NaN) +└─ Scan: test NULL, 1, 3.14, NaN !> SELECT AVG(TRUE) @@ -201,18 +184,16 @@ Error: invalid input: can't add 0 and foo # AVG works on no rows. [plan]> SELECT AVG(id), AVG("bool"), AVG("float"), AVG("string") FROM test WHERE false --- -Projection: #0, #1, #2, #3 -└─ Aggregate: avg(id), avg(bool), avg(float), avg(string) - └─ Nothing +Aggregate: avg(id), avg(bool), avg(float), avg(string) +└─ Nothing NULL, NULL, NULL, NULL # AVG returns the average, or NULL if any value is NULL. Errors # on booleans or strings. [plan]> SELECT AVG(id) FROM test --- -Projection: #0 -└─ Aggregate: avg(id) - └─ Scan: test +Aggregate: avg(id) +└─ Scan: test 2 !> SELECT AVG("bool") FROM test @@ -236,9 +217,8 @@ Error: invalid input: can't add 0 and # Constant aggregates can be used with rows. [plan]> SELECT COUNT(1), MIN(1), MAX(1), SUM(1), AVG(1) FROM test --- -Projection: #0, #1, #2, #3, #4 -└─ Aggregate: count(1), min(1), max(1), sum(1), avg(1) - └─ Scan: test +Aggregate: count(1), min(1), max(1), sum(1), avg(1) +└─ Scan: test 6, 1, 1, 6, 1 # Constant aggregates can't be used with value rows. diff --git a/src/sql/testscripts/queries/group_by b/src/sql/testscripts/queries/group_by index 28a516237..468015fec 100644 --- a/src/sql/testscripts/queries/group_by +++ b/src/sql/testscripts/queries/group_by @@ -31,9 +31,8 @@ Projection: #1, #2, #3, #4, #5 # Simple GROUP BY, including NULL group. [plan]> SELECT "group", COUNT(*) FROM test GROUP BY "group" --- -Projection: test.group, #1 -└─ Aggregate: count(TRUE) group by group - └─ Scan: test +Aggregate: count(TRUE) group by group +└─ Scan: test NULL, 1 a, 3 b, 3 @@ -41,9 +40,8 @@ b, 3 [plan]> SELECT "group", COUNT(*), MIN("bool"), MAX("string"), SUM("int"), AVG("float") \ FROM test GROUP BY "group" --- -Projection: test.group, #1, #2, #3, #4, #5 -└─ Aggregate: count(TRUE), min(bool), max(string), sum(int), avg(float) group by group - └─ Scan: test +Aggregate: count(TRUE), min(bool), max(string), sum(int), avg(float) group by group +└─ Scan: test NULL, 1, NULL, NULL, NULL, NULL a, 3, FALSE, AB, 9, NaN b, 3, FALSE, 👋, 41, NaN @@ -51,9 +49,8 @@ b, 3, FALSE, 👋, 41, NaN # GROUP BY works on booleans. [plan]> SELECT "bool", COUNT(*) FROM test GROUP BY "bool" --- -Projection: test.bool, #1 -└─ Aggregate: count(TRUE) group by bool - └─ Scan: test +Aggregate: count(TRUE) group by bool +└─ Scan: test NULL, 1 FALSE, 3 TRUE, 3 @@ -61,9 +58,8 @@ TRUE, 3 # GROUP BY works on integers. [plan]> SELECT "int", COUNT(*) FROM test GROUP BY "int" --- -Projection: test.int, #1 -└─ Aggregate: count(TRUE) group by int - └─ Scan: test +Aggregate: count(TRUE) group by int +└─ Scan: test NULL, 1 -1, 2 0, 1 @@ -74,9 +70,8 @@ NULL, 1 # GROUP BY works with floats, including a NAN group and -0.0 and 0.0 being equal. [plan]> SELECT "float", COUNT(*) FROM test GROUP BY "float" --- -Projection: test.float, #1 -└─ Aggregate: count(TRUE) group by float - └─ Scan: test +Aggregate: count(TRUE) group by float +└─ Scan: test NULL, 1 0, 2 3.14, 1 @@ -86,9 +81,8 @@ NaN, 2 # GROUP BY works on strings. [plan]> SELECT "string", COUNT(*) FROM test GROUP BY "string" --- -Projection: test.string, #1 -└─ Aggregate: count(TRUE) group by string - └─ Scan: test +Aggregate: count(TRUE) group by string +└─ Scan: test NULL, 1 , 2 AB, 1 @@ -108,9 +102,8 @@ Projection: #1 # GROUP BY works when there is no aggregate function. [plan]> SELECT "group" FROM test GROUP BY "group" --- -Projection: test.group -└─ Aggregate: group by group - └─ Scan: test +Aggregate: group by group +└─ Scan: test NULL a b @@ -122,9 +115,8 @@ Error: invalid input: unknown field g [plan]> SELECT "group", COUNT(*) FROM test AS t GROUP BY t."group" --- -Projection: group, #1 -└─ Aggregate: count(TRUE) group by t.group - └─ Scan: test as t +Aggregate: count(TRUE) group by t.group +└─ Scan: test as t NULL, 1 a, 3 b, 3 @@ -164,9 +156,8 @@ Projection: #1 # GROUP BY can use an expression also used in the SELECT. [plan]> SELECT id % 2, COUNT(*) FROM test GROUP BY id % 2 --- -Projection: #0, #1 -└─ Aggregate: count(TRUE) group by id % 2 - └─ Scan: test +Aggregate: count(TRUE) group by id % 2 +└─ Scan: test 0, 4 1, 3 @@ -183,9 +174,8 @@ Error: invalid input: unknown function min # GROUP BY works with multiple groups. [plan]> SELECT "group", "bool", COUNT(*) FROM test GROUP BY "group", "bool" --- -Projection: test.group, test.bool, #2 -└─ Aggregate: count(TRUE) group by group, bool - └─ Scan: test +Aggregate: count(TRUE) group by group, bool +└─ Scan: test NULL, NULL, 1 a, FALSE, 1 a, TRUE, 2 @@ -195,9 +185,8 @@ b, TRUE, 1 # GROUP BY works with joins. [plan]> SELECT t.id % 2, COUNT(*) FROM test t JOIN other o ON t.id % 2 = o.id GROUP BY t.id % 2 --- -Projection: #0, #1 -└─ Aggregate: count(TRUE) group by t.id % 2 - └─ NestedLoopJoin: inner on t.id % 2 = o.id - ├─ Scan: test as t - └─ Scan: other as o +Aggregate: count(TRUE) group by t.id % 2 +└─ NestedLoopJoin: inner on t.id % 2 = o.id + ├─ Scan: test as t + └─ Scan: other as o 1, 3 diff --git a/src/sql/testscripts/queries/having b/src/sql/testscripts/queries/having index 45a36c2c2..7928c6da8 100644 --- a/src/sql/testscripts/queries/having +++ b/src/sql/testscripts/queries/having @@ -52,9 +52,8 @@ b, 42 --- Projection: #0 └─ Filter: #1 > 10 - └─ Projection: test.group, #1 - └─ Aggregate: max(int) group by group - └─ Scan: test + └─ Aggregate: max(int) group by group + └─ Scan: test b [plan]> SELECT "group", MAX("int") FROM test GROUP BY "group" HAVING MAX("int") - MIN("int") > 10 @@ -70,9 +69,8 @@ b, 42 [plan]> SELECT "group", MAX("int") AS m FROM test GROUP BY "group" HAVING m > 10 --- Filter: m > 10 -└─ Projection: test.group, #1 - └─ Aggregate: max(int) group by group - └─ Scan: test +└─ Aggregate: max(int) group by group + └─ Scan: test b, 42 # Having works with an aggregate function not in the SELECT clause. @@ -80,9 +78,8 @@ b, 42 --- Projection: #0, #1 └─ Filter: #2 > 10 - └─ Projection: test.group, #1, #2 - └─ Aggregate: count(TRUE), max(int) group by group - └─ Scan: test + └─ Aggregate: count(TRUE), max(int) group by group + └─ Scan: test b, 3 # Having works with compound expressions. diff --git a/src/sql/testscripts/queries/order b/src/sql/testscripts/queries/order index 2ebf1a23b..716926492 100644 --- a/src/sql/testscripts/queries/order +++ b/src/sql/testscripts/queries/order @@ -485,9 +485,8 @@ FALSE, -1 --- Projection: #0 └─ Order: #1 desc - └─ Projection: test.bool, #1 - └─ Aggregate: max(int) group by bool - └─ Scan: test + └─ Aggregate: max(int) group by bool + └─ Scan: test NULL TRUE FALSE