From ed4c7ee22eb09027a540744398fcc0292e21641e Mon Sep 17 00:00:00 2001 From: Kould <2435992353@qq.com> Date: Mon, 29 Jan 2024 02:28:14 +0800 Subject: [PATCH] Refer to SparkSQL to implement Physical Select of CBO based on RBO. (#118) * bench: added SQLite for comparative testing * feat: impl Memo of CBO * feat: impl Histogram of CBO * feat: add TableMeta of `ShowTable` * feat: impl Histogram load on `Memo::new` to calculate cost of `Expression` * feat: impl `AnalyzeTable` to build the histogram of load `Histogram` on `Memo::new` * feat: completed the integration of CBO into the optimizer(currently only single column index selection is supported) * perf: optimize row count estimation - use Count min sketch to estimate equivalence conditions - for more accurate row number estimation when the range condition intersects with the bucket range of the histogram, refer to BaikalDB * style: Histogram -> ColumnMeta * fix: the optimizer does not use CBO by default --- .gitignore | 3 +- Cargo.toml | 11 +- README.md | 58 +- benchmarks/query_benchmark.rs | 129 ++- rust-toolchain | 2 +- src/binder/alter_table.rs | 2 + src/binder/analyze.rs | 42 + src/binder/copy.rs | 2 + src/binder/create_table.rs | 1 + src/binder/delete.rs | 32 +- src/binder/drop_table.rs | 1 + src/binder/insert.rs | 2 + src/binder/mod.rs | 4 +- src/binder/select.rs | 11 +- src/binder/show.rs | 1 + src/binder/truncate.rs | 1 + src/binder/update.rs | 1 + src/catalog/table.rs | 25 +- src/db.rs | 82 +- src/execution/codegen/mod.rs | 5 +- src/execution/mod.rs | 9 + .../ddl/{alter_table => }/add_column.rs | 0 src/execution/volcano/ddl/alter_table/mod.rs | 2 - .../ddl/{alter_table => }/drop_column.rs | 14 +- src/execution/volcano/ddl/mod.rs | 3 +- src/execution/volcano/dml/analyze.rs | 125 +++ src/execution/volcano/dml/delete.rs | 2 +- src/execution/volcano/dml/mod.rs | 1 + src/execution/volcano/dql/index_scan.rs | 19 +- src/execution/volcano/dql/join/hash_join.rs | 5 +- src/execution/volcano/dql/sort.rs | 2 +- src/execution/volcano/mod.rs | 22 +- src/execution/volcano/show/show_table.rs | 21 +- src/expression/simplify.rs | 51 +- src/lib.rs | 2 +- src/main.rs | 6 +- src/optimizer/core/cm_sketch.rs | 211 +++++ src/optimizer/core/column_meta.rs | 181 ++++ src/optimizer/core/histogram.rs | 810 ++++++++++++++++++ src/optimizer/core/memo.rs | 163 ++++ src/optimizer/core/mod.rs | 4 + src/optimizer/core/opt_expr.rs | 1 + src/optimizer/core/rule.rs | 20 +- src/optimizer/heuristic/batch.rs | 10 +- src/optimizer/heuristic/graph.rs | 62 +- src/optimizer/heuristic/matcher.rs | 4 + src/optimizer/heuristic/optimizer.rs | 69 +- src/optimizer/mod.rs | 31 +- .../rule/implementation/ddl/add_column.rs | 27 + .../rule/implementation/ddl/create_table.rs | 27 + .../rule/implementation/ddl/drop_column.rs | 27 + .../rule/implementation/ddl/drop_table.rs | 27 + src/optimizer/rule/implementation/ddl/mod.rs | 5 + .../rule/implementation/ddl/truncate.rs | 27 + .../rule/implementation/dml/analyze.rs | 27 + .../rule/implementation/dml/copy_from_file.rs | 27 + .../rule/implementation/dml/copy_to_file.rs | 27 + .../rule/implementation/dml/delete.rs | 23 + .../rule/implementation/dml/insert.rs | 23 + src/optimizer/rule/implementation/dml/mod.rs | 6 + .../rule/implementation/dml/update.rs | 23 + .../rule/implementation/dql/aggregate.rs | 51 ++ .../rule/implementation/dql/dummy.rs | 23 + .../rule/implementation/dql/filter.rs | 23 + src/optimizer/rule/implementation/dql/join.rs | 27 + .../rule/implementation/dql/limit.rs | 23 + src/optimizer/rule/implementation/dql/mod.rs | 9 + .../rule/implementation/dql/projection.rs | 27 + src/optimizer/rule/implementation/dql/scan.rs | 111 +++ src/optimizer/rule/implementation/dql/sort.rs | 23 + .../rule/implementation/dql/values.rs | 23 + src/optimizer/rule/implementation/marcos.rs | 27 + src/optimizer/rule/implementation/mod.rs | 174 ++++ src/optimizer/rule/mod.rs | 81 +- .../{ => normalization}/column_pruning.rs | 31 +- .../{ => normalization}/combine_operators.rs | 58 +- src/optimizer/rule/normalization/mod.rs | 91 ++ .../{ => normalization}/pushdown_limit.rs | 105 ++- .../pushdown_predicates.rs | 81 +- .../{ => normalization}/simplification.rs | 108 ++- src/parser/mod.rs | 7 +- src/planner/mod.rs | 3 +- src/planner/operator/aggregate.rs | 1 + src/planner/operator/analyze.rs | 7 + src/planner/operator/delete.rs | 4 +- src/planner/operator/filter.rs | 1 + src/planner/operator/join.rs | 1 + src/planner/operator/limit.rs | 1 + src/planner/operator/mod.rs | 34 + src/planner/operator/scan.rs | 32 +- src/storage/kip.rs | 119 +-- src/storage/mod.rs | 11 +- src/storage/table_codec.rs | 42 +- src/types/errors.rs | 10 +- src/types/index.rs | 7 + src/types/value.rs | 38 +- tests/slt/analyze.slt | 48 ++ 97 files changed, 3497 insertions(+), 526 deletions(-) create mode 100644 src/binder/analyze.rs rename src/execution/volcano/ddl/{alter_table => }/add_column.rs (100%) delete mode 100644 src/execution/volcano/ddl/alter_table/mod.rs rename src/execution/volcano/ddl/{alter_table => }/drop_column.rs (85%) create mode 100644 src/execution/volcano/dml/analyze.rs create mode 100644 src/optimizer/core/cm_sketch.rs create mode 100644 src/optimizer/core/column_meta.rs create mode 100644 src/optimizer/core/histogram.rs create mode 100644 src/optimizer/core/memo.rs create mode 100644 src/optimizer/rule/implementation/ddl/add_column.rs create mode 100644 src/optimizer/rule/implementation/ddl/create_table.rs create mode 100644 src/optimizer/rule/implementation/ddl/drop_column.rs create mode 100644 src/optimizer/rule/implementation/ddl/drop_table.rs create mode 100644 src/optimizer/rule/implementation/ddl/mod.rs create mode 100644 src/optimizer/rule/implementation/ddl/truncate.rs create mode 100644 src/optimizer/rule/implementation/dml/analyze.rs create mode 100644 src/optimizer/rule/implementation/dml/copy_from_file.rs create mode 100644 src/optimizer/rule/implementation/dml/copy_to_file.rs create mode 100644 src/optimizer/rule/implementation/dml/delete.rs create mode 100644 src/optimizer/rule/implementation/dml/insert.rs create mode 100644 src/optimizer/rule/implementation/dml/mod.rs create mode 100644 src/optimizer/rule/implementation/dml/update.rs create mode 100644 src/optimizer/rule/implementation/dql/aggregate.rs create mode 100644 src/optimizer/rule/implementation/dql/dummy.rs create mode 100644 src/optimizer/rule/implementation/dql/filter.rs create mode 100644 src/optimizer/rule/implementation/dql/join.rs create mode 100644 src/optimizer/rule/implementation/dql/limit.rs create mode 100644 src/optimizer/rule/implementation/dql/mod.rs create mode 100644 src/optimizer/rule/implementation/dql/projection.rs create mode 100644 src/optimizer/rule/implementation/dql/scan.rs create mode 100644 src/optimizer/rule/implementation/dql/sort.rs create mode 100644 src/optimizer/rule/implementation/dql/values.rs create mode 100644 src/optimizer/rule/implementation/marcos.rs create mode 100644 src/optimizer/rule/implementation/mod.rs rename src/optimizer/rule/{ => normalization}/column_pruning.rs (87%) rename src/optimizer/rule/{ => normalization}/combine_operators.rs (76%) create mode 100644 src/optimizer/rule/normalization/mod.rs rename src/optimizer/rule/{ => normalization}/pushdown_limit.rs (70%) rename src/optimizer/rule/{ => normalization}/pushdown_predicates.rs (86%) rename src/optimizer/rule/{ => normalization}/simplification.rs (81%) create mode 100644 src/planner/operator/analyze.rs create mode 100644 tests/slt/analyze.slt diff --git a/.gitignore b/.gitignore index 5e475b7e..362d1e8d 100644 --- a/.gitignore +++ b/.gitignore @@ -23,4 +23,5 @@ Cargo.lock /hello_world /transaction -query_bench_data/ \ No newline at end of file +kipsql_bench +sqlite_bench \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index f93fdf2d..3215198f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ [package] name = "kip-sql" -version = "0.0.1-alpha.8" +version = "0.0.1-alpha.9" edition = "2021" authors = ["Kould ", "Xwg "] description = "build the SQL layer of KipDB database" @@ -24,7 +24,6 @@ codegen_execute = ["dep:mlua"] name = "query_bench" path = "benchmarks/query_benchmark.rs" harness = false -required-features = ["codegen_execute"] [dependencies] sqlparser = "0.34.0" @@ -46,11 +45,14 @@ ahash = "0.8.3" lazy_static = "1.4.0" comfy-table = "7.0.1" bytes = "1.5.0" -kip_db = "0.1.2-alpha.21" +kip_db = "0.1.2-alpha.23.fix5" rust_decimal = "1" csv = "1" regex = "1.10.2" clap = "4.4.11" +rand = "0.8.5" +dirs = "5.0.1" +siphasher = { version = "0.3.11", features = ["serde"] } mlua = { version = "0.9.1", features = ["luajit", "vendored", "macros", "async"], optional = true } @@ -64,6 +66,9 @@ env_logger = "0.10" paste = "^1.0" rstest = "0.17" tempfile = "3.0.7" +rand_distr = "0.4.3" + +sqlite = "0.32.0" [workspace] members = [ diff --git a/README.md b/README.md index 2d10c0c1..f14a3aed 100755 --- a/README.md +++ b/README.md @@ -34,14 +34,6 @@ Embedded SQL DBMS KipSQL is designed to allow small Rust projects to reduce external dependencies and get rid of heavy database maintenance, so that the Rust application itself can provide SQL storage capabilities. - -If you are a developer of the following applications, we very much welcome you to try using KipSQL -and provide your experience and opinions on using it. -- personal website -- desktop/mobile application -- learning database -- platform bot - Welcome to our WebSite, Power By KipSQL: **http://www.kipdata.site/** ### Quick Started @@ -84,31 +76,30 @@ Storage Support: ### Features - ORM Mapping: `features = ["marcos"]` ```rust -#[derive(Debug, Clone, Default)] -pub struct Post { - pub post_title: String, - pub post_date: NaiveDateTime, - pub post_body: String, +#[derive(Default, Debug, PartialEq)] +struct MyStruct { + c1: i32, + c2: String, } -implement_from_tuple!(Post, ( - post_title: String => |post: &mut Post, value: DataValue| { - if let Some(title) = value.utf8() { - post.post_title = title; - } - }, - post_date: NaiveDateTime => |post: &mut Post, value: DataValue| { - if let Some(date_time) = value.datetime() { - post.post_date = date_time; - } - }, - post_body: String => |post: &mut Post, value: DataValue| { - if let Some(body) = value.utf8() { - post.post_body = body; +implement_from_tuple!( + MyStruct, ( + c1: i32 => |inner: &mut MyStruct, value| { + if let DataValue::Int32(Some(val)) = value { + inner.c1 = val; + } + }, + c2: String => |inner: &mut MyStruct, value| { + if let DataValue::Utf8(Some(val)) = value { + inner.c2 = val; + } } - } -)); + ) +); ``` +- Optimizer + - RBO + - CBO based on RBO(Physical Selection) - Execute - Volcano - Codegen on LuaJIT: `features = ["codegen_execute"]` @@ -165,6 +156,7 @@ implement_from_tuple!(Post, ( - [x] Insert Overwrite - [x] Update - [x] Delete + - [x] Analyze - DataTypes - Invalid - SqlNull @@ -182,14 +174,6 @@ implement_from_tuple!(Post, ( - Varchar - Date - DateTime -- Optimizer rules - - Limit Project Transpose - - Eliminate Limits - - Push Limit Through Join - - Push Limit Into Scan - - Combine Filters - - Column Pruning - - Collapse Project ## License diff --git a/benchmarks/query_benchmark.rs b/benchmarks/query_benchmark.rs index 903c0a04..a15b181f 100644 --- a/benchmarks/query_benchmark.rs +++ b/benchmarks/query_benchmark.rs @@ -1,18 +1,23 @@ use criterion::{criterion_group, criterion_main, Criterion}; use indicatif::{ProgressBar, ProgressStyle}; +use itertools::Itertools; use kip_sql::db::{Database, DatabaseError}; -use kip_sql::execution::{codegen, volcano}; +use kip_sql::execution::volcano; use kip_sql::storage::kip::KipStorage; use kip_sql::storage::Storage; +use sqlite::Error; use std::cell::RefCell; use std::fs; +use std::path::Path; use std::sync::Arc; -const QUERY_BENCH_PATH: &'static str = "./query_bench_data"; +const QUERY_CASE: &'static str = "select * from t1 where c1 = 1000"; +const QUERY_BENCH_KIPSQL_PATH: &'static str = "./kipsql_bench"; +const QUERY_BENCH_SQLITE_PATH: &'static str = "./sqlite_bench"; const TABLE_ROW_NUM: u64 = 2_00_000; -async fn init_query_bench() -> Result<(), DatabaseError> { - let database = Database::with_kipdb(QUERY_BENCH_PATH).await.unwrap(); +async fn init_kipsql_query_bench() -> Result<(), DatabaseError> { + let database = Database::with_kipdb(QUERY_BENCH_KIPSQL_PATH).await.unwrap(); database .run("create table t1 (c1 int primary key, c2 int)") .await?; @@ -31,6 +36,29 @@ async fn init_query_bench() -> Result<(), DatabaseError> { } pb.finish_with_message("Insert completed!"); + let _ = database.run("analyze table t1").await?; + + Ok(()) +} + +fn init_sqlite_query_bench() -> Result<(), Error> { + let connection = sqlite::open(QUERY_BENCH_SQLITE_PATH.to_owned())?; + + let _ = connection.execute("create table t1 (c1 int primary key, c2 int)")?; + + let pb = ProgressBar::new(TABLE_ROW_NUM); + pb.set_style( + ProgressStyle::default_bar() + .template("[{elapsed_precise}] {bar:40.cyan/white} {pos}/{len} {msg}") + .unwrap(), + ); + + for i in 0..TABLE_ROW_NUM { + let _ = connection.execute(format!("insert into t1 values({}, {})", i, i + 1))?; + pb.set_position(i + 1); + } + pb.finish_with_message("Insert completed!"); + Ok(()) } @@ -48,56 +76,85 @@ fn query_on_execute(c: &mut Criterion) { .build() .unwrap(); let database = rt.block_on(async { - if !path_exists_and_is_directory(QUERY_BENCH_PATH) { + if !Path::new(QUERY_BENCH_SQLITE_PATH).exists() { + println!( + "SQLITE: The table is not initialized and data insertion is started. => {}", + TABLE_ROW_NUM + ); + + init_sqlite_query_bench().unwrap(); + } + if !path_exists_and_is_directory(QUERY_BENCH_KIPSQL_PATH) { println!( - "The table is not initialized and data insertion is started. => {}", + "KipSQL: The table is not initialized and data insertion is started. => {}", TABLE_ROW_NUM ); - init_query_bench().await.unwrap(); + init_kipsql_query_bench().await.unwrap(); } - Database::::with_kipdb(QUERY_BENCH_PATH) + Database::::with_kipdb(QUERY_BENCH_KIPSQL_PATH) .await .unwrap() }); println!("Table initialization completed"); - let (codegen_transaction, plan) = rt.block_on(async { - let transaction = database.storage.transaction().await.unwrap(); - let (plan, _) = - Database::::build_plan("select * from t1", &transaction).unwrap(); - - (Arc::new(transaction), plan) - }); + #[cfg(feature = "codegen_execute")] + { + use kip_sql::execution::codegen; + + let (codegen_transaction, plan) = rt.block_on(async { + let transaction = database.storage.transaction().await.unwrap(); + let (plan, _) = Database::::build_plan(QUERY_CASE, &transaction).unwrap(); + + (Arc::new(transaction), plan) + }); + + c.bench_function(format!("Codegen: {}", QUERY_CASE).as_str(), |b| { + b.to_async(&rt).iter(|| async { + let tuples = codegen::execute(plan.clone(), codegen_transaction.clone()) + .await + .unwrap(); + if tuples.len() as u64 != TABLE_ROW_NUM { + panic!("{}", tuples.len()); + } + }) + }); + + let (volcano_transaction, plan) = rt.block_on(async { + let transaction = database.storage.transaction().await.unwrap(); + let (plan, _) = Database::::build_plan(QUERY_CASE, &transaction).unwrap(); + + (RefCell::new(transaction), plan) + }); + + c.bench_function(format!("Volcano: {}", QUERY_CASE).as_str(), |b| { + b.to_async(&rt).iter(|| async { + let mut stream = volcano::build_stream(plan.clone(), &volcano_transaction); + let tuples = volcano::try_collect(&mut stream).await.unwrap(); + if tuples.len() as u64 != TABLE_ROW_NUM { + panic!("{}", tuples.len()); + } + }) + }); + } - c.bench_function("Codegen: select all", |b| { + c.bench_function(format!("KipSQL: {}", QUERY_CASE).as_str(), |b| { b.to_async(&rt).iter(|| async { - let tuples = codegen::execute(plan.clone(), codegen_transaction.clone()) - .await - .unwrap(); - if tuples.len() as u64 != TABLE_ROW_NUM { - panic!("{}", tuples.len()); - } + let _tuples = database.run(QUERY_CASE).await.unwrap(); }) }); - let (volcano_transaction, plan) = rt.block_on(async { - let transaction = database.storage.transaction().await.unwrap(); - let (plan, _) = - Database::::build_plan("select * from t1", &transaction).unwrap(); - - (RefCell::new(transaction), plan) - }); - - c.bench_function("Volcano: select all", |b| { + let connection = sqlite::open(QUERY_BENCH_SQLITE_PATH.to_owned()).unwrap(); + c.bench_function(format!("SQLite: {}", QUERY_CASE).as_str(), |b| { b.to_async(&rt).iter(|| async { - let mut stream = volcano::build_stream(plan.clone(), &volcano_transaction); - let tuples = volcano::try_collect(&mut stream).await.unwrap(); - if tuples.len() as u64 != TABLE_ROW_NUM { - panic!("{}", tuples.len()); - } + let _tuples = connection + .prepare(QUERY_CASE) + .unwrap() + .into_iter() + .map(|row| row.unwrap()) + .collect_vec(); }) }); } diff --git a/rust-toolchain b/rust-toolchain index 07ade694..8d5e8e76 100644 --- a/rust-toolchain +++ b/rust-toolchain @@ -1 +1 @@ -nightly \ No newline at end of file +nightly-2024-01-18 \ No newline at end of file diff --git a/src/binder/alter_table.rs b/src/binder/alter_table.rs index 1e4b9948..827fae1e 100644 --- a/src/binder/alter_table.rs +++ b/src/binder/alter_table.rs @@ -41,6 +41,7 @@ impl<'a, T: Transaction> Binder<'a, T> { column, }), childrens: vec![plan], + physical_option: None, } } AlterTableOperation::DropColumn { @@ -58,6 +59,7 @@ impl<'a, T: Transaction> Binder<'a, T> { column_name, }), childrens: vec![plan], + physical_option: None, } } AlterTableOperation::DropPrimaryKey => todo!(), diff --git a/src/binder/analyze.rs b/src/binder/analyze.rs new file mode 100644 index 00000000..22f08b04 --- /dev/null +++ b/src/binder/analyze.rs @@ -0,0 +1,42 @@ +use crate::binder::{lower_case_name, split_name, BindError, Binder}; +use crate::planner::operator::analyze::AnalyzeOperator; +use crate::planner::operator::scan::ScanOperator; +use crate::planner::operator::Operator; +use crate::planner::LogicalPlan; +use crate::storage::Transaction; +use itertools::Itertools; +use sqlparser::ast::ObjectName; +use std::sync::Arc; + +impl<'a, T: Transaction> Binder<'a, T> { + pub(crate) fn bind_analyze(&mut self, name: &ObjectName) -> Result { + let name = lower_case_name(name); + let name = split_name(&name)?; + let table_name = Arc::new(name.to_string()); + + let table_catalog = self + .context + .table(table_name.clone()) + .cloned() + .ok_or_else(|| BindError::InvalidTable(format!("bind table {}", name)))?; + let columns = table_catalog + .all_columns() + .into_iter() + .filter_map(|column| column.desc.is_index().then(|| column)) + .collect_vec(); + + let scan_op = ScanOperator::build(table_name.clone(), &table_catalog); + self.context + .add_bind_table(table_name.clone(), table_catalog, None)?; + + let plan = LogicalPlan { + operator: Operator::Analyze(AnalyzeOperator { + table_name, + columns, + }), + childrens: vec![scan_op], + physical_option: None, + }; + Ok(plan) + } +} diff --git a/src/binder/copy.rs b/src/binder/copy.rs index 350e35b7..f4f26e98 100644 --- a/src/binder/copy.rs +++ b/src/binder/copy.rs @@ -85,6 +85,7 @@ impl<'a, T: Transaction> Binder<'a, T> { Ok(LogicalPlan { operator: Operator::CopyToFile(CopyToFileOperator { source: ext_source }), childrens: vec![], + physical_option: None, }) } else { // COPY FROM @@ -95,6 +96,7 @@ impl<'a, T: Transaction> Binder<'a, T> { table: table_name.to_string(), }), childrens: vec![], + physical_option: None, }) } } else { diff --git a/src/binder/create_table.rs b/src/binder/create_table.rs index e4a976c1..883812bf 100644 --- a/src/binder/create_table.rs +++ b/src/binder/create_table.rs @@ -86,6 +86,7 @@ impl<'a, T: Transaction> Binder<'a, T> { if_not_exists, }), childrens: vec![], + physical_option: None, }; Ok(plan) } diff --git a/src/binder/delete.rs b/src/binder/delete.rs index 7657a77d..bac496b3 100644 --- a/src/binder/delete.rs +++ b/src/binder/delete.rs @@ -1,9 +1,11 @@ use crate::binder::{lower_case_name, split_name, BindError, Binder}; use crate::planner::operator::delete::DeleteOperator; +use crate::planner::operator::scan::ScanOperator; use crate::planner::operator::Operator; use crate::planner::LogicalPlan; use crate::storage::Transaction; use sqlparser::ast::{Expr, TableFactor, TableWithJoins}; +use std::sync::Arc; impl<'a, T: Transaction> Binder<'a, T> { pub(crate) fn bind_delete( @@ -14,16 +16,40 @@ impl<'a, T: Transaction> Binder<'a, T> { if let TableFactor::Table { name, alias, .. } = &from.relation { let name = lower_case_name(name); let name = split_name(&name)?; - let (table_name, mut plan) = - self._bind_single_table_ref(None, name, Self::trans_alias(alias))?; + let table_name = Arc::new(name.to_string()); + + let table_catalog = self + .context + .table(table_name.clone()) + .cloned() + .ok_or_else(|| BindError::InvalidTable(format!("bind table {}", name)))?; + let primary_key_column = table_catalog + .all_columns_with_id() + .iter() + .find(|(_, column)| column.desc.is_primary) + .map(|(_, column)| Arc::clone(column)) + .unwrap(); + let mut plan = ScanOperator::build(table_name.clone(), &table_catalog); + + self.context + .add_bind_table(table_name.clone(), table_catalog, None)?; + + if let Some(alias) = alias { + self.context + .add_table_alias(alias.to_string(), table_name.clone())?; + } if let Some(predicate) = selection { plan = self.bind_where(plan, predicate)?; } Ok(LogicalPlan { - operator: Operator::Delete(DeleteOperator { table_name }), + operator: Operator::Delete(DeleteOperator { + table_name, + primary_key_column, + }), childrens: vec![plan], + physical_option: None, }) } else { unreachable!("only table") diff --git a/src/binder/drop_table.rs b/src/binder/drop_table.rs index 2eec4562..ad878f95 100644 --- a/src/binder/drop_table.rs +++ b/src/binder/drop_table.rs @@ -22,6 +22,7 @@ impl<'a, T: Transaction> Binder<'a, T> { if_exists: *if_exists, }), childrens: vec![], + physical_option: None, }; Ok(plan) } diff --git a/src/binder/insert.rs b/src/binder/insert.rs index 2a4e6c36..e31bac1c 100644 --- a/src/binder/insert.rs +++ b/src/binder/insert.rs @@ -86,6 +86,7 @@ impl<'a, T: Transaction> Binder<'a, T> { is_overwrite, }), childrens: vec![values_plan], + physical_option: None, }) } else { Err(BindError::InvalidTable(format!( @@ -103,6 +104,7 @@ impl<'a, T: Transaction> Binder<'a, T> { LogicalPlan { operator: Operator::Values(ValuesOperator { rows, columns }), childrens: vec![], + physical_option: None, } } } diff --git a/src/binder/mod.rs b/src/binder/mod.rs index f2261527..2181f180 100644 --- a/src/binder/mod.rs +++ b/src/binder/mod.rs @@ -1,5 +1,6 @@ pub mod aggregate; mod alter_table; +mod analyze; pub mod copy; mod create_table; mod delete; @@ -173,6 +174,7 @@ impl<'a, T: Transaction> Binder<'a, T> { self.bind_delete(table, selection)? } } + Statement::Analyze { table_name, .. } => self.bind_analyze(table_name)?, Statement::Truncate { table_name, .. } => self.bind_truncate(table_name)?, Statement::ShowTables { .. } => self.bind_show_tables()?, Statement::Copy { @@ -302,7 +304,7 @@ pub mod test { Ok(storage) } - pub async fn select_sql_run(sql: &str) -> Result { + pub async fn select_sql_run>(sql: S) -> Result { let temp_dir = TempDir::new().expect("unable to create temporary working directory"); let storage = build_test_catalog(temp_dir.path()).await?; let transaction = storage.transaction().await?; diff --git a/src/binder/select.rs b/src/binder/select.rs index c77eb435..9a1a2877 100644 --- a/src/binder/select.rs +++ b/src/binder/select.rs @@ -118,6 +118,7 @@ impl<'a, T: Transaction> Binder<'a, T> { return Ok(LogicalPlan { operator: Operator::Dummy, childrens: vec![], + physical_option: None, }); } @@ -205,19 +206,17 @@ impl<'a, T: Transaction> Binder<'a, T> { .table(table_name.clone()) .cloned() .ok_or_else(|| BindError::InvalidTable(format!("bind table {}", table)))?; + let scan_op = ScanOperator::build(table_name.clone(), &table_catalog); self.context - .add_bind_table(table_name.clone(), table_catalog.clone(), join_type)?; + .add_bind_table(table_name.clone(), table_catalog, join_type)?; if let Some(alias) = alias { self.context .add_table_alias(alias.to_string(), table_name.clone())?; } - Ok(( - table_name.clone(), - ScanOperator::build(table_name, &table_catalog), - )) + Ok((table_name, scan_op)) } /// Normalize select item. @@ -342,6 +341,7 @@ impl<'a, T: Transaction> Binder<'a, T> { LogicalPlan { operator: Operator::Project(ProjectOperator { exprs: select_list }), childrens: vec![children], + physical_option: None, } } @@ -352,6 +352,7 @@ impl<'a, T: Transaction> Binder<'a, T> { limit: None, }), childrens: vec![children], + physical_option: None, } } diff --git a/src/binder/show.rs b/src/binder/show.rs index f5855a5d..4b1da3be 100644 --- a/src/binder/show.rs +++ b/src/binder/show.rs @@ -9,6 +9,7 @@ impl<'a, T: Transaction> Binder<'a, T> { let plan = LogicalPlan { operator: Operator::Show(ShowTablesOperator {}), childrens: vec![], + physical_option: None, }; Ok(plan) } diff --git a/src/binder/truncate.rs b/src/binder/truncate.rs index 4dfb920c..d3a0e1a8 100644 --- a/src/binder/truncate.rs +++ b/src/binder/truncate.rs @@ -15,6 +15,7 @@ impl<'a, T: Transaction> Binder<'a, T> { let plan = LogicalPlan { operator: Operator::Truncate(TruncateOperator { table_name }), childrens: vec![], + physical_option: None, }; Ok(plan) } diff --git a/src/binder/update.rs b/src/binder/update.rs index 9703cf43..e0743c6a 100644 --- a/src/binder/update.rs +++ b/src/binder/update.rs @@ -58,6 +58,7 @@ impl<'a, T: Transaction> Binder<'a, T> { Ok(LogicalPlan { operator: Operator::Update(UpdateOperator { table_name }), childrens: vec![plan, values_plan], + physical_option: None, }) } else { unreachable!("only table") diff --git a/src/catalog/table.rs b/src/catalog/table.rs index 01db1e93..e663c81c 100644 --- a/src/catalog/table.rs +++ b/src/catalog/table.rs @@ -1,3 +1,4 @@ +use serde::{Deserialize, Serialize}; use std::collections::BTreeMap; use std::sync::Arc; @@ -16,6 +17,12 @@ pub struct TableCatalog { pub(crate) indexes: Vec, } +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct TableMeta { + pub(crate) colum_meta_paths: Vec, + pub(crate) table_name: TableName, +} + impl TableCatalog { pub(crate) fn get_unique_index(&self, col_id: &ColumnId) -> Option<&IndexMetaRef> { self.indexes @@ -56,7 +63,12 @@ impl TableCatalog { return Err(CatalogError::Duplicated("column", col.name().to_string())); } - let col_id = self.columns.len() as u32; + let col_id = self + .columns + .iter() + .last() + .map(|(column_id, _)| column_id + 1) + .unwrap_or(0); col.summary.table_name = Some(self.name.clone()); col.summary.id = Some(col_id); @@ -108,7 +120,7 @@ impl TableCatalog { Ok(table_catalog) } - pub(crate) fn new_with_indexes( + pub(crate) fn reload( name: TableName, columns: Vec, indexes: Vec, @@ -120,6 +132,15 @@ impl TableCatalog { } } +impl TableMeta { + pub(crate) fn empty(table_name: TableName) -> Self { + TableMeta { + colum_meta_paths: vec![], + table_name, + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/db.rs b/src/db.rs index 1356636e..2aeedb3e 100644 --- a/src/db.rs +++ b/src/db.rs @@ -8,7 +8,8 @@ use crate::execution::volcano::{build_stream, try_collect}; use crate::execution::ExecutorError; use crate::optimizer::heuristic::batch::HepBatchStrategy; use crate::optimizer::heuristic::optimizer::HepOptimizer; -use crate::optimizer::rule::RuleImpl; +use crate::optimizer::rule::implementation::ImplementationRuleImpl; +use crate::optimizer::rule::normalization::NormalizationRuleImpl; use crate::optimizer::OptimizerError; use crate::parser::parse_sql; use crate::planner::LogicalPlan; @@ -36,9 +37,9 @@ impl Database { } impl Database { - pub async fn run_on_query( + pub async fn run_on_query>( &self, - sql: &str, + sql: S, query_execute: QueryExecute, ) -> Result, DatabaseError> { match query_execute { @@ -79,9 +80,9 @@ impl Database { } /// Run SQL queries. - pub async fn run(&self, sql: &str) -> Result, DatabaseError> { + pub async fn run>(&self, sql: T) -> Result, DatabaseError> { let transaction = self.storage.transaction().await?; - let (plan, _) = Self::build_plan(sql, &transaction)?; + let (plan, _) = Self::build_plan::(sql, &transaction)?; Self::run_volcano(transaction, plan).await } @@ -107,8 +108,8 @@ impl Database { }) } - pub fn build_plan( - sql: &str, + pub fn build_plan, T: Transaction>( + sql: V, transaction: &::TransactionType, ) -> Result<(LogicalPlan, Statement), DatabaseError> { // parse @@ -127,7 +128,8 @@ impl Database { let source_plan = binder.bind(&stmts[0])?; // println!("source_plan plan: {:#?}", source_plan); - let best_plan = Self::default_optimizer(source_plan).find_best()?; + let best_plan = + Self::default_optimizer(source_plan).find_best(Some(&transaction.meta_loader()))?; // println!("best_plan plan: {:#?}", best_plan); Ok((best_plan, stmts.remove(0))) @@ -138,36 +140,69 @@ impl Database { .batch( "Column Pruning".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::ColumnPruning], + vec![NormalizationRuleImpl::ColumnPruning], ) .batch( "Simplify Filter".to_string(), HepBatchStrategy::fix_point_topdown(10), - vec![RuleImpl::SimplifyFilter, RuleImpl::ConstantCalculation], + vec![ + NormalizationRuleImpl::SimplifyFilter, + NormalizationRuleImpl::ConstantCalculation, + ], ) .batch( "Predicate Pushdown".to_string(), HepBatchStrategy::fix_point_topdown(10), vec![ - RuleImpl::PushPredicateThroughJoin, - RuleImpl::PushPredicateIntoScan, + NormalizationRuleImpl::PushPredicateThroughJoin, + NormalizationRuleImpl::PushPredicateIntoScan, ], ) .batch( "Combine Operators".to_string(), HepBatchStrategy::fix_point_topdown(10), - vec![RuleImpl::CollapseProject, RuleImpl::CombineFilter], + vec![ + NormalizationRuleImpl::CollapseProject, + NormalizationRuleImpl::CombineFilter, + ], ) .batch( "Limit Pushdown".to_string(), HepBatchStrategy::fix_point_topdown(10), vec![ - RuleImpl::LimitProjectTranspose, - RuleImpl::PushLimitThroughJoin, - RuleImpl::PushLimitIntoTableScan, - RuleImpl::EliminateLimits, + NormalizationRuleImpl::LimitProjectTranspose, + NormalizationRuleImpl::PushLimitThroughJoin, + NormalizationRuleImpl::PushLimitIntoTableScan, + NormalizationRuleImpl::EliminateLimits, ], ) + .implementations(vec![ + // DQL + ImplementationRuleImpl::SimpleAggregate, + ImplementationRuleImpl::GroupByAggregate, + ImplementationRuleImpl::Dummy, + ImplementationRuleImpl::Filter, + ImplementationRuleImpl::HashJoin, + ImplementationRuleImpl::Limit, + ImplementationRuleImpl::Projection, + ImplementationRuleImpl::SeqScan, + ImplementationRuleImpl::IndexScan, + ImplementationRuleImpl::Sort, + ImplementationRuleImpl::Values, + // DML + ImplementationRuleImpl::Analyze, + ImplementationRuleImpl::CopyFromFile, + ImplementationRuleImpl::CopyToFile, + ImplementationRuleImpl::Delete, + ImplementationRuleImpl::Insert, + ImplementationRuleImpl::Update, + // DLL + ImplementationRuleImpl::AddColumn, + ImplementationRuleImpl::CreateTable, + ImplementationRuleImpl::DropColumn, + ImplementationRuleImpl::DropTable, + ImplementationRuleImpl::Truncate, + ]) } } @@ -176,9 +211,10 @@ pub struct DBTransaction { } impl DBTransaction { - pub async fn run(&mut self, sql: &str) -> Result, DatabaseError> { - let (plan, _) = - Database::::build_plan(sql, unsafe { self.inner.as_ptr().as_ref().unwrap() })?; + pub async fn run>(&mut self, sql: T) -> Result, DatabaseError> { + let (plan, _) = Database::::build_plan::(sql, unsafe { + self.inner.as_ptr().as_ref().unwrap() + })?; let mut stream = build_stream(plan, &self.inner); Ok(try_collect(&mut stream).await?) @@ -340,9 +376,13 @@ mod test { #[tokio::test] async fn test_crud_sql() -> Result<(), DatabaseError> { - let mut results_1 = _test_crud_sql(QueryExecute::Volcano).await?; + #[cfg(not(feature = "codegen_execute"))] + { + let _ = crate::db::test::_test_crud_sql(QueryExecute::Volcano).await?; + } #[cfg(feature = "codegen_execute")] { + let mut results_1 = _test_crud_sql(QueryExecute::Volcano).await?; let mut results_2 = _test_crud_sql(QueryExecute::Codegen).await?; assert_eq!(results_1.len(), results_2.len()); diff --git a/src/execution/codegen/mod.rs b/src/execution/codegen/mod.rs index b8911f4f..2784293c 100644 --- a/src/execution/codegen/mod.rs +++ b/src/execution/codegen/mod.rs @@ -302,7 +302,7 @@ mod test { use crate::execution::codegen::execute; use crate::parser::parse_sql; use crate::storage::kip::KipStorage; - use crate::storage::Storage; + use crate::storage::{Storage, Transaction}; use crate::types::tuple::create_table; use std::sync::Arc; use tempfile::TempDir; @@ -343,7 +343,8 @@ mod test { let source_plan = binder.bind(&stmts[0])?; // println!("source_plan plan: {:#?}", source_plan); - let best_plan = Database::::default_optimizer(source_plan).find_best()?; + let best_plan = Database::::default_optimizer(source_plan) + .find_best(Some(&transaction.meta_loader()))?; // println!("{:#?}", best_plan); let tuples = execute(best_plan, Arc::new(transaction)).await?; diff --git a/src/execution/mod.rs b/src/execution/mod.rs index effe6bf8..5621fcd1 100644 --- a/src/execution/mod.rs +++ b/src/execution/mod.rs @@ -4,6 +4,7 @@ pub mod volcano; use crate::binder::BindError; use crate::catalog::CatalogError; +use crate::optimizer::OptimizerError; use crate::storage::StorageError; use crate::types::errors::TypeError; #[cfg(feature = "codegen_execute")] @@ -36,6 +37,12 @@ pub enum ExecutorError { #[from] BindError, ), + #[error("optimizer error: {0}")] + Optimizer( + #[source] + #[from] + OptimizerError, + ), #[error("parser error: {0}")] ParserError( #[source] @@ -73,4 +80,6 @@ pub enum ExecutorError { ), #[error("channel close")] ChannelClose, + #[error("invalid index")] + InvalidIndex, } diff --git a/src/execution/volcano/ddl/alter_table/add_column.rs b/src/execution/volcano/ddl/add_column.rs similarity index 100% rename from src/execution/volcano/ddl/alter_table/add_column.rs rename to src/execution/volcano/ddl/add_column.rs diff --git a/src/execution/volcano/ddl/alter_table/mod.rs b/src/execution/volcano/ddl/alter_table/mod.rs deleted file mode 100644 index 413c5a01..00000000 --- a/src/execution/volcano/ddl/alter_table/mod.rs +++ /dev/null @@ -1,2 +0,0 @@ -pub mod add_column; -pub mod drop_column; diff --git a/src/execution/volcano/ddl/alter_table/drop_column.rs b/src/execution/volcano/ddl/drop_column.rs similarity index 85% rename from src/execution/volcano/ddl/alter_table/drop_column.rs rename to src/execution/volcano/ddl/drop_column.rs index 27b47074..aef267d0 100644 --- a/src/execution/volcano/ddl/alter_table/drop_column.rs +++ b/src/execution/volcano/ddl/drop_column.rs @@ -33,13 +33,13 @@ impl DropColumn { column_name, if_exists, } = &self.op; - let mut option_column_index = None; + let mut option_column_i = None; #[for_await] for tuple in self.input { let mut tuple: Tuple = tuple?; - if option_column_index.is_none() { + if option_column_i.is_none() { if let Some((column_index, is_primary)) = tuple .columns .iter() @@ -52,17 +52,17 @@ impl DropColumn { "drop of primary key column is not allowed.".to_owned(), ))?; } - option_column_index = Some(column_index); + option_column_i = Some(column_index); } } - if option_column_index.is_none() && *if_exists { + if option_column_i.is_none() && *if_exists { return Ok(()); } - let column_index = option_column_index + let column_i = option_column_i .ok_or_else(|| BindError::InvalidColumn("not found column".to_string()))?; - let _ = tuple.columns.remove(column_index); - let _ = tuple.values.remove(column_index); + let _ = tuple.columns.remove(column_i); + let _ = tuple.values.remove(column_i); transaction.append(table_name, tuple, true)?; } diff --git a/src/execution/volcano/ddl/mod.rs b/src/execution/volcano/ddl/mod.rs index 4ec4ceef..9faa1f6e 100644 --- a/src/execution/volcano/ddl/mod.rs +++ b/src/execution/volcano/ddl/mod.rs @@ -1,4 +1,5 @@ -pub(crate) mod alter_table; +pub mod add_column; pub(crate) mod create_table; +pub mod drop_column; pub(crate) mod drop_table; pub(crate) mod truncate; diff --git a/src/execution/volcano/dml/analyze.rs b/src/execution/volcano/dml/analyze.rs new file mode 100644 index 00000000..98d75148 --- /dev/null +++ b/src/execution/volcano/dml/analyze.rs @@ -0,0 +1,125 @@ +use crate::catalog::{ColumnCatalog, ColumnRef, TableMeta, TableName}; +use crate::execution::volcano::{BoxedExecutor, Executor}; +use crate::execution::ExecutorError; +use crate::optimizer::core::column_meta::ColumnMeta; +use crate::optimizer::core::histogram::HistogramBuilder; +use crate::optimizer::OptimizerError; +use crate::planner::operator::analyze::AnalyzeOperator; +use crate::storage::Transaction; +use crate::types::tuple::Tuple; +use crate::types::value::DataValue; +use futures_async_stream::try_stream; +use itertools::Itertools; +use std::cell::RefCell; +use std::collections::HashMap; +use std::fs; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; + +const DEFAULT_NUM_OF_BUCKETS: usize = 100; +const DEFAULT_COLUMN_METAS_PATH: &'static str = "kipsql_column_metas"; + +pub struct Analyze { + table_name: TableName, + input: BoxedExecutor, + columns: Vec, +} + +impl From<(AnalyzeOperator, BoxedExecutor)> for Analyze { + fn from( + ( + AnalyzeOperator { + table_name, + columns, + }, + input, + ): (AnalyzeOperator, BoxedExecutor), + ) -> Self { + Analyze { + table_name, + input, + columns, + } + } +} + +impl Executor for Analyze { + fn execute(self, transaction: &RefCell) -> BoxedExecutor { + unsafe { self._execute(transaction.as_ptr().as_mut().unwrap()) } + } +} + +impl Analyze { + #[try_stream(boxed, ok = Tuple, error = ExecutorError)] + pub async fn _execute(self, transaction: &mut T) { + let Analyze { + table_name, + input, + columns, + } = self; + + let mut builders = HashMap::with_capacity(columns.len()); + + for column in &columns { + builders.insert(column.id(), HistogramBuilder::new(column, None)?); + } + + #[for_await] + for tuple in input { + let Tuple { + columns, values, .. + } = tuple?; + + for (i, column) in columns.iter().enumerate() { + if !column.desc.is_index() { + continue; + } + + if let Some(builder) = builders.get_mut(&column.id()) { + builder.append(&values[i])? + } + } + } + let ts = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("It's the end of the world!") + .as_secs(); + let dir_path = dirs::config_dir() + .expect("Your system does not have a Config directory!") + .join(DEFAULT_COLUMN_METAS_PATH) + .join(table_name.as_str()) + .join(ts.to_string()); + fs::create_dir_all(&dir_path)?; + + let mut meta = TableMeta::empty(table_name.clone()); + + for (column_id, builder) in builders { + let path = dir_path.join(column_id.unwrap().to_string()); + let (histogram, sketch) = match builder.build(DEFAULT_NUM_OF_BUCKETS) { + Ok(build) => build, + Err(OptimizerError::TooManyBuckets) => continue, + err => err?, + }; + + ColumnMeta::new(histogram, sketch).to_file(&path)?; + + meta.colum_meta_paths.push(path.to_string_lossy().into()); + } + transaction.save_table_meta(&meta)?; + + let columns: Vec = vec![Arc::new(ColumnCatalog::new_dummy( + "COLUMN_META_PATH".to_string(), + ))]; + let values = meta + .colum_meta_paths + .into_iter() + .map(|path| Arc::new(DataValue::Utf8(Some(path)))) + .collect_vec(); + + yield Tuple { + id: None, + columns, + values, + }; + } +} diff --git a/src/execution/volcano/dml/delete.rs b/src/execution/volcano/dml/delete.rs index d96d785f..b7c7e24d 100644 --- a/src/execution/volcano/dml/delete.rs +++ b/src/execution/volcano/dml/delete.rs @@ -15,7 +15,7 @@ pub struct Delete { } impl From<(DeleteOperator, BoxedExecutor)> for Delete { - fn from((DeleteOperator { table_name }, input): (DeleteOperator, BoxedExecutor)) -> Self { + fn from((DeleteOperator { table_name, .. }, input): (DeleteOperator, BoxedExecutor)) -> Self { Delete { table_name, input } } } diff --git a/src/execution/volcano/dml/mod.rs b/src/execution/volcano/dml/mod.rs index fac541fe..094edda3 100644 --- a/src/execution/volcano/dml/mod.rs +++ b/src/execution/volcano/dml/mod.rs @@ -1,3 +1,4 @@ +pub(crate) mod analyze; pub(crate) mod copy_from_file; pub(crate) mod copy_to_file; pub(crate) mod delete; diff --git a/src/execution/volcano/dql/index_scan.rs b/src/execution/volcano/dql/index_scan.rs index 8bc68033..c3e9a9ae 100644 --- a/src/execution/volcano/dql/index_scan.rs +++ b/src/execution/volcano/dql/index_scan.rs @@ -1,19 +1,26 @@ use crate::execution::volcano::{BoxedExecutor, Executor}; use crate::execution::ExecutorError; +use crate::expression::simplify::ConstantBinary; use crate::planner::operator::scan::ScanOperator; use crate::storage::{Iter, Transaction}; -use crate::types::errors::TypeError; +use crate::types::index::IndexMetaRef; use crate::types::tuple::Tuple; use futures_async_stream::try_stream; use std::cell::RefCell; pub(crate) struct IndexScan { op: ScanOperator, + index_by: IndexMetaRef, + binaries: Vec, } -impl From for IndexScan { - fn from(op: ScanOperator) -> Self { - IndexScan { op } +impl From<(ScanOperator, IndexMetaRef, Vec)> for IndexScan { + fn from((op, index_by, binaries): (ScanOperator, IndexMetaRef, Vec)) -> Self { + IndexScan { + op, + index_by, + binaries, + } } } @@ -30,12 +37,10 @@ impl IndexScan { table_name, columns, limit, - index_by, .. } = self.op; - let (index_meta, binaries) = index_by.ok_or(TypeError::InvalidType)?; let mut iter = - transaction.read_by_index(table_name, limit, columns, index_meta, binaries)?; + transaction.read_by_index(table_name, limit, columns, self.index_by, self.binaries)?; while let Some(tuple) = iter.next_tuple()? { yield tuple; diff --git a/src/execution/volcano/dql/join/hash_join.rs b/src/execution/volcano/dql/join/hash_join.rs index b5606260..1fb1f805 100644 --- a/src/execution/volcano/dql/join/hash_join.rs +++ b/src/execution/volcano/dql/join/hash_join.rs @@ -112,7 +112,10 @@ impl HashJoinStatus { let _ = mem::replace(left_init_flag, true); } - build_map.entry(hash).or_insert(Vec::new()).push(tuple); + build_map + .entry(hash) + .or_insert_with(|| Vec::new()) + .push(tuple); Ok(()) } diff --git a/src/execution/volcano/dql/sort.rs b/src/execution/volcano/dql/sort.rs index 3767f899..98b33a21 100644 --- a/src/execution/volcano/dql/sort.rs +++ b/src/execution/volcano/dql/sort.rs @@ -12,7 +12,7 @@ use std::mem; const BUCKET_SIZE: usize = u8::MAX as usize + 1; // LSD Radix Sort -fn radix_sort(mut tuples: Vec<(T, Vec)>) -> Vec { +pub(crate) fn radix_sort(mut tuples: Vec<(T, Vec)>) -> Vec { if let Some(max_len) = tuples.iter().map(|(_, bytes)| bytes.len()).max() { // init buckets let mut temp_buckets = Vec::with_capacity(BUCKET_SIZE); diff --git a/src/execution/volcano/mod.rs b/src/execution/volcano/mod.rs index 8335d948..6e927172 100644 --- a/src/execution/volcano/mod.rs +++ b/src/execution/volcano/mod.rs @@ -3,10 +3,11 @@ pub(crate) mod dml; pub(crate) mod dql; pub(crate) mod show; -use crate::execution::volcano::ddl::alter_table::drop_column::DropColumn; use crate::execution::volcano::ddl::create_table::CreateTable; +use crate::execution::volcano::ddl::drop_column::DropColumn; use crate::execution::volcano::ddl::drop_table::DropTable; use crate::execution::volcano::ddl::truncate::Truncate; +use crate::execution::volcano::dml::analyze::Analyze; use crate::execution::volcano::dml::copy_from_file::CopyFromFile; use crate::execution::volcano::dml::delete::Delete; use crate::execution::volcano::dml::insert::Insert; @@ -24,15 +25,16 @@ use crate::execution::volcano::dql::sort::Sort; use crate::execution::volcano::dql::values::Values; use crate::execution::volcano::show::show_table::ShowTables; use crate::execution::ExecutorError; -use crate::planner::operator::Operator; +use crate::planner::operator::{Operator, PhysicalOption}; use crate::planner::LogicalPlan; use crate::storage::Transaction; +use crate::types::index::IndexInfo; use crate::types::tuple::Tuple; use futures::stream::BoxStream; use futures::TryStreamExt; use std::cell::RefCell; -use self::ddl::alter_table::add_column::AddColumn; +use self::ddl::add_column::AddColumn; pub type BoxedExecutor = BoxStream<'static, Result>; @@ -44,6 +46,7 @@ pub fn build_stream(plan: LogicalPlan, transaction: &RefCell) let LogicalPlan { operator, mut childrens, + .. } = plan; match operator { @@ -74,8 +77,12 @@ pub fn build_stream(plan: LogicalPlan, transaction: &RefCell) Projection::from((op, input)).execute(transaction) } Operator::Scan(op) => { - if op.index_by.is_some() { - IndexScan::from(op).execute(transaction) + if let Some(PhysicalOption::IndexScan(IndexInfo { + meta, + binaries: Some(binaries), + })) = plan.physical_option + { + IndexScan::from((op, meta, binaries)).execute(transaction) } else { SeqScan::from(op).execute(transaction) } @@ -124,6 +131,11 @@ pub fn build_stream(plan: LogicalPlan, transaction: &RefCell) Operator::CopyToFile(_op) => { todo!() } + Operator::Analyze(op) => { + let input = build_stream(childrens.remove(0), transaction); + + Analyze::from((op, input)).execute(transaction) + } } } diff --git a/src/execution/volcano/show/show_table.rs b/src/execution/volcano/show/show_table.rs index 98751732..a2a0aa9a 100644 --- a/src/execution/volcano/show/show_table.rs +++ b/src/execution/volcano/show/show_table.rs @@ -1,5 +1,5 @@ -use crate::catalog::ColumnCatalog; use crate::catalog::ColumnRef; +use crate::catalog::{ColumnCatalog, TableMeta}; use crate::execution::volcano::{BoxedExecutor, Executor}; use crate::execution::ExecutorError; use crate::planner::operator::show::ShowTablesOperator; @@ -29,12 +29,21 @@ impl Executor for ShowTables { impl ShowTables { #[try_stream(boxed, ok = Tuple, error = ExecutorError)] pub async fn _execute(self, transaction: &T) { - let tables = transaction.show_tables()?; + let metas = transaction.table_metas()?; - for table in tables { - let columns: Vec = - vec![Arc::new(ColumnCatalog::new_dummy("TABLES".to_string()))]; - let values: Vec = vec![Arc::new(DataValue::Utf8(Some(table)))]; + for TableMeta { + table_name, + colum_meta_paths: histogram_paths, + } in metas + { + let columns: Vec = vec![ + Arc::new(ColumnCatalog::new_dummy("TABLE".to_string())), + Arc::new(ColumnCatalog::new_dummy("COLUMN_METAS_LEN".to_string())), + ]; + let values: Vec = vec![ + Arc::new(DataValue::Utf8(Some(table_name.to_string()))), + Arc::new(DataValue::UInt32(Some(histogram_paths.len() as u32))), + ]; yield Tuple { id: None, diff --git a/src/expression/simplify.rs b/src/expression/simplify.rs index 93849538..07008ca7 100644 --- a/src/expression/simplify.rs +++ b/src/expression/simplify.rs @@ -97,24 +97,28 @@ impl ConstantBinary { let mut is_push = merged_binaries.is_empty(); for binary in merged_binaries.iter_mut().rev() { - if let ConstantBinary::Scope { max, .. } = binary { - let (condition_min, condition_max) = op(&condition); - let is_lt_min = Self::bound_compared(max, &condition_min, false) - .unwrap_or(Ordering::Equal) - .is_lt(); - let is_lt_max = Self::bound_compared(max, &condition_max, false) - .unwrap_or(Ordering::Equal) - .is_lt(); - - if !is_lt_min && is_lt_max { - let _ = mem::replace(max, condition_max); - } else if !matches!(condition, ConstantBinary::Scope { .. }) { - is_push = is_lt_max; - } else if is_lt_min && is_lt_max { - is_push = true - } + match binary { + ConstantBinary::Scope { max, .. } => { + let (condition_min, condition_max) = op(&condition); + let is_lt_min = Self::bound_compared(max, &condition_min, false) + .unwrap_or(Ordering::Equal) + .is_lt(); + let is_lt_max = Self::bound_compared(max, &condition_max, false) + .unwrap_or(Ordering::Equal) + .is_lt(); + + if !is_lt_min && is_lt_max { + let _ = mem::replace(max, condition_max); + } else if !matches!(condition, ConstantBinary::Scope { .. }) { + is_push = is_lt_max; + } else if is_lt_min && is_lt_max { + is_push = true + } - break; + break; + } + ConstantBinary::Eq(_) => is_push = true, + _ => (), } } @@ -828,11 +832,14 @@ impl ScalarExpression { (None, Some(binary)) => Ok(Self::check_or(col_id, left_expr, op, binary)), } } - ScalarExpression::Alias { expr, .. } => expr.convert_binary(col_id), - ScalarExpression::TypeCast { expr, .. } => expr.convert_binary(col_id), - ScalarExpression::IsNull { expr, .. } => expr.convert_binary(col_id), - ScalarExpression::Unary { expr, .. } => expr.convert_binary(col_id), - _ => Ok(None), + ScalarExpression::Alias { expr, .. } + | ScalarExpression::TypeCast { expr, .. } + | ScalarExpression::IsNull { expr, .. } + | ScalarExpression::Unary { expr, .. } + | ScalarExpression::In { expr, .. } => expr.convert_binary(col_id), + ScalarExpression::Constant(_) + | ScalarExpression::ColumnRef(_) + | ScalarExpression::AggCall { .. } => Ok(None), } } diff --git a/src/lib.rs b/src/lib.rs index eade08e9..d158657c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,7 +4,7 @@ #![feature(coroutines)] #![feature(iterator_try_collect)] #![feature(slice_pattern)] -#![feature(bound_map)] +#![feature(is_sorted)] extern crate core; pub mod binder; pub mod catalog; diff --git a/src/main.rs b/src/main.rs index a50da2da..ecfe085a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -63,11 +63,7 @@ async fn server_run(path: String) -> Result<(), Box> { match db.run(&input).await { Ok(tuples) => { - if tuples.is_empty() { - println!("\nEmpty\n"); - } else { - println!("\n{}\n", create_table(&tuples)); - } + println!("\n{}\nRow len: {}\n", create_table(&tuples), tuples.len()); } Err(err) => { println!("Oops!: {}", err); diff --git a/src/optimizer/core/cm_sketch.rs b/src/optimizer/core/cm_sketch.rs new file mode 100644 index 00000000..15d9e78b --- /dev/null +++ b/src/optimizer/core/cm_sketch.rs @@ -0,0 +1,211 @@ +use crate::expression::simplify::ConstantBinary; +use crate::types::value::DataValue; +use rand::RngCore; +use serde::{Deserialize, Serialize}; +use siphasher::sip::SipHasher13; +use std::borrow::Borrow; +use std::hash::{Hash, Hasher}; +use std::marker::PhantomData; +use std::{cmp, mem}; + +type FastHasher = SipHasher13; + +// https://github.com/jedisct1/rust-count-min-sketch +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CountMinSketch { + counters: Vec>, + offsets: Vec, + hashers: [FastHasher; 2], + mask: usize, + k_num: usize, + phantom_k: PhantomData, +} + +impl CountMinSketch { + /// Tips: + /// - binaries must be used `ConstantBinary::scope_aggregation` and `ConstantBinary::rearrange` + /// - just count with `ConstantBinary::Eq` + pub fn collect_count(&self, binaries: &[ConstantBinary]) -> usize { + let mut count = 0; + + for binary in binaries { + count += match binary { + ConstantBinary::Eq(value) => self.estimate(value), + ConstantBinary::NotEq(_) | ConstantBinary::Scope { .. } => 0, + ConstantBinary::And(binaries) | ConstantBinary::Or(binaries) => { + self.collect_count(binaries) + } + } + } + + count + } +} + +impl CountMinSketch { + pub fn new(capacity: usize, probability: f64, tolerance: f64) -> Self { + let width = Self::optimal_width(capacity, tolerance); + let k_num = Self::optimal_k_num(probability); + let counters = vec![vec![0; width]; k_num]; + let offsets = vec![0; k_num]; + let hashers = [Self::sip_new(), Self::sip_new()]; + CountMinSketch { + counters, + offsets, + hashers, + mask: Self::mask(width), + k_num, + phantom_k: PhantomData, + } + } + + pub fn add(&mut self, key: &Q, value: usize) + where + K: Borrow, + { + let mut hashes = [0u64, 0u64]; + let lowest = (0..self.k_num) + .map(|k_i| { + let offset = self.offset(&mut hashes, key, k_i); + self.offsets[k_i] = offset; + self.counters[k_i][offset] + }) + .min() + .unwrap(); + for k_i in 0..self.k_num { + let offset = self.offsets[k_i]; + if self.counters[k_i][offset] == lowest { + self.counters[k_i][offset] = self.counters[k_i][offset].saturating_add(value); + } + } + } + + pub fn increment(&mut self, key: &Q) + where + K: Borrow, + { + self.add(key, 1) + } + + pub fn estimate(&self, key: &Q) -> usize + where + K: Borrow, + { + let mut hashes = [0u64, 0u64]; + (0..self.k_num) + .map(|k_i| { + let offset = self.offset(&mut hashes, key, k_i); + self.counters[k_i][offset] + }) + .min() + .unwrap() as usize + } + + #[allow(dead_code)] + pub fn estimate_memory( + capacity: usize, + probability: f64, + tolerance: f64, + ) -> Result { + let width = Self::optimal_width(capacity, tolerance); + let k_num = Self::optimal_k_num(probability); + Ok(width * mem::size_of::() * k_num) + } + + #[allow(dead_code)] + pub fn clear(&mut self) { + for k_i in 0..self.k_num { + for counter in &mut self.counters[k_i] { + *counter = 0 + } + } + self.hashers = [Self::sip_new(), Self::sip_new()]; + } + + fn optimal_width(capacity: usize, tolerance: f64) -> usize { + let e = tolerance / (capacity as f64); + let width = (2.0 / e).round() as usize; + cmp::max(2, width) + .checked_next_power_of_two() + .expect("Width would be way too large") + } + + fn mask(width: usize) -> usize { + assert!(width > 1); + assert_eq!(width & (width - 1), 0); + width - 1 + } + + fn optimal_k_num(probability: f64) -> usize { + cmp::max(1, ((1.0 - probability).ln() / 0.5f64.ln()) as usize) + } + + fn sip_new() -> FastHasher { + let mut rng = rand::thread_rng(); + FastHasher::new_with_keys(rng.next_u64(), rng.next_u64()) + } + + fn offset(&self, hashes: &mut [u64; 2], key: &Q, k_i: usize) -> usize + where + K: Borrow, + { + if k_i < 2 { + let sip = &mut self.hashers[k_i].clone(); + key.hash(sip); + let hash = sip.finish(); + hashes[k_i] = hash; + hash as usize & self.mask + } else { + hashes[0].wrapping_add((k_i as u64).wrapping_mul(hashes[1]) % 0xffffffffffffffc5) + as usize + & self.mask + } + } +} + +#[cfg(test)] +mod tests { + use crate::expression::simplify::ConstantBinary; + use crate::optimizer::core::cm_sketch::CountMinSketch; + use crate::types::value::DataValue; + use std::collections::Bound; + use std::sync::Arc; + + #[test] + fn test_increment() { + let mut cms = CountMinSketch::<&str>::new(100, 0.95, 10.0); + for _ in 0..300 { + cms.increment("key"); + } + assert_eq!(cms.estimate("key"), 300); + } + + #[test] + fn test_increment_multi() { + let mut cms = CountMinSketch::::new(100, 0.99, 2.0); + for i in 0..1_000_000 { + cms.increment(&(i % 100)); + } + for key in 0..100 { + assert!(cms.estimate(&key) >= 9_000); + } + } + + #[test] + fn test_collect_count() { + let mut cms = CountMinSketch::::new(100, 0.95, 10.0); + for _ in 0..300 { + cms.increment(&DataValue::Int32(Some(300))); + } + assert_eq!( + cms.collect_count(&vec![ + ConstantBinary::Eq(Arc::new(DataValue::Int32(Some(300)))), + ConstantBinary::Scope { + min: Bound::Unbounded, + max: Bound::Unbounded, + } + ]), + 300 + ); + } +} diff --git a/src/optimizer/core/column_meta.rs b/src/optimizer/core/column_meta.rs new file mode 100644 index 00000000..6b8aa18e --- /dev/null +++ b/src/optimizer/core/column_meta.rs @@ -0,0 +1,181 @@ +use crate::catalog::TableName; +use crate::expression::simplify::ConstantBinary; +use crate::optimizer::core::cm_sketch::CountMinSketch; +use crate::optimizer::core::histogram::Histogram; +use crate::optimizer::OptimizerError; +use crate::storage::Transaction; +use crate::types::value::DataValue; +use crate::types::{ColumnId, LogicalType}; +use kip_db::kernel::utils::lru_cache::ShardingLruCache; +use serde::{Deserialize, Serialize}; +use std::fs::OpenOptions; +use std::io::{Read, Write}; +use std::path::Path; + +pub struct ColumnMetaLoader<'a, T: Transaction> { + cache: &'a ShardingLruCache>, + tx: &'a T, +} + +impl<'a, T: Transaction> ColumnMetaLoader<'a, T> { + pub fn new( + tx: &'a T, + cache: &'a ShardingLruCache>, + ) -> ColumnMetaLoader<'a, T> { + ColumnMetaLoader { cache, tx } + } + + pub fn load(&self, table_name: TableName) -> Result<&Vec, OptimizerError> { + let option = self.cache.get(&table_name); + + return if let Some(column_metas) = option { + Ok(column_metas) + } else { + let paths = self.tx.column_meta_paths(&table_name)?; + let mut column_metas = Vec::with_capacity(paths.len()); + + for path in paths { + column_metas.push(ColumnMeta::from_file(path)?); + } + + Ok(self.cache.get_or_insert(table_name, |_| Ok(column_metas))?) + }; + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct ColumnMeta { + column_id: ColumnId, + data_type: LogicalType, + histogram: Histogram, + cm_sketch: CountMinSketch, +} + +impl ColumnMeta { + pub fn new(histogram: Histogram, cm_sketch: CountMinSketch) -> Self { + ColumnMeta { + column_id: histogram.column_id(), + data_type: histogram.data_type(), + histogram, + cm_sketch, + } + } + + pub fn column_id(&self) -> ColumnId { + self.column_id + } + pub fn data_type(&self) -> LogicalType { + self.data_type + } + + pub fn histogram(&self) -> &Histogram { + &self.histogram + } + + /// Tips: + /// - binaries must be used `ConstantBinary::scope_aggregation` and `ConstantBinary::rearrange` + pub fn collect_count(&self, binaries: &[ConstantBinary]) -> usize { + let mut count = 0; + + count += self.histogram.collect_count(binaries, &self.cm_sketch); + count + } + + pub fn to_file(&self, path: impl AsRef) -> Result<(), OptimizerError> { + let mut file = OpenOptions::new() + .create(true) + .write(true) + .read(true) + .open(path)?; + let _ = file.write_all(&bincode::serialize(self)?)?; + file.flush()?; + + Ok(()) + } + + pub fn from_file(path: impl AsRef) -> Result { + let mut file = OpenOptions::new() + .create(true) + .write(true) + .read(true) + .open(path)?; + + let mut bytes = Vec::new(); + let _ = file.read_to_end(&mut bytes)?; + + Ok(bincode::deserialize(&bytes)?) + } +} + +#[cfg(test)] +mod tests { + use crate::catalog::{ColumnCatalog, ColumnDesc, ColumnSummary}; + use crate::optimizer::core::column_meta::ColumnMeta; + use crate::optimizer::core::histogram::HistogramBuilder; + use crate::optimizer::OptimizerError; + use crate::types::value::DataValue; + use crate::types::LogicalType; + use std::sync::Arc; + use tempfile::TempDir; + + fn int32_column() -> ColumnCatalog { + ColumnCatalog { + summary: ColumnSummary { + id: Some(1), + name: "c1".to_string(), + table_name: None, + }, + nullable: false, + desc: ColumnDesc { + column_datatype: LogicalType::UInteger, + is_primary: false, + is_unique: false, + default: None, + }, + ref_expr: None, + } + } + + #[test] + fn test_to_file_and_from_file() -> Result<(), OptimizerError> { + let temp_dir = TempDir::new().expect("unable to create temporary working directory"); + let column = int32_column(); + + let mut builder = HistogramBuilder::new(&column, Some(15))?; + + builder.append(&Arc::new(DataValue::Int32(Some(14))))?; + builder.append(&Arc::new(DataValue::Int32(Some(13))))?; + builder.append(&Arc::new(DataValue::Int32(Some(12))))?; + builder.append(&Arc::new(DataValue::Int32(Some(11))))?; + builder.append(&Arc::new(DataValue::Int32(Some(10))))?; + + builder.append(&Arc::new(DataValue::Int32(Some(4))))?; + builder.append(&Arc::new(DataValue::Int32(Some(3))))?; + builder.append(&Arc::new(DataValue::Int32(Some(2))))?; + builder.append(&Arc::new(DataValue::Int32(Some(1))))?; + builder.append(&Arc::new(DataValue::Int32(Some(0))))?; + + builder.append(&Arc::new(DataValue::Int32(Some(9))))?; + builder.append(&Arc::new(DataValue::Int32(Some(8))))?; + builder.append(&Arc::new(DataValue::Int32(Some(7))))?; + builder.append(&Arc::new(DataValue::Int32(Some(6))))?; + builder.append(&Arc::new(DataValue::Int32(Some(5))))?; + + builder.append(&Arc::new(DataValue::Null))?; + builder.append(&Arc::new(DataValue::Int32(None)))?; + + let (histogram, sketch) = builder.build(4)?; + let path = temp_dir.path().join("meta"); + + ColumnMeta::new(histogram.clone(), sketch.clone()).to_file(path.clone())?; + let column_meta = ColumnMeta::from_file(path)?; + + assert_eq!(histogram, column_meta.histogram); + assert_eq!( + sketch.estimate(&DataValue::Null), + column_meta.cm_sketch.estimate(&DataValue::Null) + ); + + Ok(()) + } +} diff --git a/src/optimizer/core/histogram.rs b/src/optimizer/core/histogram.rs new file mode 100644 index 00000000..ebcddf40 --- /dev/null +++ b/src/optimizer/core/histogram.rs @@ -0,0 +1,810 @@ +use crate::catalog::ColumnCatalog; +use crate::execution::volcano::dql::sort::radix_sort; +use crate::expression::simplify::ConstantBinary; +use crate::optimizer::core::cm_sketch::CountMinSketch; +use crate::optimizer::OptimizerError; +use crate::types::value::{DataValue, ValueRef}; +use crate::types::{ColumnId, LogicalType}; +use ordered_float::OrderedFloat; +use serde::{Deserialize, Serialize}; +use std::cmp::Ordering; +use std::collections::Bound; +use std::sync::Arc; +use std::{cmp, mem}; + +pub struct HistogramBuilder { + column_id: ColumnId, + data_type: LogicalType, + + null_count: usize, + values: Vec<((usize, ValueRef), Vec)>, + + value_index: usize, +} + +// Equal depth histogram +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Histogram { + column_id: ColumnId, + data_type: LogicalType, + + number_of_distinct_value: usize, + null_count: usize, + values_len: usize, + + buckets: Vec, + // TODO: How to use? + // Correlation is the statistical correlation between physical row ordering and logical ordering of + // the column values + correlation: f64, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +struct Bucket { + lower: ValueRef, + upper: ValueRef, + count: u64, + // repeat: u64, +} + +impl HistogramBuilder { + pub fn new(column: &ColumnCatalog, capacity: Option) -> Result { + Ok(Self { + column_id: column.id().ok_or(OptimizerError::OwnerLessColumn)?, + data_type: *column.datatype(), + null_count: 0, + values: capacity + .map(Vec::with_capacity) + .unwrap_or_else(|| Vec::new()), + value_index: 0, + }) + } + + pub fn append(&mut self, value: &ValueRef) -> Result<(), OptimizerError> { + if value.is_null() { + self.null_count += 1; + } else { + let mut bytes = Vec::new(); + + value.memcomparable_encode(&mut bytes)?; + self.values.push(((self.value_index, value.clone()), bytes)); + } + + self.value_index += 1; + + Ok(()) + } + + pub fn build( + self, + number_of_buckets: usize, + ) -> Result<(Histogram, CountMinSketch), OptimizerError> { + if number_of_buckets > self.values.len() { + return Err(OptimizerError::TooManyBuckets); + } + + let tolerance = if self.values.len() > 10_000 { + (self.values.len() / 100) as f64 + } else { + 1.0 + }; + let mut sketch = CountMinSketch::new(self.values.len(), 0.95, tolerance); + let HistogramBuilder { + column_id, + data_type, + null_count, + values, + .. + } = self; + let mut buckets = Vec::with_capacity(number_of_buckets); + let values_len = values.len(); + let bucket_len = if values_len % number_of_buckets == 0 { + values_len / number_of_buckets + } else { + (values_len + number_of_buckets) / number_of_buckets + }; + let sorted_values = radix_sort(values); + + for i in 0..number_of_buckets { + let mut bucket = Bucket::empty(&data_type); + let j = (i + 1) * bucket_len; + + bucket.upper = sorted_values[cmp::min(j, values_len) - 1].1.clone(); + buckets.push(bucket); + } + let mut corr_xy_sum = 0.0; + let mut number_of_distinct_value = 0; + let mut last_value: Option = None; + + for (i, (ordinal, value)) in sorted_values.into_iter().enumerate() { + sketch.increment(value.as_ref()); + + if let None | Some(true) = last_value.as_ref().map(|last_value| last_value != &value) { + last_value = Some(value.clone()); + number_of_distinct_value += 1; + } + + let bucket = &mut buckets[i / bucket_len]; + + if bucket.lower.is_null() { + bucket.lower = value; + } + bucket.count += 1; + + corr_xy_sum += i as f64 * ordinal as f64; + } + sketch.add(&DataValue::Null, self.null_count); + + Ok(( + Histogram { + column_id, + data_type, + number_of_distinct_value, + null_count, + values_len, + buckets, + correlation: Self::calc_correlation(corr_xy_sum, values_len), + }, + sketch, + )) + } + + // https://github.com/pingcap/tidb/blob/6957170f1147e96958e63db48148445a7670328e/pkg/statistics/builder.go#L210 + fn calc_correlation(corr_xy_sum: f64, values_len: usize) -> f64 { + if values_len == 1 { + return 1.0; + } + let item_count = values_len as f64; + let corr_x_sum = (item_count - 1.0) * item_count / 2.0; + let corr_x2_sum = (item_count - 1.0) * item_count * (2.0 * item_count - 1.0) / 6.0; + (item_count * corr_xy_sum - corr_x_sum * corr_x_sum) + / (item_count * corr_x2_sum - corr_x_sum * corr_x_sum) + } +} + +fn is_under(value: &ValueRef, target: &Bound, is_min: bool) -> bool { + let _is_under = |value: &ValueRef, target: &ValueRef, is_min: bool| { + value + .partial_cmp(target) + .map(|order| { + if is_min { + Ordering::is_lt(order) + } else { + Ordering::is_le(order) + } + }) + .unwrap() + }; + + match target { + Bound::Included(target) => _is_under(value, target, is_min), + Bound::Excluded(target) => _is_under(value, target, !is_min), + Bound::Unbounded => !is_min, + } +} + +fn is_above(value: &ValueRef, target: &Bound, is_min: bool) -> bool { + let _is_above = |value: &ValueRef, target: &ValueRef, is_min: bool| { + value + .partial_cmp(target) + .map(|order| { + if is_min { + Ordering::is_ge(order) + } else { + Ordering::is_gt(order) + } + }) + .unwrap() + }; + + match target { + Bound::Included(target) => _is_above(value, target, is_min), + Bound::Excluded(target) => _is_above(value, target, !is_min), + Bound::Unbounded => is_min, + } +} + +impl Histogram { + pub fn column_id(&self) -> ColumnId { + self.column_id + } + pub fn data_type(&self) -> LogicalType { + self.data_type + } + + pub fn values_len(&self) -> usize { + self.values_len + } + + /// Tips: binaries must be used `ConstantBinary::scope_aggregation` and `ConstantBinary::rearrange` + pub fn collect_count( + &self, + binaries: &[ConstantBinary], + sketch: &CountMinSketch, + ) -> usize { + if self.buckets.is_empty() || binaries.is_empty() { + return 0; + } + + let mut count = 0; + let mut binary_i = 0; + let mut bucket_i = 0; + let mut bucket_idxs = Vec::new(); + + while bucket_i < self.buckets.len() && binary_i < binaries.len() { + self._collect_count( + &binaries, + &mut binary_i, + &mut bucket_i, + &mut bucket_idxs, + &mut count, + sketch, + ); + } + + bucket_idxs + .iter() + .map(|idx| self.buckets[*idx].count as usize) + .sum::() + + count + } + + fn _collect_count( + &self, + binaries: &[ConstantBinary], + binary_i: &mut usize, + bucket_i: &mut usize, + bucket_idxs: &mut Vec, + count: &mut usize, + sketch: &CountMinSketch, + ) { + let float_value = |value: &DataValue, prefix_len: usize| { + match value.logical_type() { + LogicalType::Varchar(_) => match value { + DataValue::Utf8(value) => value.as_ref().and_then(|string| { + if prefix_len > string.len() { + return Some(0.0); + } + + let mut val = 0u64; + for (i, char) in string + .get(prefix_len..prefix_len + 8) + .unwrap() + .chars() + .enumerate() + { + if string.len() - prefix_len > i { + val += (val << 8) + char as u64; + } else { + val += val << 8; + } + } + + Some(val as f64) + }), + _ => unreachable!(), + }, + LogicalType::Date | LogicalType::DateTime => match value { + DataValue::Date32(value) => DataValue::Int32(*value) + .cast(&LogicalType::Double) + .unwrap() + .double(), + DataValue::Date64(value) => DataValue::Int64(*value) + .cast(&LogicalType::Double) + .unwrap() + .double(), + _ => unreachable!(), + }, + + LogicalType::Invalid + | LogicalType::SqlNull + | LogicalType::Boolean + | LogicalType::Tinyint + | LogicalType::UTinyint + | LogicalType::Smallint + | LogicalType::USmallint + | LogicalType::Integer + | LogicalType::UInteger + | LogicalType::Bigint + | LogicalType::UBigint + | LogicalType::Float + | LogicalType::Double + | LogicalType::Decimal(_, _) => { + value.clone().cast(&LogicalType::Double).unwrap().double() + } + } + .unwrap_or(0.0) + }; + let calc_fraction = |start: &DataValue, end: &DataValue, value: &DataValue| { + let prefix_len = start.common_prefix_length(end).unwrap_or(0); + (float_value(value, prefix_len) - float_value(start, prefix_len)) + / (float_value(end, prefix_len) - float_value(start, prefix_len)) + }; + + let distinct_1 = OrderedFloat(1.0 / self.number_of_distinct_value as f64); + + match &binaries[*binary_i] { + ConstantBinary::Scope { min, max } => { + let bucket = &self.buckets[*bucket_i]; + let mut temp_count = 0; + + let is_eq = |value: &ValueRef, target: &Bound| match target { + Bound::Included(target) => target.eq(value), + _ => false, + }; + + if (is_above(&bucket.lower, min, true) || is_eq(&bucket.lower, min)) + && (is_under(&bucket.upper, max, false) || is_eq(&bucket.upper, max)) + { + bucket_idxs.push(mem::replace(bucket_i, *bucket_i + 1)); + } else if is_above(&bucket.lower, max, false) { + *binary_i += 1; + } else if is_under(&bucket.upper, min, true) { + *bucket_i += 1; + } else if is_above(&bucket.lower, min, true) { + let (temp_ratio, option) = match max { + Bound::Included(val) => { + (calc_fraction(&bucket.lower, &bucket.upper, val), None) + } + Bound::Excluded(val) => ( + calc_fraction(&bucket.lower, &bucket.upper, val), + Some(sketch.estimate(val)), + ), + Bound::Unbounded => unreachable!(), + }; + let ratio = *distinct_1.max(OrderedFloat(temp_ratio).min(OrderedFloat(1.0))); + temp_count += (bucket.count as f64 * ratio).ceil() as usize; + if let Some(count) = option { + temp_count = temp_count.saturating_sub(count); + } + *bucket_i += 1; + } else if is_under(&bucket.upper, max, false) { + let (temp_ratio, option) = match min { + Bound::Included(val) => { + (calc_fraction(&bucket.lower, &bucket.upper, val), None) + } + Bound::Excluded(val) => ( + calc_fraction(&bucket.lower, &bucket.upper, val), + Some(sketch.estimate(val)), + ), + Bound::Unbounded => unreachable!(), + }; + let ratio = *distinct_1.max(OrderedFloat(temp_ratio).min(OrderedFloat(1.0))); + temp_count += (bucket.count as f64 * (1.0 - ratio)).ceil() as usize; + if let Some(count) = option { + temp_count = temp_count.saturating_sub(count); + } + *bucket_i += 1; + } else { + let (temp_ratio_max, option_max) = match max { + Bound::Included(val) => { + (calc_fraction(&bucket.lower, &bucket.upper, val), None) + } + Bound::Excluded(val) => ( + calc_fraction(&bucket.lower, &bucket.upper, val), + Some(sketch.estimate(val)), + ), + Bound::Unbounded => unreachable!(), + }; + let (temp_ratio_min, option_min) = match min { + Bound::Included(val) => { + (calc_fraction(&bucket.lower, &bucket.upper, val), None) + } + Bound::Excluded(val) => ( + calc_fraction(&bucket.lower, &bucket.upper, val), + Some(sketch.estimate(val)), + ), + Bound::Unbounded => unreachable!(), + }; + let ratio = *distinct_1 + .max(OrderedFloat(temp_ratio_max - temp_ratio_min).min(OrderedFloat(1.0))); + temp_count += (bucket.count as f64 * ratio).ceil() as usize; + if let Some(count) = option_max { + temp_count = temp_count.saturating_sub(count); + } + if let Some(count) = option_min { + temp_count = temp_count.saturating_sub(count); + } + *binary_i += 1; + } + *count += cmp::max(temp_count, 0); + } + ConstantBinary::Eq(value) => { + *count += sketch.estimate(value); + *binary_i += 1 + } + ConstantBinary::NotEq(_) => (), + ConstantBinary::And(inner_binaries) | ConstantBinary::Or(inner_binaries) => self + ._collect_count( + inner_binaries, + binary_i, + bucket_i, + bucket_idxs, + count, + sketch, + ), + } + } +} + +impl Bucket { + fn empty(data_type: &LogicalType) -> Self { + let empty_value = Arc::new(DataValue::none(data_type)); + + Bucket { + lower: empty_value.clone(), + upper: empty_value, + count: 0, + } + } +} + +#[cfg(test)] +mod tests { + use crate::catalog::{ColumnCatalog, ColumnDesc, ColumnSummary}; + use crate::expression::simplify::ConstantBinary; + use crate::optimizer::core::histogram::{Bucket, HistogramBuilder}; + use crate::optimizer::OptimizerError; + use crate::types::value::DataValue; + use crate::types::LogicalType; + use std::ops::Bound; + use std::sync::Arc; + + fn int32_column() -> ColumnCatalog { + ColumnCatalog { + summary: ColumnSummary { + id: Some(1), + name: "c1".to_string(), + table_name: None, + }, + nullable: false, + desc: ColumnDesc { + column_datatype: LogicalType::UInteger, + is_primary: false, + is_unique: false, + default: None, + }, + ref_expr: None, + } + } + + #[test] + fn test_sort_tuples_on_histogram() -> Result<(), OptimizerError> { + let column = int32_column(); + + let mut builder = HistogramBuilder::new(&column, Some(15))?; + + builder.append(&Arc::new(DataValue::Int32(Some(0))))?; + builder.append(&Arc::new(DataValue::Int32(Some(1))))?; + builder.append(&Arc::new(DataValue::Int32(Some(2))))?; + builder.append(&Arc::new(DataValue::Int32(Some(3))))?; + builder.append(&Arc::new(DataValue::Int32(Some(4))))?; + + builder.append(&Arc::new(DataValue::Int32(Some(5))))?; + builder.append(&Arc::new(DataValue::Int32(Some(6))))?; + builder.append(&Arc::new(DataValue::Int32(Some(7))))?; + builder.append(&Arc::new(DataValue::Int32(Some(8))))?; + builder.append(&Arc::new(DataValue::Int32(Some(9))))?; + + builder.append(&Arc::new(DataValue::Int32(Some(10))))?; + builder.append(&Arc::new(DataValue::Int32(Some(11))))?; + builder.append(&Arc::new(DataValue::Int32(Some(12))))?; + builder.append(&Arc::new(DataValue::Int32(Some(13))))?; + builder.append(&Arc::new(DataValue::Int32(Some(14))))?; + + builder.append(&Arc::new(DataValue::Null))?; + builder.append(&Arc::new(DataValue::Int32(None)))?; + + // assert!(matches!(builder.build(10), Err(OptimizerError::TooManyBuckets))); + + let (histogram, _) = builder.build(5)?; + + assert_eq!(histogram.correlation, 1.0); + assert_eq!(histogram.null_count, 2); + assert_eq!(histogram.buckets.len(), 5); + assert_eq!( + histogram.buckets, + vec![ + Bucket { + lower: Arc::new(DataValue::Int32(Some(0))), + upper: Arc::new(DataValue::Int32(Some(2))), + count: 3, + }, + Bucket { + lower: Arc::new(DataValue::Int32(Some(3))), + upper: Arc::new(DataValue::Int32(Some(5))), + count: 3, + }, + Bucket { + lower: Arc::new(DataValue::Int32(Some(6))), + upper: Arc::new(DataValue::Int32(Some(8))), + count: 3, + }, + Bucket { + lower: Arc::new(DataValue::Int32(Some(9))), + upper: Arc::new(DataValue::Int32(Some(11))), + count: 3, + }, + Bucket { + lower: Arc::new(DataValue::Int32(Some(12))), + upper: Arc::new(DataValue::Int32(Some(14))), + count: 3, + }, + ] + ); + + Ok(()) + } + + #[test] + fn test_rev_sort_tuples_on_histogram() -> Result<(), OptimizerError> { + let column = int32_column(); + + let mut builder = HistogramBuilder::new(&column, Some(15))?; + + builder.append(&Arc::new(DataValue::Int32(Some(14))))?; + builder.append(&Arc::new(DataValue::Int32(Some(13))))?; + builder.append(&Arc::new(DataValue::Int32(Some(12))))?; + builder.append(&Arc::new(DataValue::Int32(Some(11))))?; + builder.append(&Arc::new(DataValue::Int32(Some(10))))?; + + builder.append(&Arc::new(DataValue::Int32(Some(9))))?; + builder.append(&Arc::new(DataValue::Int32(Some(8))))?; + builder.append(&Arc::new(DataValue::Int32(Some(7))))?; + builder.append(&Arc::new(DataValue::Int32(Some(6))))?; + builder.append(&Arc::new(DataValue::Int32(Some(5))))?; + + builder.append(&Arc::new(DataValue::Int32(Some(4))))?; + builder.append(&Arc::new(DataValue::Int32(Some(3))))?; + builder.append(&Arc::new(DataValue::Int32(Some(2))))?; + builder.append(&Arc::new(DataValue::Int32(Some(1))))?; + builder.append(&Arc::new(DataValue::Int32(Some(0))))?; + + builder.append(&Arc::new(DataValue::Null))?; + builder.append(&Arc::new(DataValue::Int32(None)))?; + + let (histogram, _) = builder.build(5)?; + + assert_eq!(histogram.correlation, -1.0); + assert_eq!(histogram.null_count, 2); + assert_eq!(histogram.buckets.len(), 5); + assert_eq!( + histogram.buckets, + vec![ + Bucket { + lower: Arc::new(DataValue::Int32(Some(0))), + upper: Arc::new(DataValue::Int32(Some(2))), + count: 3, + }, + Bucket { + lower: Arc::new(DataValue::Int32(Some(3))), + upper: Arc::new(DataValue::Int32(Some(5))), + count: 3, + }, + Bucket { + lower: Arc::new(DataValue::Int32(Some(6))), + upper: Arc::new(DataValue::Int32(Some(8))), + count: 3, + }, + Bucket { + lower: Arc::new(DataValue::Int32(Some(9))), + upper: Arc::new(DataValue::Int32(Some(11))), + count: 3, + }, + Bucket { + lower: Arc::new(DataValue::Int32(Some(12))), + upper: Arc::new(DataValue::Int32(Some(14))), + count: 3, + }, + ] + ); + + Ok(()) + } + + #[test] + fn test_non_average_on_histogram() -> Result<(), OptimizerError> { + let column = int32_column(); + + let mut builder = HistogramBuilder::new(&column, Some(15))?; + + builder.append(&Arc::new(DataValue::Int32(Some(14))))?; + builder.append(&Arc::new(DataValue::Int32(Some(13))))?; + builder.append(&Arc::new(DataValue::Int32(Some(12))))?; + builder.append(&Arc::new(DataValue::Int32(Some(11))))?; + builder.append(&Arc::new(DataValue::Int32(Some(10))))?; + + builder.append(&Arc::new(DataValue::Int32(Some(4))))?; + builder.append(&Arc::new(DataValue::Int32(Some(3))))?; + builder.append(&Arc::new(DataValue::Int32(Some(2))))?; + builder.append(&Arc::new(DataValue::Int32(Some(1))))?; + builder.append(&Arc::new(DataValue::Int32(Some(0))))?; + + builder.append(&Arc::new(DataValue::Int32(Some(9))))?; + builder.append(&Arc::new(DataValue::Int32(Some(8))))?; + builder.append(&Arc::new(DataValue::Int32(Some(7))))?; + builder.append(&Arc::new(DataValue::Int32(Some(6))))?; + builder.append(&Arc::new(DataValue::Int32(Some(5))))?; + + builder.append(&Arc::new(DataValue::Null))?; + builder.append(&Arc::new(DataValue::Int32(None)))?; + + let (histogram, _) = builder.build(4)?; + + assert!(histogram.correlation < 0.0); + assert_eq!(histogram.null_count, 2); + assert_eq!(histogram.buckets.len(), 4); + assert_eq!( + histogram.buckets, + vec![ + Bucket { + lower: Arc::new(DataValue::Int32(Some(0))), + upper: Arc::new(DataValue::Int32(Some(3))), + count: 4, + }, + Bucket { + lower: Arc::new(DataValue::Int32(Some(4))), + upper: Arc::new(DataValue::Int32(Some(7))), + count: 4, + }, + Bucket { + lower: Arc::new(DataValue::Int32(Some(8))), + upper: Arc::new(DataValue::Int32(Some(11))), + count: 4, + }, + Bucket { + lower: Arc::new(DataValue::Int32(Some(12))), + upper: Arc::new(DataValue::Int32(Some(14))), + count: 3, + }, + ] + ); + + Ok(()) + } + + #[test] + fn test_collect_count() -> Result<(), OptimizerError> { + let column = int32_column(); + + let mut builder = HistogramBuilder::new(&column, Some(15))?; + + builder.append(&Arc::new(DataValue::Int32(Some(14))))?; + builder.append(&Arc::new(DataValue::Int32(Some(13))))?; + builder.append(&Arc::new(DataValue::Int32(Some(12))))?; + builder.append(&Arc::new(DataValue::Int32(Some(11))))?; + builder.append(&Arc::new(DataValue::Int32(Some(10))))?; + + builder.append(&Arc::new(DataValue::Int32(Some(4))))?; + builder.append(&Arc::new(DataValue::Int32(Some(3))))?; + builder.append(&Arc::new(DataValue::Int32(Some(2))))?; + builder.append(&Arc::new(DataValue::Int32(Some(1))))?; + builder.append(&Arc::new(DataValue::Int32(Some(0))))?; + + builder.append(&Arc::new(DataValue::Int32(Some(9))))?; + builder.append(&Arc::new(DataValue::Int32(Some(8))))?; + builder.append(&Arc::new(DataValue::Int32(Some(7))))?; + builder.append(&Arc::new(DataValue::Int32(Some(6))))?; + builder.append(&Arc::new(DataValue::Int32(Some(5))))?; + + builder.append(&Arc::new(DataValue::Null))?; + builder.append(&Arc::new(DataValue::Int32(None)))?; + + let (histogram, sketch) = builder.build(4)?; + + let count_1 = histogram.collect_count( + &vec![ + ConstantBinary::Eq(Arc::new(DataValue::Int32(Some(2)))), + ConstantBinary::Scope { + min: Bound::Included(Arc::new(DataValue::Int32(Some(4)))), + max: Bound::Excluded(Arc::new(DataValue::Int32(Some(12)))), + }, + ], + &sketch, + ); + + assert_eq!(count_1, 9); + + let count_2 = histogram.collect_count( + &vec![ConstantBinary::Scope { + min: Bound::Included(Arc::new(DataValue::Int32(Some(4)))), + max: Bound::Unbounded, + }], + &sketch, + ); + + assert_eq!(count_2, 11); + + let count_3 = histogram.collect_count( + &vec![ConstantBinary::Scope { + min: Bound::Excluded(Arc::new(DataValue::Int32(Some(7)))), + max: Bound::Unbounded, + }], + &sketch, + ); + + assert_eq!(count_3, 7); + + let count_4 = histogram.collect_count( + &vec![ConstantBinary::Scope { + min: Bound::Unbounded, + max: Bound::Included(Arc::new(DataValue::Int32(Some(11)))), + }], + &sketch, + ); + + assert_eq!(count_4, 12); + + let count_5 = histogram.collect_count( + &vec![ConstantBinary::Scope { + min: Bound::Unbounded, + max: Bound::Excluded(Arc::new(DataValue::Int32(Some(8)))), + }], + &sketch, + ); + + assert_eq!(count_5, 8); + + let count_6 = histogram.collect_count( + &vec![ConstantBinary::Scope { + min: Bound::Included(Arc::new(DataValue::Int32(Some(2)))), + max: Bound::Unbounded, + }], + &sketch, + ); + + assert_eq!(count_6, 13); + + let count_7 = histogram.collect_count( + &vec![ConstantBinary::Scope { + min: Bound::Excluded(Arc::new(DataValue::Int32(Some(1)))), + max: Bound::Unbounded, + }], + &sketch, + ); + + assert_eq!(count_7, 13); + + let count_8 = histogram.collect_count( + &vec![ConstantBinary::Scope { + min: Bound::Unbounded, + max: Bound::Included(Arc::new(DataValue::Int32(Some(12)))), + }], + &sketch, + ); + + assert_eq!(count_8, 13); + + let count_9 = histogram.collect_count( + &vec![ConstantBinary::Scope { + min: Bound::Unbounded, + max: Bound::Excluded(Arc::new(DataValue::Int32(Some(13)))), + }], + &sketch, + ); + + assert_eq!(count_9, 13); + + let count_10 = histogram.collect_count( + &vec![ConstantBinary::Scope { + min: Bound::Excluded(Arc::new(DataValue::Int32(Some(0)))), + max: Bound::Excluded(Arc::new(DataValue::Int32(Some(3)))), + }], + &sketch, + ); + + assert_eq!(count_10, 2); + + let count_11 = histogram.collect_count( + &vec![ConstantBinary::Scope { + min: Bound::Included(Arc::new(DataValue::Int32(Some(1)))), + max: Bound::Included(Arc::new(DataValue::Int32(Some(2)))), + }], + &sketch, + ); + + assert_eq!(count_11, 2); + + Ok(()) + } +} diff --git a/src/optimizer/core/memo.rs b/src/optimizer/core/memo.rs new file mode 100644 index 00000000..a8e37593 --- /dev/null +++ b/src/optimizer/core/memo.rs @@ -0,0 +1,163 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::pattern::PatternMatcher; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::heuristic::batch::HepMatchOrder; +use crate::optimizer::heuristic::graph::{HepGraph, HepNodeId}; +use crate::optimizer::heuristic::matcher::HepMatcher; +use crate::optimizer::rule::implementation::ImplementationRuleImpl; +use crate::optimizer::OptimizerError; +use crate::planner::operator::PhysicalOption; +use crate::storage::Transaction; +use std::cmp::Ordering; +use std::collections::HashMap; + +#[derive(Debug, Clone)] +pub struct Expression { + pub(crate) op: PhysicalOption, + pub(crate) cost: Option, +} + +#[derive(Debug, Clone)] +pub struct GroupExpression { + exprs: Vec, +} + +impl GroupExpression { + pub(crate) fn append_expr(&mut self, expr: Expression) { + self.exprs.push(expr); + } +} + +#[derive(Debug)] +pub struct Memo { + groups: HashMap, +} + +impl Memo { + pub(crate) fn new( + graph: &HepGraph, + loader: &ColumnMetaLoader<'_, T>, + implementations: &[ImplementationRuleImpl], + ) -> Result { + let node_count = graph.node_count(); + let mut groups = HashMap::new(); + + if node_count == 0 { + return Err(OptimizerError::EmptyPlan); + } + + for node_id in graph.nodes_iter(HepMatchOrder::TopDown, None) { + for rule in implementations { + if HepMatcher::new(rule.pattern(), node_id, graph).match_opt_expr() { + let op = graph.operator(node_id); + let group_expr = groups + .entry(node_id) + .or_insert_with(|| GroupExpression { exprs: vec![] }); + + rule.to_expression(op, loader, group_expr)?; + } + } + } + + Ok(Memo { groups }) + } + + pub(crate) fn cheapest_physical_option(&self, node_id: &HepNodeId) -> Option { + self.groups.get(node_id).and_then(|exprs| { + exprs + .exprs + .iter() + .min_by(|expr_1, expr_2| match (expr_1.cost, expr_2.cost) { + (Some(cost_1), Some(cost_2)) => cost_1.cmp(&cost_2), + (None, Some(_)) => Ordering::Greater, + (Some(_), None) => Ordering::Less, + (None, None) => Ordering::Equal, + }) + .map(|expr| expr.op.clone()) + }) + } +} + +#[cfg(test)] +mod tests { + use crate::binder::{Binder, BinderContext}; + use crate::db::{Database, DatabaseError}; + use crate::optimizer::core::memo::Memo; + use crate::optimizer::heuristic::batch::HepBatchStrategy; + use crate::optimizer::heuristic::graph::HepGraph; + use crate::optimizer::heuristic::optimizer::HepOptimizer; + use crate::optimizer::rule::implementation::ImplementationRuleImpl; + use crate::optimizer::rule::normalization::NormalizationRuleImpl; + use crate::planner::operator::PhysicalOption; + use crate::storage::kip::KipTransaction; + use crate::storage::{Storage, Transaction}; + use petgraph::stable_graph::NodeIndex; + use tempfile::TempDir; + + #[tokio::test] + async fn test_build_memo() -> Result<(), DatabaseError> { + let temp_dir = TempDir::new().expect("unable to create temporary working directory"); + + let database = Database::with_kipdb(temp_dir.path()).await?; + database + .run("create table t1 (c1 int primary key, c2 int)") + .await?; + database + .run("create table t2 (c3 int primary key, c4 int)") + .await?; + + for i in 0..1000 { + let _ = database + .run(format!("insert into t1 values({}, {})", i, i + 1).as_str()) + .await?; + } + database.run("analyze table t1").await?; + + let transaction = database.storage.transaction().await?; + let binder = Binder::new(BinderContext::new(&transaction)); + let stmt = crate::parser::parse_sql( + // FIXME: Only by bracketing (c1 > 40 or c1 = 2) can the filter be pushed down below the join + "select c1, c3 from t1 inner join t2 on c1 = c3 where (c1 > 40 or c1 = 2) and c3 > 22", + )?; + let plan = binder.bind(&stmt[0])?; + let best_plan = HepOptimizer::new(plan) + .batch( + "Simplify Filter".to_string(), + HepBatchStrategy::once_topdown(), + vec![NormalizationRuleImpl::SimplifyFilter], + ) + .batch( + "Predicate Pushdown".to_string(), + HepBatchStrategy::fix_point_topdown(10), + vec![ + NormalizationRuleImpl::PushPredicateThroughJoin, + NormalizationRuleImpl::PushPredicateIntoScan, + ], + ) + .find_best::(None)?; + let graph = HepGraph::new(best_plan); + let rules = vec![ + ImplementationRuleImpl::Projection, + ImplementationRuleImpl::Filter, + ImplementationRuleImpl::HashJoin, + ImplementationRuleImpl::SeqScan, + ImplementationRuleImpl::IndexScan, + ]; + + let memo = Memo::new(&graph, &transaction.meta_loader(), &rules)?; + let best_plan = graph.to_plan(Some(&memo)); + let exprs = &memo.groups.get(&NodeIndex::new(3)).unwrap(); + + assert_eq!(exprs.exprs.len(), 2); + assert_eq!(exprs.exprs[0].cost, Some(1000)); + assert_eq!(exprs.exprs[0].op, PhysicalOption::SeqScan); + assert!(exprs.exprs[1].cost.unwrap() >= 1920); + assert!(matches!(exprs.exprs[1].op, PhysicalOption::IndexScan(_))); + assert_eq!( + best_plan.as_ref().unwrap().childrens[0].childrens[0].childrens[0].physical_option, + Some(PhysicalOption::SeqScan) + ); + + Ok(()) + } +} diff --git a/src/optimizer/core/mod.rs b/src/optimizer/core/mod.rs index 8dd57c35..7b734378 100644 --- a/src/optimizer/core/mod.rs +++ b/src/optimizer/core/mod.rs @@ -1,3 +1,7 @@ +pub(crate) mod cm_sketch; +pub(crate) mod column_meta; +pub(crate) mod histogram; +pub(crate) mod memo; pub(crate) mod opt_expr; pub(crate) mod pattern; pub(crate) mod rule; diff --git a/src/optimizer/core/opt_expr.rs b/src/optimizer/core/opt_expr.rs index c29e7b8e..5b7be4ea 100644 --- a/src/optimizer/core/opt_expr.rs +++ b/src/optimizer/core/opt_expr.rs @@ -43,6 +43,7 @@ impl OptExpr { LogicalPlan { operator: self.root.clone(), childrens, + physical_option: None, } } } diff --git a/src/optimizer/core/rule.rs b/src/optimizer/core/rule.rs index 9a790ace..b2ee1a6d 100644 --- a/src/optimizer/core/rule.rs +++ b/src/optimizer/core/rule.rs @@ -1,11 +1,25 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::GroupExpression; use crate::optimizer::core::pattern::Pattern; use crate::optimizer::heuristic::graph::{HepGraph, HepNodeId}; use crate::optimizer::OptimizerError; +use crate::planner::operator::Operator; +use crate::storage::Transaction; -/// A rule is to transform logically equivalent expression -pub trait Rule { - /// The pattern to determine whether the rule can be applied. +// TODO: Use indexing and other methods for matching optimization to avoid traversal +pub trait MatchPattern { fn pattern(&self) -> &Pattern; +} +pub trait NormalizationRule: MatchPattern { fn apply(&self, node_id: HepNodeId, graph: &mut HepGraph) -> Result<(), OptimizerError>; } + +pub trait ImplementationRule: MatchPattern { + fn to_expression( + &self, + op: &Operator, + loader: &ColumnMetaLoader, + group_expr: &mut GroupExpression, + ) -> Result<(), OptimizerError>; +} diff --git a/src/optimizer/heuristic/batch.rs b/src/optimizer/heuristic/batch.rs index 2161bec7..fb09a87c 100644 --- a/src/optimizer/heuristic/batch.rs +++ b/src/optimizer/heuristic/batch.rs @@ -1,15 +1,19 @@ -use crate::optimizer::rule::RuleImpl; +use crate::optimizer::rule::normalization::NormalizationRuleImpl; /// A batch of rules. #[derive(Clone)] pub struct HepBatch { pub name: String, pub strategy: HepBatchStrategy, - pub rules: Vec, + pub rules: Vec, } impl HepBatch { - pub fn new(name: String, strategy: HepBatchStrategy, rules: Vec) -> Self { + pub fn new( + name: String, + strategy: HepBatchStrategy, + rules: Vec, + ) -> Self { Self { name, strategy, diff --git a/src/optimizer/heuristic/graph.rs b/src/optimizer/heuristic/graph.rs index 8b74e0a2..bff31450 100644 --- a/src/optimizer/heuristic/graph.rs +++ b/src/optimizer/heuristic/graph.rs @@ -1,3 +1,4 @@ +use crate::optimizer::core::memo::Memo; use crate::optimizer::core::opt_expr::OptExprNodeId; use crate::optimizer::heuristic::batch::HepMatchOrder; use crate::planner::operator::Operator; @@ -10,7 +11,7 @@ use std::mem; /// HepNodeId is used in optimizer to identify a node. pub type HepNodeId = NodeIndex; -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct HepGraph { graph: StableDiGraph, root_index: HepNodeId, @@ -24,6 +25,7 @@ impl HepGraph { LogicalPlan { operator, childrens, + .. }: LogicalPlan, ) -> HepNodeId { let index = graph.add_node(operator); @@ -47,6 +49,10 @@ impl HepGraph { } } + pub fn node_count(&self) -> usize { + self.graph.node_count() + } + pub fn parent_id(&self, node_id: HepNodeId) -> Option { self.graph .neighbors_directed(node_id, petgraph::Direction::Incoming) @@ -159,40 +165,42 @@ impl HepGraph { &mut self.graph[node_id] } - pub fn to_plan(&self) -> LogicalPlan { - self.to_plan_with_index(self.root_index) + /// If input node is join, we use the edge weight to control the join children order. + pub fn children_at(&self, id: HepNodeId) -> Box + '_> { + Box::new( + self.graph + .edges(id) + .sorted_by_key(|edge| edge.weight()) + .map(|edge| edge.target()), + ) } - /// If input node is join, we use the edge weight to control the join chilren order. - pub fn children_at(&self, id: HepNodeId) -> Vec { + pub fn eldest_child_at(&self, id: HepNodeId) -> Option { self.graph .edges(id) - .sorted_by_key(|edge| edge.weight()) + .min_by_key(|edge| edge.weight()) .map(|edge| edge.target()) - .collect_vec() } - pub fn to_plan_with_index(&self, start_index: HepNodeId) -> LogicalPlan { - let mut root_plan = LogicalPlan { - operator: self.operator(start_index).clone(), - childrens: vec![], - }; - - self.build_childrens(&mut root_plan, start_index); - - root_plan + pub fn to_plan(mut self, memo: Option<&Memo>) -> Option { + self.build_childrens(self.root_index, memo) } - fn build_childrens(&self, plan: &mut LogicalPlan, start: HepNodeId) { - for child_id in self.children_at(start) { - let mut child_plan = LogicalPlan { - operator: self.operator(child_id).clone(), - childrens: vec![], - }; + fn build_childrens(&mut self, start: HepNodeId, memo: Option<&Memo>) -> Option { + let mut childrens = Vec::with_capacity(2); + let physical_option = memo.and_then(|memo| memo.cheapest_physical_option(&start)); - self.build_childrens(&mut child_plan, child_id); - plan.childrens.push(child_plan); + for child_id in self.children_at(start).collect_vec() { + if let Some(child_plan) = self.build_childrens(child_id, memo) { + childrens.push(child_plan); + } } + + self.graph.remove_node(start).map(|operator| LogicalPlan { + operator, + childrens, + physical_option, + }) } } @@ -337,14 +345,10 @@ mod tests { let plan = select_sql_run("select * from t1 left join t2 on c1 = c3").await?; let graph = HepGraph::new(plan.clone()); - let plan_for_graph = graph.to_plan(); + let plan_for_graph = graph.to_plan(None).unwrap(); assert_eq!(plan, plan_for_graph); - let plan_by_index = graph.to_plan_with_index(HepNodeId::new(1)); - - assert_eq!(plan.childrens[0], plan_by_index); - Ok(()) } } diff --git a/src/optimizer/heuristic/matcher.rs b/src/optimizer/heuristic/matcher.rs index 56637ec5..f31d1682 100644 --- a/src/optimizer/heuristic/matcher.rs +++ b/src/optimizer/heuristic/matcher.rs @@ -102,13 +102,17 @@ mod tests { childrens: vec![LogicalPlan { operator: Operator::Dummy, childrens: vec![], + physical_option: None, }], + physical_option: None, }, LogicalPlan { operator: Operator::Dummy, childrens: vec![], + physical_option: None, }, ], + physical_option: None, }; let graph = HepGraph::new(all_dummy_plan.clone()); diff --git a/src/optimizer/heuristic/optimizer.rs b/src/optimizer/heuristic/optimizer.rs index ee266b2b..2fa7adb8 100644 --- a/src/optimizer/heuristic/optimizer.rs +++ b/src/optimizer/heuristic/optimizer.rs @@ -1,15 +1,21 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::Memo; use crate::optimizer::core::pattern::PatternMatcher; -use crate::optimizer::core::rule::Rule; +use crate::optimizer::core::rule::{MatchPattern, NormalizationRule}; use crate::optimizer::heuristic::batch::{HepBatch, HepBatchStrategy}; use crate::optimizer::heuristic::graph::{HepGraph, HepNodeId}; use crate::optimizer::heuristic::matcher::HepMatcher; -use crate::optimizer::rule::RuleImpl; +use crate::optimizer::rule::implementation::ImplementationRuleImpl; +use crate::optimizer::rule::normalization::NormalizationRuleImpl; use crate::optimizer::OptimizerError; use crate::planner::LogicalPlan; +use crate::storage::Transaction; +use std::ops::Not; pub struct HepOptimizer { batches: Vec, pub graph: HepGraph, + implementations: Vec, } impl HepOptimizer { @@ -17,59 +23,86 @@ impl HepOptimizer { Self { batches: vec![], graph: HepGraph::new(root), + implementations: vec![], } } - pub fn batch(mut self, name: String, strategy: HepBatchStrategy, rules: Vec) -> Self { + pub fn batch( + mut self, + name: String, + strategy: HepBatchStrategy, + rules: Vec, + ) -> Self { self.batches.push(HepBatch::new(name, strategy, rules)); self } - pub fn find_best(&mut self) -> Result { - let batches = self.batches.clone(); + pub fn implementations(mut self, implementations: Vec) -> Self { + self.implementations = implementations; + self + } - for batch in batches { + pub fn find_best( + mut self, + loader: Option<&ColumnMetaLoader<'_, T>>, + ) -> Result { + for ref batch in self.batches { let mut batch_over = false; let mut iteration = 1usize; while iteration <= batch.strategy.max_iteration && !batch_over { - if self.apply_batch(&batch)? { + if Self::apply_batch(&mut self.graph, batch)? { iteration += 1; } else { batch_over = true } } } + let memo = loader + .and_then(|loader| { + self.implementations + .is_empty() + .not() + .then(|| Memo::new(&self.graph, loader, &self.implementations)) + }) + .transpose()?; - Ok(self.graph.to_plan()) + Ok(self + .graph + .to_plan(memo.as_ref()) + .ok_or(OptimizerError::EmptyPlan)?) } fn apply_batch( - &mut self, + graph: &mut HepGraph, HepBatch { rules, strategy, .. }: &HepBatch, ) -> Result { - let start_ver = self.graph.version; + let before_version = graph.version; for rule in rules { - for node_id in self.graph.nodes_iter(strategy.match_order, None) { - if self.apply_rule(rule, node_id)? { + for node_id in graph.nodes_iter(strategy.match_order, None) { + if Self::apply_rule(graph, rule, node_id)? { break; } } } - Ok(start_ver != self.graph.version) + Ok(before_version != graph.version) } - fn apply_rule(&mut self, rule: &RuleImpl, node_id: HepNodeId) -> Result { - let after_version = self.graph.version; + fn apply_rule( + graph: &mut HepGraph, + rule: &NormalizationRuleImpl, + node_id: HepNodeId, + ) -> Result { + let before_version = graph.version; - if HepMatcher::new(rule.pattern(), node_id, &self.graph).match_opt_expr() { - rule.apply(node_id, &mut self.graph)?; + if HepMatcher::new(rule.pattern(), node_id, graph).match_opt_expr() { + rule.apply(node_id, graph)?; } - Ok(after_version != self.graph.version) + Ok(before_version != graph.version) } } diff --git a/src/optimizer/mod.rs b/src/optimizer/mod.rs index 28dcdaa0..5135e07a 100644 --- a/src/optimizer/mod.rs +++ b/src/optimizer/mod.rs @@ -1,8 +1,10 @@ +use crate::storage::StorageError; use crate::types::errors::TypeError; +use kip_db::KernelError; /// The architecture and some components, /// such as (/core) are referenced from sqlrs -mod core; +pub mod core; pub mod heuristic; pub mod rule; @@ -14,4 +16,31 @@ pub enum OptimizerError { #[from] TypeError, ), + #[error("plan is empty")] + EmptyPlan, + #[error("this column must belong to a table")] + OwnerLessColumn, + #[error("there are more buckets than elements")] + TooManyBuckets, + #[error("io: {0}")] + IO( + #[source] + #[from] + std::io::Error, + ), + #[error("cache error: {0}")] + Cache( + #[source] + #[from] + KernelError, + ), + /// Serialization or deserialization error + #[error(transparent)] + SerdeBinCode(#[from] Box), + #[error("storage error: {0}")] + Storage( + #[source] + #[from] + StorageError, + ), } diff --git a/src/optimizer/rule/implementation/ddl/add_column.rs b/src/optimizer/rule/implementation/ddl/add_column.rs new file mode 100644 index 00000000..47eac517 --- /dev/null +++ b/src/optimizer/rule/implementation/ddl/add_column.rs @@ -0,0 +1,27 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref ADD_COLUMN_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::AddColumn(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct AddColumnImplementation; + +single_mapping!( + AddColumnImplementation, + ADD_COLUMN_PATTERN, + PhysicalOption::AddColumn +); diff --git a/src/optimizer/rule/implementation/ddl/create_table.rs b/src/optimizer/rule/implementation/ddl/create_table.rs new file mode 100644 index 00000000..fc700be5 --- /dev/null +++ b/src/optimizer/rule/implementation/ddl/create_table.rs @@ -0,0 +1,27 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref CREATE_TABLE_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::CreateTable(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct CreateTableImplementation; + +single_mapping!( + CreateTableImplementation, + CREATE_TABLE_PATTERN, + PhysicalOption::CreateTable +); diff --git a/src/optimizer/rule/implementation/ddl/drop_column.rs b/src/optimizer/rule/implementation/ddl/drop_column.rs new file mode 100644 index 00000000..c293f5c2 --- /dev/null +++ b/src/optimizer/rule/implementation/ddl/drop_column.rs @@ -0,0 +1,27 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref DROP_COLUMN_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::DropColumn(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct DropColumnImplementation; + +single_mapping!( + DropColumnImplementation, + DROP_COLUMN_PATTERN, + PhysicalOption::DropColumn +); diff --git a/src/optimizer/rule/implementation/ddl/drop_table.rs b/src/optimizer/rule/implementation/ddl/drop_table.rs new file mode 100644 index 00000000..5bd0ae18 --- /dev/null +++ b/src/optimizer/rule/implementation/ddl/drop_table.rs @@ -0,0 +1,27 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref DROP_TABLE_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::DropTable(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct DropTableImplementation; + +single_mapping!( + DropTableImplementation, + DROP_TABLE_PATTERN, + PhysicalOption::DropTable +); diff --git a/src/optimizer/rule/implementation/ddl/mod.rs b/src/optimizer/rule/implementation/ddl/mod.rs new file mode 100644 index 00000000..3bb8295f --- /dev/null +++ b/src/optimizer/rule/implementation/ddl/mod.rs @@ -0,0 +1,5 @@ +pub(crate) mod add_column; +pub(crate) mod create_table; +pub(crate) mod drop_column; +pub(crate) mod drop_table; +pub(crate) mod truncate; diff --git a/src/optimizer/rule/implementation/ddl/truncate.rs b/src/optimizer/rule/implementation/ddl/truncate.rs new file mode 100644 index 00000000..2f77f06d --- /dev/null +++ b/src/optimizer/rule/implementation/ddl/truncate.rs @@ -0,0 +1,27 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref TRUNCATE_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::DropTable(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct TruncateImplementation; + +single_mapping!( + TruncateImplementation, + TRUNCATE_PATTERN, + PhysicalOption::Truncate +); diff --git a/src/optimizer/rule/implementation/dml/analyze.rs b/src/optimizer/rule/implementation/dml/analyze.rs new file mode 100644 index 00000000..dd522d53 --- /dev/null +++ b/src/optimizer/rule/implementation/dml/analyze.rs @@ -0,0 +1,27 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref ANALYZE_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::Analyze(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct AnalyzeImplementation; + +single_mapping!( + AnalyzeImplementation, + ANALYZE_PATTERN, + PhysicalOption::Analyze +); diff --git a/src/optimizer/rule/implementation/dml/copy_from_file.rs b/src/optimizer/rule/implementation/dml/copy_from_file.rs new file mode 100644 index 00000000..13fc0c75 --- /dev/null +++ b/src/optimizer/rule/implementation/dml/copy_from_file.rs @@ -0,0 +1,27 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref COPY_FROM_FILE_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::CopyFromFile(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct CopyFromFileImplementation; + +single_mapping!( + CopyFromFileImplementation, + COPY_FROM_FILE_PATTERN, + PhysicalOption::CopyFromFile +); diff --git a/src/optimizer/rule/implementation/dml/copy_to_file.rs b/src/optimizer/rule/implementation/dml/copy_to_file.rs new file mode 100644 index 00000000..d894bbb8 --- /dev/null +++ b/src/optimizer/rule/implementation/dml/copy_to_file.rs @@ -0,0 +1,27 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref COPY_TO_FILE_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::CopyToFile(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct CopyToFileImplementation; + +single_mapping!( + CopyToFileImplementation, + COPY_TO_FILE_PATTERN, + PhysicalOption::CopyToFile +); diff --git a/src/optimizer/rule/implementation/dml/delete.rs b/src/optimizer/rule/implementation/dml/delete.rs new file mode 100644 index 00000000..9adf3f77 --- /dev/null +++ b/src/optimizer/rule/implementation/dml/delete.rs @@ -0,0 +1,23 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref DELETE_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::Delete(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct DeleteImplementation; + +single_mapping!(DeleteImplementation, DELETE_PATTERN, PhysicalOption::Delete); diff --git a/src/optimizer/rule/implementation/dml/insert.rs b/src/optimizer/rule/implementation/dml/insert.rs new file mode 100644 index 00000000..828bf8df --- /dev/null +++ b/src/optimizer/rule/implementation/dml/insert.rs @@ -0,0 +1,23 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref INSERT_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::Insert(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct InsertImplementation; + +single_mapping!(InsertImplementation, INSERT_PATTERN, PhysicalOption::Insert); diff --git a/src/optimizer/rule/implementation/dml/mod.rs b/src/optimizer/rule/implementation/dml/mod.rs new file mode 100644 index 00000000..094edda3 --- /dev/null +++ b/src/optimizer/rule/implementation/dml/mod.rs @@ -0,0 +1,6 @@ +pub(crate) mod analyze; +pub(crate) mod copy_from_file; +pub(crate) mod copy_to_file; +pub(crate) mod delete; +pub(crate) mod insert; +pub(crate) mod update; diff --git a/src/optimizer/rule/implementation/dml/update.rs b/src/optimizer/rule/implementation/dml/update.rs new file mode 100644 index 00000000..ab0b56d3 --- /dev/null +++ b/src/optimizer/rule/implementation/dml/update.rs @@ -0,0 +1,23 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref UPDATE_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::Update(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct UpdateImplementation; + +single_mapping!(UpdateImplementation, UPDATE_PATTERN, PhysicalOption::Update); diff --git a/src/optimizer/rule/implementation/dql/aggregate.rs b/src/optimizer/rule/implementation/dql/aggregate.rs new file mode 100644 index 00000000..cbb8c3d4 --- /dev/null +++ b/src/optimizer/rule/implementation/dql/aggregate.rs @@ -0,0 +1,51 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref GROUP_BY_AGGREGATE_PATTERN: Pattern = { + Pattern { + predicate: |op| { + if let Operator::Aggregate(op) = op { + return !op.groupby_exprs.is_empty(); + } + false + }, + children: PatternChildrenPredicate::None, + } + }; + static ref SIMPLE_AGGREGATE_PATTERN: Pattern = { + Pattern { + predicate: |op| { + if let Operator::Aggregate(op) = op { + return op.groupby_exprs.is_empty(); + } + false + }, + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct GroupByAggregateImplementation; + +single_mapping!( + GroupByAggregateImplementation, + GROUP_BY_AGGREGATE_PATTERN, + PhysicalOption::HashAggregate +); + +pub struct SimpleAggregateImplementation; + +single_mapping!( + SimpleAggregateImplementation, + SIMPLE_AGGREGATE_PATTERN, + PhysicalOption::SimpleAggregate +); diff --git a/src/optimizer/rule/implementation/dql/dummy.rs b/src/optimizer/rule/implementation/dql/dummy.rs new file mode 100644 index 00000000..c616c55c --- /dev/null +++ b/src/optimizer/rule/implementation/dql/dummy.rs @@ -0,0 +1,23 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref DUMMY_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::Dummy), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct DummyImplementation; + +single_mapping!(DummyImplementation, DUMMY_PATTERN, PhysicalOption::Dummy); diff --git a/src/optimizer/rule/implementation/dql/filter.rs b/src/optimizer/rule/implementation/dql/filter.rs new file mode 100644 index 00000000..506ac2fd --- /dev/null +++ b/src/optimizer/rule/implementation/dql/filter.rs @@ -0,0 +1,23 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref FILTER_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::Filter(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct FilterImplementation; + +single_mapping!(FilterImplementation, FILTER_PATTERN, PhysicalOption::Filter); diff --git a/src/optimizer/rule/implementation/dql/join.rs b/src/optimizer/rule/implementation/dql/join.rs new file mode 100644 index 00000000..68f9f0da --- /dev/null +++ b/src/optimizer/rule/implementation/dql/join.rs @@ -0,0 +1,27 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref JOIN_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::Join(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct HashJoinImplementation; + +single_mapping!( + HashJoinImplementation, + JOIN_PATTERN, + PhysicalOption::HashJoin +); diff --git a/src/optimizer/rule/implementation/dql/limit.rs b/src/optimizer/rule/implementation/dql/limit.rs new file mode 100644 index 00000000..4936c6db --- /dev/null +++ b/src/optimizer/rule/implementation/dql/limit.rs @@ -0,0 +1,23 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref LIMIT_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::Limit(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct LimitImplementation; + +single_mapping!(LimitImplementation, LIMIT_PATTERN, PhysicalOption::Limit); diff --git a/src/optimizer/rule/implementation/dql/mod.rs b/src/optimizer/rule/implementation/dql/mod.rs new file mode 100644 index 00000000..62e009ee --- /dev/null +++ b/src/optimizer/rule/implementation/dql/mod.rs @@ -0,0 +1,9 @@ +pub(crate) mod aggregate; +pub(crate) mod dummy; +pub(crate) mod filter; +pub(crate) mod join; +pub(crate) mod limit; +pub(crate) mod projection; +pub(crate) mod scan; +pub(crate) mod sort; +pub(crate) mod values; diff --git a/src/optimizer/rule/implementation/dql/projection.rs b/src/optimizer/rule/implementation/dql/projection.rs new file mode 100644 index 00000000..66208208 --- /dev/null +++ b/src/optimizer/rule/implementation/dql/projection.rs @@ -0,0 +1,27 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref PROJECTION_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::Project(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct ProjectionImplementation; + +single_mapping!( + ProjectionImplementation, + PROJECTION_PATTERN, + PhysicalOption::Project +); diff --git a/src/optimizer/rule/implementation/dql/scan.rs b/src/optimizer/rule/implementation/dql/scan.rs new file mode 100644 index 00000000..00f1631d --- /dev/null +++ b/src/optimizer/rule/implementation/dql/scan.rs @@ -0,0 +1,111 @@ +use crate::optimizer::core::column_meta::{ColumnMeta, ColumnMetaLoader}; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::storage::Transaction; +use crate::types::ColumnId; +use lazy_static::lazy_static; + +lazy_static! { + static ref SCAN_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::Scan(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct SeqScanImplementation; + +impl MatchPattern for SeqScanImplementation { + fn pattern(&self) -> &Pattern { + &SCAN_PATTERN + } +} + +impl ImplementationRule for SeqScanImplementation { + fn to_expression( + &self, + op: &Operator, + loader: &ColumnMetaLoader, + group_expr: &mut GroupExpression, + ) -> Result<(), OptimizerError> { + if let Operator::Scan(scan_op) = op { + let column_metas = loader.load(scan_op.table_name.clone())?; + let mut cost = None; + + if let Some(column_meta) = find_column_meta(column_metas, &scan_op.primary_key) { + cost = Some(column_meta.histogram().values_len()); + } + + group_expr.append_expr(Expression { + op: PhysicalOption::SeqScan, + cost, + }); + + Ok(()) + } else { + unreachable!("invalid operator!") + } + } +} + +pub struct IndexScanImplementation; + +impl MatchPattern for IndexScanImplementation { + fn pattern(&self) -> &Pattern { + &SCAN_PATTERN + } +} + +impl ImplementationRule for IndexScanImplementation { + fn to_expression( + &self, + op: &Operator, + loader: &ColumnMetaLoader<'_, T>, + group_expr: &mut GroupExpression, + ) -> Result<(), OptimizerError> { + if let Operator::Scan(scan_op) = op { + let column_metas = loader.load(scan_op.table_name.clone())?; + for index_info in scan_op.index_infos.iter() { + if index_info.binaries.is_none() { + continue; + } + let mut cost = None; + + if let Some(binaries) = &index_info.binaries { + // FIXME: Only UniqueIndex + if let Some(histogram) = + find_column_meta(column_metas, &index_info.meta.column_ids[0]) + { + // need to return table query(non-covering index) + cost = Some(histogram.collect_count(binaries) * 2); + } + } + + group_expr.append_expr(Expression { + op: PhysicalOption::IndexScan(index_info.clone()), + cost, + }) + } + + Ok(()) + } else { + unreachable!("invalid operator!") + } + } +} + +fn find_column_meta<'a>( + column_metas: &'a Vec, + column_id: &ColumnId, +) -> Option<&'a ColumnMeta> { + assert!(column_metas.is_sorted_by_key(ColumnMeta::column_id)); + column_metas + .binary_search_by(|column_meta| column_meta.column_id().cmp(column_id)) + .ok() + .map(|i| &column_metas[i]) +} diff --git a/src/optimizer/rule/implementation/dql/sort.rs b/src/optimizer/rule/implementation/dql/sort.rs new file mode 100644 index 00000000..95688f04 --- /dev/null +++ b/src/optimizer/rule/implementation/dql/sort.rs @@ -0,0 +1,23 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref SORT_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::Sort(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct SortImplementation; + +single_mapping!(SortImplementation, SORT_PATTERN, PhysicalOption::RadixSort); diff --git a/src/optimizer/rule/implementation/dql/values.rs b/src/optimizer/rule/implementation/dql/values.rs new file mode 100644 index 00000000..df0f170a --- /dev/null +++ b/src/optimizer/rule/implementation/dql/values.rs @@ -0,0 +1,23 @@ +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::{Expression, GroupExpression}; +use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::OptimizerError; +use crate::planner::operator::{Operator, PhysicalOption}; +use crate::single_mapping; +use crate::storage::Transaction; +use lazy_static::lazy_static; + +lazy_static! { + static ref VALUES_PATTERN: Pattern = { + Pattern { + predicate: |op| matches!(op, Operator::Values(_)), + children: PatternChildrenPredicate::None, + } + }; +} + +#[derive(Clone)] +pub struct ValuesImplementation; + +single_mapping!(ValuesImplementation, VALUES_PATTERN, PhysicalOption::Values); diff --git a/src/optimizer/rule/implementation/marcos.rs b/src/optimizer/rule/implementation/marcos.rs new file mode 100644 index 00000000..d61326c5 --- /dev/null +++ b/src/optimizer/rule/implementation/marcos.rs @@ -0,0 +1,27 @@ +#[macro_export] +macro_rules! single_mapping { + ($ty:ty, $pattern:expr, $option:expr) => { + impl MatchPattern for $ty { + fn pattern(&self) -> &Pattern { + &$pattern + } + } + + impl ImplementationRule for $ty { + fn to_expression( + &self, + _: &Operator, + _: &ColumnMetaLoader<'_, T>, + group_expr: &mut GroupExpression, + ) -> Result<(), OptimizerError> { + //TODO: CostModel + group_expr.append_expr(Expression { + op: $option, + cost: None, + }); + + Ok(()) + } + } + }; +} diff --git a/src/optimizer/rule/implementation/mod.rs b/src/optimizer/rule/implementation/mod.rs new file mode 100644 index 00000000..f7318cca --- /dev/null +++ b/src/optimizer/rule/implementation/mod.rs @@ -0,0 +1,174 @@ +pub(crate) mod ddl; +pub(crate) mod dml; +pub(crate) mod dql; +pub(crate) mod marcos; + +use crate::optimizer::core::column_meta::ColumnMetaLoader; +use crate::optimizer::core::memo::GroupExpression; +use crate::optimizer::core::pattern::Pattern; +use crate::optimizer::core::rule::{ImplementationRule, MatchPattern}; +use crate::optimizer::rule::implementation::ddl::add_column::AddColumnImplementation; +use crate::optimizer::rule::implementation::ddl::create_table::CreateTableImplementation; +use crate::optimizer::rule::implementation::ddl::drop_column::DropColumnImplementation; +use crate::optimizer::rule::implementation::ddl::drop_table::DropTableImplementation; +use crate::optimizer::rule::implementation::ddl::truncate::TruncateImplementation; +use crate::optimizer::rule::implementation::dml::analyze::AnalyzeImplementation; +use crate::optimizer::rule::implementation::dml::copy_from_file::CopyFromFileImplementation; +use crate::optimizer::rule::implementation::dml::copy_to_file::CopyToFileImplementation; +use crate::optimizer::rule::implementation::dml::delete::DeleteImplementation; +use crate::optimizer::rule::implementation::dml::insert::InsertImplementation; +use crate::optimizer::rule::implementation::dml::update::UpdateImplementation; +use crate::optimizer::rule::implementation::dql::aggregate::{ + GroupByAggregateImplementation, SimpleAggregateImplementation, +}; +use crate::optimizer::rule::implementation::dql::dummy::DummyImplementation; +use crate::optimizer::rule::implementation::dql::filter::FilterImplementation; +use crate::optimizer::rule::implementation::dql::join::HashJoinImplementation; +use crate::optimizer::rule::implementation::dql::limit::LimitImplementation; +use crate::optimizer::rule::implementation::dql::projection::ProjectionImplementation; +use crate::optimizer::rule::implementation::dql::scan::{ + IndexScanImplementation, SeqScanImplementation, +}; +use crate::optimizer::rule::implementation::dql::sort::SortImplementation; +use crate::optimizer::rule::implementation::dql::values::ValuesImplementation; +use crate::optimizer::OptimizerError; +use crate::planner::operator::Operator; +use crate::storage::Transaction; + +#[derive(Debug, Copy, Clone)] +pub enum ImplementationRuleImpl { + // DQL + GroupByAggregate, + SimpleAggregate, + Dummy, + Filter, + HashJoin, + Limit, + Projection, + SeqScan, + IndexScan, + Sort, + Values, + // DML + Analyze, + CopyFromFile, + CopyToFile, + Delete, + Insert, + Update, + // DDL + AddColumn, + CreateTable, + DropColumn, + DropTable, + Truncate, +} + +impl MatchPattern for ImplementationRuleImpl { + fn pattern(&self) -> &Pattern { + match self { + ImplementationRuleImpl::GroupByAggregate => GroupByAggregateImplementation.pattern(), + ImplementationRuleImpl::SimpleAggregate => SimpleAggregateImplementation.pattern(), + ImplementationRuleImpl::Dummy => DummyImplementation.pattern(), + ImplementationRuleImpl::Filter => FilterImplementation.pattern(), + ImplementationRuleImpl::HashJoin => HashJoinImplementation.pattern(), + ImplementationRuleImpl::Limit => LimitImplementation.pattern(), + ImplementationRuleImpl::Projection => ProjectionImplementation.pattern(), + ImplementationRuleImpl::SeqScan => SeqScanImplementation.pattern(), + ImplementationRuleImpl::IndexScan => IndexScanImplementation.pattern(), + ImplementationRuleImpl::Sort => SortImplementation.pattern(), + ImplementationRuleImpl::Values => ValuesImplementation.pattern(), + ImplementationRuleImpl::CopyFromFile => CopyFromFileImplementation.pattern(), + ImplementationRuleImpl::CopyToFile => CopyToFileImplementation.pattern(), + ImplementationRuleImpl::Delete => DeleteImplementation.pattern(), + ImplementationRuleImpl::Insert => InsertImplementation.pattern(), + ImplementationRuleImpl::Update => UpdateImplementation.pattern(), + ImplementationRuleImpl::AddColumn => AddColumnImplementation.pattern(), + ImplementationRuleImpl::CreateTable => CreateTableImplementation.pattern(), + ImplementationRuleImpl::DropColumn => DropColumnImplementation.pattern(), + ImplementationRuleImpl::DropTable => DropTableImplementation.pattern(), + ImplementationRuleImpl::Truncate => TruncateImplementation.pattern(), + ImplementationRuleImpl::Analyze => AnalyzeImplementation.pattern(), + } + } +} + +impl ImplementationRule for ImplementationRuleImpl { + fn to_expression( + &self, + operator: &Operator, + loader: &ColumnMetaLoader<'_, T>, + group_expr: &mut GroupExpression, + ) -> Result<(), OptimizerError> { + match self { + ImplementationRuleImpl::GroupByAggregate => { + GroupByAggregateImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::SimpleAggregate => { + SimpleAggregateImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::Dummy => { + DummyImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::Filter => { + FilterImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::HashJoin => { + HashJoinImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::Limit => { + LimitImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::Projection => { + ProjectionImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::SeqScan => { + SeqScanImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::IndexScan => { + IndexScanImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::Sort => { + SortImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::Values => { + ValuesImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::CopyFromFile => { + CopyFromFileImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::CopyToFile => { + CopyToFileImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::Delete => { + DeleteImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::Insert => { + InsertImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::Update => { + UpdateImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::AddColumn => { + AddColumnImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::CreateTable => { + CreateTableImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::DropColumn => { + DropColumnImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::DropTable => { + DropTableImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::Truncate => { + TruncateImplementation.to_expression(operator, loader, group_expr)? + } + ImplementationRuleImpl::Analyze => { + AnalyzeImplementation.to_expression(operator, loader, group_expr)? + } + } + + Ok(()) + } +} diff --git a/src/optimizer/rule/mod.rs b/src/optimizer/rule/mod.rs index 1c9bbbed..d825195d 100644 --- a/src/optimizer/rule/mod.rs +++ b/src/optimizer/rule/mod.rs @@ -1,79 +1,2 @@ -use crate::expression::ScalarExpression; -use crate::optimizer::core::pattern::Pattern; -use crate::optimizer::core::rule::Rule; -use crate::optimizer::heuristic::graph::{HepGraph, HepNodeId}; -use crate::optimizer::rule::column_pruning::ColumnPruning; -use crate::optimizer::rule::combine_operators::{CollapseProject, CombineFilter}; -use crate::optimizer::rule::pushdown_limit::{ - EliminateLimits, LimitProjectTranspose, PushLimitIntoScan, PushLimitThroughJoin, -}; -use crate::optimizer::rule::pushdown_predicates::PushPredicateIntoScan; -use crate::optimizer::rule::pushdown_predicates::PushPredicateThroughJoin; -use crate::optimizer::rule::simplification::ConstantCalculation; -use crate::optimizer::rule::simplification::SimplifyFilter; -use crate::optimizer::OptimizerError; - -mod column_pruning; -mod combine_operators; -mod pushdown_limit; -mod pushdown_predicates; -mod simplification; - -#[derive(Debug, Copy, Clone)] -pub enum RuleImpl { - ColumnPruning, - // Combine operators - CollapseProject, - CombineFilter, - // PushDown limit - LimitProjectTranspose, - EliminateLimits, - PushLimitThroughJoin, - PushLimitIntoTableScan, - // PushDown predicates - PushPredicateThroughJoin, - // Tips: need to be used with `SimplifyFilter` - PushPredicateIntoScan, - // Simplification - SimplifyFilter, - ConstantCalculation, -} - -impl Rule for RuleImpl { - fn pattern(&self) -> &Pattern { - match self { - RuleImpl::ColumnPruning => ColumnPruning.pattern(), - RuleImpl::CollapseProject => CollapseProject.pattern(), - RuleImpl::CombineFilter => CombineFilter.pattern(), - RuleImpl::LimitProjectTranspose => LimitProjectTranspose.pattern(), - RuleImpl::EliminateLimits => EliminateLimits.pattern(), - RuleImpl::PushLimitThroughJoin => PushLimitThroughJoin.pattern(), - RuleImpl::PushLimitIntoTableScan => PushLimitIntoScan.pattern(), - RuleImpl::PushPredicateThroughJoin => PushPredicateThroughJoin.pattern(), - RuleImpl::PushPredicateIntoScan => PushPredicateIntoScan.pattern(), - RuleImpl::SimplifyFilter => SimplifyFilter.pattern(), - RuleImpl::ConstantCalculation => ConstantCalculation.pattern(), - } - } - - fn apply(&self, node_id: HepNodeId, graph: &mut HepGraph) -> Result<(), OptimizerError> { - match self { - RuleImpl::ColumnPruning => ColumnPruning.apply(node_id, graph), - RuleImpl::CollapseProject => CollapseProject.apply(node_id, graph), - RuleImpl::CombineFilter => CombineFilter.apply(node_id, graph), - RuleImpl::LimitProjectTranspose => LimitProjectTranspose.apply(node_id, graph), - RuleImpl::EliminateLimits => EliminateLimits.apply(node_id, graph), - RuleImpl::PushLimitThroughJoin => PushLimitThroughJoin.apply(node_id, graph), - RuleImpl::PushLimitIntoTableScan => PushLimitIntoScan.apply(node_id, graph), - RuleImpl::PushPredicateThroughJoin => PushPredicateThroughJoin.apply(node_id, graph), - RuleImpl::SimplifyFilter => SimplifyFilter.apply(node_id, graph), - RuleImpl::PushPredicateIntoScan => PushPredicateIntoScan.apply(node_id, graph), - RuleImpl::ConstantCalculation => ConstantCalculation.apply(node_id, graph), - } - } -} - -/// Return true when left is subset of right -pub fn is_subset_exprs(left: &[ScalarExpression], right: &[ScalarExpression]) -> bool { - left.iter().all(|l| right.contains(l)) -} +pub(crate) mod implementation; +pub(crate) mod normalization; diff --git a/src/optimizer/rule/column_pruning.rs b/src/optimizer/rule/normalization/column_pruning.rs similarity index 87% rename from src/optimizer/rule/column_pruning.rs rename to src/optimizer/rule/normalization/column_pruning.rs index 2538ff5f..e7e1f551 100644 --- a/src/optimizer/rule/column_pruning.rs +++ b/src/optimizer/rule/normalization/column_pruning.rs @@ -2,12 +2,13 @@ use crate::catalog::{ColumnRef, ColumnSummary}; use crate::expression::agg::AggKind; use crate::expression::ScalarExpression; use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; -use crate::optimizer::core::rule::Rule; +use crate::optimizer::core::rule::{MatchPattern, NormalizationRule}; use crate::optimizer::heuristic::graph::{HepGraph, HepNodeId}; use crate::optimizer::OptimizerError; use crate::planner::operator::Operator; use crate::types::value::DataValue; use crate::types::LogicalType; +use itertools::Itertools; use lazy_static::lazy_static; use std::collections::HashSet; use std::sync::Arc; @@ -84,7 +85,7 @@ impl ColumnPruning { // Todo: Order Project // https://github.com/duckdb/duckdb/blob/main/src/optimizer/remove_unused_columns.cpp#L174 } - for child_id in graph.children_at(node_id) { + for child_id in graph.children_at(node_id).collect_vec() { Self::_apply(column_references, true, child_id, graph); } } @@ -97,17 +98,24 @@ impl ColumnPruning { for column in operator.referenced_columns(false) { column_references.insert(column.summary().clone()); } - for child_id in graph.children_at(node_id) { + for child_id in graph.children_at(node_id).collect_vec() { Self::_apply(column_references, all_referenced, child_id, graph); } } // Last Operator Operator::Dummy | Operator::Values(_) => (), // DDL Based on Other Plan - Operator::Insert(_) | Operator::Update(_) | Operator::Delete(_) => { + Operator::Insert(_) + | Operator::Update(_) + | Operator::Delete(_) + | Operator::Analyze(_) => { let op_ref_columns = operator.referenced_columns(false); - Self::recollect_apply(op_ref_columns, true, graph.children_at(node_id)[0], graph); + if let Some(child_id) = graph.eldest_child_at(node_id) { + Self::recollect_apply(op_ref_columns, true, child_id, graph); + } else { + unreachable!(); + } } // DDL Single Plan Operator::CreateTable(_) @@ -127,7 +135,7 @@ impl ColumnPruning { node_id: HepNodeId, graph: &mut HepGraph, ) { - for child_id in graph.children_at(node_id) { + for child_id in graph.children_at(node_id).collect_vec() { let mut new_references: HashSet = referenced_columns .iter() .map(|column| column.summary()) @@ -139,11 +147,13 @@ impl ColumnPruning { } } -impl Rule for ColumnPruning { +impl MatchPattern for ColumnPruning { fn pattern(&self) -> &Pattern { &COLUMN_PRUNING_RULE } +} +impl NormalizationRule for ColumnPruning { fn apply(&self, node_id: HepNodeId, graph: &mut HepGraph) -> Result<(), OptimizerError> { Self::_apply(&mut HashSet::new(), true, node_id, graph); // mark changed to skip this rule batch @@ -159,9 +169,10 @@ mod tests { use crate::db::DatabaseError; use crate::optimizer::heuristic::batch::HepBatchStrategy; use crate::optimizer::heuristic::optimizer::HepOptimizer; - use crate::optimizer::rule::RuleImpl; + use crate::optimizer::rule::normalization::NormalizationRuleImpl; use crate::planner::operator::join::JoinCondition; use crate::planner::operator::Operator; + use crate::storage::kip::KipTransaction; #[tokio::test] async fn test_column_pruning() -> Result<(), DatabaseError> { @@ -171,9 +182,9 @@ mod tests { .batch( "test_column_pruning".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::ColumnPruning], + vec![NormalizationRuleImpl::ColumnPruning], ) - .find_best()?; + .find_best::(None)?; assert_eq!(best_plan.childrens.len(), 1); match best_plan.operator { diff --git a/src/optimizer/rule/combine_operators.rs b/src/optimizer/rule/normalization/combine_operators.rs similarity index 76% rename from src/optimizer/rule/combine_operators.rs rename to src/optimizer/rule/normalization/combine_operators.rs index a4daa1f1..df6cb87f 100644 --- a/src/optimizer/rule/combine_operators.rs +++ b/src/optimizer/rule/normalization/combine_operators.rs @@ -1,10 +1,9 @@ use crate::expression::{BinaryOperator, ScalarExpression}; use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; -use crate::optimizer::core::rule::Rule; +use crate::optimizer::core::rule::{MatchPattern, NormalizationRule}; use crate::optimizer::heuristic::graph::{HepGraph, HepNodeId}; -use crate::optimizer::rule::is_subset_exprs; +use crate::optimizer::rule::normalization::is_subset_exprs; use crate::optimizer::OptimizerError; -use crate::planner::operator::filter::FilterOperator; use crate::planner::operator::Operator; use crate::types::LogicalType; use lazy_static::lazy_static; @@ -33,19 +32,22 @@ lazy_static! { /// Combine two adjacent project operators into one. pub struct CollapseProject; -impl Rule for CollapseProject { +impl MatchPattern for CollapseProject { fn pattern(&self) -> &Pattern { &COLLAPSE_PROJECT_RULE } +} +impl NormalizationRule for CollapseProject { fn apply(&self, node_id: HepNodeId, graph: &mut HepGraph) -> Result<(), OptimizerError> { if let Operator::Project(op) = graph.operator(node_id) { - let child_id = graph.children_at(node_id)[0]; - if let Operator::Project(child_op) = graph.operator(child_id) { - if is_subset_exprs(&op.exprs, &child_op.exprs) { - graph.remove_node(child_id, false); - } else { - graph.remove_node(node_id, false); + if let Some(child_id) = graph.eldest_child_at(node_id) { + if let Operator::Project(child_op) = graph.operator(child_id) { + if is_subset_exprs(&op.exprs, &child_op.exprs) { + graph.remove_node(child_id, false); + } else { + graph.remove_node(node_id, false); + } } } } @@ -57,26 +59,27 @@ impl Rule for CollapseProject { /// Combine two adjacent filter operators into one. pub struct CombineFilter; -impl Rule for CombineFilter { +impl MatchPattern for CombineFilter { fn pattern(&self) -> &Pattern { &COMBINE_FILTERS_RULE } +} +impl NormalizationRule for CombineFilter { fn apply(&self, node_id: HepNodeId, graph: &mut HepGraph) -> Result<(), OptimizerError> { - if let Operator::Filter(op) = graph.operator(node_id) { - let child_id = graph.children_at(node_id)[0]; - if let Operator::Filter(child_op) = graph.operator(child_id) { - let new_filter_op = FilterOperator { - predicate: ScalarExpression::Binary { + if let Operator::Filter(op) = graph.operator(node_id).clone() { + if let Some(child_id) = graph.eldest_child_at(node_id) { + if let Operator::Filter(child_op) = graph.operator_mut(child_id) { + child_op.predicate = ScalarExpression::Binary { op: BinaryOperator::And, - left_expr: Box::new(op.predicate.clone()), + left_expr: Box::new(op.predicate), right_expr: Box::new(child_op.predicate.clone()), ty: LogicalType::Boolean, - }, - having: op.having || child_op.having, - }; - graph.replace_node(node_id, Operator::Filter(new_filter_op)); - graph.remove_node(child_id, false); + }; + child_op.having = op.having || child_op.having; + + graph.remove_node(node_id, false); + } } } @@ -93,8 +96,9 @@ mod tests { use crate::optimizer::heuristic::batch::HepBatchStrategy; use crate::optimizer::heuristic::graph::HepNodeId; use crate::optimizer::heuristic::optimizer::HepOptimizer; - use crate::optimizer::rule::RuleImpl; + use crate::optimizer::rule::normalization::NormalizationRuleImpl; use crate::planner::operator::Operator; + use crate::storage::kip::KipTransaction; use crate::types::value::DataValue; use crate::types::LogicalType; use std::sync::Arc; @@ -106,7 +110,7 @@ mod tests { let mut optimizer = HepOptimizer::new(plan.clone()).batch( "test_collapse_project".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::CollapseProject], + vec![NormalizationRuleImpl::CollapseProject], ); let mut new_project_op = optimizer.graph.operator(HepNodeId::new(0)).clone(); @@ -119,7 +123,7 @@ mod tests { optimizer.graph.add_root(new_project_op); - let best_plan = optimizer.find_best()?; + let best_plan = optimizer.find_best::(None)?; if let Operator::Project(op) = &best_plan.operator { assert_eq!(op.exprs.len(), 1); @@ -143,7 +147,7 @@ mod tests { let mut optimizer = HepOptimizer::new(plan.clone()).batch( "test_combine_filter".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::CombineFilter], + vec![NormalizationRuleImpl::CombineFilter], ); let mut new_filter_op = optimizer.graph.operator(HepNodeId::new(1)).clone(); @@ -163,7 +167,7 @@ mod tests { .graph .add_node(HepNodeId::new(0), Some(HepNodeId::new(1)), new_filter_op); - let best_plan = optimizer.find_best()?; + let best_plan = optimizer.find_best::(None)?; if let Operator::Filter(op) = &best_plan.childrens[0].operator { if let ScalarExpression::Binary { op, .. } = &op.predicate { diff --git a/src/optimizer/rule/normalization/mod.rs b/src/optimizer/rule/normalization/mod.rs new file mode 100644 index 00000000..27697c0f --- /dev/null +++ b/src/optimizer/rule/normalization/mod.rs @@ -0,0 +1,91 @@ +use crate::expression::ScalarExpression; +use crate::optimizer::core::pattern::Pattern; +use crate::optimizer::core::rule::{MatchPattern, NormalizationRule}; +use crate::optimizer::heuristic::graph::{HepGraph, HepNodeId}; +use crate::optimizer::rule::normalization::column_pruning::ColumnPruning; +use crate::optimizer::rule::normalization::combine_operators::{CollapseProject, CombineFilter}; +use crate::optimizer::rule::normalization::pushdown_limit::{ + EliminateLimits, LimitProjectTranspose, PushLimitIntoScan, PushLimitThroughJoin, +}; +use crate::optimizer::rule::normalization::pushdown_predicates::PushPredicateIntoScan; +use crate::optimizer::rule::normalization::pushdown_predicates::PushPredicateThroughJoin; +use crate::optimizer::rule::normalization::simplification::ConstantCalculation; +use crate::optimizer::rule::normalization::simplification::SimplifyFilter; +use crate::optimizer::OptimizerError; + +mod column_pruning; +mod combine_operators; +mod pushdown_limit; +mod pushdown_predicates; +mod simplification; + +#[derive(Debug, Copy, Clone)] +pub enum NormalizationRuleImpl { + ColumnPruning, + // Combine operators + CollapseProject, + CombineFilter, + // PushDown limit + LimitProjectTranspose, + EliminateLimits, + PushLimitThroughJoin, + PushLimitIntoTableScan, + // PushDown predicates + PushPredicateThroughJoin, + // Tips: need to be used with `SimplifyFilter` + PushPredicateIntoScan, + // Simplification + SimplifyFilter, + ConstantCalculation, +} + +impl MatchPattern for NormalizationRuleImpl { + fn pattern(&self) -> &Pattern { + match self { + NormalizationRuleImpl::ColumnPruning => ColumnPruning.pattern(), + NormalizationRuleImpl::CollapseProject => CollapseProject.pattern(), + NormalizationRuleImpl::CombineFilter => CombineFilter.pattern(), + NormalizationRuleImpl::LimitProjectTranspose => LimitProjectTranspose.pattern(), + NormalizationRuleImpl::EliminateLimits => EliminateLimits.pattern(), + NormalizationRuleImpl::PushLimitThroughJoin => PushLimitThroughJoin.pattern(), + NormalizationRuleImpl::PushLimitIntoTableScan => PushLimitIntoScan.pattern(), + NormalizationRuleImpl::PushPredicateThroughJoin => PushPredicateThroughJoin.pattern(), + NormalizationRuleImpl::PushPredicateIntoScan => PushPredicateIntoScan.pattern(), + NormalizationRuleImpl::SimplifyFilter => SimplifyFilter.pattern(), + NormalizationRuleImpl::ConstantCalculation => ConstantCalculation.pattern(), + } + } +} + +impl NormalizationRule for NormalizationRuleImpl { + fn apply(&self, node_id: HepNodeId, graph: &mut HepGraph) -> Result<(), OptimizerError> { + match self { + NormalizationRuleImpl::ColumnPruning => ColumnPruning.apply(node_id, graph), + NormalizationRuleImpl::CollapseProject => CollapseProject.apply(node_id, graph), + NormalizationRuleImpl::CombineFilter => CombineFilter.apply(node_id, graph), + NormalizationRuleImpl::LimitProjectTranspose => { + LimitProjectTranspose.apply(node_id, graph) + } + NormalizationRuleImpl::EliminateLimits => EliminateLimits.apply(node_id, graph), + NormalizationRuleImpl::PushLimitThroughJoin => { + PushLimitThroughJoin.apply(node_id, graph) + } + NormalizationRuleImpl::PushLimitIntoTableScan => { + PushLimitIntoScan.apply(node_id, graph) + } + NormalizationRuleImpl::PushPredicateThroughJoin => { + PushPredicateThroughJoin.apply(node_id, graph) + } + NormalizationRuleImpl::SimplifyFilter => SimplifyFilter.apply(node_id, graph), + NormalizationRuleImpl::PushPredicateIntoScan => { + PushPredicateIntoScan.apply(node_id, graph) + } + NormalizationRuleImpl::ConstantCalculation => ConstantCalculation.apply(node_id, graph), + } + } +} + +/// Return true when left is subset of right +pub fn is_subset_exprs(left: &[ScalarExpression], right: &[ScalarExpression]) -> bool { + left.iter().all(|l| right.contains(l)) +} diff --git a/src/optimizer/rule/pushdown_limit.rs b/src/optimizer/rule/normalization/pushdown_limit.rs similarity index 70% rename from src/optimizer/rule/pushdown_limit.rs rename to src/optimizer/rule/normalization/pushdown_limit.rs index 44a29a87..a5fd5b7e 100644 --- a/src/optimizer/rule/pushdown_limit.rs +++ b/src/optimizer/rule/normalization/pushdown_limit.rs @@ -1,11 +1,12 @@ use crate::optimizer::core::pattern::Pattern; use crate::optimizer::core::pattern::PatternChildrenPredicate; -use crate::optimizer::core::rule::Rule; +use crate::optimizer::core::rule::{MatchPattern, NormalizationRule}; use crate::optimizer::heuristic::graph::{HepGraph, HepNodeId}; use crate::optimizer::OptimizerError; use crate::planner::operator::join::JoinType; use crate::planner::operator::limit::LimitOperator; use crate::planner::operator::Operator; +use itertools::Itertools; use lazy_static::lazy_static; use std::cmp; lazy_static! { @@ -49,13 +50,17 @@ lazy_static! { pub struct LimitProjectTranspose; -impl Rule for LimitProjectTranspose { +impl MatchPattern for LimitProjectTranspose { fn pattern(&self) -> &Pattern { &LIMIT_PROJECT_TRANSPOSE_RULE } +} +impl NormalizationRule for LimitProjectTranspose { fn apply(&self, node_id: HepNodeId, graph: &mut HepGraph) -> Result<(), OptimizerError> { - graph.swap_node(node_id, graph.children_at(node_id)[0]); + if let Some(child_id) = graph.eldest_child_at(node_id) { + graph.swap_node(node_id, child_id); + } Ok(()) } @@ -65,22 +70,25 @@ impl Rule for LimitProjectTranspose { /// expression. pub struct EliminateLimits; -impl Rule for EliminateLimits { +impl MatchPattern for EliminateLimits { fn pattern(&self) -> &Pattern { &ELIMINATE_LIMITS_RULE } +} +impl NormalizationRule for EliminateLimits { fn apply(&self, node_id: HepNodeId, graph: &mut HepGraph) -> Result<(), OptimizerError> { if let Operator::Limit(op) = graph.operator(node_id) { - let child_id = graph.children_at(node_id)[0]; - if let Operator::Limit(child_op) = graph.operator(child_id) { - let offset = Self::binary_options(op.offset, child_op.offset, |a, b| a + b); - let limit = Self::binary_options(op.limit, child_op.limit, cmp::min); + if let Some(child_id) = graph.eldest_child_at(node_id) { + if let Operator::Limit(child_op) = graph.operator(child_id) { + let offset = Self::binary_options(op.offset, child_op.offset, |a, b| a + b); + let limit = Self::binary_options(op.limit, child_op.limit, cmp::min); - let new_limit_op = LimitOperator { offset, limit }; + let new_limit_op = LimitOperator { offset, limit }; - graph.remove_node(child_id, false); - graph.replace_node(node_id, Operator::Limit(new_limit_op)); + graph.remove_node(child_id, false); + graph.replace_node(node_id, Operator::Limit(new_limit_op)); + } } } @@ -111,27 +119,32 @@ impl EliminateLimits { /// TODO: if join condition is empty. pub struct PushLimitThroughJoin; -impl Rule for PushLimitThroughJoin { +impl MatchPattern for PushLimitThroughJoin { fn pattern(&self) -> &Pattern { &PUSH_LIMIT_THROUGH_JOIN_RULE } +} +impl NormalizationRule for PushLimitThroughJoin { fn apply(&self, node_id: HepNodeId, graph: &mut HepGraph) -> Result<(), OptimizerError> { if let Operator::Limit(op) = graph.operator(node_id) { - let child_id = graph.children_at(node_id)[0]; - let join_type = if let Operator::Join(op) = graph.operator(child_id) { - Some(op.join_type) - } else { - None - }; - - if let Some(ty) = join_type { - if let Some(grandson_id) = match ty { - JoinType::Left => Some(graph.children_at(child_id)[0]), - JoinType::Right => Some(graph.children_at(child_id)[1]), - _ => None, - } { - graph.add_node(child_id, Some(grandson_id), Operator::Limit(op.clone())); + if let Some(child_id) = graph.eldest_child_at(node_id) { + let join_type = if let Operator::Join(op) = graph.operator(child_id) { + Some(op.join_type) + } else { + None + }; + + if let Some(ty) = join_type { + let children = graph.children_at(child_id).collect_vec(); + + if let Some(grandson_id) = match ty { + JoinType::Left => children.first(), + JoinType::Right => children.last(), + _ => None, + } { + graph.add_node(child_id, Some(*grandson_id), Operator::Limit(op.clone())); + } } } } @@ -143,21 +156,24 @@ impl Rule for PushLimitThroughJoin { /// Push down `Limit` past a `Scan`. pub struct PushLimitIntoScan; -impl Rule for PushLimitIntoScan { +impl MatchPattern for PushLimitIntoScan { fn pattern(&self) -> &Pattern { &PUSH_LIMIT_INTO_TABLE_SCAN_RULE } +} +impl NormalizationRule for PushLimitIntoScan { fn apply(&self, node_id: HepNodeId, graph: &mut HepGraph) -> Result<(), OptimizerError> { if let Operator::Limit(limit_op) = graph.operator(node_id) { - let child_index = graph.children_at(node_id)[0]; - if let Operator::Scan(scan_op) = graph.operator(child_index) { - let mut new_scan_op = scan_op.clone(); + if let Some(child_index) = graph.eldest_child_at(node_id) { + if let Operator::Scan(scan_op) = graph.operator(child_index) { + let mut new_scan_op = scan_op.clone(); - new_scan_op.limit = (limit_op.offset, limit_op.limit); + new_scan_op.limit = (limit_op.offset, limit_op.limit); - graph.remove_node(node_id, false); - graph.replace_node(child_index, Operator::Scan(new_scan_op)); + graph.remove_node(node_id, false); + graph.replace_node(child_index, Operator::Scan(new_scan_op)); + } } } @@ -171,9 +187,10 @@ mod tests { use crate::db::DatabaseError; use crate::optimizer::heuristic::batch::HepBatchStrategy; use crate::optimizer::heuristic::optimizer::HepOptimizer; - use crate::optimizer::rule::RuleImpl; + use crate::optimizer::rule::normalization::NormalizationRuleImpl; use crate::planner::operator::limit::LimitOperator; use crate::planner::operator::Operator; + use crate::storage::kip::KipTransaction; #[tokio::test] async fn test_limit_project_transpose() -> Result<(), DatabaseError> { @@ -183,9 +200,9 @@ mod tests { .batch( "test_limit_project_transpose".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::LimitProjectTranspose], + vec![NormalizationRuleImpl::LimitProjectTranspose], ) - .find_best()?; + .find_best::(None)?; if let Operator::Project(_) = &best_plan.operator { } else { @@ -207,7 +224,7 @@ mod tests { let mut optimizer = HepOptimizer::new(plan.clone()).batch( "test_eliminate_limits".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::EliminateLimits], + vec![NormalizationRuleImpl::EliminateLimits], ); let new_limit_op = LimitOperator { @@ -217,7 +234,7 @@ mod tests { optimizer.graph.add_root(Operator::Limit(new_limit_op)); - let best_plan = optimizer.find_best()?; + let best_plan = optimizer.find_best::(None)?; if let Operator::Limit(op) = &best_plan.operator { assert_eq!(op.limit, Some(1)); @@ -242,11 +259,11 @@ mod tests { "test_push_limit_through_join".to_string(), HepBatchStrategy::once_topdown(), vec![ - RuleImpl::LimitProjectTranspose, - RuleImpl::PushLimitThroughJoin, + NormalizationRuleImpl::LimitProjectTranspose, + NormalizationRuleImpl::PushLimitThroughJoin, ], ) - .find_best()?; + .find_best::(None)?; if let Operator::Join(_) = &best_plan.childrens[0].childrens[0].operator { } else { @@ -271,11 +288,11 @@ mod tests { "test_push_limit_into_table_scan".to_string(), HepBatchStrategy::once_topdown(), vec![ - RuleImpl::LimitProjectTranspose, - RuleImpl::PushLimitIntoTableScan, + NormalizationRuleImpl::LimitProjectTranspose, + NormalizationRuleImpl::PushLimitIntoTableScan, ], ) - .find_best()?; + .find_best::(None)?; if let Operator::Scan(op) = &best_plan.childrens[0].operator { assert_eq!(op.limit, (Some(1), Some(1))) diff --git a/src/optimizer/rule/pushdown_predicates.rs b/src/optimizer/rule/normalization/pushdown_predicates.rs similarity index 86% rename from src/optimizer/rule/pushdown_predicates.rs rename to src/optimizer/rule/normalization/pushdown_predicates.rs index b69c23ec..27cd03d1 100644 --- a/src/optimizer/rule/pushdown_predicates.rs +++ b/src/optimizer/rule/normalization/pushdown_predicates.rs @@ -2,12 +2,13 @@ use crate::catalog::ColumnRef; use crate::expression::{BinaryOperator, ScalarExpression}; use crate::optimizer::core::pattern::Pattern; use crate::optimizer::core::pattern::PatternChildrenPredicate; -use crate::optimizer::core::rule::Rule; +use crate::optimizer::core::rule::{MatchPattern, NormalizationRule}; use crate::optimizer::heuristic::graph::{HepGraph, HepNodeId}; use crate::optimizer::OptimizerError; use crate::planner::operator::filter::FilterOperator; use crate::planner::operator::join::JoinType; use crate::planner::operator::Operator; +use crate::types::index::IndexInfo; use crate::types::LogicalType; use itertools::Itertools; use lazy_static::lazy_static; @@ -94,14 +95,19 @@ pub fn is_subset_cols(left: &[ColumnRef], right: &[ColumnRef]) -> bool { /// attributes of the left or right side of sub query when applicable. pub struct PushPredicateThroughJoin; -impl Rule for PushPredicateThroughJoin { +impl MatchPattern for PushPredicateThroughJoin { fn pattern(&self) -> &Pattern { &PUSH_PREDICATE_THROUGH_JOIN } +} +impl NormalizationRule for PushPredicateThroughJoin { // TODO: pushdown_predicates need to consider output columns fn apply(&self, node_id: HepNodeId, graph: &mut HepGraph) -> Result<(), OptimizerError> { - let child_id = graph.children_at(node_id)[0]; + let child_id = match graph.eldest_child_at(node_id) { + Some(child_id) => child_id, + None => return Ok(()), + }; if let Operator::Join(child_op) = graph.operator(child_id) { if !matches!( child_op.join_type, @@ -110,7 +116,7 @@ impl Rule for PushPredicateThroughJoin { return Ok(()); } - let join_childs = graph.children_at(child_id); + let join_childs = graph.children_at(child_id).collect_vec(); let left_columns = graph.operator(join_childs[0]).referenced_columns(true); let right_columns = graph.operator(join_childs[1]).referenced_columns(true); @@ -196,38 +202,32 @@ impl Rule for PushPredicateThroughJoin { pub struct PushPredicateIntoScan; -impl Rule for PushPredicateIntoScan { +impl MatchPattern for PushPredicateIntoScan { fn pattern(&self) -> &Pattern { &PUSH_PREDICATE_INTO_SCAN } +} +impl NormalizationRule for PushPredicateIntoScan { fn apply(&self, node_id: HepNodeId, graph: &mut HepGraph) -> Result<(), OptimizerError> { - if let Operator::Filter(op) = graph.operator(node_id) { - let child_id = graph.children_at(node_id)[0]; - if let Operator::Scan(child_op) = graph.operator(child_id) { - if child_op.index_by.is_some() { - return Ok(()); - } - - //FIXME: now only support unique - for meta in &child_op.index_metas { - let mut option = op.predicate.convert_binary(&meta.column_ids[0])?; - - if let Some(mut binary) = option.take() { - binary.scope_aggregation()?; - let rearrange_binaries = binary.rearrange()?; + if let Operator::Filter(op) = graph.operator(node_id).clone() { + if let Some(child_id) = graph.eldest_child_at(node_id) { + if let Operator::Scan(child_op) = graph.operator_mut(child_id) { + //FIXME: now only support unique + for IndexInfo { meta, binaries } in &mut child_op.index_infos { + let mut option = op.predicate.convert_binary(&meta.column_ids[0])?; + + if let Some(mut binary) = option.take() { + binary.scope_aggregation()?; + let rearrange_binaries = binary.rearrange()?; + + if rearrange_binaries.is_empty() { + continue; + } + let _ = binaries.replace(rearrange_binaries); - if rearrange_binaries.is_empty() { - continue; + return Ok(()); } - let mut scan_by_index = child_op.clone(); - scan_by_index.index_by = Some((meta.clone(), rearrange_binaries)); - - // The constant expression extracted in prewhere is used to - // reduce the data scanning range and cannot replace the role of Filter. - graph.replace_node(child_id, Operator::Scan(scan_by_index)); - - return Ok(()); } } } @@ -245,8 +245,9 @@ mod tests { use crate::expression::{BinaryOperator, ScalarExpression}; use crate::optimizer::heuristic::batch::HepBatchStrategy; use crate::optimizer::heuristic::optimizer::HepOptimizer; - use crate::optimizer::rule::RuleImpl; + use crate::optimizer::rule::normalization::NormalizationRuleImpl; use crate::planner::operator::Operator; + use crate::storage::kip::KipTransaction; use crate::types::value::DataValue; use crate::types::LogicalType; use std::collections::Bound; @@ -261,14 +262,14 @@ mod tests { .batch( "simplify_filter".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::SimplifyFilter], + vec![NormalizationRuleImpl::SimplifyFilter], ) .batch( "test_push_predicate_into_scan".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::PushPredicateIntoScan], + vec![NormalizationRuleImpl::PushPredicateIntoScan], ) - .find_best()?; + .find_best::(None)?; if let Operator::Scan(op) = &best_plan.childrens[0].childrens[0].operator { let mock_binaries = vec![Scope { @@ -276,7 +277,7 @@ mod tests { max: Bound::Unbounded, }]; - assert_eq!(op.index_by.clone().unwrap().1, mock_binaries); + assert_eq!(op.index_infos[1].binaries, Some(mock_binaries)); } else { unreachable!("Should be a filter operator") } @@ -294,9 +295,9 @@ mod tests { .batch( "test_push_predicate_through_join".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::PushPredicateThroughJoin], + vec![NormalizationRuleImpl::PushPredicateThroughJoin], ) - .find_best()?; + .find_best::(None)?; if let Operator::Filter(op) = &best_plan.childrens[0].operator { match op.predicate { @@ -337,9 +338,9 @@ mod tests { .batch( "test_push_predicate_through_join".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::PushPredicateThroughJoin], + vec![NormalizationRuleImpl::PushPredicateThroughJoin], ) - .find_best()?; + .find_best::(None)?; if let Operator::Filter(op) = &best_plan.childrens[0].operator { match op.predicate { @@ -380,9 +381,9 @@ mod tests { .batch( "test_push_predicate_through_join".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::PushPredicateThroughJoin], + vec![NormalizationRuleImpl::PushPredicateThroughJoin], ) - .find_best()?; + .find_best::(None)?; if let Operator::Join(_) = &best_plan.childrens[0].operator { } else { diff --git a/src/optimizer/rule/simplification.rs b/src/optimizer/rule/normalization/simplification.rs similarity index 81% rename from src/optimizer/rule/simplification.rs rename to src/optimizer/rule/normalization/simplification.rs index 3b5738f4..19e92f3e 100644 --- a/src/optimizer/rule/simplification.rs +++ b/src/optimizer/rule/normalization/simplification.rs @@ -1,9 +1,10 @@ use crate::optimizer::core::pattern::{Pattern, PatternChildrenPredicate}; -use crate::optimizer::core::rule::Rule; +use crate::optimizer::core::rule::{MatchPattern, NormalizationRule}; use crate::optimizer::heuristic::graph::{HepGraph, HepNodeId}; use crate::optimizer::OptimizerError; use crate::planner::operator::join::JoinCondition; use crate::planner::operator::Operator; +use itertools::Itertools; use lazy_static::lazy_static; lazy_static! { static ref CONSTANT_CALCULATION_RULE: Pattern = { @@ -67,7 +68,7 @@ impl ConstantCalculation { } _ => (), } - for child_id in graph.children_at(node_id) { + for child_id in graph.children_at(node_id).collect_vec() { Self::_apply(child_id, graph)?; } @@ -75,11 +76,13 @@ impl ConstantCalculation { } } -impl Rule for ConstantCalculation { +impl MatchPattern for ConstantCalculation { fn pattern(&self) -> &Pattern { &CONSTANT_CALCULATION_RULE } +} +impl NormalizationRule for ConstantCalculation { fn apply(&self, node_id: HepNodeId, graph: &mut HepGraph) -> Result<(), OptimizerError> { Self::_apply(node_id, graph)?; // mark changed to skip this rule batch @@ -92,11 +95,13 @@ impl Rule for ConstantCalculation { #[derive(Copy, Clone)] pub struct SimplifyFilter; -impl Rule for SimplifyFilter { +impl MatchPattern for SimplifyFilter { fn pattern(&self) -> &Pattern { &SIMPLIFY_FILTER_RULE } +} +impl NormalizationRule for SimplifyFilter { fn apply(&self, node_id: HepNodeId, graph: &mut HepGraph) -> Result<(), OptimizerError> { if let Operator::Filter(mut filter_op) = graph.operator(node_id).clone() { filter_op.predicate.simplify()?; @@ -118,10 +123,11 @@ mod test { use crate::expression::{BinaryOperator, ScalarExpression, UnaryOperator}; use crate::optimizer::heuristic::batch::HepBatchStrategy; use crate::optimizer::heuristic::optimizer::HepOptimizer; - use crate::optimizer::rule::RuleImpl; + use crate::optimizer::rule::normalization::NormalizationRuleImpl; use crate::planner::operator::filter::FilterOperator; use crate::planner::operator::Operator; use crate::planner::LogicalPlan; + use crate::storage::kip::KipTransaction; use crate::types::value::DataValue; use crate::types::LogicalType; use std::collections::Bound; @@ -138,9 +144,12 @@ mod test { .batch( "test_simplification".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::SimplifyFilter, RuleImpl::ConstantCalculation], + vec![ + NormalizationRuleImpl::SimplifyFilter, + NormalizationRuleImpl::ConstantCalculation, + ], ) - .find_best()?; + .find_best::(None)?; if let Operator::Project(project_op) = best_plan.clone().operator { let constant_expr = ScalarExpression::Constant(Arc::new(DataValue::Int32(Some(3)))); if let ScalarExpression::Binary { right_expr, .. } = &project_op.exprs[0] { @@ -197,9 +206,9 @@ mod test { .batch( "test_simplify_filter".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::SimplifyFilter], + vec![NormalizationRuleImpl::SimplifyFilter], ) - .find_best()?; + .find_best::(None)?; if let Operator::Filter(filter_op) = best_plan.childrens[0].clone().operator { println!( "{expr}: {:#?}", @@ -241,9 +250,9 @@ mod test { .batch( "test_simplify_filter".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::SimplifyFilter], + vec![NormalizationRuleImpl::SimplifyFilter], ) - .find_best()?; + .find_best::(None)?; if let Operator::Filter(filter_op) = best_plan.childrens[0].clone().operator { let c1_col = ColumnCatalog { summary: ColumnSummary { @@ -322,9 +331,9 @@ mod test { .batch( "test_simplify_filter".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::SimplifyFilter], + vec![NormalizationRuleImpl::SimplifyFilter], ) - .find_best()?; + .find_best::(None)?; if let Operator::Filter(filter_op) = best_plan.childrens[0].clone().operator { println!("{expr}: {:#?}", filter_op); @@ -432,9 +441,9 @@ mod test { .batch( "test_simplify_filter".to_string(), HepBatchStrategy::once_topdown(), - vec![RuleImpl::SimplifyFilter], + vec![NormalizationRuleImpl::SimplifyFilter], ) - .find_best()?; + .find_best::(None)?; if let Operator::Filter(filter_op) = best_plan.childrens[0].clone().operator { println!("{expr}: {:#?}", filter_op); @@ -452,4 +461,73 @@ mod test { Ok(()) } + + #[tokio::test] + async fn test_simplify_filter_multiple_dispersed_same_column_in_or() -> Result<(), DatabaseError> + { + let plan_1 = select_sql_run("select * from t1 where c1 = 4 and c1 > c2 or c1 > 1").await?; + + let op = |plan: LogicalPlan, expr: &str| -> Result, DatabaseError> { + let best_plan = HepOptimizer::new(plan.clone()) + .batch( + "test_simplify_filter".to_string(), + HepBatchStrategy::once_topdown(), + vec![NormalizationRuleImpl::SimplifyFilter], + ) + .find_best::(None)?; + if let Operator::Filter(filter_op) = best_plan.childrens[0].clone().operator { + println!("{expr}: {:#?}", filter_op); + + Ok(Some(filter_op)) + } else { + Ok(None) + } + }; + + let op_1 = op(plan_1, "c1 = 4 and c2 > c1 or c1 > 1")?.unwrap(); + + let cb_1_c1 = op_1.predicate.convert_binary(&0).unwrap(); + println!("op_1 => c1: {:#?}", cb_1_c1); + assert_eq!( + cb_1_c1, + Some(ConstantBinary::Or(vec![ + ConstantBinary::Eq(Arc::new(DataValue::Int32(Some(4)))), + ConstantBinary::Scope { + min: Bound::Excluded(Arc::new(DataValue::Int32(Some(1)))), + max: Bound::Unbounded + } + ])) + ); + + Ok(()) + } + + #[tokio::test] + async fn test_simplify_filter_column_is_null() -> Result<(), DatabaseError> { + let plan_1 = select_sql_run("select * from t1 where c1 is null").await?; + + let op = |plan: LogicalPlan, expr: &str| -> Result, DatabaseError> { + let best_plan = HepOptimizer::new(plan.clone()) + .batch( + "test_simplify_filter".to_string(), + HepBatchStrategy::once_topdown(), + vec![NormalizationRuleImpl::SimplifyFilter], + ) + .find_best::(None)?; + if let Operator::Filter(filter_op) = best_plan.childrens[0].clone().operator { + println!("{expr}: {:#?}", filter_op); + + Ok(Some(filter_op)) + } else { + Ok(None) + } + }; + + let op_1 = op(plan_1, "c1 is null")?.unwrap(); + + let cb_1_c1 = op_1.predicate.convert_binary(&0).unwrap(); + println!("op_1 => c1: {:#?}", cb_1_c1); + + Ok(()) + } } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index deaa0fdf..41b0d4c2 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1,6 +1,8 @@ use sqlparser::parser::ParserError; use sqlparser::{ast::Statement, dialect::PostgreSqlDialect, parser::Parser}; +const DIALECT: PostgreSqlDialect = PostgreSqlDialect {}; + /// Parse a string to a collection of statements. /// /// # Example @@ -13,7 +15,6 @@ use sqlparser::{ast::Statement, dialect::PostgreSqlDialect, parser::Parser}; /// let ast = parse_sql(sql).unwrap(); /// println!("{:?}", ast); /// ``` -pub fn parse_sql(sql: &str) -> Result, ParserError> { - let dialect = PostgreSqlDialect {}; - Parser::parse_sql(&dialect, sql) +pub fn parse_sql>(sql: S) -> Result, ParserError> { + Parser::parse_sql(&DIALECT, sql.as_ref()) } diff --git a/src/planner/mod.rs b/src/planner/mod.rs index 6b0a5243..25106cc0 100644 --- a/src/planner/mod.rs +++ b/src/planner/mod.rs @@ -1,12 +1,13 @@ pub mod operator; use crate::catalog::TableName; -use crate::planner::operator::Operator; +use crate::planner::operator::{Operator, PhysicalOption}; #[derive(Debug, PartialEq, Eq, Clone, Hash)] pub struct LogicalPlan { pub operator: Operator, pub childrens: Vec, + pub physical_option: Option, } impl LogicalPlan { diff --git a/src/planner/operator/aggregate.rs b/src/planner/operator/aggregate.rs index c6ce8a9d..9017b079 100644 --- a/src/planner/operator/aggregate.rs +++ b/src/planner/operator/aggregate.rs @@ -19,6 +19,7 @@ impl AggregateOperator { agg_calls, }), childrens: vec![children], + physical_option: None, } } } diff --git a/src/planner/operator/analyze.rs b/src/planner/operator/analyze.rs new file mode 100644 index 00000000..d6ee5c31 --- /dev/null +++ b/src/planner/operator/analyze.rs @@ -0,0 +1,7 @@ +use crate::catalog::{ColumnRef, TableName}; + +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +pub struct AnalyzeOperator { + pub table_name: TableName, + pub columns: Vec, +} diff --git a/src/planner/operator/delete.rs b/src/planner/operator/delete.rs index 04261672..ca8f5b23 100644 --- a/src/planner/operator/delete.rs +++ b/src/planner/operator/delete.rs @@ -1,6 +1,8 @@ -use crate::catalog::TableName; +use crate::catalog::{ColumnRef, TableName}; #[derive(Debug, PartialEq, Eq, Clone, Hash)] pub struct DeleteOperator { pub table_name: TableName, + // for column pruning + pub primary_key_column: ColumnRef, } diff --git a/src/planner/operator/filter.rs b/src/planner/operator/filter.rs index 41fd25aa..372e181a 100644 --- a/src/planner/operator/filter.rs +++ b/src/planner/operator/filter.rs @@ -16,6 +16,7 @@ impl FilterOperator { LogicalPlan { operator: Operator::Filter(FilterOperator { predicate, having }), childrens: vec![children], + physical_option: None, } } } diff --git a/src/planner/operator/join.rs b/src/planner/operator/join.rs index 459c147b..3f2b2122 100644 --- a/src/planner/operator/join.rs +++ b/src/planner/operator/join.rs @@ -38,6 +38,7 @@ impl JoinOperator { LogicalPlan { operator: Operator::Join(JoinOperator { on, join_type }), childrens: vec![left, right], + physical_option: None, } } } diff --git a/src/planner/operator/limit.rs b/src/planner/operator/limit.rs index c72ff1e7..9a2cb8f7 100644 --- a/src/planner/operator/limit.rs +++ b/src/planner/operator/limit.rs @@ -17,6 +17,7 @@ impl LimitOperator { LogicalPlan { operator: Operator::Limit(LimitOperator { offset, limit }), childrens: vec![children], + physical_option: None, } } } diff --git a/src/planner/operator/mod.rs b/src/planner/operator/mod.rs index 5dc28385..5d6a7c98 100644 --- a/src/planner/operator/mod.rs +++ b/src/planner/operator/mod.rs @@ -1,5 +1,6 @@ pub mod aggregate; pub mod alter_table; +pub mod analyze; pub mod copy_from_file; pub mod copy_to_file; pub mod create_table; @@ -19,6 +20,7 @@ pub mod values; use crate::catalog::ColumnRef; use crate::planner::operator::alter_table::drop_column::DropColumnOperator; +use crate::planner::operator::analyze::AnalyzeOperator; use crate::planner::operator::copy_from_file::CopyFromFileOperator; use crate::planner::operator::copy_to_file::CopyToFileOperator; use crate::planner::operator::create_table::CreateTableOperator; @@ -30,6 +32,7 @@ use crate::planner::operator::show::ShowTablesOperator; use crate::planner::operator::truncate::TruncateOperator; use crate::planner::operator::update::UpdateOperator; use crate::planner::operator::values::ValuesOperator; +use crate::types::index::IndexInfo; use itertools::Itertools; use self::{ @@ -54,6 +57,7 @@ pub enum Operator { Insert(InsertOperator), Update(UpdateOperator), Delete(DeleteOperator), + Analyze(AnalyzeOperator), // DDL AddColumn(AddColumnOperator), DropColumn(DropColumnOperator), @@ -67,6 +71,34 @@ pub enum Operator { CopyToFile(CopyToFileOperator), } +#[derive(Debug, PartialEq, Eq, Clone, Hash)] +pub enum PhysicalOption { + Dummy, + SimpleAggregate, + HashAggregate, + Filter, + HashJoin, + Project, + SeqScan, + IndexScan(IndexInfo), + RadixSort, + // NormalSort, + Limit, + Values, + Insert, + Update, + Delete, + AddColumn, + DropColumn, + CreateTable, + DropTable, + Truncate, + Show, + CopyFromFile, + CopyToFile, + Analyze, +} + impl Operator { pub fn referenced_columns(&self, only_column_ref: bool) -> Vec { match self { @@ -109,6 +141,8 @@ impl Operator { .flat_map(|expr| expr.referenced_columns(only_column_ref)) .collect_vec(), Operator::Values(op) => op.columns.clone(), + Operator::Analyze(op) => op.columns.clone(), + Operator::Delete(op) => vec![op.primary_key_column.clone()], _ => vec![], } } diff --git a/src/planner/operator/scan.rs b/src/planner/operator/scan.rs index 2782c9d9..9128c59d 100644 --- a/src/planner/operator/scan.rs +++ b/src/planner/operator/scan.rs @@ -1,46 +1,60 @@ use crate::catalog::{TableCatalog, TableName}; -use crate::expression::simplify::ConstantBinary; use crate::expression::ScalarExpression; use crate::planner::LogicalPlan; use crate::storage::Bounds; -use crate::types::index::IndexMetaRef; +use crate::types::index::IndexInfo; +use crate::types::ColumnId; use itertools::Itertools; use super::Operator; #[derive(Debug, PartialEq, Eq, Clone, Hash)] pub struct ScanOperator { - pub index_metas: Vec, - pub table_name: TableName, + pub primary_key: ColumnId, pub columns: Vec, // Support push down limit. pub limit: Bounds, - // IndexScan only // Support push down predicate. // If pre_where is simple predicate, for example: a > 1 then can calculate directly when read data. - pub index_by: Option<(IndexMetaRef, Vec)>, + pub index_infos: Vec, } impl ScanOperator { pub fn build(table_name: TableName, table_catalog: &TableCatalog) -> LogicalPlan { + let mut primary_key_option = None; // Fill all Columns in TableCatalog by default let columns = table_catalog .all_columns() .into_iter() - .map(ScalarExpression::ColumnRef) + .map(|column| { + if column.desc.is_primary { + primary_key_option = column.id(); + } + + ScalarExpression::ColumnRef(column) + }) + .collect_vec(); + let index_infos = table_catalog + .indexes + .iter() + .map(|meta| IndexInfo { + meta: meta.clone(), + binaries: None, + }) .collect_vec(); LogicalPlan { operator: Operator::Scan(ScanOperator { - index_metas: table_catalog.indexes.clone(), + index_infos, table_name, + primary_key: primary_key_option.unwrap(), columns, limit: (None, None), - index_by: None, }), childrens: vec![], + physical_option: None, } } } diff --git a/src/storage/kip.rs b/src/storage/kip.rs index 2025cdfe..197248b6 100644 --- a/src/storage/kip.rs +++ b/src/storage/kip.rs @@ -1,5 +1,6 @@ -use crate::catalog::{ColumnCatalog, ColumnRef, TableCatalog, TableName}; +use crate::catalog::{ColumnCatalog, ColumnRef, TableCatalog, TableMeta, TableName}; use crate::expression::simplify::ConstantBinary; +use crate::optimizer::core::column_meta::{ColumnMeta, ColumnMetaLoader}; use crate::storage::table_codec::TableCodec; use crate::storage::{ tuple_projection, Bounds, IndexIter, Iter, Projections, Storage, StorageError, Transaction, @@ -21,6 +22,7 @@ use std::sync::Arc; #[derive(Clone)] pub struct KipStorage { pub inner: Arc, + pub(crate) meta_cache: Arc>>, } impl KipStorage { @@ -28,9 +30,11 @@ impl KipStorage { let storage = storage::KipStorage::open_with_config(Config::new(path).enable_level_0_memorization()) .await?; + let meta_cache = Arc::new(ShardingLruCache::new(128, 16, RandomState::new()).unwrap()); Ok(KipStorage { inner: Arc::new(storage), + meta_cache, }) } } @@ -43,14 +47,16 @@ impl Storage for KipStorage { Ok(KipTransaction { tx, - cache: ShardingLruCache::new(8, 2, RandomState::default())?, + table_cache: ShardingLruCache::new(8, 2, RandomState::default())?, + meta_cache: self.meta_cache.clone(), }) } } pub struct KipTransaction { tx: mvcc::Transaction, - cache: ShardingLruCache, + table_cache: ShardingLruCache, + meta_cache: Arc>>, } impl Transaction for KipTransaction { @@ -176,11 +182,11 @@ impl Transaction for KipTransaction { for col in catalog.all_columns() { if col.name() == column.name() { - if if_not_exists { - return Ok(col.id().unwrap()); + return if if_not_exists { + Ok(col.id().unwrap()) } else { - return Err(StorageError::DuplicateColumn); - } + Err(StorageError::DuplicateColumn) + }; } } @@ -200,7 +206,7 @@ impl Transaction for KipTransaction { let column = catalog.get_column_by_id(&col_id).unwrap(); let (key, value) = TableCodec::encode_column(&table_name, column)?; self.tx.set(key, value); - self.cache.remove(table_name); + self.table_cache.remove(table_name); Ok(col_id) } else { @@ -235,7 +241,7 @@ impl Transaction for KipTransaction { } err => err?, } - self.cache.remove(table_name); + self.table_cache.remove(table_name); Ok(()) } else { @@ -249,7 +255,8 @@ impl Transaction for KipTransaction { columns: Vec, if_not_exists: bool, ) -> Result { - let (table_key, value) = TableCodec::encode_root_table(&table_name)?; + let (table_key, value) = + TableCodec::encode_root_table(&TableMeta::empty(table_name.clone()))?; if self.tx.get(&table_key)?.is_some() { if if_not_exists { return Ok(table_name); @@ -266,7 +273,7 @@ impl Transaction for KipTransaction { let (key, value) = TableCodec::encode_column(&table_name, column)?; self.tx.set(key, value); } - self.cache.put(table_name.to_string(), table_catalog); + self.table_cache.put(table_name.to_string(), table_catalog); Ok(table_name) } @@ -290,7 +297,7 @@ impl Transaction for KipTransaction { self.tx .remove(&TableCodec::encode_root_table_key(table_name))?; - let _ = self.cache.remove(&table_name.to_string()); + let _ = self.table_cache.remove(&table_name.to_string()); Ok(()) } @@ -306,18 +313,15 @@ impl Transaction for KipTransaction { } fn table(&self, table_name: TableName) -> Option<&TableCatalog> { - let mut option = self.cache.get(&table_name); + let mut option = self.table_cache.get(&table_name); if option.is_none() { // TODO: unify the data into a `Meta` prefix and use one iteration to collect all data - let columns = Self::column_collect(table_name.clone(), &self.tx).ok()?; - let indexes = Self::index_meta_collect(&table_name, &self.tx)?; + let (columns, indexes) = Self::table_collect(table_name.clone(), &self.tx).ok()?; - if let Ok(catalog) = - TableCatalog::new_with_indexes(table_name.clone(), columns, indexes) - { + if let Ok(catalog) = TableCatalog::reload(table_name.clone(), columns, indexes) { option = self - .cache + .table_cache .get_or_insert(table_name.to_string(), |_| Ok(catalog)) .ok(); } @@ -326,20 +330,48 @@ impl Transaction for KipTransaction { option } - fn show_tables(&self) -> Result, StorageError> { - let mut tables = vec![]; + fn table_metas(&self) -> Result, StorageError> { + let mut metas = vec![]; let (min, max) = TableCodec::root_table_bound(); let mut iter = self.tx.iter(Bound::Included(&min), Bound::Included(&max))?; while let Some((_, value_option)) = iter.try_next().ok().flatten() { if let Some(value) = value_option { - let table_name = TableCodec::decode_root_table(&value)?; + let meta = TableCodec::decode_root_table(&value)?; - tables.push(table_name); + metas.push(meta); } } - Ok(tables) + Ok(metas) + } + + fn save_table_meta(&mut self, table_meta: &TableMeta) -> Result<(), StorageError> { + let _ = self.meta_cache.remove(&table_meta.table_name); + let (key, value) = TableCodec::encode_root_table(table_meta)?; + self.tx.set(key, value); + + Ok(()) + } + + fn column_meta_paths(&self, table_name: &str) -> Result, StorageError> { + if let Some(bytes) = self + .tx + .get(&TableCodec::encode_root_table_key(table_name))? + { + let meta = TableCodec::decode_root_table(&bytes)?; + + return Ok(meta.colum_meta_paths); + } + + Ok(vec![]) + } + + fn meta_loader(&self) -> ColumnMetaLoader + where + Self: Sized, + { + ColumnMetaLoader::new(self, &self.meta_cache) } async fn commit(self) -> Result<(), StorageError> { @@ -350,41 +382,28 @@ impl Transaction for KipTransaction { } impl KipTransaction { - fn column_collect( + fn table_collect( table_name: TableName, tx: &mvcc::Transaction, - ) -> Result, StorageError> { - let (column_min, column_max) = TableCodec::columns_bound(&table_name); - let mut column_iter = - tx.iter(Bound::Included(&column_min), Bound::Included(&column_max))?; + ) -> Result<(Vec, Vec), StorageError> { + let (table_min, table_max) = TableCodec::table_bound(&table_name); + let mut column_iter = tx.iter(Bound::Included(&table_min), Bound::Included(&table_max))?; - let mut columns = vec![]; + let mut columns = Vec::new(); + let mut index_metas = Vec::new(); - while let Some((_, value_option)) = column_iter.try_next().ok().flatten() { + // Tips: only `Column`, `IndexMeta`, `TableMeta` + while let Some((key, value_option)) = column_iter.try_next().ok().flatten() { if let Some(value) = value_option { - columns.push(TableCodec::decode_column(&value)?); - } - } - - Ok(columns) - } - - fn index_meta_collect(name: &str, tx: &mvcc::Transaction) -> Option> { - let (index_min, index_max) = TableCodec::index_meta_bound(name); - let mut index_metas = vec![]; - let mut index_iter = tx - .iter(Bound::Included(&index_min), Bound::Included(&index_max)) - .ok()?; - - while let Some((_, value_option)) = index_iter.try_next().ok().flatten() { - if let Some(value) = value_option { - if let Ok(index_meta) = TableCodec::decode_index_meta(&value) { - index_metas.push(Arc::new(index_meta)); + if key.starts_with(&table_min) { + columns.push(TableCodec::decode_column(&value)?); + } else { + index_metas.push(Arc::new(TableCodec::decode_index_meta(&value)?)); } } } - Some(index_metas) + Ok((columns, index_metas)) } fn _drop_data(tx: &mut mvcc::Transaction, min: &[u8], max: &[u8]) -> Result<(), StorageError> { diff --git a/src/storage/mod.rs b/src/storage/mod.rs index 818282a7..72e76ca1 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -1,9 +1,10 @@ pub mod kip; mod table_codec; -use crate::catalog::{CatalogError, ColumnCatalog, TableCatalog, TableName}; +use crate::catalog::{CatalogError, ColumnCatalog, TableCatalog, TableMeta, TableName}; use crate::expression::simplify::ConstantBinary; use crate::expression::ScalarExpression; +use crate::optimizer::core::column_meta::ColumnMetaLoader; use crate::storage::table_codec::TableCodec; use crate::types::errors::TypeError; use crate::types::index::{Index, IndexMetaRef}; @@ -93,8 +94,12 @@ pub trait Transaction: Sync + Send + 'static { fn drop_table(&mut self, table_name: &str, if_exists: bool) -> Result<(), StorageError>; fn drop_data(&mut self, table_name: &str) -> Result<(), StorageError>; fn table(&self, table_name: TableName) -> Option<&TableCatalog>; - - fn show_tables(&self) -> Result, StorageError>; + fn table_metas(&self) -> Result, StorageError>; + fn save_table_meta(&mut self, table_meta: &TableMeta) -> Result<(), StorageError>; + fn column_meta_paths(&self, table_name: &str) -> Result, StorageError>; + fn meta_loader(&self) -> ColumnMetaLoader + where + Self: Sized; #[allow(async_fn_in_trait)] async fn commit(self) -> Result<(), StorageError>; diff --git a/src/storage/table_codec.rs b/src/storage/table_codec.rs index d9f415e1..c4ce78e7 100644 --- a/src/storage/table_codec.rs +++ b/src/storage/table_codec.rs @@ -1,4 +1,4 @@ -use crate::catalog::{ColumnCatalog, ColumnRef}; +use crate::catalog::{ColumnCatalog, ColumnRef, TableMeta}; use crate::types::errors::TypeError; use crate::types::index::{Index, IndexId, IndexMeta}; use crate::types::tuple::{Tuple, TupleId}; @@ -40,10 +40,10 @@ impl TableCodec { table_bytes.push(b'1'); } CodecType::Index => { - table_bytes.push(b'2'); + table_bytes.push(b'3'); } CodecType::Tuple => { - table_bytes.push(b'3'); + table_bytes.push(b'8'); } CodecType::Root => { let mut bytes = ROOT_BYTES.clone(); @@ -114,6 +114,16 @@ impl TableCodec { (op(BOUND_MIN_TAG), op(BOUND_MAX_TAG)) } + pub fn table_bound(table_name: &str) -> (Vec, Vec) { + let mut column_prefix = Self::key_prefix(CodecType::Column, table_name); + column_prefix.push(BOUND_MIN_TAG); + + let mut index_prefix = Self::key_prefix(CodecType::IndexMeta, table_name); + index_prefix.push(BOUND_MAX_TAG); + + (column_prefix, index_prefix) + } + pub fn columns_bound(table_name: &str) -> (Vec, Vec) { let op = |bound_id| { let mut key_prefix = Self::key_prefix(CodecType::Column, table_name); @@ -245,27 +255,24 @@ impl TableCodec { /// Key: Root{BOUND_MIN_TAG}{TableName} /// Value: TableName - pub fn encode_root_table(table_name: &str) -> Result<(Bytes, Bytes), TypeError> { - let key = Self::encode_root_table_key(table_name); + pub fn encode_root_table(meta: &TableMeta) -> Result<(Bytes, Bytes), TypeError> { + let key = Self::encode_root_table_key(&meta.table_name); - Ok(( - Bytes::from(key), - Bytes::from(table_name.to_owned().into_bytes()), - )) + Ok((Bytes::from(key), Bytes::from(bincode::serialize(meta)?))) } pub fn encode_root_table_key(table_name: &str) -> Vec { Self::key_prefix(CodecType::Root, table_name) } - pub fn decode_root_table(bytes: &[u8]) -> Result { - Ok(String::from_utf8(bytes.to_vec())?) + pub fn decode_root_table(bytes: &[u8]) -> Result { + Ok(bincode::deserialize(bytes)?) } } #[cfg(test)] mod tests { - use crate::catalog::{ColumnCatalog, ColumnDesc, TableCatalog}; + use crate::catalog::{ColumnCatalog, ColumnDesc, TableCatalog, TableMeta}; use crate::storage::table_codec::TableCodec; use crate::types::errors::TypeError; use crate::types::index::{Index, IndexMeta}; @@ -322,11 +329,16 @@ mod tests { #[test] fn test_root_catalog() { let table_catalog = build_table_codec(); - let (_, bytes) = TableCodec::encode_root_table(&table_catalog.name).unwrap(); + let (_, bytes) = TableCodec::encode_root_table(&TableMeta { + colum_meta_paths: vec![], + table_name: table_catalog.name.clone(), + }) + .unwrap(); - let table_name = TableCodec::decode_root_table(&bytes).unwrap(); + let table_meta = TableCodec::decode_root_table(&bytes).unwrap(); - assert_eq!(table_name, table_catalog.name.as_str()); + assert_eq!(table_meta.table_name.as_str(), table_catalog.name.as_str()); + assert!(table_meta.colum_meta_paths.is_empty()); } #[test] diff --git a/src/types/errors.rs b/src/types/errors.rs index 255d93b4..55dbb2b7 100644 --- a/src/types/errors.rs +++ b/src/types/errors.rs @@ -7,7 +7,7 @@ use std::string::FromUtf8Error; pub enum TypeError { #[error("invalid type")] InvalidType, - #[error("Must contain PrimaryKey!")] + #[error("must contain PrimaryKey!")] PrimaryKeyNotFound, #[error("not implemented sqlparser datatype: {0}")] NotImplementedSqlparserDataType(String), @@ -15,7 +15,7 @@ pub enum TypeError { InternalError(String), #[error("cast fail")] CastFail, - #[error("Too long")] + #[error("too long")] TooLong, #[error("cannot be Null")] NotNull, @@ -69,4 +69,10 @@ pub enum TypeError { ), #[error("{0} and {1} do not match")] MisMatch(String, String), + #[error("io")] + IO( + #[source] + #[from] + std::io::Error, + ), } diff --git a/src/types/index.rs b/src/types/index.rs index f6660129..b5cffd5f 100644 --- a/src/types/index.rs +++ b/src/types/index.rs @@ -1,3 +1,4 @@ +use crate::expression::simplify::ConstantBinary; use crate::types::value::ValueRef; use crate::types::ColumnId; use serde::{Deserialize, Serialize}; @@ -6,6 +7,12 @@ use std::sync::Arc; pub type IndexId = u32; pub type IndexMetaRef = Arc; +#[derive(Debug, Clone, Eq, PartialEq, Hash)] +pub struct IndexInfo { + pub(crate) meta: IndexMetaRef, + pub(crate) binaries: Option>, +} + #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq, Hash)] pub struct IndexMeta { pub id: IndexId, diff --git a/src/types/value.rs b/src/types/value.rs index 97d5e4e9..720bdaf7 100644 --- a/src/types/value.rs +++ b/src/types/value.rs @@ -8,11 +8,11 @@ use std::fmt::Formatter; use std::hash::Hash; use std::str::FromStr; use std::sync::Arc; -use std::{fmt, mem}; +use std::{cmp, fmt, mem}; use crate::types::errors::TypeError; use ordered_float::OrderedFloat; -use rust_decimal::prelude::FromPrimitive; +use rust_decimal::prelude::{FromPrimitive, ToPrimitive}; use serde::{Deserialize, Serialize}; use super::LogicalType; @@ -606,6 +606,7 @@ impl DataValue { }, DataValue::Float64(value) => match to { LogicalType::SqlNull => Ok(DataValue::Null), + LogicalType::Float => Ok(DataValue::Float32(value.map(|v| v as f32))), LogicalType::Double => Ok(DataValue::Float64(value)), LogicalType::Varchar(len) => varchar_cast!(value, len), LogicalType::Decimal(_, option) => Ok(DataValue::Decimal( @@ -687,6 +688,7 @@ impl DataValue { } LogicalType::Integer => Ok(DataValue::Int32(value)), LogicalType::Bigint => Ok(DataValue::Int64(value.map(|v| v.into()))), + LogicalType::Float => Ok(DataValue::Float32(value.map(|v| v as f32))), LogicalType::Double => Ok(DataValue::Float64(value.map(|v| v.into()))), LogicalType::Varchar(len) => varchar_cast!(value, len), LogicalType::Decimal(_, option) => Ok(DataValue::Decimal(value.map(|v| { @@ -710,6 +712,8 @@ impl DataValue { Ok(DataValue::UInt64(value.map(u64::try_from).transpose()?)) } LogicalType::Bigint => Ok(DataValue::Int64(value)), + LogicalType::Float => Ok(DataValue::Float32(value.map(|v| v as f32))), + LogicalType::Double => Ok(DataValue::Float64(value.map(|v| v as f64))), LogicalType::Varchar(len) => varchar_cast!(value, len), LogicalType::Decimal(_, option) => Ok(DataValue::Decimal(value.map(|v| { let mut decimal = Decimal::from(v); @@ -762,6 +766,7 @@ impl DataValue { LogicalType::UInteger => Ok(DataValue::UInt32(value)), LogicalType::Bigint => Ok(DataValue::Int64(value.map(|v| v.into()))), LogicalType::UBigint => Ok(DataValue::UInt64(value.map(|v| v.into()))), + LogicalType::Float => Ok(DataValue::Float32(value.map(|v| v as f32))), LogicalType::Double => Ok(DataValue::Float64(value.map(|v| v.into()))), LogicalType::Varchar(len) => varchar_cast!(value, len), LogicalType::Decimal(_, option) => Ok(DataValue::Decimal(value.map(|v| { @@ -775,6 +780,8 @@ impl DataValue { DataValue::UInt64(value) => match to { LogicalType::SqlNull => Ok(DataValue::Null), LogicalType::UBigint => Ok(DataValue::UInt64(value)), + LogicalType::Float => Ok(DataValue::Float32(value.map(|v| v as f32))), + LogicalType::Double => Ok(DataValue::Float64(value.map(|v| v as f64))), LogicalType::Varchar(len) => varchar_cast!(value, len), LogicalType::Decimal(_, option) => Ok(DataValue::Decimal(value.map(|v| { let mut decimal = Decimal::from(v); @@ -880,6 +887,8 @@ impl DataValue { }, DataValue::Decimal(value) => match to { LogicalType::SqlNull => Ok(DataValue::Null), + LogicalType::Float => Ok(DataValue::Float32(value.and_then(|v| v.to_f32()))), + LogicalType::Double => Ok(DataValue::Float64(value.and_then(|v| v.to_f64()))), LogicalType::Decimal(_, _) => Ok(DataValue::Decimal(value)), LogicalType::Varchar(len) => varchar_cast!(value, len), _ => Err(TypeError::CastFail), @@ -887,6 +896,31 @@ impl DataValue { } } + pub fn common_prefix_length(&self, target: &DataValue) -> Option { + if self.is_null() && target.is_null() { + return Some(0); + } + if self.is_null() || target.is_null() { + return None; + } + + if let (DataValue::Utf8(Some(v1)), DataValue::Utf8(Some(v2))) = (self, target) { + let min_len = cmp::min(v1.len(), v2.len()); + + let mut v1_iter = v1.get(0..min_len).unwrap().chars(); + let mut v2_iter = v2.get(0..min_len).unwrap().chars(); + + for i in 0..min_len { + if v1_iter.next() != v2_iter.next() { + return Some(i); + } + } + + return Some(min_len); + } + Some(0) + } + fn decimal_round_i(option: &Option, decimal: &mut Decimal) { if let Some(scale) = option { let new_decimal = decimal.trunc_with_scale(*scale as u32); diff --git a/tests/slt/analyze.slt b/tests/slt/analyze.slt new file mode 100644 index 00000000..79610b4b --- /dev/null +++ b/tests/slt/analyze.slt @@ -0,0 +1,48 @@ +statement ok +create table t(id int primary key, v1 bigint null, v2 varchar null, v3 decimal null) + +statement ok +insert into t values (0,1,10,100) + +statement ok +insert into t values (1,1,10,100), (2,2,20,200), (3,3,30,300), (4,4,40,400) + +statement ok +insert into t(id, v1, v2, v3) values (5,1,10,100) + +statement ok +insert into t(id, v1, v2) values (6,1,10) + +statement ok +insert into t(id, v2, v1) values (7,1,10) + +statement error +insert into t(id, v1, v2, v3) values (0) + +statement error +insert into t(id, v1, v2, v3) values (0, 0) + +statement error +insert into t(id, v1, v2, v3) values (0, 0, 0) + +statement ok +insert into t values (8,NULL,NULL,NULL) + +query IIII rowsort +select * from t +---- +0 1 10 100 +1 1 10 100 +2 2 20 200 +3 3 30 300 +4 4 40 400 +5 1 10 100 +6 1 10 null +7 10 1 null +8 null null null + +statement ok +analyze table t + +statement ok +drop table t \ No newline at end of file