diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml index 489ccead54a8..c251d643ffdd 100644 --- a/.github/workflows/develop.yml +++ b/.github/workflows/develop.yml @@ -212,7 +212,14 @@ jobs: path: . - name: Unzip binaries run: tar -xvf ./bins.tar.gz - - name: Fuzz Test + - name: Build Fuzz Test + shell: bash + run: | + cd tests-fuzz & + cargo install cargo-gc-bin & + cargo gc & + cd .. + - name: Run Fuzz Test uses: ./.github/actions/fuzz-test env: CUSTOM_LIBFUZZER_PATH: /usr/lib/llvm-14/lib/libFuzzer.a diff --git a/Cargo.lock b/Cargo.lock index bd19aabd63d1..b594cfeff36a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -214,7 +214,7 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c" [[package]] name = "api" -version = "0.8.0" +version = "0.8.1" dependencies = [ "common-base", "common-decimal", @@ -703,7 +703,7 @@ dependencies = [ [[package]] name = "auth" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "async-trait", @@ -877,7 +877,7 @@ dependencies = [ [[package]] name = "benchmarks" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "arrow", @@ -1218,7 +1218,7 @@ dependencies = [ [[package]] name = "cache" -version = "0.8.0" +version = "0.8.1" dependencies = [ "catalog", "common-error", @@ -1226,6 +1226,7 @@ dependencies = [ "common-meta", "moka", "snafu 0.8.3", + "substrait 0.8.1", ] [[package]] @@ -1252,13 +1253,15 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "catalog" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "arrow", "arrow-schema", "async-stream", "async-trait", + "bytes", + "cache", "catalog", "chrono", "common-catalog", @@ -1532,7 +1535,7 @@ checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" [[package]] name = "client" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "arc-swap", @@ -1561,7 +1564,7 @@ dependencies = [ "serde_json", "snafu 0.8.3", "substrait 0.17.1", - "substrait 0.8.0", + "substrait 0.8.1", "tokio", "tokio-stream", "tonic 0.11.0", @@ -1591,7 +1594,7 @@ dependencies = [ [[package]] name = "cmd" -version = "0.8.0" +version = "0.8.1" dependencies = [ "async-trait", "auth", @@ -1647,7 +1650,7 @@ dependencies = [ "session", "snafu 0.8.3", "store-api", - "substrait 0.8.0", + "substrait 0.8.1", "table", "temp-env", "tempfile", @@ -1692,7 +1695,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335" [[package]] name = "common-base" -version = "0.8.0" +version = "0.8.1" dependencies = [ "anymap", "bitvec", @@ -1708,7 +1711,7 @@ dependencies = [ [[package]] name = "common-catalog" -version = "0.8.0" +version = "0.8.1" dependencies = [ "chrono", "common-error", @@ -1719,7 +1722,7 @@ dependencies = [ [[package]] name = "common-config" -version = "0.8.0" +version = "0.8.1" dependencies = [ "common-base", "common-error", @@ -1742,7 +1745,7 @@ dependencies = [ [[package]] name = "common-datasource" -version = "0.8.0" +version = "0.8.1" dependencies = [ "arrow", "arrow-schema", @@ -1774,7 +1777,7 @@ dependencies = [ [[package]] name = "common-decimal" -version = "0.8.0" +version = "0.8.1" dependencies = [ "bigdecimal", "common-error", @@ -1787,7 +1790,7 @@ dependencies = [ [[package]] name = "common-error" -version = "0.8.0" +version = "0.8.1" dependencies = [ "snafu 0.8.3", "strum 0.25.0", @@ -1795,7 +1798,7 @@ dependencies = [ [[package]] name = "common-frontend" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "async-trait", @@ -1810,7 +1813,7 @@ dependencies = [ [[package]] name = "common-function" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "arc-swap", @@ -1843,7 +1846,7 @@ dependencies = [ [[package]] name = "common-greptimedb-telemetry" -version = "0.8.0" +version = "0.8.1" dependencies = [ "async-trait", "common-runtime", @@ -1860,7 +1863,7 @@ dependencies = [ [[package]] name = "common-grpc" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "arrow-flight", @@ -1886,7 +1889,7 @@ dependencies = [ [[package]] name = "common-grpc-expr" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "common-base", @@ -1903,7 +1906,7 @@ dependencies = [ [[package]] name = "common-macro" -version = "0.8.0" +version = "0.8.1" dependencies = [ "arc-swap", "common-query", @@ -1918,7 +1921,7 @@ dependencies = [ [[package]] name = "common-mem-prof" -version = "0.8.0" +version = "0.8.1" dependencies = [ "common-error", "common-macro", @@ -1931,7 +1934,7 @@ dependencies = [ [[package]] name = "common-meta" -version = "0.8.0" +version = "0.8.1" dependencies = [ "anymap2", "api", @@ -1984,11 +1987,11 @@ dependencies = [ [[package]] name = "common-plugins" -version = "0.8.0" +version = "0.8.1" [[package]] name = "common-procedure" -version = "0.8.0" +version = "0.8.1" dependencies = [ "async-stream", "async-trait", @@ -2013,7 +2016,7 @@ dependencies = [ [[package]] name = "common-procedure-test" -version = "0.8.0" +version = "0.8.1" dependencies = [ "async-trait", "common-procedure", @@ -2021,10 +2024,11 @@ dependencies = [ [[package]] name = "common-query" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "async-trait", + "bytes", "common-base", "common-error", "common-macro", @@ -2044,7 +2048,7 @@ dependencies = [ [[package]] name = "common-recordbatch" -version = "0.8.0" +version = "0.8.1" dependencies = [ "arc-swap", "common-error", @@ -2063,7 +2067,7 @@ dependencies = [ [[package]] name = "common-runtime" -version = "0.8.0" +version = "0.8.1" dependencies = [ "async-trait", "common-error", @@ -2083,7 +2087,7 @@ dependencies = [ [[package]] name = "common-telemetry" -version = "0.8.0" +version = "0.8.1" dependencies = [ "atty", "backtrace", @@ -2110,7 +2114,7 @@ dependencies = [ [[package]] name = "common-test-util" -version = "0.8.0" +version = "0.8.1" dependencies = [ "client", "common-query", @@ -2122,7 +2126,7 @@ dependencies = [ [[package]] name = "common-time" -version = "0.8.0" +version = "0.8.1" dependencies = [ "arrow", "chrono", @@ -2138,7 +2142,7 @@ dependencies = [ [[package]] name = "common-version" -version = "0.8.0" +version = "0.8.1" dependencies = [ "build-data", "schemars", @@ -2147,7 +2151,7 @@ dependencies = [ [[package]] name = "common-wal" -version = "0.8.0" +version = "0.8.1" dependencies = [ "common-base", "common-error", @@ -3155,7 +3159,7 @@ dependencies = [ [[package]] name = "datanode" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "arrow-flight", @@ -3204,7 +3208,6 @@ dependencies = [ "session", "snafu 0.8.3", "store-api", - "substrait 0.8.0", "table", "tokio", "toml 0.8.13", @@ -3213,7 +3216,7 @@ dependencies = [ [[package]] name = "datatypes" -version = "0.8.0" +version = "0.8.1" dependencies = [ "arrow", "arrow-array", @@ -3696,7 +3699,7 @@ dependencies = [ [[package]] name = "file-engine" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "async-trait", @@ -3792,7 +3795,7 @@ dependencies = [ [[package]] name = "flow" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "async-trait", @@ -3833,7 +3836,7 @@ dependencies = [ "snafu 0.8.3", "store-api", "strum 0.25.0", - "substrait 0.8.0", + "substrait 0.8.1", "table", "tokio", "tonic 0.11.0", @@ -3871,7 +3874,7 @@ checksum = "6c2141d6d6c8512188a7891b4b01590a45f6dac67afb4f255c4124dbb86d4eaa" [[package]] name = "frontend" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "arc-swap", @@ -4187,7 +4190,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "greptime-proto" version = "0.1.0" -source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=902f75fdd170c572e90b1f640161d90995f20218#902f75fdd170c572e90b1f640161d90995f20218" +source = "git+https://github.com/killme2008/greptime-proto.git?rev=a15a54a714fe117d7e9f7635e149c4eecac773fa#a15a54a714fe117d7e9f7635e149c4eecac773fa" dependencies = [ "prost 0.12.6", "serde", @@ -4753,7 +4756,7 @@ dependencies = [ [[package]] name = "index" -version = "0.8.0" +version = "0.8.1" dependencies = [ "async-trait", "asynchronous-codec", @@ -5320,7 +5323,7 @@ checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] name = "log-store" -version = "0.8.0" +version = "0.8.1" dependencies = [ "async-stream", "async-trait", @@ -5617,7 +5620,7 @@ dependencies = [ [[package]] name = "meta-client" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "async-trait", @@ -5643,7 +5646,7 @@ dependencies = [ [[package]] name = "meta-srv" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "async-trait", @@ -5719,7 +5722,7 @@ dependencies = [ [[package]] name = "metric-engine" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "aquamarine", @@ -5801,7 +5804,7 @@ dependencies = [ [[package]] name = "mito2" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "aquamarine", @@ -6429,7 +6432,7 @@ dependencies = [ [[package]] name = "object-store" -version = "0.8.0" +version = "0.8.1" dependencies = [ "anyhow", "bytes", @@ -6670,7 +6673,7 @@ dependencies = [ [[package]] name = "operator" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "async-trait", @@ -6716,7 +6719,7 @@ dependencies = [ "sql", "sqlparser 0.45.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=54a267ac89c09b11c0c88934690530807185d3e7)", "store-api", - "substrait 0.8.0", + "substrait 0.8.1", "table", "tokio", "tokio-util", @@ -6961,7 +6964,7 @@ dependencies = [ [[package]] name = "partition" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "async-trait", @@ -7307,7 +7310,7 @@ dependencies = [ [[package]] name = "plugins" -version = "0.8.0" +version = "0.8.1" dependencies = [ "auth", "common-base", @@ -7585,26 +7588,20 @@ dependencies = [ [[package]] name = "promql" -version = "0.8.0" +version = "0.8.1" dependencies = [ "ahash 0.8.11", - "async-recursion", "async-trait", "bytemuck", - "catalog", - "common-catalog", "common-error", "common-macro", - "common-query", "common-recordbatch", "common-telemetry", "datafusion 38.0.0", "datafusion-expr 38.0.0", - "datafusion-functions 38.0.0", "datatypes", "futures", "greptime-proto", - "itertools 0.10.5", "lazy_static", "prometheus", "promql-parser", @@ -7612,7 +7609,6 @@ dependencies = [ "query", "session", "snafu 0.8.3", - "table", "tokio", ] @@ -7798,7 +7794,7 @@ dependencies = [ [[package]] name = "puffin" -version = "0.8.0" +version = "0.8.1" dependencies = [ "async-trait", "bitflags 2.5.0", @@ -7909,7 +7905,7 @@ dependencies = [ [[package]] name = "query" -version = "0.8.0" +version = "0.8.1" dependencies = [ "ahash 0.8.11", "api", @@ -7920,6 +7916,7 @@ dependencies = [ "async-recursion", "async-stream", "async-trait", + "bytes", "catalog", "chrono", "common-base", @@ -7932,11 +7929,13 @@ dependencies = [ "common-plugins", "common-query", "common-recordbatch", + "common-runtime", "common-telemetry", "common-time", "datafusion 38.0.0", "datafusion-common 38.0.0", "datafusion-expr 38.0.0", + "datafusion-functions 38.0.0", "datafusion-optimizer 38.0.0", "datafusion-physical-expr 38.0.0", "datafusion-sql 38.0.0", @@ -7946,6 +7945,7 @@ dependencies = [ "futures-util", "greptime-proto", "humantime", + "itertools 0.10.5", "lazy_static", "meter-core", "meter-macros", @@ -7957,6 +7957,7 @@ dependencies = [ "prometheus", "promql", "promql-parser", + "prost 0.12.6", "rand", "regex", "session", @@ -7967,7 +7968,7 @@ dependencies = [ "stats-cli", "store-api", "streaming-stats", - "substrait 0.8.0", + "substrait 0.8.1", "table", "tokio", "tokio-stream", @@ -9277,7 +9278,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "script" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "arc-swap", @@ -9550,7 +9551,7 @@ dependencies = [ [[package]] name = "servers" -version = "0.8.0" +version = "0.8.1" dependencies = [ "aide", "api", @@ -9653,7 +9654,7 @@ dependencies = [ [[package]] name = "session" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "arc-swap", @@ -9931,7 +9932,7 @@ dependencies = [ [[package]] name = "sql" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "chrono", @@ -9987,7 +9988,7 @@ dependencies = [ [[package]] name = "sqlness-runner" -version = "0.8.0" +version = "0.8.1" dependencies = [ "async-trait", "clap 4.5.4", @@ -10214,7 +10215,7 @@ dependencies = [ [[package]] name = "store-api" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "aquamarine", @@ -10382,13 +10383,11 @@ dependencies = [ [[package]] name = "substrait" -version = "0.8.0" +version = "0.8.1" dependencies = [ "async-trait", "bytes", - "catalog", "common-error", - "common-function", "common-macro", "common-telemetry", "datafusion 38.0.0", @@ -10398,7 +10397,6 @@ dependencies = [ "datatypes", "promql", "prost 0.12.6", - "session", "snafu 0.8.3", "substrait 0.17.1", "tokio", @@ -10573,8 +10571,9 @@ dependencies = [ [[package]] name = "table" -version = "0.8.0" +version = "0.8.1" dependencies = [ + "api", "async-trait", "chrono", "common-base", @@ -10683,7 +10682,7 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" [[package]] name = "tests-fuzz" -version = "0.8.0" +version = "0.8.1" dependencies = [ "arbitrary", "async-trait", @@ -10716,7 +10715,7 @@ dependencies = [ [[package]] name = "tests-integration" -version = "0.8.0" +version = "0.8.1" dependencies = [ "api", "arrow-flight", @@ -10775,7 +10774,7 @@ dependencies = [ "sql", "sqlx", "store-api", - "substrait 0.8.0", + "substrait 0.8.1", "table", "tempfile", "time", diff --git a/Cargo.toml b/Cargo.toml index 6aeb67c8038a..42c584358b20 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,7 +64,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.8.0" +version = "0.8.1" edition = "2021" license = "Apache-2.0" @@ -120,7 +120,7 @@ etcd-client = { git = "https://github.com/MichaelScofield/etcd-client.git", rev fst = "0.4.7" futures = "0.3" futures-util = "0.3" -greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "902f75fdd170c572e90b1f640161d90995f20218" } +greptime-proto = { git = "https://github.com/killme2008/greptime-proto.git", rev = "a15a54a714fe117d7e9f7635e149c4eecac773fa" } humantime = "2.1" humantime-serde = "1.1" itertools = "0.10" diff --git a/benchmarks/src/wal_bench.rs b/benchmarks/src/wal_bench.rs index 10e88f99f37c..681dacfbb60e 100644 --- a/benchmarks/src/wal_bench.rs +++ b/benchmarks/src/wal_bench.rs @@ -28,6 +28,7 @@ use rand::distributions::{Alphanumeric, DistString, Uniform}; use rand::rngs::SmallRng; use rand::{Rng, SeedableRng}; use serde::{Deserialize, Serialize}; +use store_api::logstore::provider::Provider; use store_api::logstore::LogStore; use store_api::storage::RegionId; @@ -210,7 +211,7 @@ impl From for Config { pub struct Region { id: RegionId, schema: Vec, - wal_options: WalOptions, + provider: Provider, next_sequence: AtomicU64, next_entry_id: AtomicU64, next_timestamp: AtomicI64, @@ -227,10 +228,14 @@ impl Region { num_rows: u32, rng_seed: u64, ) -> Self { + let provider = match wal_options { + WalOptions::RaftEngine => Provider::raft_engine_provider(id.as_u64()), + WalOptions::Kafka(opts) => Provider::kafka_provider(opts.topic), + }; Self { id, schema, - wal_options, + provider, next_sequence: AtomicU64::new(1), next_entry_id: AtomicU64::new(1), next_timestamp: AtomicI64::new(1655276557000), @@ -258,14 +263,14 @@ impl Region { self.id, self.next_entry_id.fetch_add(1, Ordering::Relaxed), &entry, - &self.wal_options, + &self.provider, ) .unwrap(); } /// Replays the region. pub async fn replay(&self, wal: &Arc>) { - let mut wal_stream = wal.scan(self.id, 0, &self.wal_options).unwrap(); + let mut wal_stream = wal.scan(self.id, 0, &self.provider).unwrap(); while let Some(res) = wal_stream.next().await { let (_, entry) = res.unwrap(); metrics::METRIC_WAL_READ_BYTES_TOTAL.inc_by(Self::entry_estimated_size(&entry) as u64); diff --git a/src/cache/Cargo.toml b/src/cache/Cargo.toml index 07870fa904a5..9a2888e5fc13 100644 --- a/src/cache/Cargo.toml +++ b/src/cache/Cargo.toml @@ -11,3 +11,4 @@ common-macro.workspace = true common-meta.workspace = true moka.workspace = true snafu.workspace = true +substrait.workspace = true diff --git a/src/cache/src/lib.rs b/src/cache/src/lib.rs index 85dc9c05f1f3..4adf0ff1ff33 100644 --- a/src/cache/src/lib.rs +++ b/src/cache/src/lib.rs @@ -20,7 +20,8 @@ use std::time::Duration; use catalog::kvbackend::new_table_cache; use common_meta::cache::{ new_table_flownode_set_cache, new_table_info_cache, new_table_name_cache, - new_table_route_cache, CacheRegistry, CacheRegistryBuilder, LayeredCacheRegistryBuilder, + new_table_route_cache, new_view_info_cache, CacheRegistry, CacheRegistryBuilder, + LayeredCacheRegistryBuilder, }; use common_meta::kv_backend::KvBackendRef; use moka::future::CacheBuilder; @@ -33,6 +34,7 @@ const DEFAULT_CACHE_TTL: Duration = Duration::from_secs(10 * 60); const DEFAULT_CACHE_TTI: Duration = Duration::from_secs(5 * 60); pub const TABLE_INFO_CACHE_NAME: &str = "table_info_cache"; +pub const VIEW_INFO_CACHE_NAME: &str = "view_info_cache"; pub const TABLE_NAME_CACHE_NAME: &str = "table_name_cache"; pub const TABLE_CACHE_NAME: &str = "table_cache"; pub const TABLE_FLOWNODE_SET_CACHE_NAME: &str = "table_flownode_set_cache"; @@ -82,11 +84,22 @@ pub fn build_fundamental_cache_registry(kv_backend: KvBackendRef) -> CacheRegist cache, kv_backend.clone(), )); + // Builds the view info cache + let cache = CacheBuilder::new(DEFAULT_CACHE_MAX_CAPACITY) + .time_to_live(DEFAULT_CACHE_TTL) + .time_to_idle(DEFAULT_CACHE_TTI) + .build(); + let view_info_cache = Arc::new(new_view_info_cache( + VIEW_INFO_CACHE_NAME.to_string(), + cache, + kv_backend.clone(), + )); CacheRegistryBuilder::default() .add_cache(table_info_cache) .add_cache(table_name_cache) .add_cache(table_route_cache) + .add_cache(view_info_cache) .add_cache(table_flownode_set_cache) .build() } diff --git a/src/catalog/Cargo.toml b/src/catalog/Cargo.toml index ddda28ba8864..185614e98152 100644 --- a/src/catalog/Cargo.toml +++ b/src/catalog/Cargo.toml @@ -16,6 +16,7 @@ arrow.workspace = true arrow-schema.workspace = true async-stream.workspace = true async-trait = "0.1" +bytes.workspace = true common-catalog.workspace = true common-config.workspace = true common-error.workspace = true @@ -48,8 +49,11 @@ table.workspace = true tokio.workspace = true [dev-dependencies] +cache.workspace = true catalog = { workspace = true, features = ["testing"] } chrono.workspace = true +common-meta = { workspace = true, features = ["testing"] } +common-query = { workspace = true, features = ["testing"] } common-test-util.workspace = true log-store.workspace = true object-store.workspace = true diff --git a/src/catalog/src/error.rs b/src/catalog/src/error.rs index 5834eaed359d..eaad22f4c9f6 100644 --- a/src/catalog/src/error.rs +++ b/src/catalog/src/error.rs @@ -19,10 +19,7 @@ use common_error::ext::{BoxedError, ErrorExt}; use common_error::status_code::StatusCode; use common_macro::stack_trace_debug; use datafusion::error::DataFusionError; -use datatypes::prelude::ConcreteDataType; use snafu::{Location, Snafu}; -use table::metadata::TableId; -use tokio::task::JoinError; #[derive(Snafu)] #[snafu(visibility(pub))] @@ -65,19 +62,6 @@ pub enum Error { location: Location, source: BoxedError, }, - #[snafu(display("Failed to open system catalog table"))] - OpenSystemCatalog { - #[snafu(implicit)] - location: Location, - source: table::error::Error, - }, - - #[snafu(display("Failed to create system catalog table"))] - CreateSystemCatalog { - #[snafu(implicit)] - location: Location, - source: table::error::Error, - }, #[snafu(display("Failed to create table, table info: {}", table_info))] CreateTable { @@ -94,52 +78,6 @@ pub enum Error { location: Location, }, - #[snafu(display( - "System catalog table type mismatch, expected: binary, found: {:?}", - data_type, - ))] - SystemCatalogTypeMismatch { - data_type: ConcreteDataType, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Invalid system catalog entry type: {:?}", entry_type))] - InvalidEntryType { - entry_type: Option, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Invalid system catalog key: {:?}", key))] - InvalidKey { - key: Option, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Catalog value is not present"))] - EmptyValue { - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Failed to deserialize value"))] - ValueDeserialize { - #[snafu(source)] - error: serde_json::error::Error, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Table engine not found: {}", engine_name))] - TableEngineNotFound { - engine_name: String, - #[snafu(implicit)] - location: Location, - source: table::error::Error, - }, - #[snafu(display("Cannot find catalog by name: {}", catalog_name))] CatalogNotFound { catalog_name: String, @@ -169,44 +107,9 @@ pub enum Error { location: Location, }, - #[snafu(display("Schema {} already exists", schema))] - SchemaExists { - schema: String, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Operation {} not implemented yet", operation))] - Unimplemented { - operation: String, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Operation {} not supported", op))] - NotSupported { - op: String, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Failed to open table {table_id}"))] - OpenTable { - table_id: TableId, - #[snafu(implicit)] - location: Location, - source: table::error::Error, - }, - - #[snafu(display("Failed to open table in parallel"))] - ParallelOpenTable { - #[snafu(source)] - error: JoinError, - }, - - #[snafu(display("Table not found while opening table, table info: {}", table_info))] - TableNotFound { - table_info: String, + #[snafu(display("View info not found: {}", name))] + ViewInfoNotFound { + name: String, #[snafu(implicit)] location: Location, }, @@ -217,13 +120,6 @@ pub enum Error { #[snafu(display("Failed to find region routes"))] FindRegionRoutes { source: partition::error::Error }, - #[snafu(display("Failed to read system catalog table records"))] - ReadSystemCatalog { - #[snafu(implicit)] - location: Location, - source: common_recordbatch::error::Error, - }, - #[snafu(display("Failed to create recordbatch"))] CreateRecordBatch { #[snafu(implicit)] @@ -231,20 +127,6 @@ pub enum Error { source: common_recordbatch::error::Error, }, - #[snafu(display("Failed to insert table creation record to system catalog"))] - InsertCatalogRecord { - #[snafu(implicit)] - location: Location, - source: table::error::Error, - }, - - #[snafu(display("Failed to scan system catalog table"))] - SystemCatalogTableScan { - #[snafu(implicit)] - location: Location, - source: table::error::Error, - }, - #[snafu(display("Internal error"))] Internal { #[snafu(implicit)] @@ -258,20 +140,14 @@ pub enum Error { location: Location, }, - #[snafu(display("Failed to execute system catalog table scan"))] - SystemCatalogTableScanExec { + #[snafu(display("Failed to decode logical plan for view: {}", name))] + DecodePlan { + name: String, #[snafu(implicit)] location: Location, source: common_query::error::Error, }, - #[snafu(display("Cannot parse catalog value"))] - InvalidCatalogValue { - #[snafu(implicit)] - location: Location, - source: common_catalog::error::Error, - }, - #[snafu(display("Failed to perform metasrv operation"))] Metasrv { #[snafu(implicit)] @@ -297,30 +173,36 @@ pub enum Error { location: Location, }, - #[snafu(display("Table schema mismatch"))] - TableSchemaMismatch { + #[snafu(display("Table metadata manager error"))] + TableMetadataManager { + source: common_meta::error::Error, #[snafu(implicit)] location: Location, - source: table::error::Error, }, - #[snafu(display("A generic error has occurred, msg: {}", msg))] - Generic { - msg: String, + #[snafu(display("Failed to get table cache"))] + GetTableCache { + source: common_meta::error::Error, #[snafu(implicit)] location: Location, }, - #[snafu(display("Table metadata manager error"))] - TableMetadataManager { + #[snafu(display("Failed to get view info from cache"))] + GetViewCache { source: common_meta::error::Error, #[snafu(implicit)] location: Location, }, - #[snafu(display("Failed to get table cache"))] - GetTableCache { - source: common_meta::error::Error, + #[snafu(display("Cache not found: {name}"))] + CacheNotFound { + name: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to cast the catalog manager"))] + CastManager { #[snafu(implicit)] location: Location, }, @@ -331,61 +213,43 @@ pub type Result = std::result::Result; impl ErrorExt for Error { fn status_code(&self) -> StatusCode { match self { - Error::InvalidKey { .. } - | Error::SchemaNotFound { .. } + Error::SchemaNotFound { .. } | Error::CatalogNotFound { .. } | Error::FindPartitions { .. } | Error::FindRegionRoutes { .. } - | Error::InvalidEntryType { .. } - | Error::ParallelOpenTable { .. } => StatusCode::Unexpected, + | Error::CacheNotFound { .. } + | Error::CastManager { .. } => StatusCode::Unexpected, - Error::TableNotFound { .. } => StatusCode::TableNotFound, + Error::ViewInfoNotFound { .. } => StatusCode::TableNotFound, - Error::SystemCatalog { .. } - | Error::EmptyValue { .. } - | Error::ValueDeserialize { .. } => StatusCode::StorageUnavailable, + Error::SystemCatalog { .. } => StatusCode::StorageUnavailable, - Error::Generic { .. } - | Error::SystemCatalogTypeMismatch { .. } - | Error::UpgradeWeakCatalogManagerRef { .. } => StatusCode::Internal, - - Error::ReadSystemCatalog { source, .. } | Error::CreateRecordBatch { source, .. } => { - source.status_code() - } - Error::InvalidCatalogValue { source, .. } => source.status_code(), + Error::UpgradeWeakCatalogManagerRef { .. } => StatusCode::Internal, + Error::CreateRecordBatch { source, .. } => source.status_code(), Error::TableExists { .. } => StatusCode::TableAlreadyExists, Error::TableNotExist { .. } => StatusCode::TableNotFound, - Error::SchemaExists { .. } | Error::TableEngineNotFound { .. } => { - StatusCode::InvalidArguments - } - Error::ListCatalogs { source, .. } | Error::ListNodes { source, .. } | Error::ListSchemas { source, .. } | Error::ListTables { source, .. } => source.status_code(), - Error::OpenSystemCatalog { source, .. } - | Error::CreateSystemCatalog { source, .. } - | Error::InsertCatalogRecord { source, .. } - | Error::OpenTable { source, .. } - | Error::CreateTable { source, .. } - | Error::TableSchemaMismatch { source, .. } => source.status_code(), + Error::CreateTable { source, .. } => source.status_code(), Error::Metasrv { source, .. } => source.status_code(), - Error::SystemCatalogTableScan { source, .. } => source.status_code(), - Error::SystemCatalogTableScanExec { source, .. } => source.status_code(), + Error::DecodePlan { source, .. } => source.status_code(), Error::InvalidTableInfoInCatalog { source, .. } => source.status_code(), Error::CompileScriptInternal { source, .. } | Error::Internal { source, .. } => { source.status_code() } - Error::Unimplemented { .. } | Error::NotSupported { .. } => StatusCode::Unsupported, Error::QueryAccessDenied { .. } => StatusCode::AccessDenied, Error::Datafusion { .. } => StatusCode::EngineExecuteQuery, Error::TableMetadataManager { source, .. } => source.status_code(), - Error::GetTableCache { .. } => StatusCode::Internal, + Error::GetViewCache { source, .. } | Error::GetTableCache { source, .. } => { + source.status_code() + } } } @@ -417,11 +281,6 @@ mod tests { .status_code() ); - assert_eq!( - StatusCode::Unexpected, - InvalidKeySnafu { key: None }.build().status_code() - ); - assert_eq!( StatusCode::StorageUnavailable, Error::SystemCatalog { @@ -430,19 +289,6 @@ mod tests { } .status_code() ); - - assert_eq!( - StatusCode::Internal, - Error::SystemCatalogTypeMismatch { - data_type: ConcreteDataType::binary_datatype(), - location: Location::generate(), - } - .status_code() - ); - assert_eq!( - StatusCode::StorageUnavailable, - EmptyValueSnafu {}.build().status_code() - ); } #[test] diff --git a/src/catalog/src/kvbackend/manager.rs b/src/catalog/src/kvbackend/manager.rs index e7a4ef4be39c..0bf51643b1b1 100644 --- a/src/catalog/src/kvbackend/manager.rs +++ b/src/catalog/src/kvbackend/manager.rs @@ -22,14 +22,13 @@ use common_catalog::consts::{ }; use common_config::Mode; use common_error::ext::BoxedError; -use common_meta::cache::TableRouteCacheRef; +use common_meta::cache::{LayeredCacheRegistryRef, ViewInfoCacheRef}; use common_meta::key::catalog_name::CatalogNameKey; use common_meta::key::schema_name::SchemaNameKey; use common_meta::key::table_info::TableInfoValue; use common_meta::key::table_name::TableNameKey; use common_meta::key::{TableMetadataManager, TableMetadataManagerRef}; use common_meta::kv_backend::KvBackendRef; -use common_meta::table_name::TableName; use futures_util::stream::BoxStream; use futures_util::{StreamExt, TryStreamExt}; use meta_client::client::MetaClient; @@ -38,11 +37,12 @@ use partition::manager::{PartitionRuleManager, PartitionRuleManagerRef}; use snafu::prelude::*; use table::dist_table::DistTable; use table::table::numbers::{NumbersTable, NUMBERS_TABLE_NAME}; +use table::table_name::TableName; use table::TableRef; use crate::error::{ - GetTableCacheSnafu, InvalidTableInfoInCatalogSnafu, ListCatalogsSnafu, ListSchemasSnafu, - ListTablesSnafu, Result, TableMetadataManagerSnafu, + CacheNotFoundSnafu, GetTableCacheSnafu, InvalidTableInfoInCatalogSnafu, ListCatalogsSnafu, + ListSchemasSnafu, ListTablesSnafu, Result, TableMetadataManagerSnafu, }; use crate::information_schema::InformationSchemaProvider; use crate::kvbackend::TableCacheRef; @@ -61,25 +61,26 @@ pub struct KvBackendCatalogManager { table_metadata_manager: TableMetadataManagerRef, /// A sub-CatalogManager that handles system tables system_catalog: SystemCatalog, - table_cache: TableCacheRef, + cache_registry: LayeredCacheRegistryRef, } const CATALOG_CACHE_MAX_CAPACITY: u64 = 128; impl KvBackendCatalogManager { - pub async fn new( + pub fn new( mode: Mode, meta_client: Option>, backend: KvBackendRef, - table_cache: TableCacheRef, - table_route_cache: TableRouteCacheRef, + cache_registry: LayeredCacheRegistryRef, ) -> Arc { Arc::new_cyclic(|me| Self { mode, meta_client, partition_manager: Arc::new(PartitionRuleManager::new( backend.clone(), - table_route_cache, + cache_registry + .get() + .expect("Failed to get table_route_cache"), )), table_metadata_manager: Arc::new(TableMetadataManager::new(backend)), system_catalog: SystemCatalog { @@ -90,7 +91,7 @@ impl KvBackendCatalogManager { me.clone(), )), }, - table_cache, + cache_registry, }) } @@ -99,6 +100,12 @@ impl KvBackendCatalogManager { &self.mode } + pub fn view_info_cache(&self) -> Result { + self.cache_registry.get().context(CacheNotFoundSnafu { + name: "view_info_cache", + }) + } + /// Returns the `[MetaClient]`. pub fn meta_client(&self) -> Option> { self.meta_client.clone() @@ -215,7 +222,11 @@ impl CatalogManager for KvBackendCatalogManager { return Ok(Some(table)); } - self.table_cache + let table_cache: TableCacheRef = self.cache_registry.get().context(CacheNotFoundSnafu { + name: "table_cache", + })?; + + table_cache .get_by_ref(&TableName { catalog_name: catalog_name.to_string(), schema_name: schema_name.to_string(), diff --git a/src/catalog/src/kvbackend/table_cache.rs b/src/catalog/src/kvbackend/table_cache.rs index 4ab73bf9d256..93980d1a0612 100644 --- a/src/catalog/src/kvbackend/table_cache.rs +++ b/src/catalog/src/kvbackend/table_cache.rs @@ -17,11 +17,11 @@ use std::sync::Arc; use common_meta::cache::{CacheContainer, Initializer, TableInfoCacheRef, TableNameCacheRef}; use common_meta::error::{Result as MetaResult, ValueNotExistSnafu}; use common_meta::instruction::CacheIdent; -use common_meta::table_name::TableName; use futures::future::BoxFuture; use moka::future::Cache; use snafu::OptionExt; use table::dist_table::DistTable; +use table::table_name::TableName; use table::TableRef; pub type TableCacheRef = Arc; diff --git a/src/catalog/src/table_source.rs b/src/catalog/src/table_source.rs index 58813a460e33..7399dca550da 100644 --- a/src/catalog/src/table_source.rs +++ b/src/catalog/src/table_source.rs @@ -15,15 +15,25 @@ use std::collections::HashMap; use std::sync::Arc; +use bytes::Bytes; use common_catalog::format_full_table_name; +use common_query::logical_plan::SubstraitPlanDecoderRef; use datafusion::common::{ResolvedTableReference, TableReference}; -use datafusion::datasource::provider_as_source; +use datafusion::datasource::view::ViewTable; +use datafusion::datasource::{provider_as_source, TableProvider}; use datafusion::logical_expr::TableSource; use session::context::QueryContext; -use snafu::{ensure, OptionExt}; +use snafu::{ensure, OptionExt, ResultExt}; +use table::metadata::TableType; use table::table::adapter::DfTableProviderAdapter; +mod dummy_catalog; +use dummy_catalog::DummyCatalogList; -use crate::error::{QueryAccessDeniedSnafu, Result, TableNotExistSnafu}; +use crate::error::{ + CastManagerSnafu, DatafusionSnafu, DecodePlanSnafu, GetViewCacheSnafu, QueryAccessDeniedSnafu, + Result, TableNotExistSnafu, ViewInfoNotFoundSnafu, +}; +use crate::kvbackend::KvBackendCatalogManager; use crate::CatalogManagerRef; pub struct DfTableSourceProvider { @@ -32,6 +42,7 @@ pub struct DfTableSourceProvider { disallow_cross_catalog_query: bool, default_catalog: String, default_schema: String, + plan_decoder: SubstraitPlanDecoderRef, } impl DfTableSourceProvider { @@ -39,6 +50,7 @@ impl DfTableSourceProvider { catalog_manager: CatalogManagerRef, disallow_cross_catalog_query: bool, query_ctx: &QueryContext, + plan_decoder: SubstraitPlanDecoderRef, ) -> Self { Self { catalog_manager, @@ -46,6 +58,7 @@ impl DfTableSourceProvider { resolved_tables: HashMap::new(), default_catalog: query_ctx.current_catalog().to_owned(), default_schema: query_ctx.current_schema().to_owned(), + plan_decoder, } } @@ -94,8 +107,39 @@ impl DfTableSourceProvider { table: format_full_table_name(catalog_name, schema_name, table_name), })?; - let provider = DfTableProviderAdapter::new(table); - let source = provider_as_source(Arc::new(provider)); + let provider: Arc = if table.table_info().table_type == TableType::View { + let catalog_manager = self + .catalog_manager + .as_any() + .downcast_ref::() + .context(CastManagerSnafu)?; + + let view_info = catalog_manager + .view_info_cache()? + .get(table.table_info().ident.table_id) + .await + .context(GetViewCacheSnafu)? + .context(ViewInfoNotFoundSnafu { + name: &table.table_info().name, + })?; + + // Build the catalog list provider for deserialization. + let catalog_list = Arc::new(DummyCatalogList::new(self.catalog_manager.clone())); + let logical_plan = self + .plan_decoder + .decode(Bytes::from(view_info.view_info.clone()), catalog_list, true) + .await + .context(DecodePlanSnafu { + name: &table.table_info().name, + })?; + + Arc::new(ViewTable::try_new(logical_plan, None).context(DatafusionSnafu)?) + } else { + Arc::new(DfTableProviderAdapter::new(table)) + }; + + let source = provider_as_source(provider); + let _ = self.resolved_tables.insert(resolved_name, source.clone()); Ok(source) } @@ -103,6 +147,7 @@ impl DfTableSourceProvider { #[cfg(test)] mod tests { + use common_query::test_util::DummyDecoder; use session::context::QueryContext; use super::*; @@ -112,8 +157,12 @@ mod tests { fn test_validate_table_ref() { let query_ctx = &QueryContext::with("greptime", "public"); - let table_provider = - DfTableSourceProvider::new(MemoryCatalogManager::with_default_setup(), true, query_ctx); + let table_provider = DfTableSourceProvider::new( + MemoryCatalogManager::with_default_setup(), + true, + query_ctx, + DummyDecoder::arc(), + ); let table_ref = TableReference::bare("table_name"); let result = table_provider.resolve_table_ref(table_ref); @@ -148,4 +197,99 @@ mod tests { let table_ref = TableReference::full("greptime", "greptime_private", "columns"); assert!(table_provider.resolve_table_ref(table_ref).is_ok()); } + + use std::collections::HashSet; + + use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; + use cache::{build_fundamental_cache_registry, with_default_composite_cache_registry}; + use common_config::Mode; + use common_meta::cache::{CacheRegistryBuilder, LayeredCacheRegistryBuilder}; + use common_meta::key::TableMetadataManager; + use common_meta::kv_backend::memory::MemoryKvBackend; + use common_query::error::Result as QueryResult; + use common_query::logical_plan::SubstraitPlanDecoder; + use datafusion::catalog::CatalogProviderList; + use datafusion::logical_expr::builder::LogicalTableSource; + use datafusion::logical_expr::{col, lit, LogicalPlan, LogicalPlanBuilder}; + + struct MockDecoder; + impl MockDecoder { + pub fn arc() -> Arc { + Arc::new(MockDecoder) + } + } + + #[async_trait::async_trait] + impl SubstraitPlanDecoder for MockDecoder { + async fn decode( + &self, + _message: bytes::Bytes, + _catalog_list: Arc, + _optimize: bool, + ) -> QueryResult { + Ok(mock_plan()) + } + } + + fn mock_plan() -> LogicalPlan { + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("name", DataType::Utf8, true), + ]); + let table_source = LogicalTableSource::new(SchemaRef::new(schema)); + + let projection = None; + + let builder = + LogicalPlanBuilder::scan("person", Arc::new(table_source), projection).unwrap(); + + builder + .filter(col("id").gt(lit(500))) + .unwrap() + .build() + .unwrap() + } + + #[tokio::test] + async fn test_resolve_view() { + let query_ctx = &QueryContext::with("greptime", "public"); + let backend = Arc::new(MemoryKvBackend::default()); + let layered_cache_builder = LayeredCacheRegistryBuilder::default() + .add_cache_registry(CacheRegistryBuilder::default().build()); + let fundamental_cache_registry = build_fundamental_cache_registry(backend.clone()); + let layered_cache_registry = Arc::new( + with_default_composite_cache_registry( + layered_cache_builder.add_cache_registry(fundamental_cache_registry), + ) + .unwrap() + .build(), + ); + + let catalog_manager = KvBackendCatalogManager::new( + Mode::Standalone, + None, + backend.clone(), + layered_cache_registry, + ); + let table_metadata_manager = TableMetadataManager::new(backend); + let mut view_info = common_meta::key::test_utils::new_test_table_info(1024, vec![]); + view_info.table_type = TableType::View; + let logical_plan = vec![1, 2, 3]; + // Create view metadata + table_metadata_manager + .create_view_metadata(view_info.clone().into(), logical_plan, HashSet::new()) + .await + .unwrap(); + + let mut table_provider = + DfTableSourceProvider::new(catalog_manager, true, query_ctx, MockDecoder::arc()); + + // View not found + let table_ref = TableReference::bare("not_exists_view"); + assert!(table_provider.resolve_table(table_ref).await.is_err()); + + let table_ref = TableReference::bare(view_info.name); + let source = table_provider.resolve_table(table_ref).await.unwrap(); + assert_eq!(*source.get_logical_plan().unwrap(), mock_plan()); + } } diff --git a/src/catalog/src/table_source/dummy_catalog.rs b/src/catalog/src/table_source/dummy_catalog.rs new file mode 100644 index 000000000000..602a5c9cbe0f --- /dev/null +++ b/src/catalog/src/table_source/dummy_catalog.rs @@ -0,0 +1,129 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Dummy catalog for region server. + +use std::any::Any; +use std::sync::Arc; + +use async_trait::async_trait; +use common_catalog::format_full_table_name; +use datafusion::catalog::schema::SchemaProvider; +use datafusion::catalog::{CatalogProvider, CatalogProviderList}; +use datafusion::datasource::TableProvider; +use snafu::OptionExt; +use table::table::adapter::DfTableProviderAdapter; + +use crate::error::TableNotExistSnafu; +use crate::CatalogManagerRef; + +/// Delegate the resolving requests to the `[CatalogManager]` unconditionally. +#[derive(Clone)] +pub struct DummyCatalogList { + catalog_manager: CatalogManagerRef, +} + +impl DummyCatalogList { + /// Creates a new catalog list with the given catalog manager. + pub fn new(catalog_manager: CatalogManagerRef) -> Self { + Self { catalog_manager } + } +} + +impl CatalogProviderList for DummyCatalogList { + fn as_any(&self) -> &dyn Any { + self + } + + fn register_catalog( + &self, + _name: String, + _catalog: Arc, + ) -> Option> { + None + } + + fn catalog_names(&self) -> Vec { + vec![] + } + + fn catalog(&self, catalog_name: &str) -> Option> { + Some(Arc::new(DummyCatalogProvider { + catalog_name: catalog_name.to_string(), + catalog_manager: self.catalog_manager.clone(), + })) + } +} + +/// A dummy catalog provider for [DummyCatalogList]. +#[derive(Clone)] +struct DummyCatalogProvider { + catalog_name: String, + catalog_manager: CatalogManagerRef, +} + +impl CatalogProvider for DummyCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + vec![] + } + + fn schema(&self, schema_name: &str) -> Option> { + Some(Arc::new(DummySchemaProvider { + catalog_name: self.catalog_name.clone(), + schema_name: schema_name.to_string(), + catalog_manager: self.catalog_manager.clone(), + })) + } +} + +/// A dummy schema provider for [DummyCatalogList]. +#[derive(Clone)] +struct DummySchemaProvider { + catalog_name: String, + schema_name: String, + catalog_manager: CatalogManagerRef, +} + +#[async_trait] +impl SchemaProvider for DummySchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + vec![] + } + + async fn table(&self, name: &str) -> datafusion::error::Result>> { + let table = self + .catalog_manager + .table(&self.catalog_name, &self.schema_name, name) + .await? + .with_context(|| TableNotExistSnafu { + table: format_full_table_name(&self.catalog_name, &self.schema_name, name), + })?; + + let table_provider: Arc = Arc::new(DfTableProviderAdapter::new(table)); + + Ok(Some(table_provider)) + } + + fn table_exist(&self, _name: &str) -> bool { + true + } +} diff --git a/src/cmd/src/cli.rs b/src/cmd/src/cli.rs index fc443a169d2b..3042b8370f77 100644 --- a/src/cmd/src/cli.rs +++ b/src/cmd/src/cli.rs @@ -23,9 +23,6 @@ mod helper; // Wait for https://github.com/GreptimeTeam/greptimedb/issues/2373 #[allow(unused)] mod repl; -// TODO(tisonkun): migrate deprecated methods -#[allow(deprecated)] -mod upgrade; use async_trait::async_trait; use bench::BenchTableMetadataCommand; @@ -33,7 +30,6 @@ use clap::Parser; use common_telemetry::logging::{LoggingOptions, TracingOptions}; pub use repl::Repl; use tracing_appender::non_blocking::WorkerGuard; -use upgrade::UpgradeCommand; use self::export::ExportCommand; use crate::error::Result; @@ -116,7 +112,6 @@ impl Command { #[derive(Parser)] enum SubCommand { // Attach(AttachCommand), - Upgrade(UpgradeCommand), Bench(BenchTableMetadataCommand), Export(ExportCommand), } @@ -125,7 +120,6 @@ impl SubCommand { async fn build(&self, guard: Vec) -> Result { match self { // SubCommand::Attach(cmd) => cmd.build().await, - SubCommand::Upgrade(cmd) => cmd.build(guard).await, SubCommand::Bench(cmd) => cmd.build(guard).await, SubCommand::Export(cmd) => cmd.build(guard).await, } diff --git a/src/cmd/src/cli/bench.rs b/src/cmd/src/cli/bench.rs index 7f0acfe378bf..bf5a6825f014 100644 --- a/src/cmd/src/cli/bench.rs +++ b/src/cmd/src/cli/bench.rs @@ -23,13 +23,13 @@ use common_meta::key::{TableMetadataManager, TableMetadataManagerRef}; use common_meta::kv_backend::etcd::EtcdStore; use common_meta::peer::Peer; use common_meta::rpc::router::{Region, RegionRoute}; -use common_meta::table_name::TableName; use common_telemetry::info; use datatypes::data_type::ConcreteDataType; use datatypes::schema::{ColumnSchema, RawSchema}; use rand::Rng; use store_api::storage::RegionNumber; use table::metadata::{RawTableInfo, RawTableMeta, TableId, TableIdent, TableType}; +use table::table_name::TableName; use tracing_appender::non_blocking::WorkerGuard; use self::metadata::TableMetadataBencher; diff --git a/src/cmd/src/cli/bench/metadata.rs b/src/cmd/src/cli/bench/metadata.rs index a1009cfe8d6c..9229b0342e88 100644 --- a/src/cmd/src/cli/bench/metadata.rs +++ b/src/cmd/src/cli/bench/metadata.rs @@ -16,7 +16,7 @@ use std::time::Instant; use common_meta::key::table_route::TableRouteValue; use common_meta::key::TableMetadataManagerRef; -use common_meta::table_name::TableName; +use table::table_name::TableName; use crate::cli::bench::{ bench_self_recorded, create_region_routes, create_region_wal_options, create_table_info, diff --git a/src/cmd/src/cli/repl.rs b/src/cmd/src/cli/repl.rs index a9e2e21967f9..74184d523985 100644 --- a/src/cmd/src/cli/repl.rs +++ b/src/cmd/src/cli/repl.rs @@ -37,7 +37,7 @@ use query::datafusion::DatafusionQueryEngine; use query::logical_optimizer::LogicalOptimizer; use query::parser::QueryLanguageParser; use query::plan::LogicalPlan; -use query::query_engine::QueryEngineState; +use query::query_engine::{DefaultSerializer, QueryEngineState}; use query::QueryEngine; use rustyline::error::ReadlineError; use rustyline::Editor; @@ -185,7 +185,7 @@ impl Repl { .context(PlanStatementSnafu)?; let plan = DFLogicalSubstraitConvertor {} - .encode(&plan) + .encode(&plan, DefaultSerializer) .context(SubstraitEncodeLogicalPlanSnafu)?; self.database.logical_plan(plan.to_vec()).await @@ -277,24 +277,12 @@ async fn create_query_engine(meta_addr: &str) -> Result { .build(), ); - let table_cache = layered_cache_registry - .get() - .context(error::CacheRequiredSnafu { - name: TABLE_CACHE_NAME, - })?; - let table_route_cache = layered_cache_registry - .get() - .context(error::CacheRequiredSnafu { - name: TABLE_ROUTE_CACHE_NAME, - })?; let catalog_manager = KvBackendCatalogManager::new( Mode::Distributed, Some(meta_client.clone()), cached_meta_backend.clone(), - table_cache, - table_route_cache, - ) - .await; + layered_cache_registry, + ); let plugins: Plugins = Default::default(); let state = Arc::new(QueryEngineState::new( catalog_manager, diff --git a/src/cmd/src/cli/upgrade.rs b/src/cmd/src/cli/upgrade.rs deleted file mode 100644 index f6f0d525c152..000000000000 --- a/src/cmd/src/cli/upgrade.rs +++ /dev/null @@ -1,584 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; -use std::sync::Arc; - -use async_trait::async_trait; -use clap::Parser; -use client::api::v1::meta::TableRouteValue; -use common_meta::ddl::utils::region_storage_path; -use common_meta::error as MetaError; -use common_meta::key::catalog_name::{CatalogNameKey, CatalogNameValue}; -use common_meta::key::datanode_table::{DatanodeTableKey, DatanodeTableValue, RegionInfo}; -use common_meta::key::schema_name::{SchemaNameKey, SchemaNameValue}; -use common_meta::key::table_info::{TableInfoKey, TableInfoValue}; -use common_meta::key::table_name::{TableNameKey, TableNameValue}; -use common_meta::key::table_region::{TableRegionKey, TableRegionValue}; -use common_meta::key::table_route::{TableRouteKey, TableRouteValue as NextTableRouteValue}; -use common_meta::key::{MetaKey, RegionDistribution, TableMetaValue}; -use common_meta::kv_backend::etcd::EtcdStore; -use common_meta::kv_backend::KvBackendRef; -use common_meta::range_stream::PaginationStream; -use common_meta::rpc::router::TableRoute; -use common_meta::rpc::store::{BatchDeleteRequest, BatchPutRequest, PutRequest, RangeRequest}; -use common_meta::rpc::KeyValue; -use common_meta::util::get_prefix_end_key; -use common_telemetry::info; -use etcd_client::Client; -use futures::TryStreamExt; -use prost::Message; -use snafu::ResultExt; -use tracing_appender::non_blocking::WorkerGuard; -use v1_helper::{CatalogKey as v1CatalogKey, SchemaKey as v1SchemaKey, TableGlobalValue}; - -use crate::cli::{Instance, Tool}; -use crate::error::{self, ConnectEtcdSnafu, Result}; - -#[derive(Debug, Default, Parser)] -pub struct UpgradeCommand { - #[clap(long)] - etcd_addr: String, - #[clap(long)] - dryrun: bool, - - #[clap(long)] - skip_table_global_keys: bool, - #[clap(long)] - skip_catalog_keys: bool, - #[clap(long)] - skip_schema_keys: bool, - #[clap(long)] - skip_table_route_keys: bool, -} - -impl UpgradeCommand { - pub async fn build(&self, guard: Vec) -> Result { - let client = Client::connect([&self.etcd_addr], None) - .await - .context(ConnectEtcdSnafu { - etcd_addr: &self.etcd_addr, - })?; - let tool = MigrateTableMetadata { - etcd_store: EtcdStore::with_etcd_client(client, 128), - dryrun: self.dryrun, - skip_catalog_keys: self.skip_catalog_keys, - skip_table_global_keys: self.skip_table_global_keys, - skip_schema_keys: self.skip_schema_keys, - skip_table_route_keys: self.skip_table_route_keys, - }; - Ok(Instance::new(Box::new(tool), guard)) - } -} - -struct MigrateTableMetadata { - etcd_store: KvBackendRef, - dryrun: bool, - - skip_table_global_keys: bool, - - skip_catalog_keys: bool, - - skip_schema_keys: bool, - - skip_table_route_keys: bool, -} - -#[async_trait] -impl Tool for MigrateTableMetadata { - // migrates database's metadata from 0.3 to 0.4. - async fn do_work(&self) -> Result<()> { - if !self.skip_table_global_keys { - self.migrate_table_global_values().await?; - } - if !self.skip_catalog_keys { - self.migrate_catalog_keys().await?; - } - if !self.skip_schema_keys { - self.migrate_schema_keys().await?; - } - if !self.skip_table_route_keys { - self.migrate_table_route_keys().await?; - } - Ok(()) - } -} - -const PAGE_SIZE: usize = 1000; - -impl MigrateTableMetadata { - async fn migrate_table_route_keys(&self) -> Result<()> { - let key = b"__meta_table_route".to_vec(); - let range_end = get_prefix_end_key(&key); - let mut keys = Vec::new(); - info!("Start scanning key from: {}", String::from_utf8_lossy(&key)); - - let mut stream = PaginationStream::new( - self.etcd_store.clone(), - RangeRequest::new().with_range(key, range_end), - PAGE_SIZE, - Arc::new(|kv: KeyValue| { - let value = - TableRouteValue::decode(&kv.value[..]).context(MetaError::DecodeProtoSnafu)?; - Ok((kv.key, value)) - }), - ); - - while let Some((key, value)) = stream.try_next().await.context(error::IterStreamSnafu)? { - let table_id = self.migrate_table_route_key(value).await?; - keys.push(key); - keys.push(TableRegionKey::new(table_id).to_bytes()) - } - - info!("Total migrated TableRouteKeys: {}", keys.len() / 2); - self.delete_migrated_keys(keys).await; - - Ok(()) - } - - async fn migrate_table_route_key(&self, value: TableRouteValue) -> Result { - let table_route = TableRoute::try_from_raw( - &value.peers, - value.table_route.expect("expected table_route"), - ) - .unwrap(); - - let new_table_value = NextTableRouteValue::physical(table_route.region_routes); - - let table_id = table_route.table.id as u32; - let new_key = TableRouteKey::new(table_id); - info!("Creating '{new_key}'"); - - if self.dryrun { - info!("Dryrun: do nothing"); - } else { - self.etcd_store - .put( - PutRequest::new() - .with_key(new_key.to_bytes()) - .with_value(new_table_value.try_as_raw_value().unwrap()), - ) - .await - .unwrap(); - } - - Ok(table_id) - } - - async fn migrate_schema_keys(&self) -> Result<()> { - // The schema key prefix. - let key = b"__s".to_vec(); - let range_end = get_prefix_end_key(&key); - - let mut keys = Vec::new(); - info!("Start scanning key from: {}", String::from_utf8_lossy(&key)); - let mut stream = PaginationStream::new( - self.etcd_store.clone(), - RangeRequest::new().with_range(key, range_end), - PAGE_SIZE, - Arc::new(|kv: KeyValue| { - let key_str = - std::str::from_utf8(&kv.key).context(MetaError::ConvertRawKeySnafu)?; - let key = v1SchemaKey::parse(key_str) - .unwrap_or_else(|e| panic!("schema key is corrupted: {e}, key: {key_str}")); - - Ok(key) - }), - ); - while let Some(key) = stream.try_next().await.context(error::IterStreamSnafu)? { - let _ = self.migrate_schema_key(&key).await; - keys.push(key.to_string().as_bytes().to_vec()); - } - info!("Total migrated SchemaKeys: {}", keys.len()); - self.delete_migrated_keys(keys).await; - - Ok(()) - } - - async fn migrate_schema_key(&self, key: &v1SchemaKey) -> Result<()> { - let new_key = SchemaNameKey::new(&key.catalog_name, &key.schema_name); - let schema_name_value = SchemaNameValue::default(); - - info!("Creating '{new_key}'"); - - if self.dryrun { - info!("Dryrun: do nothing"); - } else { - self.etcd_store - .put( - PutRequest::new() - .with_key(new_key.to_bytes()) - .with_value(schema_name_value.try_as_raw_value().unwrap()), - ) - .await - .unwrap(); - } - - Ok(()) - } - - async fn migrate_catalog_keys(&self) -> Result<()> { - // The catalog key prefix. - let key = b"__c".to_vec(); - let range_end = get_prefix_end_key(&key); - - let mut keys = Vec::new(); - info!("Start scanning key from: {}", String::from_utf8_lossy(&key)); - let mut stream = PaginationStream::new( - self.etcd_store.clone(), - RangeRequest::new().with_range(key, range_end), - PAGE_SIZE, - Arc::new(|kv: KeyValue| { - let key_str = - std::str::from_utf8(&kv.key).context(MetaError::ConvertRawKeySnafu)?; - let key = v1CatalogKey::parse(key_str) - .unwrap_or_else(|e| panic!("catalog key is corrupted: {e}, key: {key_str}")); - - Ok(key) - }), - ); - while let Some(key) = stream.try_next().await.context(error::IterStreamSnafu)? { - let _ = self.migrate_catalog_key(&key).await; - keys.push(key.to_string().as_bytes().to_vec()); - } - info!("Total migrated CatalogKeys: {}", keys.len()); - self.delete_migrated_keys(keys).await; - - Ok(()) - } - - async fn migrate_catalog_key(&self, key: &v1CatalogKey) { - let new_key = CatalogNameKey::new(&key.catalog_name); - let catalog_name_value = CatalogNameValue; - - info!("Creating '{new_key}'"); - - if self.dryrun { - info!("Dryrun: do nothing"); - } else { - self.etcd_store - .put( - PutRequest::new() - .with_key(new_key.to_bytes()) - .with_value(catalog_name_value.try_as_raw_value().unwrap()), - ) - .await - .unwrap(); - } - } - - async fn migrate_table_global_values(&self) -> Result<()> { - let key = b"__tg".to_vec(); - let range_end = get_prefix_end_key(&key); - - let mut keys = Vec::new(); - - info!("Start scanning key from: {}", String::from_utf8_lossy(&key)); - let mut stream = PaginationStream::new( - self.etcd_store.clone(), - RangeRequest::new().with_range(key, range_end.clone()), - PAGE_SIZE, - Arc::new(|kv: KeyValue| { - let key = String::from_utf8_lossy(kv.key()).to_string(); - let value = TableGlobalValue::from_bytes(kv.value()) - .unwrap_or_else(|e| panic!("table global value is corrupted: {e}, key: {key}")); - - Ok((key, value)) - }), - ); - while let Some((key, value)) = stream.try_next().await.context(error::IterStreamSnafu)? { - self.create_table_name_key(&value).await; - - self.create_datanode_table_keys(&value).await; - - self.split_table_global_value(&key, value).await; - - keys.push(key.as_bytes().to_vec()); - } - - info!("Total migrated TableGlobalKeys: {}", keys.len()); - self.delete_migrated_keys(keys).await; - - Ok(()) - } - - async fn delete_migrated_keys(&self, keys: Vec>) { - for keys in keys.chunks(PAGE_SIZE) { - info!("Deleting {} keys", keys.len()); - let req = BatchDeleteRequest { - keys: keys.to_vec(), - prev_kv: false, - }; - if self.dryrun { - info!("Dryrun: do nothing"); - } else { - self.etcd_store.batch_delete(req).await.unwrap(); - } - } - } - - async fn split_table_global_value(&self, key: &str, value: TableGlobalValue) { - let table_id = value.table_id(); - let region_distribution: RegionDistribution = value.regions_id_map.into_iter().collect(); - - let table_info_key = TableInfoKey::new(table_id); - let table_info_value = TableInfoValue::new(value.table_info); - - let table_region_key = TableRegionKey::new(table_id); - let table_region_value = TableRegionValue::new(region_distribution); - - info!("Splitting TableGlobalKey '{key}' into '{table_info_key}' and '{table_region_key}'"); - - if self.dryrun { - info!("Dryrun: do nothing"); - } else { - self.etcd_store - .batch_put( - BatchPutRequest::new() - .add_kv( - table_info_key.to_bytes(), - table_info_value.try_as_raw_value().unwrap(), - ) - .add_kv( - table_region_key.to_bytes(), - table_region_value.try_as_raw_value().unwrap(), - ), - ) - .await - .unwrap(); - } - } - - async fn create_table_name_key(&self, value: &TableGlobalValue) { - let table_info = &value.table_info; - let table_id = value.table_id(); - - let table_name_key = TableNameKey::new( - &table_info.catalog_name, - &table_info.schema_name, - &table_info.name, - ); - let table_name_value = TableNameValue::new(table_id); - - info!("Creating '{table_name_key}' => {table_id}"); - - if self.dryrun { - info!("Dryrun: do nothing"); - } else { - self.etcd_store - .put( - PutRequest::new() - .with_key(table_name_key.to_bytes()) - .with_value(table_name_value.try_as_raw_value().unwrap()), - ) - .await - .unwrap(); - } - } - - async fn create_datanode_table_keys(&self, value: &TableGlobalValue) { - let table_id = value.table_id(); - let engine = value.table_info.meta.engine.as_str(); - let region_storage_path = region_storage_path( - &value.table_info.catalog_name, - &value.table_info.schema_name, - ); - let region_distribution: RegionDistribution = - value.regions_id_map.clone().into_iter().collect(); - - // TODO(niebayes): properly fetch or construct wal options. - let region_wal_options = HashMap::default(); - - let datanode_table_kvs = region_distribution - .into_iter() - .map(|(datanode_id, regions)| { - let k = DatanodeTableKey::new(datanode_id, table_id); - info!("Creating DatanodeTableKey '{k}' => {regions:?}"); - ( - k, - DatanodeTableValue::new( - table_id, - regions, - RegionInfo { - engine: engine.to_string(), - region_storage_path: region_storage_path.clone(), - region_options: (&value.table_info.meta.options).into(), - region_wal_options: region_wal_options.clone(), - }, - ), - ) - }) - .collect::>(); - - if self.dryrun { - info!("Dryrun: do nothing"); - } else { - let mut req = BatchPutRequest::new(); - for (key, value) in datanode_table_kvs { - req = req.add_kv(key.to_bytes(), value.try_as_raw_value().unwrap()); - } - self.etcd_store.batch_put(req).await.unwrap(); - } - } -} - -#[deprecated(since = "0.4.0", note = "Used for migrate old version(v0.3) metadata")] -mod v1_helper { - use std::collections::HashMap; - use std::fmt::{Display, Formatter}; - - use err::{DeserializeCatalogEntryValueSnafu, Error, InvalidCatalogSnafu}; - use lazy_static::lazy_static; - use regex::Regex; - use serde::{Deserialize, Serialize}; - use snafu::{ensure, OptionExt, ResultExt}; - use table::metadata::{RawTableInfo, TableId}; - - pub const CATALOG_KEY_PREFIX: &str = "__c"; - pub const SCHEMA_KEY_PREFIX: &str = "__s"; - - /// The pattern of a valid catalog, schema or table name. - const NAME_PATTERN: &str = "[a-zA-Z_:][a-zA-Z0-9_:]*"; - - lazy_static! { - static ref CATALOG_KEY_PATTERN: Regex = - Regex::new(&format!("^{CATALOG_KEY_PREFIX}-({NAME_PATTERN})$")).unwrap(); - } - - lazy_static! { - static ref SCHEMA_KEY_PATTERN: Regex = Regex::new(&format!( - "^{SCHEMA_KEY_PREFIX}-({NAME_PATTERN})-({NAME_PATTERN})$" - )) - .unwrap(); - } - - /// Table global info contains necessary info for a datanode to create table regions, including - /// table id, table meta(schema...), region id allocation across datanodes. - #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] - pub struct TableGlobalValue { - /// Id of datanode that created the global table info kv. only for debugging. - pub node_id: u64, - /// Allocation of region ids across all datanodes. - pub regions_id_map: HashMap>, - pub table_info: RawTableInfo, - } - - impl TableGlobalValue { - pub fn table_id(&self) -> TableId { - self.table_info.ident.table_id - } - } - - pub struct CatalogKey { - pub catalog_name: String, - } - - impl Display for CatalogKey { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.write_str(CATALOG_KEY_PREFIX)?; - f.write_str("-")?; - f.write_str(&self.catalog_name) - } - } - - impl CatalogKey { - pub fn parse(s: impl AsRef) -> Result { - let key = s.as_ref(); - let captures = CATALOG_KEY_PATTERN - .captures(key) - .context(InvalidCatalogSnafu { key })?; - ensure!(captures.len() == 2, InvalidCatalogSnafu { key }); - Ok(Self { - catalog_name: captures[1].to_string(), - }) - } - } - - #[derive(Debug, Serialize, Deserialize)] - pub struct CatalogValue; - - pub struct SchemaKey { - pub catalog_name: String, - pub schema_name: String, - } - - impl Display for SchemaKey { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.write_str(SCHEMA_KEY_PREFIX)?; - f.write_str("-")?; - f.write_str(&self.catalog_name)?; - f.write_str("-")?; - f.write_str(&self.schema_name) - } - } - - impl SchemaKey { - pub fn parse(s: impl AsRef) -> Result { - let key = s.as_ref(); - let captures = SCHEMA_KEY_PATTERN - .captures(key) - .context(InvalidCatalogSnafu { key })?; - ensure!(captures.len() == 3, InvalidCatalogSnafu { key }); - Ok(Self { - catalog_name: captures[1].to_string(), - schema_name: captures[2].to_string(), - }) - } - } - - #[derive(Debug, Serialize, Deserialize)] - pub struct SchemaValue; - - macro_rules! define_catalog_value { - ( $($val_ty: ty), *) => { - $( - impl $val_ty { - pub fn parse(s: impl AsRef) -> Result { - serde_json::from_str(s.as_ref()) - .context(DeserializeCatalogEntryValueSnafu { raw: s.as_ref() }) - } - - pub fn from_bytes(bytes: impl AsRef<[u8]>) -> Result { - Self::parse(&String::from_utf8_lossy(bytes.as_ref())) - } - } - )* - } - } - - define_catalog_value!(TableGlobalValue); - - mod err { - use snafu::{Location, Snafu}; - - #[derive(Debug, Snafu)] - #[snafu(visibility(pub))] - pub enum Error { - #[snafu(display("Invalid catalog info: {}", key))] - InvalidCatalog { - key: String, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Failed to deserialize catalog entry value: {}", raw))] - DeserializeCatalogEntryValue { - raw: String, - #[snafu(implicit)] - location: Location, - source: serde_json::error::Error, - }, - } - } -} diff --git a/src/cmd/src/error.rs b/src/cmd/src/error.rs index a2a880fa6c1d..fa5371545fcb 100644 --- a/src/cmd/src/error.rs +++ b/src/cmd/src/error.rs @@ -375,11 +375,11 @@ impl ErrorExt for Error { Error::SerdeJson { .. } | Error::FileIo { .. } => StatusCode::Unexpected, - Error::CacheRequired { .. } | Error::BuildCacheRegistry { .. } => StatusCode::Internal, - Error::Other { source, .. } => source.status_code(), Error::BuildRuntime { source, .. } => source.status_code(), + + Error::CacheRequired { .. } | Error::BuildCacheRegistry { .. } => StatusCode::Internal, } } diff --git a/src/cmd/src/frontend.rs b/src/cmd/src/frontend.rs index a2dc2c6fd9ae..a3e744e9c7ec 100644 --- a/src/cmd/src/frontend.rs +++ b/src/cmd/src/frontend.rs @@ -16,10 +16,7 @@ use std::sync::Arc; use std::time::Duration; use async_trait::async_trait; -use cache::{ - build_fundamental_cache_registry, with_default_composite_cache_registry, TABLE_CACHE_NAME, - TABLE_ROUTE_CACHE_NAME, -}; +use cache::{build_fundamental_cache_registry, with_default_composite_cache_registry}; use catalog::kvbackend::{CachedMetaKvBackendBuilder, KvBackendCatalogManager, MetaKvBackend}; use clap::Parser; use client::client_manager::DatanodeClients; @@ -302,25 +299,12 @@ impl StartCommand { .build(), ); - let table_cache = layered_cache_registry - .get() - .context(error::CacheRequiredSnafu { - name: TABLE_CACHE_NAME, - })?; - let table_route_cache = - layered_cache_registry - .get() - .context(error::CacheRequiredSnafu { - name: TABLE_ROUTE_CACHE_NAME, - })?; let catalog_manager = KvBackendCatalogManager::new( opts.mode, Some(meta_client.clone()), cached_meta_backend.clone(), - table_cache, - table_route_cache, - ) - .await; + layered_cache_registry.clone(), + ); let executor = HandlerGroupExecutor::new(vec![ Arc::new(ParseMailboxMessageHandler), diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs index 01fe22d64ad2..90958baf1048 100644 --- a/src/cmd/src/standalone.rs +++ b/src/cmd/src/standalone.rs @@ -16,10 +16,7 @@ use std::sync::Arc; use std::{fs, path}; use async_trait::async_trait; -use cache::{ - build_fundamental_cache_registry, with_default_composite_cache_registry, TABLE_CACHE_NAME, - TABLE_ROUTE_CACHE_NAME, -}; +use cache::{build_fundamental_cache_registry, with_default_composite_cache_registry}; use catalog::kvbackend::KvBackendCatalogManager; use clap::Parser; use common_catalog::consts::{MIN_USER_FLOW_ID, MIN_USER_TABLE_ID}; @@ -61,14 +58,14 @@ use servers::export_metrics::ExportMetricsOption; use servers::http::HttpOptions; use servers::tls::{TlsMode, TlsOption}; use servers::Mode; -use snafu::{OptionExt, ResultExt}; +use snafu::ResultExt; use tracing_appender::non_blocking::WorkerGuard; use crate::error::{ - BuildCacheRegistrySnafu, CacheRequiredSnafu, CreateDirSnafu, IllegalConfigSnafu, - InitDdlManagerSnafu, InitMetadataSnafu, InitTimezoneSnafu, LoadLayeredConfigSnafu, Result, - ShutdownDatanodeSnafu, ShutdownFrontendSnafu, StartDatanodeSnafu, StartFrontendSnafu, - StartProcedureManagerSnafu, StartWalOptionsAllocatorSnafu, StopProcedureManagerSnafu, + BuildCacheRegistrySnafu, CreateDirSnafu, IllegalConfigSnafu, InitDdlManagerSnafu, + InitMetadataSnafu, InitTimezoneSnafu, LoadLayeredConfigSnafu, Result, ShutdownDatanodeSnafu, + ShutdownFrontendSnafu, StartDatanodeSnafu, StartFrontendSnafu, StartProcedureManagerSnafu, + StartWalOptionsAllocatorSnafu, StopProcedureManagerSnafu, }; use crate::options::GlobalOptions; use crate::{log_versions, App}; @@ -421,20 +418,12 @@ impl StartCommand { .build(), ); - let table_cache = layered_cache_registry.get().context(CacheRequiredSnafu { - name: TABLE_CACHE_NAME, - })?; - let table_route_cache = layered_cache_registry.get().context(CacheRequiredSnafu { - name: TABLE_ROUTE_CACHE_NAME, - })?; let catalog_manager = KvBackendCatalogManager::new( dn_opts.mode, None, kv_backend.clone(), - table_cache, - table_route_cache, - ) - .await; + layered_cache_registry.clone(), + ); let table_metadata_manager = Self::create_table_metadata_manager(kv_backend.clone()).await?; diff --git a/src/common/function/src/scalars/math/clamp.rs b/src/common/function/src/scalars/math/clamp.rs index 58a2dcefd403..dc73aed15845 100644 --- a/src/common/function/src/scalars/math/clamp.rs +++ b/src/common/function/src/scalars/math/clamp.rs @@ -143,8 +143,6 @@ fn clamp_impl Result { - common_telemetry::info!("[DEBUG] min {min:?}, max {max:?}"); - let iter = ArrayIter::new(input); let result = iter.map(|x| { x.map(|x| { diff --git a/src/common/meta/src/cache.rs b/src/common/meta/src/cache.rs index b7d13a6f0ec0..52dae1a094af 100644 --- a/src/common/meta/src/cache.rs +++ b/src/common/meta/src/cache.rs @@ -24,7 +24,7 @@ pub use registry::{ LayeredCacheRegistryBuilder, LayeredCacheRegistryRef, }; pub use table::{ - new_table_info_cache, new_table_name_cache, new_table_route_cache, TableInfoCache, - TableInfoCacheRef, TableNameCache, TableNameCacheRef, TableRoute, TableRouteCache, - TableRouteCacheRef, + new_table_info_cache, new_table_name_cache, new_table_route_cache, new_view_info_cache, + TableInfoCache, TableInfoCacheRef, TableNameCache, TableNameCacheRef, TableRoute, + TableRouteCache, TableRouteCacheRef, ViewInfoCache, ViewInfoCacheRef, }; diff --git a/src/common/meta/src/cache/flow/table_flownode.rs b/src/common/meta/src/cache/flow/table_flownode.rs index eeaa88128628..faf62b8c36f6 100644 --- a/src/common/meta/src/cache/flow/table_flownode.rs +++ b/src/common/meta/src/cache/flow/table_flownode.rs @@ -145,13 +145,13 @@ mod tests { use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use moka::future::CacheBuilder; + use table::table_name::TableName; use crate::cache::flow::table_flownode::new_table_flownode_set_cache; use crate::instruction::{CacheIdent, CreateFlow, DropFlow}; use crate::key::flow::flow_info::FlowInfoValue; use crate::key::flow::FlowMetadataManager; use crate::kv_backend::memory::MemoryKvBackend; - use crate::table_name::TableName; #[tokio::test] async fn test_cache_empty_set() { diff --git a/src/common/meta/src/cache/table.rs b/src/common/meta/src/cache/table.rs index fa3bcbd30994..82a3ad98df33 100644 --- a/src/common/meta/src/cache/table.rs +++ b/src/common/meta/src/cache/table.rs @@ -15,6 +15,9 @@ mod table_info; mod table_name; mod table_route; +mod view_info; + pub use table_info::{new_table_info_cache, TableInfoCache, TableInfoCacheRef}; pub use table_name::{new_table_name_cache, TableNameCache, TableNameCacheRef}; pub use table_route::{new_table_route_cache, TableRoute, TableRouteCache, TableRouteCacheRef}; +pub use view_info::{new_view_info_cache, ViewInfoCache, ViewInfoCacheRef}; diff --git a/src/common/meta/src/cache/table/table_name.rs b/src/common/meta/src/cache/table/table_name.rs index 0ec88a2d6e9d..926e4de66f63 100644 --- a/src/common/meta/src/cache/table/table_name.rs +++ b/src/common/meta/src/cache/table/table_name.rs @@ -18,6 +18,7 @@ use futures::future::BoxFuture; use moka::future::Cache; use snafu::OptionExt; use table::metadata::TableId; +use table::table_name::TableName; use crate::cache::{CacheContainer, Initializer}; use crate::error; @@ -25,7 +26,6 @@ use crate::error::Result; use crate::instruction::CacheIdent; use crate::key::table_name::{TableNameKey, TableNameManager, TableNameManagerRef}; use crate::kv_backend::KvBackendRef; -use crate::table_name::TableName; /// [TableNameCache] caches the [TableName] to [TableId] mapping. pub type TableNameCache = CacheContainer; diff --git a/src/common/meta/src/cache/table/view_info.rs b/src/common/meta/src/cache/table/view_info.rs new file mode 100644 index 000000000000..cd9d29b2ca4d --- /dev/null +++ b/src/common/meta/src/cache/table/view_info.rs @@ -0,0 +1,143 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use futures::future::BoxFuture; +use moka::future::Cache; +use snafu::OptionExt; +use store_api::storage::TableId; + +use crate::cache::{CacheContainer, Initializer}; +use crate::error; +use crate::error::Result; +use crate::instruction::CacheIdent; +use crate::key::view_info::{ViewInfoManager, ViewInfoManagerRef, ViewInfoValue}; +use crate::kv_backend::KvBackendRef; + +/// [ViewInfoCache] caches the [TableId] to [ViewInfoValue] mapping. +pub type ViewInfoCache = CacheContainer, CacheIdent>; + +pub type ViewInfoCacheRef = Arc; + +/// Constructs a [ViewInfoCache]. +pub fn new_view_info_cache( + name: String, + cache: Cache>, + kv_backend: KvBackendRef, +) -> ViewInfoCache { + let view_info_manager = Arc::new(ViewInfoManager::new(kv_backend)); + let init = init_factory(view_info_manager); + + CacheContainer::new(name, cache, Box::new(invalidator), init, Box::new(filter)) +} + +fn init_factory(view_info_manager: ViewInfoManagerRef) -> Initializer> { + Arc::new(move |view_id| { + let view_info_manager = view_info_manager.clone(); + Box::pin(async move { + let view_info = view_info_manager + .get(*view_id) + .await? + .context(error::ValueNotExistSnafu {})? + .into_inner(); + + Ok(Some(Arc::new(view_info))) + }) + }) +} + +fn invalidator<'a>( + cache: &'a Cache>, + ident: &'a CacheIdent, +) -> BoxFuture<'a, Result<()>> { + Box::pin(async move { + if let CacheIdent::TableId(table_id) = ident { + cache.invalidate(table_id).await + } + Ok(()) + }) +} + +fn filter(ident: &CacheIdent) -> bool { + matches!(ident, CacheIdent::TableId(_)) +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + use std::sync::Arc; + + use moka::future::CacheBuilder; + use table::table_name::TableName; + + use super::*; + use crate::ddl::tests::create_view::test_create_view_task; + use crate::key::TableMetadataManager; + use crate::kv_backend::memory::MemoryKvBackend; + + #[tokio::test] + async fn test_view_info_cache() { + let mem_kv = Arc::new(MemoryKvBackend::default()); + let table_metadata_manager = TableMetadataManager::new(mem_kv.clone()); + let cache = CacheBuilder::new(128).build(); + let cache = new_view_info_cache("test".to_string(), cache, mem_kv.clone()); + + let result = cache.get(1024).await.unwrap(); + assert!(result.is_none()); + let mut task = test_create_view_task("my_view"); + let table_names = { + let mut set = HashSet::new(); + set.insert(TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "a_table".to_string(), + }); + set.insert(TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "b_table".to_string(), + }); + set + }; + + task.view_info.ident.table_id = 1024; + table_metadata_manager + .create_view_metadata( + task.view_info.clone(), + task.create_view.logical_plan.clone(), + table_names, + ) + .await + .unwrap(); + + let view_info = cache.get(1024).await.unwrap().unwrap(); + assert_eq!(view_info.view_info, task.create_view.logical_plan); + assert_eq!( + view_info.table_names, + task.create_view + .table_names + .iter() + .map(|t| t.clone().into()) + .collect::>() + ); + + assert!(cache.contains_key(&1024)); + cache + .invalidate(&[CacheIdent::TableId(1024)]) + .await + .unwrap(); + assert!(!cache.contains_key(&1024)); + } +} diff --git a/src/common/meta/src/ddl.rs b/src/common/meta/src/ddl.rs index 8d8cb8d5a45d..c00b6df08e6b 100644 --- a/src/common/meta/src/ddl.rs +++ b/src/common/meta/src/ddl.rs @@ -48,7 +48,7 @@ pub mod table_meta; #[cfg(any(test, feature = "testing"))] pub mod test_util; #[cfg(test)] -mod tests; +pub(crate) mod tests; pub mod truncate_table; pub mod utils; diff --git a/src/common/meta/src/ddl/alter_logical_tables/table_cache_keys.rs b/src/common/meta/src/ddl/alter_logical_tables/table_cache_keys.rs index 23cf22e2c02c..15f6bfbd6f33 100644 --- a/src/common/meta/src/ddl/alter_logical_tables/table_cache_keys.rs +++ b/src/common/meta/src/ddl/alter_logical_tables/table_cache_keys.rs @@ -13,10 +13,10 @@ // limitations under the License. use table::metadata::RawTableInfo; +use table::table_name::TableName; use crate::ddl::alter_logical_tables::AlterLogicalTablesProcedure; use crate::instruction::CacheIdent; -use crate::table_name::TableName; impl AlterLogicalTablesProcedure { pub(crate) fn build_table_cache_keys_to_invalidate(&self) -> Vec { diff --git a/src/common/meta/src/ddl/create_logical_tables/update_metadata.rs b/src/common/meta/src/ddl/create_logical_tables/update_metadata.rs index 61ec611f850f..0309a046138f 100644 --- a/src/common/meta/src/ddl/create_logical_tables/update_metadata.rs +++ b/src/common/meta/src/ddl/create_logical_tables/update_metadata.rs @@ -18,13 +18,13 @@ use common_telemetry::{info, warn}; use itertools::Itertools; use snafu::OptionExt; use table::metadata::TableId; +use table::table_name::TableName; use crate::cache_invalidator::Context; use crate::ddl::create_logical_tables::CreateLogicalTablesProcedure; use crate::ddl::physical_table_metadata; use crate::error::{Result, TableInfoNotFoundSnafu}; use crate::instruction::CacheIdent; -use crate::table_name::TableName; impl CreateLogicalTablesProcedure { pub(crate) async fn update_physical_table_metadata(&mut self) -> Result<()> { diff --git a/src/common/meta/src/ddl/create_view.rs b/src/common/meta/src/ddl/create_view.rs index 5d364ba77417..fa7a115d4d8a 100644 --- a/src/common/meta/src/ddl/create_view.rs +++ b/src/common/meta/src/ddl/create_view.rs @@ -22,9 +22,11 @@ use strum::AsRefStr; use table::metadata::{RawTableInfo, TableId, TableType}; use table::table_reference::TableReference; +use crate::cache_invalidator::Context; use crate::ddl::utils::handle_retry_error; use crate::ddl::{DdlContext, TableMetadata, TableMetadataAllocatorContext}; use crate::error::{self, Result}; +use crate::instruction::CacheIdent; use crate::key::table_name::TableNameKey; use crate::lock_key::{CatalogLock, SchemaLock, TableNameLock}; use crate::rpc::ddl::CreateViewTask; @@ -157,6 +159,25 @@ impl CreateViewProcedure { Ok(Status::executing(true)) } + async fn invalidate_view_cache(&self) -> Result<()> { + let cache_invalidator = &self.context.cache_invalidator; + let ctx = Context { + subject: Some("Invalidate view cache by creating view".to_string()), + }; + + cache_invalidator + .invalidate( + &ctx, + &[ + CacheIdent::TableName(self.data.table_ref().into()), + CacheIdent::TableId(self.view_id()), + ], + ) + .await?; + + Ok(()) + } + /// Creates view metadata /// /// Abort(not-retry): @@ -175,15 +196,21 @@ impl CreateViewProcedure { view_name: self.data.table_ref().to_string(), })?; let new_logical_plan = self.data.task.raw_logical_plan().clone(); + let table_names = self.data.task.table_names(); + manager - .update_view_info(view_id, ¤t_view_info, new_logical_plan) + .update_view_info(view_id, ¤t_view_info, new_logical_plan, table_names) .await?; info!("Updated view metadata for view {view_id}"); } else { let raw_view_info = self.view_info().clone(); manager - .create_view_metadata(raw_view_info, self.data.task.raw_logical_plan()) + .create_view_metadata( + raw_view_info, + self.data.task.raw_logical_plan().clone(), + self.data.task.table_names(), + ) .await?; info!( @@ -191,6 +218,7 @@ impl CreateViewProcedure { ctx.procedure_id ); } + self.invalidate_view_cache().await?; Ok(Status::done_with_output(view_id)) } diff --git a/src/common/meta/src/ddl/drop_database/cursor.rs b/src/common/meta/src/ddl/drop_database/cursor.rs index 7e1cb05bb98d..c3dd8a582684 100644 --- a/src/common/meta/src/ddl/drop_database/cursor.rs +++ b/src/common/meta/src/ddl/drop_database/cursor.rs @@ -14,19 +14,23 @@ use std::any::Any; +use common_catalog::format_full_table_name; use common_procedure::Status; use futures::TryStreamExt; use serde::{Deserialize, Serialize}; -use table::metadata::TableId; +use snafu::OptionExt; +use table::metadata::{TableId, TableType}; +use table::table_name::TableName; use super::executor::DropDatabaseExecutor; use super::metadata::DropDatabaseRemoveMetadata; use super::DropTableTarget; +use crate::cache_invalidator::Context; use crate::ddl::drop_database::{DropDatabaseContext, State}; use crate::ddl::DdlContext; -use crate::error::Result; +use crate::error::{Result, TableInfoNotFoundSnafu}; +use crate::instruction::CacheIdent; use crate::key::table_route::TableRouteValue; -use crate::table_name::TableName; #[derive(Debug, Serialize, Deserialize)] pub(crate) struct DropDatabaseCursor { @@ -101,6 +105,40 @@ impl DropDatabaseCursor { )), } } + + async fn handle_view( + &self, + ddl_ctx: &DdlContext, + ctx: &mut DropDatabaseContext, + table_name: String, + table_id: TableId, + ) -> Result<(Box, Status)> { + let view_name = TableName::new(&ctx.catalog, &ctx.schema, &table_name); + ddl_ctx + .table_metadata_manager + .destroy_view_info(table_id, &view_name) + .await?; + + let cache_invalidator = &ddl_ctx.cache_invalidator; + let ctx = Context { + subject: Some("Invalidate table cache by dropping table".to_string()), + }; + + cache_invalidator + .invalidate( + &ctx, + &[ + CacheIdent::TableName(view_name), + CacheIdent::TableId(table_id), + ], + ) + .await?; + + Ok(( + Box::new(DropDatabaseCursor::new(self.target)), + Status::executing(false), + )) + } } #[async_trait::async_trait] @@ -122,6 +160,20 @@ impl State for DropDatabaseCursor { match ctx.tables.as_mut().unwrap().try_next().await? { Some((table_name, table_name_value)) => { let table_id = table_name_value.table_id(); + + let table_info_value = ddl_ctx + .table_metadata_manager + .table_info_manager() + .get(table_id) + .await? + .with_context(|| TableInfoNotFoundSnafu { + table: format_full_table_name(&ctx.catalog, &ctx.schema, &table_name), + })?; + + if table_info_value.table_info.table_type == TableType::View { + return self.handle_view(ddl_ctx, ctx, table_name, table_id).await; + } + match ddl_ctx .table_metadata_manager .table_route_manager() diff --git a/src/common/meta/src/ddl/drop_database/executor.rs b/src/common/meta/src/ddl/drop_database/executor.rs index 48b840e8d9bf..f3a7f9a9fff5 100644 --- a/src/common/meta/src/ddl/drop_database/executor.rs +++ b/src/common/meta/src/ddl/drop_database/executor.rs @@ -19,6 +19,7 @@ use common_telemetry::info; use serde::{Deserialize, Serialize}; use snafu::OptionExt; use table::metadata::TableId; +use table::table_name::TableName; use super::cursor::DropDatabaseCursor; use super::{DropDatabaseContext, DropTableTarget}; @@ -29,7 +30,6 @@ use crate::error::{self, Result}; use crate::key::table_route::TableRouteValue; use crate::region_keeper::OperatingRegionGuard; use crate::rpc::router::{operating_leader_regions, RegionRoute}; -use crate::table_name::TableName; #[derive(Debug, Serialize, Deserialize)] pub(crate) struct DropDatabaseExecutor { @@ -135,6 +135,7 @@ mod tests { use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_error::ext::BoxedError; use common_recordbatch::SendableRecordBatchStream; + use table::table_name::TableName; use crate::ddl::drop_database::cursor::DropDatabaseCursor; use crate::ddl::drop_database::executor::DropDatabaseExecutor; @@ -144,7 +145,6 @@ mod tests { use crate::key::datanode_table::DatanodeTableKey; use crate::peer::Peer; use crate::rpc::router::region_distribution; - use crate::table_name::TableName; use crate::test_util::{new_ddl_context, MockDatanodeHandler, MockDatanodeManager}; #[derive(Clone)] diff --git a/src/common/meta/src/ddl/drop_table/executor.rs b/src/common/meta/src/ddl/drop_table/executor.rs index aa41d03c6597..0783ce86ccaf 100644 --- a/src/common/meta/src/ddl/drop_table/executor.rs +++ b/src/common/meta/src/ddl/drop_table/executor.rs @@ -23,6 +23,7 @@ use futures::future::join_all; use snafu::ensure; use store_api::storage::RegionId; use table::metadata::TableId; +use table::table_name::TableName; use crate::cache_invalidator::Context; use crate::ddl::utils::add_peer_context_if_needed; @@ -32,7 +33,6 @@ use crate::instruction::CacheIdent; use crate::key::table_name::TableNameKey; use crate::key::table_route::TableRouteValue; use crate::rpc::router::{find_leader_regions, find_leaders, RegionRoute}; -use crate::table_name::TableName; /// [Control] indicated to the caller whether to go to the next step. #[derive(Debug)] @@ -224,6 +224,7 @@ mod tests { use api::v1::{ColumnDataType, SemanticType}; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use table::metadata::RawTableInfo; + use table::table_name::TableName; use super::*; use crate::ddl::test_util::columns::TestColumnDefBuilder; @@ -231,7 +232,6 @@ mod tests { build_raw_table_info_from_expr, TestCreateTableExprBuilder, }; use crate::key::table_route::TableRouteValue; - use crate::table_name::TableName; use crate::test_util::{new_ddl_context, MockDatanodeManager}; fn test_create_raw_table_info(name: &str) -> RawTableInfo { diff --git a/src/common/meta/src/ddl/tests.rs b/src/common/meta/src/ddl/tests.rs index 3c550883ffc2..9a0db96a37e0 100644 --- a/src/common/meta/src/ddl/tests.rs +++ b/src/common/meta/src/ddl/tests.rs @@ -17,7 +17,7 @@ mod alter_table; mod create_flow; mod create_logical_tables; mod create_table; -mod create_view; +pub(crate) mod create_view; mod drop_database; mod drop_flow; mod drop_table; diff --git a/src/common/meta/src/ddl/tests/create_flow.rs b/src/common/meta/src/ddl/tests/create_flow.rs index e79fe27b848f..a130e0590c47 100644 --- a/src/common/meta/src/ddl/tests/create_flow.rs +++ b/src/common/meta/src/ddl/tests/create_flow.rs @@ -19,6 +19,7 @@ use std::sync::Arc; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_procedure_test::execute_procedure_until_done; use session::context::QueryContext; +use table::table_name::TableName; use crate::ddl::create_flow::CreateFlowProcedure; use crate::ddl::test_util::create_table::test_create_table_task; @@ -27,7 +28,6 @@ use crate::ddl::DdlContext; use crate::key::table_route::TableRouteValue; use crate::key::FlowId; use crate::rpc::ddl::CreateFlowTask; -use crate::table_name::TableName; use crate::test_util::{new_ddl_context, MockFlownodeManager}; use crate::{error, ClusterId}; diff --git a/src/common/meta/src/ddl/tests/create_view.rs b/src/common/meta/src/ddl/tests/create_view.rs index 693faddeb3f3..4dc589dbb883 100644 --- a/src/common/meta/src/ddl/tests/create_view.rs +++ b/src/common/meta/src/ddl/tests/create_view.rs @@ -13,9 +13,10 @@ // limitations under the License. use std::assert_matches::assert_matches; +use std::collections::HashSet; use std::sync::Arc; -use api::v1::CreateViewExpr; +use api::v1::{CreateViewExpr, TableName}; use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; use common_procedure::{Context as ProcedureContext, Procedure, ProcedureId, Status}; @@ -31,7 +32,35 @@ use crate::error::Error; use crate::rpc::ddl::CreateViewTask; use crate::test_util::{new_ddl_context, MockDatanodeManager}; -fn test_create_view_task(name: &str) -> CreateViewTask { +fn test_table_names() -> HashSet { + let mut set = HashSet::new(); + set.insert(table::table_name::TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "a_table".to_string(), + }); + set.insert(table::table_name::TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "b_table".to_string(), + }); + set +} + +pub(crate) fn test_create_view_task(name: &str) -> CreateViewTask { + let table_names = vec![ + TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "a_table".to_string(), + }, + TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "b_table".to_string(), + }, + ]; + let expr = CreateViewExpr { catalog_name: "greptime".to_string(), schema_name: "public".to_string(), @@ -39,6 +68,7 @@ fn test_create_view_task(name: &str) -> CreateViewTask { or_replace: false, create_if_not_exists: false, logical_plan: vec![1, 2, 3], + table_names, }; let view_info = RawTableInfo { @@ -70,7 +100,11 @@ async fn test_on_prepare_view_exists_err() { // Puts a value to table name key. ddl_context .table_metadata_manager - .create_view_metadata(task.view_info.clone(), &task.create_view.logical_plan) + .create_view_metadata( + task.view_info.clone(), + task.create_view.logical_plan.clone(), + test_table_names(), + ) .await .unwrap(); let mut procedure = CreateViewProcedure::new(cluster_id, task, ddl_context); @@ -90,7 +124,11 @@ async fn test_on_prepare_with_create_if_view_exists() { // Puts a value to table name key. ddl_context .table_metadata_manager - .create_view_metadata(task.view_info.clone(), &task.create_view.logical_plan) + .create_view_metadata( + task.view_info.clone(), + task.create_view.logical_plan.clone(), + test_table_names(), + ) .await .unwrap(); let mut procedure = CreateViewProcedure::new(cluster_id, task, ddl_context); diff --git a/src/common/meta/src/ddl/tests/drop_flow.rs b/src/common/meta/src/ddl/tests/drop_flow.rs index b8b62b76cc61..97b4632a595a 100644 --- a/src/common/meta/src/ddl/tests/drop_flow.rs +++ b/src/common/meta/src/ddl/tests/drop_flow.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; use common_procedure_test::execute_procedure_until_done; +use table::table_name::TableName; use crate::ddl::drop_flow::DropFlowProcedure; use crate::ddl::test_util::create_table::test_create_table_task; @@ -26,7 +27,6 @@ use crate::ddl::tests::create_flow::create_test_flow; use crate::error; use crate::key::table_route::TableRouteValue; use crate::rpc::ddl::DropFlowTask; -use crate::table_name::TableName; use crate::test_util::{new_ddl_context, MockFlownodeManager}; fn test_drop_flow_task(flow_name: &str, flow_id: u32, drop_if_exists: bool) -> DropFlowTask { diff --git a/src/common/meta/src/ddl/truncate_table.rs b/src/common/meta/src/ddl/truncate_table.rs index ce1341c0add2..edc7321e091c 100644 --- a/src/common/meta/src/ddl/truncate_table.rs +++ b/src/common/meta/src/ddl/truncate_table.rs @@ -28,6 +28,7 @@ use snafu::{ensure, ResultExt}; use store_api::storage::RegionId; use strum::AsRefStr; use table::metadata::{RawTableInfo, TableId}; +use table::table_name::TableName; use table::table_reference::TableReference; use super::utils::handle_retry_error; @@ -40,7 +41,6 @@ use crate::key::DeserializedValueWithBytes; use crate::lock_key::{CatalogLock, SchemaLock, TableLock}; use crate::rpc::ddl::TruncateTableTask; use crate::rpc::router::{find_leader_regions, find_leaders, RegionRoute}; -use crate::table_name::TableName; use crate::{metrics, ClusterId}; pub struct TruncateTableProcedure { diff --git a/src/common/meta/src/instruction.rs b/src/common/meta/src/instruction.rs index 7820985b6571..aee1844b0f80 100644 --- a/src/common/meta/src/instruction.rs +++ b/src/common/meta/src/instruction.rs @@ -20,11 +20,11 @@ use serde::{Deserialize, Serialize}; use store_api::storage::{RegionId, RegionNumber}; use strum::Display; use table::metadata::TableId; +use table::table_name::TableName; use crate::flow_name::FlowName; use crate::key::schema_name::SchemaName; use crate::key::FlowId; -use crate::table_name::TableName; use crate::{ClusterId, DatanodeId, FlownodeId}; #[derive(Eq, Hash, PartialEq, Clone, Debug, Serialize, Deserialize)] diff --git a/src/common/meta/src/key.rs b/src/common/meta/src/key.rs index 9090eb075f3c..22179afbad98 100644 --- a/src/common/meta/src/key.rs +++ b/src/common/meta/src/key.rs @@ -89,9 +89,6 @@ pub mod flow; pub mod schema_name; pub mod table_info; pub mod table_name; -// TODO(weny): removes it. -#[allow(deprecated)] -pub mod table_region; pub mod view_info; // TODO(weny): removes it. #[allow(deprecated)] @@ -119,6 +116,7 @@ use serde::{Deserialize, Serialize}; use snafu::{ensure, OptionExt, ResultExt}; use store_api::storage::RegionNumber; use table::metadata::{RawTableInfo, TableId}; +use table::table_name::TableName; use table_info::{TableInfoKey, TableInfoManager, TableInfoValue}; use table_name::{TableNameKey, TableNameManager, TableNameValue}; use view_info::{ViewInfoKey, ViewInfoManager, ViewInfoValue}; @@ -138,14 +136,12 @@ use crate::kv_backend::txn::{Txn, TxnOp}; use crate::kv_backend::KvBackendRef; use crate::rpc::router::{region_distribution, RegionRoute, RegionStatus}; use crate::rpc::store::BatchDeleteRequest; -use crate::table_name::TableName; use crate::DatanodeId; pub const NAME_PATTERN: &str = r"[a-zA-Z_:-][a-zA-Z0-9_:\-\.]*"; pub const MAINTENANCE_KEY: &str = "maintenance"; const DATANODE_TABLE_KEY_PREFIX: &str = "__dn_table"; -const TABLE_REGION_KEY_PREFIX: &str = "__table_region"; pub const TABLE_INFO_KEY_PREFIX: &str = "__table_info"; pub const VIEW_INFO_KEY_PREFIX: &str = "__view_info"; pub const TABLE_NAME_KEY_PREFIX: &str = "__table_name"; @@ -490,7 +486,8 @@ impl TableMetadataManager { pub async fn create_view_metadata( &self, view_info: RawTableInfo, - raw_logical_plan: &Vec, + raw_logical_plan: Vec, + table_names: HashSet, ) -> Result<()> { let view_id = view_info.ident.table_id; @@ -512,7 +509,7 @@ impl TableMetadataManager { .build_create_txn(view_id, &table_info_value)?; // Creates view info - let view_info_value = ViewInfoValue::new(raw_logical_plan); + let view_info_value = ViewInfoValue::new(raw_logical_plan, table_names); let (create_view_info_txn, on_create_view_info_failure) = self .view_info_manager() .build_create_txn(view_id, &view_info_value)?; @@ -804,6 +801,33 @@ impl TableMetadataManager { Ok(()) } + fn view_info_keys(&self, view_id: TableId, view_name: &TableName) -> Result>> { + let mut keys = Vec::with_capacity(3); + let view_name = TableNameKey::new( + &view_name.catalog_name, + &view_name.schema_name, + &view_name.table_name, + ); + let table_info_key = TableInfoKey::new(view_id); + let view_info_key = ViewInfoKey::new(view_id); + keys.push(view_name.to_bytes()); + keys.push(table_info_key.to_bytes()); + keys.push(view_info_key.to_bytes()); + + Ok(keys) + } + + /// Deletes metadata for view **permanently**. + /// The caller MUST ensure it has the exclusive access to `ViewNameKey`. + pub async fn destroy_view_info(&self, view_id: TableId, view_name: &TableName) -> Result<()> { + let keys = self.view_info_keys(view_id, view_name)?; + let _ = self + .kv_backend + .batch_delete(BatchDeleteRequest::new().with_keys(keys)) + .await?; + Ok(()) + } + /// Renames the table name and returns an error if different metadata exists. /// The caller MUST ensure it has the exclusive access to old and new `TableNameKey`s, /// and the new `TableNameKey` MUST be empty. @@ -903,8 +927,9 @@ impl TableMetadataManager { view_id: TableId, current_view_info_value: &DeserializedValueWithBytes, new_view_info: Vec, + table_names: HashSet, ) -> Result<()> { - let new_view_info_value = current_view_info_value.update(new_view_info); + let new_view_info_value = current_view_info_value.update(new_view_info, table_names); // Updates view info. let (update_view_info_txn, on_update_view_info_failure) = self @@ -1174,7 +1199,7 @@ impl_optional_meta_value! { #[cfg(test)] mod tests { - use std::collections::{BTreeMap, HashMap}; + use std::collections::{BTreeMap, HashMap, HashSet}; use std::sync::Arc; use bytes::Bytes; @@ -1183,6 +1208,7 @@ mod tests { use futures::TryStreamExt; use store_api::storage::RegionId; use table::metadata::{RawTableInfo, TableInfo}; + use table::table_name::TableName; use super::datanode_table::DatanodeTableKey; use super::test_utils; @@ -1197,7 +1223,6 @@ mod tests { use crate::kv_backend::memory::MemoryKvBackend; use crate::peer::Peer; use crate::rpc::router::{region_distribution, Region, RegionRoute, RegionStatus}; - use crate::table_name::TableName; #[test] fn test_deserialized_value_with_bytes() { @@ -1250,6 +1275,21 @@ mod tests { test_utils::new_test_table_info(10, region_numbers) } + fn new_test_table_names() -> HashSet { + let mut set = HashSet::new(); + set.insert(TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "a_table".to_string(), + }); + set.insert(TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "b_table".to_string(), + }); + set + } + async fn create_physical_table_metadata( table_metadata_manager: &TableMetadataManager, table_info: RawTableInfo, @@ -1961,9 +2001,11 @@ mod tests { let logical_plan: Vec = vec![1, 2, 3]; + let table_names = new_test_table_names(); + // Create metadata table_metadata_manager - .create_view_metadata(view_info.clone(), &logical_plan) + .create_view_metadata(view_info.clone(), logical_plan.clone(), table_names.clone()) .await .unwrap(); @@ -1977,6 +2019,7 @@ mod tests { .unwrap() .into_inner(); assert_eq!(current_view_info.view_info, logical_plan); + assert_eq!(current_view_info.table_names, table_names); // assert table info let current_table_info = table_metadata_manager .table_info_manager() @@ -1989,16 +2032,43 @@ mod tests { } let new_logical_plan: Vec = vec![4, 5, 6]; - let current_view_info_value = - DeserializedValueWithBytes::from_inner(ViewInfoValue::new(&logical_plan)); + let new_table_names = { + let mut set = HashSet::new(); + set.insert(TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "b_table".to_string(), + }); + set.insert(TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "c_table".to_string(), + }); + set + }; + + let current_view_info_value = DeserializedValueWithBytes::from_inner(ViewInfoValue::new( + logical_plan.clone(), + table_names, + )); // should be ok. table_metadata_manager - .update_view_info(view_id, ¤t_view_info_value, new_logical_plan.clone()) + .update_view_info( + view_id, + ¤t_view_info_value, + new_logical_plan.clone(), + new_table_names.clone(), + ) .await .unwrap(); // if table info was updated, it should be ok. table_metadata_manager - .update_view_info(view_id, ¤t_view_info_value, new_logical_plan.clone()) + .update_view_info( + view_id, + ¤t_view_info_value, + new_logical_plan.clone(), + new_table_names.clone(), + ) .await .unwrap(); @@ -2011,14 +2081,21 @@ mod tests { .unwrap() .into_inner(); assert_eq!(updated_view_info.view_info, new_logical_plan); + assert_eq!(updated_view_info.table_names, new_table_names); let wrong_view_info = logical_plan.clone(); - let wrong_view_info_value = - DeserializedValueWithBytes::from_inner(current_view_info_value.update(wrong_view_info)); + let wrong_view_info_value = DeserializedValueWithBytes::from_inner( + current_view_info_value.update(wrong_view_info, new_table_names.clone()), + ); // if the current_view_info_value is wrong, it should return an error. // The ABA problem. assert!(table_metadata_manager - .update_view_info(view_id, &wrong_view_info_value, new_logical_plan.clone()) + .update_view_info( + view_id, + &wrong_view_info_value, + new_logical_plan.clone(), + new_table_names.clone(), + ) .await .is_err()); @@ -2031,5 +2108,6 @@ mod tests { .unwrap() .into_inner(); assert_eq!(current_view_info.view_info, new_logical_plan); + assert_eq!(current_view_info.table_names, new_table_names); } } diff --git a/src/common/meta/src/key/datanode_table.rs b/src/common/meta/src/key/datanode_table.rs index c20243bfd7d4..57ffa6a00c29 100644 --- a/src/common/meta/src/key/datanode_table.rs +++ b/src/common/meta/src/key/datanode_table.rs @@ -72,12 +72,8 @@ impl DatanodeTableKey { } } - fn prefix(datanode_id: DatanodeId) -> String { - format!("{}/{datanode_id}", DATANODE_TABLE_KEY_PREFIX) - } - - pub fn range_start_key(datanode_id: DatanodeId) -> String { - format!("{}/", Self::prefix(datanode_id)) + pub fn prefix(datanode_id: DatanodeId) -> String { + format!("{}/{datanode_id}/", DATANODE_TABLE_KEY_PREFIX) } } @@ -114,7 +110,7 @@ impl<'a> MetaKey<'a, DatanodeTableKey> for DatanodeTableKey { impl Display for DatanodeTableKey { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}/{}", Self::prefix(self.datanode_id), self.table_id) + write!(f, "{}{}", Self::prefix(self.datanode_id), self.table_id) } } @@ -164,7 +160,7 @@ impl DatanodeTableManager { &self, datanode_id: DatanodeId, ) -> BoxStream<'static, Result> { - let start_key = DatanodeTableKey::range_start_key(datanode_id); + let start_key = DatanodeTableKey::prefix(datanode_id); let req = RangeRequest::new().with_prefix(start_key.as_bytes()); let stream = PaginationStream::new( diff --git a/src/common/meta/src/key/flow.rs b/src/common/meta/src/key/flow.rs index 1f8db5585433..b2ce5d1cb24b 100644 --- a/src/common/meta/src/key/flow.rs +++ b/src/common/meta/src/key/flow.rs @@ -262,12 +262,12 @@ mod tests { use futures::TryStreamExt; use table::metadata::TableId; + use table::table_name::TableName; use super::*; use crate::key::flow::table_flow::TableFlowKey; use crate::key::FlowPartitionId; use crate::kv_backend::memory::MemoryKvBackend; - use crate::table_name::TableName; use crate::FlownodeId; #[derive(Debug)] diff --git a/src/common/meta/src/key/flow/flow_info.rs b/src/common/meta/src/key/flow/flow_info.rs index f08e7c5def56..c1ce1a1c994f 100644 --- a/src/common/meta/src/key/flow/flow_info.rs +++ b/src/common/meta/src/key/flow/flow_info.rs @@ -20,6 +20,7 @@ use regex::Regex; use serde::{Deserialize, Serialize}; use snafu::OptionExt; use table::metadata::TableId; +use table::table_name::TableName; use crate::error::{self, Result}; use crate::key::flow::FlowScoped; @@ -27,7 +28,6 @@ use crate::key::txn_helper::TxnOpGetResponseSet; use crate::key::{DeserializedValueWithBytes, FlowId, FlowPartitionId, MetaKey, TableMetaValue}; use crate::kv_backend::txn::Txn; use crate::kv_backend::KvBackendRef; -use crate::table_name::TableName; use crate::FlownodeId; const FLOW_INFO_KEY_PREFIX: &str = "info"; diff --git a/src/common/meta/src/key/flow/flownode_flow.rs b/src/common/meta/src/key/flow/flownode_flow.rs index 8bc33c3ef965..d891fbbb05fa 100644 --- a/src/common/meta/src/key/flow/flownode_flow.rs +++ b/src/common/meta/src/key/flow/flownode_flow.rs @@ -69,8 +69,7 @@ impl FlownodeFlowKey { /// The prefix used to retrieve all [FlownodeFlowKey]s with the specified `flownode_id`. pub fn range_start_key(flownode_id: FlownodeId) -> Vec { - let inner = - BytesAdapter::from(FlownodeFlowKeyInner::range_start_key(flownode_id).into_bytes()); + let inner = BytesAdapter::from(FlownodeFlowKeyInner::prefix(flownode_id).into_bytes()); FlowScoped::new(inner).to_bytes() } @@ -108,13 +107,8 @@ impl FlownodeFlowKeyInner { } } - fn prefix(flownode_id: FlownodeId) -> String { - format!("{}/{flownode_id}", FLOWNODE_FLOW_KEY_PREFIX) - } - - /// The prefix used to retrieve all [FlownodeFlowKey]s with the specified `flownode_id`. - fn range_start_key(flownode_id: FlownodeId) -> String { - format!("{}/", Self::prefix(flownode_id)) + pub fn prefix(flownode_id: FlownodeId) -> String { + format!("{}/{flownode_id}/", FLOWNODE_FLOW_KEY_PREFIX) } } diff --git a/src/common/meta/src/key/flow/table_flow.rs b/src/common/meta/src/key/flow/table_flow.rs index d9aa9cff0b56..63dff27bed86 100644 --- a/src/common/meta/src/key/flow/table_flow.rs +++ b/src/common/meta/src/key/flow/table_flow.rs @@ -80,7 +80,7 @@ impl TableFlowKey { /// The prefix used to retrieve all [TableFlowKey]s with the specified `table_id`. pub fn range_start_key(table_id: TableId) -> Vec { - let inner = BytesAdapter::from(TableFlowKeyInner::range_start_key(table_id).into_bytes()); + let inner = BytesAdapter::from(TableFlowKeyInner::prefix(table_id).into_bytes()); FlowScoped::new(inner).to_bytes() } @@ -123,12 +123,7 @@ impl TableFlowKeyInner { } fn prefix(table_id: TableId) -> String { - format!("{}/{table_id}", TABLE_FLOW_KEY_PREFIX) - } - - /// The prefix used to retrieve all [TableFlowKey]s with the specified `table_id`. - fn range_start_key(table_id: TableId) -> String { - format!("{}/", Self::prefix(table_id)) + format!("{}/{table_id}/", TABLE_FLOW_KEY_PREFIX) } } diff --git a/src/common/meta/src/key/table_info.rs b/src/common/meta/src/key/table_info.rs index b50d7bb6b037..a652b7caf0fd 100644 --- a/src/common/meta/src/key/table_info.rs +++ b/src/common/meta/src/key/table_info.rs @@ -19,6 +19,7 @@ use std::sync::Arc; use serde::{Deserialize, Serialize}; use snafu::OptionExt; use table::metadata::{RawTableInfo, TableId}; +use table::table_name::TableName; use table::table_reference::TableReference; use super::TABLE_INFO_KEY_PATTERN; @@ -28,7 +29,6 @@ use crate::key::{DeserializedValueWithBytes, MetaKey, TableMetaValue, TABLE_INFO use crate::kv_backend::txn::Txn; use crate::kv_backend::KvBackendRef; use crate::rpc::store::BatchGetRequest; -use crate::table_name::TableName; /// The key stores the metadata of the table. /// diff --git a/src/common/meta/src/key/table_name.rs b/src/common/meta/src/key/table_name.rs index 6c6c51c37546..8a44de7cc695 100644 --- a/src/common/meta/src/key/table_name.rs +++ b/src/common/meta/src/key/table_name.rs @@ -20,6 +20,7 @@ use futures_util::stream::BoxStream; use serde::{Deserialize, Serialize}; use snafu::OptionExt; use table::metadata::TableId; +use table::table_name::TableName; use super::{MetaKey, TableMetaValue, TABLE_NAME_KEY_PATTERN, TABLE_NAME_KEY_PREFIX}; use crate::error::{Error, InvalidTableMetadataSnafu, Result}; @@ -29,7 +30,6 @@ use crate::kv_backend::KvBackendRef; use crate::range_stream::{PaginationStream, DEFAULT_PAGE_SIZE}; use crate::rpc::store::{BatchGetRequest, RangeRequest}; use crate::rpc::KeyValue; -use crate::table_name::TableName; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)] pub struct TableNameKey<'a> { @@ -48,7 +48,7 @@ impl<'a> TableNameKey<'a> { } pub fn prefix_to_table(catalog: &str, schema: &str) -> String { - format!("{}/{}/{}", TABLE_NAME_KEY_PREFIX, catalog, schema) + format!("{}/{}/{}/", TABLE_NAME_KEY_PREFIX, catalog, schema) } } @@ -56,7 +56,7 @@ impl Display for TableNameKey<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, - "{}/{}", + "{}{}", Self::prefix_to_table(self.catalog, self.schema), self.table ) @@ -268,7 +268,11 @@ impl TableNameManager { #[cfg(test)] mod tests { + use futures::StreamExt; + use super::*; + use crate::kv_backend::KvBackend; + use crate::rpc::store::PutRequest; #[test] fn test_strip_table_name() { @@ -324,4 +328,39 @@ mod tests { assert_eq!(value.try_as_raw_value().unwrap(), literal); assert_eq!(TableNameValue::try_from_raw_value(literal).unwrap(), value); } + + #[tokio::test] + async fn test_prefix_scan_tables() { + let memory_kv = Arc::new(MemoryKvBackend::::new()); + memory_kv + .put(PutRequest { + key: TableNameKey { + catalog: "greptime", + schema: "👉", + table: "t", + } + .to_bytes(), + value: vec![], + prev_kv: false, + }) + .await + .unwrap(); + memory_kv + .put(PutRequest { + key: TableNameKey { + catalog: "greptime", + schema: "👉👈", + table: "t", + } + .to_bytes(), + value: vec![], + prev_kv: false, + }) + .await + .unwrap(); + + let manager = TableNameManager::new(memory_kv); + let items = manager.tables("greptime", "👉").collect::>().await; + assert_eq!(items.len(), 1); + } } diff --git a/src/common/meta/src/key/table_region.rs b/src/common/meta/src/key/table_region.rs deleted file mode 100644 index 4ccc99ba513d..000000000000 --- a/src/common/meta/src/key/table_region.rs +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::BTreeMap; -use std::fmt::Display; - -use lazy_static::lazy_static; -use regex::Regex; -use serde::{Deserialize, Serialize}; -use snafu::{OptionExt, ResultExt}; -use store_api::storage::RegionNumber; -use table::metadata::TableId; - -use super::{MetaKey, TABLE_REGION_KEY_PREFIX}; -use crate::error::{InvalidTableMetadataSnafu, Result, SerdeJsonSnafu}; -use crate::{impl_table_meta_value, DatanodeId}; - -pub type RegionDistribution = BTreeMap>; - -#[deprecated( - since = "0.4.0", - note = "Please use the TableRouteManager's get_region_distribution method instead" -)] -#[derive(Debug, PartialEq)] -pub struct TableRegionKey { - table_id: TableId, -} - -lazy_static! { - static ref TABLE_REGION_KEY_PATTERN: Regex = - Regex::new(&format!("^{TABLE_REGION_KEY_PREFIX}/([0-9]+)$")).unwrap(); -} - -impl TableRegionKey { - pub fn new(table_id: TableId) -> Self { - Self { table_id } - } -} - -impl Display for TableRegionKey { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}/{}", TABLE_REGION_KEY_PREFIX, self.table_id) - } -} - -impl<'a> MetaKey<'a, TableRegionKey> for TableRegionKey { - fn to_bytes(&self) -> Vec { - self.to_string().into_bytes() - } - - fn from_bytes(bytes: &'a [u8]) -> Result { - let key = std::str::from_utf8(bytes).map_err(|e| { - InvalidTableMetadataSnafu { - err_msg: format!( - "TableRegionKey '{}' is not a valid UTF8 string: {e}", - String::from_utf8_lossy(bytes) - ), - } - .build() - })?; - let captures = - TABLE_REGION_KEY_PATTERN - .captures(key) - .context(InvalidTableMetadataSnafu { - err_msg: format!("Invalid TableRegionKey '{key}'"), - })?; - // Safety: pass the regex check above - let table_id = captures[1].parse::().unwrap(); - Ok(TableRegionKey { table_id }) - } -} - -#[deprecated( - since = "0.4.0", - note = "Please use the TableRouteManager's get_region_distribution method instead" -)] -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] -pub struct TableRegionValue { - pub region_distribution: RegionDistribution, - version: u64, -} - -impl TableRegionValue { - pub fn new(region_distribution: RegionDistribution) -> Self { - Self { - region_distribution, - version: 0, - } - } -} - -impl_table_meta_value! {TableRegionValue} - -#[cfg(test)] -mod tests { - use super::*; - use crate::key::TableMetaValue; - - #[test] - fn test_serialization() { - let key = TableRegionKey::new(24); - let raw_key = key.to_bytes(); - assert_eq!(raw_key, b"__table_region/24"); - let deserialized = TableRegionKey::from_bytes(b"__table_region/24").unwrap(); - assert_eq!(key, deserialized); - - let value = TableRegionValue { - region_distribution: RegionDistribution::from([(1, vec![1, 2, 3]), (2, vec![4, 5, 6])]), - version: 0, - }; - let literal = br#"{"region_distribution":{"1":[1,2,3],"2":[4,5,6]},"version":0}"#; - - assert_eq!(value.try_as_raw_value().unwrap(), literal); - assert_eq!( - TableRegionValue::try_from_raw_value(literal).unwrap(), - value, - ); - } -} diff --git a/src/common/meta/src/key/view_info.rs b/src/common/meta/src/key/view_info.rs index 98c8a1a73178..762acf9aa3d3 100644 --- a/src/common/meta/src/key/view_info.rs +++ b/src/common/meta/src/key/view_info.rs @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fmt::Display; +use std::sync::Arc; use serde::{Deserialize, Serialize}; use snafu::OptionExt; use table::metadata::TableId; +use table::table_name::TableName; use super::VIEW_INFO_KEY_PATTERN; use crate::error::{InvalidViewInfoSnafu, Result}; @@ -80,21 +82,30 @@ impl<'a> MetaKey<'a, ViewInfoKey> for ViewInfoKey { /// The VIEW info value that keeps the metadata. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct ViewInfoValue { + /// The encoded logical plan pub view_info: RawViewLogicalPlan, + /// The resolved fully table names in logical plan + pub table_names: HashSet, version: u64, } impl ViewInfoValue { - pub fn new(view_info: &RawViewLogicalPlan) -> Self { + pub fn new(view_info: RawViewLogicalPlan, table_names: HashSet) -> Self { Self { - view_info: view_info.clone(), + view_info, + table_names, version: 0, } } - pub(crate) fn update(&self, new_view_info: RawViewLogicalPlan) -> Self { + pub(crate) fn update( + &self, + new_view_info: RawViewLogicalPlan, + table_names: HashSet, + ) -> Self { Self { view_info: new_view_info, + table_names, version: self.version + 1, } } @@ -105,6 +116,8 @@ pub struct ViewInfoManager { kv_backend: KvBackendRef, } +pub type ViewInfoManagerRef = Arc; + impl ViewInfoManager { pub fn new(kv_backend: KvBackendRef) -> Self { Self { kv_backend } @@ -254,9 +267,25 @@ mod tests { #[test] fn test_value_serialization() { + let table_names = { + let mut set = HashSet::new(); + set.insert(TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "a_table".to_string(), + }); + set.insert(TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "b_table".to_string(), + }); + set + }; + let value = ViewInfoValue { view_info: vec![1, 2, 3], version: 1, + table_names, }; let serialized = value.try_as_raw_value().unwrap(); let deserialized = ViewInfoValue::try_from_raw_value(&serialized).unwrap(); diff --git a/src/common/meta/src/lib.rs b/src/common/meta/src/lib.rs index 5398a62a6752..78d111c479a9 100644 --- a/src/common/meta/src/lib.rs +++ b/src/common/meta/src/lib.rs @@ -40,7 +40,6 @@ pub mod region_keeper; pub mod rpc; pub mod sequence; pub mod state_store; -pub mod table_name; #[cfg(any(test, feature = "testing"))] pub mod test_util; pub mod util; diff --git a/src/common/meta/src/rpc/ddl.rs b/src/common/meta/src/rpc/ddl.rs index 8e977f1ca5bc..e6140cdaeff7 100644 --- a/src/common/meta/src/rpc/ddl.rs +++ b/src/common/meta/src/rpc/ddl.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::result; use api::v1::meta::ddl_task_request::Task; @@ -39,11 +39,11 @@ use serde_with::{serde_as, DefaultOnNull}; use session::context::QueryContextRef; use snafu::{OptionExt, ResultExt}; use table::metadata::{RawTableInfo, TableId}; +use table::table_name::TableName; use table::table_reference::TableReference; use crate::error::{self, Result}; use crate::key::FlowId; -use crate::table_name::TableName; /// DDL tasks #[derive(Debug, Clone)] @@ -332,6 +332,14 @@ impl CreateViewTask { pub fn raw_logical_plan(&self) -> &Vec { &self.create_view.logical_plan } + + pub fn table_names(&self) -> HashSet { + self.create_view + .table_names + .iter() + .map(|t| t.clone().into()) + .collect() + } } impl TryFrom for CreateViewTask { diff --git a/src/common/meta/src/rpc/router.rs b/src/common/meta/src/rpc/router.rs index 31be66f64954..3e609e4af4d8 100644 --- a/src/common/meta/src/rpc/router.rs +++ b/src/common/meta/src/rpc/router.rs @@ -25,11 +25,11 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer}; use snafu::OptionExt; use store_api::storage::{RegionId, RegionNumber}; use strum::AsRefStr; +use table::table_name::TableName; use crate::error::{self, Result}; use crate::key::RegionDistribution; use crate::peer::Peer; -use crate::table_name::TableName; use crate::DatanodeId; pub fn region_distribution(region_routes: &[RegionRoute]) -> RegionDistribution { diff --git a/src/common/query/Cargo.toml b/src/common/query/Cargo.toml index 443640016488..d7a0361965bd 100644 --- a/src/common/query/Cargo.toml +++ b/src/common/query/Cargo.toml @@ -4,12 +4,16 @@ version.workspace = true edition.workspace = true license.workspace = true +[features] +testing = [] + [lints] workspace = true [dependencies] api.workspace = true async-trait.workspace = true +bytes.workspace = true common-error.workspace = true common-macro.workspace = true common-recordbatch.workspace = true diff --git a/src/common/query/src/error.rs b/src/common/query/src/error.rs index 6756c58a8449..d544e6166cdf 100644 --- a/src/common/query/src/error.rs +++ b/src/common/query/src/error.rs @@ -206,6 +206,13 @@ pub enum Error { location: Location, }, + #[snafu(display("Failed to decode logical plan: {source}"))] + DecodePlan { + #[snafu(implicit)] + location: Location, + source: BoxedError, + }, + #[snafu(display("Failed to do table mutation"))] TableMutation { source: BoxedError, @@ -282,11 +289,12 @@ impl ErrorExt for Error { | Error::InvalidFuncArgs { .. } => StatusCode::InvalidArguments, Error::ConvertDfRecordBatchStream { source, .. } => source.status_code(), - Error::ExecutePhysicalPlan { source, .. } => source.status_code(), - Error::Execute { source, .. } => source.status_code(), - Error::ProcedureService { source, .. } | Error::TableMutation { source, .. } => { - source.status_code() - } + + Error::DecodePlan { source, .. } + | Error::Execute { source, .. } + | Error::ExecutePhysicalPlan { source, .. } + | Error::ProcedureService { source, .. } + | Error::TableMutation { source, .. } => source.status_code(), Error::PermissionDenied { .. } => StatusCode::PermissionDenied, } diff --git a/src/common/query/src/lib.rs b/src/common/query/src/lib.rs index 49aff8d9a4dd..68c7c2568cbc 100644 --- a/src/common/query/src/lib.rs +++ b/src/common/query/src/lib.rs @@ -18,7 +18,8 @@ mod function; pub mod logical_plan; pub mod prelude; mod signature; - +#[cfg(any(test, feature = "testing"))] +pub mod test_util; use std::fmt::{Debug, Display, Formatter}; use std::sync::Arc; diff --git a/src/common/query/src/logical_plan.rs b/src/common/query/src/logical_plan.rs index 6705a63e4251..3598001d67f7 100644 --- a/src/common/query/src/logical_plan.rs +++ b/src/common/query/src/logical_plan.rs @@ -19,12 +19,15 @@ mod udf; use std::sync::Arc; +use datafusion::catalog::CatalogProviderList; +use datafusion::logical_expr::LogicalPlan; use datatypes::prelude::ConcreteDataType; pub use expr::build_filter_from_timestamp; pub use self::accumulator::{Accumulator, AggregateFunctionCreator, AggregateFunctionCreatorRef}; pub use self::udaf::AggregateFunction; pub use self::udf::ScalarUdf; +use crate::error::Result; use crate::function::{ReturnTypeFunction, ScalarFunctionImplementation}; use crate::logical_plan::accumulator::*; use crate::signature::{Signature, Volatility}; @@ -68,6 +71,25 @@ pub fn create_aggregate_function( ) } +/// The datafusion `[LogicalPlan]` decoder. +#[async_trait::async_trait] +pub trait SubstraitPlanDecoder { + /// Decode the [`LogicalPlan`] from bytes with the [`CatalogProviderList`]. + /// When `optimize` is true, it will do the optimization for decoded plan. + /// + /// TODO(dennis): It's not a good design for an API to do many things. + /// The `optimize` was introduced because of `query` and `catalog` cyclic dependency issue + /// I am happy to refactor it if we have a better solution. + async fn decode( + &self, + message: bytes::Bytes, + catalog_list: Arc, + optimize: bool, + ) -> Result; +} + +pub type SubstraitPlanDecoderRef = Arc; + #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/src/mito2/src/wal/wal_entry_reader.rs b/src/common/query/src/test_util.rs similarity index 50% rename from src/mito2/src/wal/wal_entry_reader.rs rename to src/common/query/src/test_util.rs index 8c3e16122254..141c284a7baf 100644 --- a/src/mito2/src/wal/wal_entry_reader.rs +++ b/src/common/query/src/test_util.rs @@ -12,13 +12,31 @@ // See the License for the specific language governing permissions and // limitations under the License. -use store_api::storage::RegionId; +use std::sync::Arc; + +use datafusion::catalog::CatalogProviderList; +use datafusion::logical_expr::LogicalPlan; use crate::error::Result; -use crate::wal::raw_entry_reader::LogStoreNamespace; -use crate::wal::{EntryId, WalEntryStream}; +use crate::logical_plan::SubstraitPlanDecoder; + +/// Dummy `[SubstraitPlanDecoder]` for test. +pub struct DummyDecoder; + +impl DummyDecoder { + pub fn arc() -> Arc { + Arc::new(DummyDecoder) + } +} -/// [OneshotWalEntryReader] provides the ability to read and decode entries from the underlying store. -pub(crate) trait OneshotWalEntryReader: Send + Sync { - fn read(self, ctx: LogStoreNamespace, start_id: EntryId) -> Result; +#[async_trait::async_trait] +impl SubstraitPlanDecoder for DummyDecoder { + async fn decode( + &self, + _message: bytes::Bytes, + _catalog_list: Arc, + _optimize: bool, + ) -> Result { + unreachable!() + } } diff --git a/src/common/substrait/Cargo.toml b/src/common/substrait/Cargo.toml index a2fb0e272594..3da8b6310017 100644 --- a/src/common/substrait/Cargo.toml +++ b/src/common/substrait/Cargo.toml @@ -10,19 +10,15 @@ workspace = true [dependencies] async-trait.workspace = true bytes.workspace = true -catalog.workspace = true common-error.workspace = true -common-function.workspace = true common-macro.workspace = true common-telemetry.workspace = true datafusion.workspace = true datafusion-common.workspace = true datafusion-expr.workspace = true datafusion-substrait.workspace = true -datatypes.workspace = true promql.workspace = true prost.workspace = true -session.workspace = true snafu.workspace = true [dependencies.substrait_proto] diff --git a/src/common/substrait/src/df_substrait.rs b/src/common/substrait/src/df_substrait.rs index 0730f0773b32..9217b60cc5b6 100644 --- a/src/common/substrait/src/df_substrait.rs +++ b/src/common/substrait/src/df_substrait.rs @@ -16,26 +16,19 @@ use std::sync::Arc; use async_trait::async_trait; use bytes::{Buf, Bytes, BytesMut}; -use common_function::function_registry::FUNCTION_REGISTRY; -use common_function::scalars::udf::create_udf; use datafusion::catalog::CatalogProviderList; use datafusion::execution::context::SessionState; use datafusion::execution::runtime_env::RuntimeEnv; -use datafusion::execution::FunctionRegistry; use datafusion::prelude::{SessionConfig, SessionContext}; use datafusion_expr::LogicalPlan; use datafusion_substrait::logical_plan::consumer::from_substrait_plan; use datafusion_substrait::logical_plan::producer::to_substrait_plan; use datafusion_substrait::substrait::proto::Plan; use prost::Message; -use session::context::QueryContextRef; use snafu::ResultExt; -use crate::error::{ - DFInternalSnafu, DecodeDfPlanSnafu, DecodeRelSnafu, EncodeDfPlanSnafu, EncodeRelSnafu, Error, -}; -use crate::extension_serializer::ExtensionSerializer; -use crate::SubstraitPlan; +use crate::error::{DecodeDfPlanSnafu, DecodeRelSnafu, EncodeDfPlanSnafu, EncodeRelSnafu, Error}; +use crate::{SerializerRegistry, SubstraitPlan}; pub struct DFLogicalSubstraitConvertor; @@ -49,15 +42,8 @@ impl SubstraitPlan for DFLogicalSubstraitConvertor { &self, message: B, catalog_list: Arc, - mut state: SessionState, - query_ctx: QueryContextRef, + state: SessionState, ) -> Result { - // substrait decoder will look up the UDFs in SessionState, so we need to register them - for func in FUNCTION_REGISTRY.functions() { - let udf = Arc::new(create_udf(func, query_ctx.clone(), Default::default()).into()); - state.register_udf(udf).context(DFInternalSnafu)?; - } - let mut context = SessionContext::new_with_state(state); context.register_catalog_list(catalog_list); let plan = Plan::decode(message).context(DecodeRelSnafu)?; @@ -67,10 +53,13 @@ impl SubstraitPlan for DFLogicalSubstraitConvertor { Ok(df_plan) } - fn encode(&self, plan: &Self::Plan) -> Result { + fn encode( + &self, + plan: &Self::Plan, + serializer: impl SerializerRegistry + 'static, + ) -> Result { let mut buf = BytesMut::new(); - - let substrait_plan = self.to_sub_plan(plan)?; + let substrait_plan = self.to_sub_plan(plan, serializer)?; substrait_plan.encode(&mut buf).context(EncodeRelSnafu)?; Ok(buf.freeze()) @@ -78,10 +67,14 @@ impl SubstraitPlan for DFLogicalSubstraitConvertor { } impl DFLogicalSubstraitConvertor { - pub fn to_sub_plan(&self, plan: &LogicalPlan) -> Result, Error> { + pub fn to_sub_plan( + &self, + plan: &LogicalPlan, + serializer: impl SerializerRegistry + 'static, + ) -> Result, Error> { let session_state = SessionState::new_with_config_rt(SessionConfig::new(), Arc::new(RuntimeEnv::default())) - .with_serializer_registry(Arc::new(ExtensionSerializer)); + .with_serializer_registry(Arc::new(serializer)); let context = SessionContext::new_with_state(session_state); to_substrait_plan(plan, &context).context(EncodeDfPlanSnafu) diff --git a/src/common/substrait/src/error.rs b/src/common/substrait/src/error.rs index 07cc310d3934..5a39a1a6d1d0 100644 --- a/src/common/substrait/src/error.rs +++ b/src/common/substrait/src/error.rs @@ -18,7 +18,6 @@ use common_error::ext::{BoxedError, ErrorExt}; use common_error::status_code::StatusCode; use common_macro::stack_trace_debug; use datafusion::error::DataFusionError; -use datatypes::prelude::ConcreteDataType; use prost::{DecodeError, EncodeError}; use snafu::{Location, Snafu}; @@ -26,34 +25,6 @@ use snafu::{Location, Snafu}; #[snafu(visibility(pub))] #[stack_trace_debug] pub enum Error { - #[snafu(display("Unsupported physical plan: {}", name))] - UnsupportedPlan { - name: String, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Unsupported expr: {}", name))] - UnsupportedExpr { - name: String, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Unsupported concrete type: {:?}", ty))] - UnsupportedConcreteType { - ty: ConcreteDataType, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Unsupported substrait type: {}", ty))] - UnsupportedSubstraitType { - ty: String, - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Failed to decode substrait relation"))] DecodeRel { #[snafu(source)] @@ -70,33 +41,6 @@ pub enum Error { location: Location, }, - #[snafu(display("Input plan is empty"))] - EmptyPlan { - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Input expression is empty"))] - EmptyExpr { - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Missing required field in protobuf, field: {}, plan: {}", field, plan))] - MissingField { - field: String, - plan: String, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Invalid parameters: {}", reason))] - InvalidParameters { - reason: String, - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Internal error from DataFusion"))] DFInternal { #[snafu(source)] @@ -118,35 +62,6 @@ pub enum Error { location: Location, }, - #[snafu(display( - "Schema from Substrait proto doesn't match with the schema in storage. - Substrait schema: {:?} - Storage schema: {:?}", - substrait_schema, - storage_schema - ))] - SchemaNotMatch { - substrait_schema: datafusion::arrow::datatypes::SchemaRef, - storage_schema: datafusion::arrow::datatypes::SchemaRef, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Failed to convert DataFusion schema"))] - ConvertDfSchema { - #[snafu(implicit)] - location: Location, - source: datatypes::error::Error, - }, - - #[snafu(display("Unable to resolve table: {table_name}, error: "))] - ResolveTable { - table_name: String, - #[snafu(implicit)] - location: Location, - source: catalog::error::Error, - }, - #[snafu(display("Failed to encode DataFusion plan"))] EncodeDfPlan { #[snafu(source)] @@ -169,24 +84,13 @@ pub type Result = std::result::Result; impl ErrorExt for Error { fn status_code(&self) -> StatusCode { match self { - Error::UnsupportedConcreteType { .. } - | Error::UnsupportedPlan { .. } - | Error::UnsupportedExpr { .. } - | Error::UnsupportedSubstraitType { .. } => StatusCode::Unsupported, - Error::UnknownPlan { .. } - | Error::EncodeRel { .. } - | Error::DecodeRel { .. } - | Error::EmptyPlan { .. } - | Error::EmptyExpr { .. } - | Error::MissingField { .. } - | Error::InvalidParameters { .. } - | Error::SchemaNotMatch { .. } => StatusCode::InvalidArguments, + Error::UnknownPlan { .. } | Error::EncodeRel { .. } | Error::DecodeRel { .. } => { + StatusCode::InvalidArguments + } Error::DFInternal { .. } | Error::Internal { .. } | Error::EncodeDfPlan { .. } | Error::DecodeDfPlan { .. } => StatusCode::Internal, - Error::ConvertDfSchema { source, .. } => source.status_code(), - Error::ResolveTable { source, .. } => source.status_code(), } } diff --git a/src/common/substrait/src/extension_serializer.rs b/src/common/substrait/src/extension_serializer.rs index 89944db508f9..a8179437687e 100644 --- a/src/common/substrait/src/extension_serializer.rs +++ b/src/common/substrait/src/extension_serializer.rs @@ -67,7 +67,6 @@ impl SerializerRegistry for ExtensionSerializer { name if name == EmptyMetric::name() => Err(DataFusionError::Substrait( "EmptyMetric should not be serialized".to_string(), )), - "MergeScan" => Ok(vec![]), other => Err(DataFusionError::NotImplemented(format!( "Serizlize logical plan for {}", other diff --git a/src/common/substrait/src/lib.rs b/src/common/substrait/src/lib.rs index 8a03dd7308ed..756e701c489a 100644 --- a/src/common/substrait/src/lib.rs +++ b/src/common/substrait/src/lib.rs @@ -23,11 +23,11 @@ use async_trait::async_trait; use bytes::{Buf, Bytes}; use datafusion::catalog::CatalogProviderList; use datafusion::execution::context::SessionState; +pub use datafusion::execution::registry::SerializerRegistry; /// Re-export the Substrait module of datafusion, /// note this is a different version of the `substrait_proto` crate pub use datafusion_substrait::substrait as substrait_proto_df; pub use datafusion_substrait::{logical_plan as df_logical_plan, variation_const}; -use session::context::QueryContextRef; pub use substrait_proto; pub use crate::df_substrait::DFLogicalSubstraitConvertor; @@ -42,8 +42,11 @@ pub trait SubstraitPlan { message: B, catalog_list: Arc, state: SessionState, - query_ctx: QueryContextRef, ) -> Result; - fn encode(&self, plan: &Self::Plan) -> Result; + fn encode( + &self, + plan: &Self::Plan, + serializer: impl SerializerRegistry + 'static, + ) -> Result; } diff --git a/src/common/telemetry/src/logging.rs b/src/common/telemetry/src/logging.rs index 62fa9a5bf60b..4088c5236ca8 100644 --- a/src/common/telemetry/src/logging.rs +++ b/src/common/telemetry/src/logging.rs @@ -94,7 +94,7 @@ pub fn init_default_ut_logging() { env::var("UNITTEST_LOG_DIR").unwrap_or_else(|_| "/tmp/__unittest_logs".to_string()); let level = env::var("UNITTEST_LOG_LEVEL").unwrap_or_else(|_| - "debug,hyper=warn,tower=warn,datafusion=warn,reqwest=warn,sqlparser=warn,h2=info,opendal=info".to_string() + "debug,hyper=warn,tower=warn,datafusion=warn,reqwest=warn,sqlparser=warn,h2=info,opendal=info,rskafka=info".to_string() ); let opts = LoggingOptions { dir: dir.clone(), diff --git a/src/datanode/Cargo.toml b/src/datanode/Cargo.toml index 26a7ccb67563..a5408b0c3246 100644 --- a/src/datanode/Cargo.toml +++ b/src/datanode/Cargo.toml @@ -57,7 +57,6 @@ servers.workspace = true session.workspace = true snafu.workspace = true store-api.workspace = true -substrait.workspace = true table.workspace = true tokio.workspace = true toml.workspace = true diff --git a/src/datanode/src/error.rs b/src/datanode/src/error.rs index 945d03422731..919a921ec349 100644 --- a/src/datanode/src/error.rs +++ b/src/datanode/src/error.rs @@ -64,11 +64,18 @@ pub enum Error { source: query::error::Error, }, + #[snafu(display("Failed to create plan decoder"))] + NewPlanDecoder { + #[snafu(implicit)] + location: Location, + source: query::error::Error, + }, + #[snafu(display("Failed to decode logical plan"))] DecodeLogicalPlan { #[snafu(implicit)] location: Location, - source: substrait::error::Error, + source: common_query::error::Error, }, #[snafu(display("Incorrect internal state: {}", state))] @@ -388,7 +395,9 @@ impl ErrorExt for Error { fn status_code(&self) -> StatusCode { use Error::*; match self { - ExecuteLogicalPlan { source, .. } => source.status_code(), + NewPlanDecoder { source, .. } | ExecuteLogicalPlan { source, .. } => { + source.status_code() + } BuildRegionRequests { source, .. } => source.status_code(), HandleHeartbeatResponse { source, .. } | GetMetadata { source, .. } => { diff --git a/src/datanode/src/region_server.rs b/src/datanode/src/region_server.rs index 469ed0a6ccf1..13b10c497cef 100644 --- a/src/datanode/src/region_server.rs +++ b/src/datanode/src/region_server.rs @@ -51,13 +51,13 @@ use store_api::metric_engine_consts::{ use store_api::region_engine::{RegionEngineRef, RegionRole, SetReadonlyResponse}; use store_api::region_request::{AffectedRows, RegionCloseRequest, RegionRequest}; use store_api::storage::RegionId; -use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan}; use tonic::{Request, Response, Result as TonicResult}; use crate::error::{ self, BuildRegionRequestsSnafu, DecodeLogicalPlanSnafu, ExecuteLogicalPlanSnafu, - FindLogicalRegionsSnafu, HandleRegionRequestSnafu, RegionEngineNotFoundSnafu, - RegionNotFoundSnafu, Result, StopRegionEngineSnafu, UnexpectedSnafu, UnsupportedOutputSnafu, + FindLogicalRegionsSnafu, HandleRegionRequestSnafu, NewPlanDecoderSnafu, + RegionEngineNotFoundSnafu, RegionNotFoundSnafu, Result, StopRegionEngineSnafu, UnexpectedSnafu, + UnsupportedOutputSnafu, }; use crate::event_listener::RegionServerEventListenerRef; @@ -189,7 +189,7 @@ impl RegionServer { pub async fn region_disk_usage(&self, region_id: RegionId) -> Option { match self.inner.region_map.get(®ion_id) { - Some(e) => e.region_disk_usage(region_id).await, + Some(e) => e.region_disk_usage(region_id), None => None, } } @@ -653,14 +653,13 @@ impl RegionServerInner { let catalog_list = Arc::new(DummyCatalogList::with_table_provider(table_provider)); let query_engine_ctx = self.query_engine.engine_context(ctx.clone()); + let plan_decoder = query_engine_ctx + .new_plan_decoder() + .context(NewPlanDecoderSnafu)?; + // decode substrait plan to logical plan and execute it - let logical_plan = DFLogicalSubstraitConvertor - .decode( - Bytes::from(plan), - catalog_list, - query_engine_ctx.state().clone(), - ctx.clone(), - ) + let logical_plan = plan_decoder + .decode(Bytes::from(plan), catalog_list, false) .await .context(DecodeLogicalPlanSnafu)?; diff --git a/src/datanode/src/tests.rs b/src/datanode/src/tests.rs index 327e1be46256..b115b366c4af 100644 --- a/src/datanode/src/tests.rs +++ b/src/datanode/src/tests.rs @@ -200,7 +200,7 @@ impl RegionEngine for MockRegionEngine { unimplemented!() } - async fn region_disk_usage(&self, _region_id: RegionId) -> Option { + fn region_disk_usage(&self, _region_id: RegionId) -> Option { unimplemented!() } diff --git a/src/file-engine/src/engine.rs b/src/file-engine/src/engine.rs index e0a3a6ebdc42..f71622178dfe 100644 --- a/src/file-engine/src/engine.rs +++ b/src/file-engine/src/engine.rs @@ -107,7 +107,7 @@ impl RegionEngine for FileRegionEngine { self.inner.stop().await.map_err(BoxedError::new) } - async fn region_disk_usage(&self, _: RegionId) -> Option { + fn region_disk_usage(&self, _: RegionId) -> Option { None } diff --git a/src/flow/src/adapter.rs b/src/flow/src/adapter.rs index 25bb3cb2bf2c..52209a172f63 100644 --- a/src/flow/src/adapter.rs +++ b/src/flow/src/adapter.rs @@ -35,12 +35,12 @@ use itertools::Itertools; use query::{QueryEngine, QueryEngineFactory}; use serde::{Deserialize, Serialize}; use session::context::QueryContext; -use snafu::{OptionExt, ResultExt}; +use snafu::{ensure, OptionExt, ResultExt}; use store_api::storage::{ConcreteDataType, RegionId}; use table::metadata::TableId; use tokio::sync::{oneshot, watch, Mutex, RwLock}; -use crate::adapter::error::{ExternalSnafu, TableNotFoundSnafu, UnexpectedSnafu}; +use crate::adapter::error::{ExternalSnafu, InternalSnafu, TableNotFoundSnafu, UnexpectedSnafu}; pub(crate) use crate::adapter::node_context::FlownodeContext; use crate::adapter::table_source::TableSource; use crate::adapter::util::column_schemas_to_proto; @@ -66,6 +66,11 @@ use error::Error; pub const PER_REQ_MAX_ROW_CNT: usize = 8192; +// TODO: replace this with `GREPTIME_TIMESTAMP` before v0.9 +pub const AUTO_CREATED_PLACEHOLDER_TS_COL: &str = "__ts_placeholder"; + +pub const UPDATE_AT_TS_COL: &str = "update_at"; + // TODO: refactor common types for flow to a separate module /// FlowId is a unique identifier for a flow task pub type FlowId = u64; @@ -279,10 +284,16 @@ impl FlownodeManager { .map(|i| meta.schema.column_schemas[i].name.clone()) .collect_vec(); let schema = meta.schema.column_schemas; - let is_auto_create = schema - .last() - .map(|s| s.name == "__ts_placeholder") - .unwrap_or(false); + // check if the last column is the auto created timestamp column, hence the table is auto created from + // flow's plan type + let is_auto_create = { + let correct_name = schema + .last() + .map(|s| s.name == AUTO_CREATED_PLACEHOLDER_TS_COL) + .unwrap_or(false); + let correct_time_index = meta.schema.timestamp_index == Some(schema.len() - 1); + correct_name && correct_time_index + }; (primary_keys, schema, is_auto_create) } else { // TODO(discord9): condiser remove buggy auto create by schema @@ -302,6 +313,7 @@ impl FlownodeManager { .clone(); // TODO(discord9): use default key from schema let primary_keys = schema + .typ() .keys .first() .map(|v| { @@ -312,24 +324,31 @@ impl FlownodeManager { }) .unwrap_or_default(); let update_at = ColumnSchema::new( - "update_at", + UPDATE_AT_TS_COL, ConcreteDataType::timestamp_millisecond_datatype(), true, ); // TODO(discord9): bugged so we can't infer time index from flow plan, so we have to manually set one let ts_col = ColumnSchema::new( - "__ts_placeholder", + AUTO_CREATED_PLACEHOLDER_TS_COL, ConcreteDataType::timestamp_millisecond_datatype(), true, ) .with_time_index(true); let wout_ts = schema + .typ() .column_types + .clone() .into_iter() .enumerate() .map(|(idx, typ)| { - ColumnSchema::new(format!("Col_{idx}"), typ.scalar_type, typ.nullable) + let name = schema + .names + .get(idx) + .cloned() + .unwrap_or(format!("Col_{}", idx)); + ColumnSchema::new(name, typ.scalar_type, typ.nullable) }) .collect_vec(); @@ -339,7 +358,7 @@ impl FlownodeManager { (primary_keys, with_ts, true) }; - + let schema_len = schema.len(); let proto_schema = column_schemas_to_proto(schema, &primary_keys)?; debug!( @@ -348,16 +367,7 @@ impl FlownodeManager { table_name.join("."), reqs ); - let now = SystemTime::now(); - let now = now - .duration_since(SystemTime::UNIX_EPOCH) - .map(|s| s.as_millis() as repr::Timestamp) - .unwrap_or_else(|_| { - -(SystemTime::UNIX_EPOCH - .duration_since(now) - .unwrap() - .as_millis() as repr::Timestamp) - }); + let now = self.tick_manager.tick(); for req in reqs { match req { DiffRequest::Insert(insert) => { @@ -370,13 +380,23 @@ impl FlownodeManager { ))]); // ts col, if auto create if is_auto_create { + ensure!( + row.len() == schema_len - 1, + InternalSnafu { + reason: format!( + "Row len mismatch, expect {} got {}", + schema_len - 1, + row.len() + ) + } + ); row.extend([Value::from( common_time::Timestamp::new_millisecond(0), )]); } - row.into() + Ok(row.into()) }) - .collect::>(); + .collect::, Error>>()?; let table_name = table_name.last().unwrap().clone(); let req = RowInsertRequest { table_name, @@ -490,9 +510,12 @@ impl FlownodeManager { debug!("Starting to run"); loop { // TODO(discord9): only run when new inputs arrive or scheduled to - self.run_available().await.unwrap(); + debug!("call run_available in run every second"); + self.run_available(true).await.unwrap(); + debug!("call send_writeback_requests in run every second"); // TODO(discord9): error handling self.send_writeback_requests().await.unwrap(); + debug!("call log_all_errors in run every second"); self.log_all_errors().await; tokio::time::sleep(std::time::Duration::from_secs(1)).await; } @@ -501,29 +524,44 @@ impl FlownodeManager { /// Run all available subgraph in the flow node /// This will try to run all dataflow in this node /// - /// However this is not blocking and can sometimes return while actual computation is still running in worker thread + /// set `blocking` to true to wait until lock is acquired + /// and false to return immediately if lock is not acquired /// TODO(discord9): add flag for subgraph that have input since last run - pub async fn run_available(&self) -> Result<(), Error> { - let now = self.tick_manager.tick(); - + pub async fn run_available(&self, blocking: bool) -> Result<(), Error> { loop { + let now = self.tick_manager.tick(); for worker in self.worker_handles.iter() { // TODO(discord9): consider how to handle error in individual worker - worker.lock().await.run_available(now).await.unwrap(); + if blocking { + worker.lock().await.run_available(now).await?; + } else if let Ok(worker) = worker.try_lock() { + worker.run_available(now).await?; + } else { + return Ok(()); + } } // first check how many inputs were sent - let send_cnt = match self.node_context.lock().await.flush_all_sender() { - Ok(cnt) => cnt, + let (flush_res, buf_len) = if blocking { + let mut ctx = self.node_context.lock().await; + (ctx.flush_all_sender(), ctx.get_send_buf_size()) + } else { + match self.node_context.try_lock() { + Ok(mut ctx) => (ctx.flush_all_sender(), ctx.get_send_buf_size()), + Err(_) => return Ok(()), + } + }; + match flush_res { + Ok(_) => (), Err(err) => { common_telemetry::error!("Flush send buf errors: {:?}", err); break; } }; - // if no inputs - if send_cnt == 0 { + // if no thing in send buf then break + if buf_len == 0 { break; } else { - debug!("FlownodeManager::run_available: send_cnt={}", send_cnt); + debug!("Send buf len = {}", buf_len); } } @@ -543,6 +581,8 @@ impl FlownodeManager { ); let table_id = region_id.table_id(); self.node_context.lock().await.send(table_id, rows)?; + // TODO(discord9): put it in a background task? + // self.run_available(false).await?; Ok(()) } } @@ -653,21 +693,22 @@ impl FlownodeManager { /// /// TODO(discord9): better way to do it, and not expose flow tick even to other flow to avoid /// TSO coord mess -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct FlowTickManager { + /// The starting instant of the flow, used with `start_timestamp` to calculate the current timestamp start: Instant, -} - -impl std::fmt::Debug for FlowTickManager { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("FlowTickManager").finish() - } + /// The timestamp when the flow started + start_timestamp: repr::Timestamp, } impl FlowTickManager { pub fn new() -> Self { FlowTickManager { start: Instant::now(), + start_timestamp: SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_millis() as repr::Timestamp, } } @@ -677,6 +718,6 @@ impl FlowTickManager { pub fn tick(&self) -> repr::Timestamp { let current = Instant::now(); let since_the_epoch = current - self.start; - since_the_epoch.as_millis() as repr::Timestamp + since_the_epoch.as_millis() as repr::Timestamp + self.start_timestamp } } diff --git a/src/flow/src/adapter/flownode_impl.rs b/src/flow/src/adapter/flownode_impl.rs index e770bb5e4cf1..152251975ab8 100644 --- a/src/flow/src/adapter/flownode_impl.rs +++ b/src/flow/src/adapter/flownode_impl.rs @@ -14,13 +14,17 @@ //! impl `FlowNode` trait for FlowNodeManager so standalone can call them +use std::collections::HashMap; + use api::v1::flow::{flow_request, CreateRequest, DropRequest, FlowRequest, FlowResponse}; use api::v1::region::InsertRequests; use common_error::ext::BoxedError; use common_meta::error::{ExternalSnafu, Result, UnexpectedSnafu}; use common_meta::node_manager::Flownode; +use common_telemetry::debug; use itertools::Itertools; -use snafu::ResultExt; +use snafu::{OptionExt, ResultExt}; +use store_api::storage::RegionId; use crate::adapter::FlownodeManager; use crate::repr::{self, DiffRow}; @@ -101,12 +105,57 @@ impl Flownode for FlownodeManager { async fn handle_inserts(&self, request: InsertRequests) -> Result { for write_request in request.requests { let region_id = write_request.region_id; - let rows_proto = write_request.rows.map(|r| r.rows).unwrap_or(vec![]); + let table_id = RegionId::from(region_id).table_id(); + + let (insert_schema, rows_proto) = write_request + .rows + .map(|r| (r.schema, r.rows)) + .unwrap_or_default(); + // TODO(discord9): reconsider time assignment mechanism let now = self.tick_manager.tick(); + + let fetch_order = { + let ctx = self.node_context.lock().await; + let table_col_names = ctx + .table_repr + .get_by_table_id(&table_id) + .map(|r| r.1) + .and_then(|id| ctx.schema.get(&id)) + .map(|desc| &desc.names) + .context(UnexpectedSnafu { + err_msg: format!("Table not found: {}", table_id), + })?; + let name_to_col = HashMap::<_, _>::from_iter( + insert_schema + .iter() + .enumerate() + .map(|(i, name)| (&name.column_name, i)), + ); + let fetch_order: Vec = table_col_names + .iter() + .map(|names| { + name_to_col.get(names).copied().context(UnexpectedSnafu { + err_msg: format!("Column not found: {}", names), + }) + }) + .try_collect()?; + if !fetch_order.iter().enumerate().all(|(i, &v)| i == v) { + debug!("Reordering columns: {:?}", fetch_order) + } + fetch_order + }; + let rows: Vec = rows_proto .into_iter() - .map(repr::Row::from) + .map(|r| { + let r = repr::Row::from(r); + let reordered = fetch_order + .iter() + .map(|&i| r.inner[i].clone()) + .collect_vec(); + repr::Row::new(reordered) + }) .map(|r| (r, now, 1)) .collect_vec(); self.handle_write_request(region_id.into(), rows) diff --git a/src/flow/src/adapter/node_context.rs b/src/flow/src/adapter/node_context.rs index b1d01373fb8a..ffaa3cc70252 100644 --- a/src/flow/src/adapter/node_context.rs +++ b/src/flow/src/adapter/node_context.rs @@ -27,7 +27,7 @@ use crate::adapter::error::{Error, EvalSnafu, TableNotFoundSnafu}; use crate::adapter::{FlowId, TableName, TableSource}; use crate::expr::error::InternalSnafu; use crate::expr::GlobalId; -use crate::repr::{DiffRow, RelationType, BROADCAST_CAP}; +use crate::repr::{DiffRow, RelationDesc, RelationType, BROADCAST_CAP}; /// A context that holds the information of the dataflow #[derive(Default, Debug)] @@ -51,10 +51,8 @@ pub struct FlownodeContext { mpsc::UnboundedReceiver, ), >, - /// store source in buffer for each source table, in case broadcast channel is full - pub send_buffer: BTreeMap>, /// the schema of the table, query from metasrv or inferred from TypedPlan - pub schema: HashMap, + pub schema: HashMap, /// All the tables that have been registered in the worker pub table_repr: IdToNameMap, pub query_context: Option>, @@ -73,7 +71,8 @@ pub struct SourceSender { impl Default for SourceSender { fn default() -> Self { Self { - sender: broadcast::Sender::new(BROADCAST_CAP), + // TODO(discord9): found a better way then increase this to prevent lagging and hence missing input data + sender: broadcast::Sender::new(BROADCAST_CAP * 2), send_buf: Default::default(), } } @@ -109,6 +108,7 @@ impl SourceSender { } if row_cnt > 0 { debug!("Send {} rows", row_cnt); + debug!("Remaining Send buf.len() = {}", self.send_buf.len()); } Ok(row_cnt) @@ -140,12 +140,19 @@ impl FlownodeContext { } /// flush all sender's buf + /// + /// return numbers being sent pub fn flush_all_sender(&mut self) -> Result { self.source_sender .iter_mut() .map(|(_table_id, src_sender)| src_sender.try_send_all()) .try_fold(0, |acc, x| x.map(|x| x + acc)) } + + /// Return the sum number of rows in all send buf + pub fn get_send_buf_size(&self) -> usize { + self.source_sender.values().map(|v| v.send_buf.len()).sum() + } } impl FlownodeContext { @@ -226,7 +233,7 @@ impl FlownodeContext { /// Retrieves a GlobalId and table schema representing a table previously registered by calling the [register_table] function. /// /// Returns an error if no table has been registered with the provided names - pub fn table(&self, name: &TableName) -> Result<(GlobalId, RelationType), Error> { + pub fn table(&self, name: &TableName) -> Result<(GlobalId, RelationDesc), Error> { let id = self .table_repr .get_by_name(name) @@ -297,7 +304,7 @@ impl FlownodeContext { .get_by_name(table_name) .map(|(_, gid)| gid) .unwrap(); - self.schema.insert(gid, schema); + self.schema.insert(gid, schema.into_unnamed()); Ok(()) } diff --git a/src/flow/src/adapter/table_source.rs b/src/flow/src/adapter/table_source.rs index cfa41f785ac8..53932cd692c2 100644 --- a/src/flow/src/adapter/table_source.rs +++ b/src/flow/src/adapter/table_source.rs @@ -17,7 +17,6 @@ use common_error::ext::BoxedError; use common_meta::key::table_info::{TableInfoManager, TableInfoValue}; use common_meta::key::table_name::{TableNameKey, TableNameManager}; -use itertools::Itertools; use snafu::{OptionExt, ResultExt}; use table::metadata::TableId; @@ -25,7 +24,7 @@ use crate::adapter::error::{ Error, ExternalSnafu, TableNotFoundMetaSnafu, TableNotFoundSnafu, UnexpectedSnafu, }; use crate::adapter::TableName; -use crate::repr::{self, ColumnType, RelationType}; +use crate::repr::{self, ColumnType, RelationDesc, RelationType}; /// mapping of table name <-> table id should be query from tableinfo manager pub struct TableSource { @@ -107,7 +106,7 @@ impl TableSource { pub async fn get_table_name_schema( &self, table_id: &TableId, - ) -> Result<(TableName, RelationType), Error> { + ) -> Result<(TableName, RelationDesc), Error> { let table_info_value = self .get_table_info_value(table_id) .await? @@ -123,14 +122,20 @@ impl TableSource { ]; let raw_schema = table_info_value.table_info.meta.schema; - let column_types = raw_schema + let (column_types, col_names): (Vec<_>, Vec<_>) = raw_schema .column_schemas + .clone() .into_iter() - .map(|col| ColumnType { - nullable: col.is_nullable(), - scalar_type: col.data_type, + .map(|col| { + ( + ColumnType { + nullable: col.is_nullable(), + scalar_type: col.data_type, + }, + col.name, + ) }) - .collect_vec(); + .unzip(); let key = table_info_value.table_info.meta.primary_key_indices; let keys = vec![repr::Key::from(key)]; @@ -138,10 +143,13 @@ impl TableSource { let time_index = raw_schema.timestamp_index; Ok(( table_name, - RelationType { - column_types, - keys, - time_index, + RelationDesc { + typ: RelationType { + column_types, + keys, + time_index, + }, + names: col_names, }, )) } diff --git a/src/flow/src/compute/render/map.rs b/src/flow/src/compute/render/map.rs index 2261f4de14f7..50bd48f5fb70 100644 --- a/src/flow/src/compute/render/map.rs +++ b/src/flow/src/compute/render/map.rs @@ -124,9 +124,13 @@ fn mfp_subgraph( // 1. Read all updates that were emitted between the last time this arrangement had updates and the current time. // 2. Output the updates. // 3. Truncate all updates within that range. - let from = arrange.read().last_compaction_time().map(|n| n + 1); + let from = arrange.read().last_compaction_time(); let from = from.unwrap_or(repr::Timestamp::MIN); - let output_kv = arrange.read().get_updates_in_range(from..=now); + let range = ( + std::ops::Bound::Excluded(from), + std::ops::Bound::Included(now), + ); + let output_kv = arrange.read().get_updates_in_range(range); // the output is expected to be key -> empty val let output = output_kv .into_iter() diff --git a/src/flow/src/compute/render/reduce.rs b/src/flow/src/compute/render/reduce.rs index e46f8c2bedc3..fa29a6324215 100644 --- a/src/flow/src/compute/render/reduce.rs +++ b/src/flow/src/compute/render/reduce.rs @@ -26,7 +26,7 @@ use crate::adapter::error::{Error, PlanSnafu}; use crate::compute::render::{Context, SubgraphArg}; use crate::compute::state::Scheduler; use crate::compute::types::{Arranged, Collection, CollectionBundle, ErrCollector, Toff}; -use crate::expr::error::{DataTypeSnafu, InternalSnafu}; +use crate::expr::error::{DataAlreadyExpiredSnafu, DataTypeSnafu, InternalSnafu}; use crate::expr::{AggregateExpr, EvalError, ScalarExpr}; use crate::plan::{AccumulablePlan, AggrWithIndex, KeyValPlan, Plan, ReducePlan, TypedPlan}; use crate::repr::{self, DiffRow, KeyValDiffRow, RelationType, Row}; @@ -301,9 +301,13 @@ fn update_reduce_distinct_arrange( // Deal with output: // 1. Read all updates that were emitted between the last time this arrangement had updates and the current time. - let from = arrange.read().last_compaction_time().map(|n| n + 1); + let from = arrange.read().last_compaction_time(); let from = from.unwrap_or(repr::Timestamp::MIN); - let output_kv = arrange.read().get_updates_in_range(from..=now); + let range = ( + std::ops::Bound::Excluded(from), + std::ops::Bound::Included(now), + ); + let output_kv = arrange.read().get_updates_in_range(range); // 2. Truncate all updates stored in arrangement within that range. let run_compaction = || { @@ -397,6 +401,29 @@ fn reduce_accum_subgraph( // TODO(discord9): consider key-based lock let mut arrange = arrange.write(); for (key, value_diffs) in key_to_vals { + if let Some(expire_man) = &arrange.get_expire_state() { + let mut is_expired = false; + err_collector.run(|| { + if let Some(expired) = expire_man.get_expire_duration(now, &key)? { + is_expired = true; + // expired data is ignored in computation, and a simple warning is logged + common_telemetry::warn!( + "Data already expired: {}", + DataAlreadyExpiredSnafu { + expired_by: expired, + } + .build() + ); + Ok(()) + } else { + Ok(()) + } + }); + if is_expired { + // errors already collected, we can just continue to next key + continue; + } + } let col_diffs = { let row_len = value_diffs[0].0.len(); let res = err_collector.run(|| get_col_diffs(value_diffs, row_len)); diff --git a/src/flow/src/compute/render/src_sink.rs b/src/flow/src/compute/render/src_sink.rs index 33ecb9670caa..96411b6d04b0 100644 --- a/src/flow/src/compute/render/src_sink.rs +++ b/src/flow/src/compute/render/src_sink.rs @@ -20,12 +20,14 @@ use common_telemetry::{debug, info}; use hydroflow::scheduled::graph_ext::GraphExt; use itertools::Itertools; use snafu::OptionExt; +use tokio::sync::broadcast::error::TryRecvError; use tokio::sync::{broadcast, mpsc}; use crate::adapter::error::{Error, PlanSnafu}; use crate::compute::render::Context; use crate::compute::types::{Arranged, Collection, CollectionBundle, Toff}; -use crate::expr::GlobalId; +use crate::expr::error::InternalSnafu; +use crate::expr::{EvalError, GlobalId}; use crate::repr::{DiffRow, Row, BROADCAST_CAP}; #[allow(clippy::mutable_key_type)] @@ -55,18 +57,43 @@ impl<'referred, 'df> Context<'referred, 'df> { .df .add_subgraph_source("source", send_port, move |_ctx, send| { let now = *now.borrow(); - let arr = arrange_handler_inner.write().get_updates_in_range(..=now); - err_collector.run(|| arrange_handler_inner.write().compact_to(now)); + // write lock to prevent unexpected mutation + let mut arranged = arrange_handler_inner.write(); + let arr = arranged.get_updates_in_range(..=now); + err_collector.run(|| arranged.compact_to(now)); + debug!("Call source"); let prev_avail = arr.into_iter().map(|((k, _), t, d)| (k, t, d)); let mut to_send = Vec::new(); let mut to_arrange = Vec::new(); // TODO(discord9): handling tokio broadcast error - while let Ok((r, t, d)) = src_recv.try_recv() { - if t <= now { - to_send.push((r, t, d)); - } else { - to_arrange.push(((r, Row::empty()), t, d)); + loop { + match src_recv.try_recv() { + Ok((r, t, d)) => { + if t <= now { + to_send.push((r, t, d)); + } else { + to_arrange.push(((r, Row::empty()), t, d)); + } + } + Err(TryRecvError::Empty) => { + break; + } + Err(TryRecvError::Lagged(lag_offset)) => { + common_telemetry::error!("Flow missing {} rows behind", lag_offset); + break; + } + Err(err) => { + err_collector.run(|| -> Result<(), EvalError> { + InternalSnafu { + reason: format!( + "Error receiving from broadcast channel: {}", + err + ), + } + .fail() + }); + } } } let all = prev_avail.chain(to_send).collect_vec(); @@ -77,10 +104,10 @@ impl<'referred, 'df> Context<'referred, 'df> { to_arrange.len() ); } - err_collector.run(|| arrange_handler_inner.write().apply_updates(now, to_arrange)); + err_collector.run(|| arranged.apply_updates(now, to_arrange)); send.give(all); - // always schedule source to run at next tick - inner_schd.schedule_at(now + 1); + // always schedule source to run at now so we can repeatedly run source if needed + inner_schd.schedule_at(now); }); schd.set_cur_subgraph(sub); let arranged = Arranged::new(arrange_handler); diff --git a/src/flow/src/expr/error.rs b/src/flow/src/expr/error.rs index 5a2823423974..09ad758056ba 100644 --- a/src/flow/src/expr/error.rs +++ b/src/flow/src/expr/error.rs @@ -100,4 +100,11 @@ pub enum EvalError { #[snafu(implicit)] location: Location, }, + + #[snafu(display("Incoming data already expired by {} ms", expired_by))] + DataAlreadyExpired { + expired_by: i64, + #[snafu(implicit)] + location: Location, + }, } diff --git a/src/flow/src/expr/func.rs b/src/flow/src/expr/func.rs index 7957f70cb6c4..31131a2758eb 100644 --- a/src/flow/src/expr/func.rs +++ b/src/flow/src/expr/func.rs @@ -76,6 +76,13 @@ impl UnmaterializableFunc { } } + pub fn is_valid_func_name(name: &str) -> bool { + matches!( + name.to_lowercase().as_str(), + "now" | "current_schema" | "tumble" + ) + } + /// Create a UnmaterializableFunc from a string of the function name pub fn from_str_args(name: &str, args: Vec) -> Result { match name.to_lowercase().as_str() { @@ -183,6 +190,13 @@ impl UnaryFunc { } } + pub fn is_valid_func_name(name: &str) -> bool { + matches!( + name.to_lowercase().as_str(), + "not" | "is_null" | "is_true" | "is_false" | "step_timestamp" | "cast" + ) + } + /// Create a UnaryFunc from a string of the function name and given argument type(optional) pub fn from_str_and_type( name: &str, @@ -278,9 +292,9 @@ impl UnaryFunc { start_time, } => { let ts = get_ts_as_millisecond(arg)?; - let start_time = start_time.map(|t| t.val()).unwrap_or(0); + let start_time = start_time.map(|t| t.val()); let window_size = (window_size.to_nanosecond() / 1_000_000) as repr::Duration; // nanosecond to millisecond - let window_start = start_time + (ts - start_time) / window_size * window_size; + let window_start = get_window_start(ts, window_size, start_time); let ret = Timestamp::new_millisecond(window_start); Ok(Value::from(ret)) @@ -290,9 +304,9 @@ impl UnaryFunc { start_time, } => { let ts = get_ts_as_millisecond(arg)?; - let start_time = start_time.map(|t| t.val()).unwrap_or(0); + let start_time = start_time.map(|t| t.val()); let window_size = (window_size.to_nanosecond() / 1_000_000) as repr::Duration; // nanosecond to millisecond - let window_start = start_time + (ts - start_time) / window_size * window_size; + let window_start = get_window_start(ts, window_size, start_time); let window_end = window_start + window_size; let ret = Timestamp::new_millisecond(window_end); @@ -302,6 +316,35 @@ impl UnaryFunc { } } +fn get_window_start( + ts: repr::Timestamp, + window_size: repr::Duration, + start_time: Option, +) -> repr::Timestamp { + let start_time = start_time.unwrap_or(0); + // left close right open + if ts >= start_time { + start_time + (ts - start_time) / window_size * window_size + } else { + start_time + (ts - start_time) / window_size * window_size + - if ((start_time - ts) % window_size) != 0 { + window_size + } else { + 0 + } + } +} + +#[test] +fn test_get_window_start() { + assert_eq!(get_window_start(1, 3, None), 0); + assert_eq!(get_window_start(3, 3, None), 3); + assert_eq!(get_window_start(0, 3, None), 0); + + assert_eq!(get_window_start(-1, 3, None), -3); + assert_eq!(get_window_start(-3, 3, None), -3); +} + fn get_ts_as_millisecond(arg: Value) -> Result { let ts = if let Some(ts) = arg.as_timestamp() { ts.convert_to(TimeUnit::Millisecond) @@ -550,6 +593,27 @@ impl BinaryFunc { Ok(ret) } + pub fn is_valid_func_name(name: &str) -> bool { + matches!( + name.to_lowercase().as_str(), + "eq" | "equal" + | "not_eq" + | "not_equal" + | "lt" + | "lte" + | "gt" + | "gte" + | "add" + | "sub" + | "subtract" + | "mul" + | "multiply" + | "div" + | "divide" + | "mod" + ) + } + /// choose the appropriate specialization based on the input types /// return a specialization of the binary function and it's actual input and output type(so no null type present) /// @@ -741,6 +805,10 @@ impl VariadicFunc { } } + pub fn is_valid_func_name(name: &str) -> bool { + matches!(name.to_lowercase().as_str(), "and" | "or") + } + /// Create a VariadicFunc from a string of the function name and given argument types(optional) pub fn from_str_and_types( name: &str, diff --git a/src/flow/src/expr/scalar.rs b/src/flow/src/expr/scalar.rs index dfd5fcd0f214..984d6f1a44a6 100644 --- a/src/flow/src/expr/scalar.rs +++ b/src/flow/src/expr/scalar.rs @@ -45,6 +45,8 @@ impl TypedExpr { impl TypedExpr { /// expand multi-value expression to multiple expressions with new indices + /// + /// Currently it just mean expand `TumbleWindow` to `TumbleWindowFloor` and `TumbleWindowCeiling` pub fn expand_multi_value( input_typ: &RelationType, exprs: &[TypedExpr], diff --git a/src/flow/src/repr/relation.rs b/src/flow/src/repr/relation.rs index 59edb31616fa..09e0b88344b7 100644 --- a/src/flow/src/repr/relation.rs +++ b/src/flow/src/repr/relation.rs @@ -262,6 +262,19 @@ impl RelationType { true } + + /// Return relation describe with column names + pub fn into_named(self, names: Vec) -> RelationDesc { + RelationDesc { typ: self, names } + } + + /// Return relation describe without column names + pub fn into_unnamed(self) -> RelationDesc { + RelationDesc { + typ: self, + names: vec![], + } + } } /// The type of a `Value` @@ -325,8 +338,8 @@ fn return_true() -> bool { /// Individual column names are optional. #[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize, Hash)] pub struct RelationDesc { - typ: RelationType, - names: Vec, + pub typ: RelationType, + pub names: Vec, } impl RelationDesc { diff --git a/src/flow/src/transform.rs b/src/flow/src/transform.rs index 9fe0b73d3642..bb28c8630b33 100644 --- a/src/flow/src/transform.rs +++ b/src/flow/src/transform.rs @@ -23,6 +23,7 @@ use literal::{from_substrait_literal, from_substrait_type}; use prost::Message; use query::parser::QueryLanguageParser; use query::plan::LogicalPlan; +use query::query_engine::DefaultSerializer; use query::QueryEngine; use session::context::QueryContext; use snafu::{OptionExt, ResultExt}; @@ -121,7 +122,7 @@ pub async fn sql_to_flow_plan( .context(ExternalSnafu)?; let LogicalPlan::DfPlan(plan) = plan; let sub_plan = DFLogicalSubstraitConvertor {} - .to_sub_plan(&plan) + .to_sub_plan(&plan, DefaultSerializer) .map_err(BoxedError::new) .context(ExternalSnafu)?; @@ -211,7 +212,7 @@ mod test { let schema = RelationType::new(vec![ColumnType::new(CDT::uint32_datatype(), false)]); tri_map.insert(Some(name.clone()), Some(1024), gid); - schemas.insert(gid, schema); + schemas.insert(gid, schema.into_unnamed()); } { @@ -225,7 +226,7 @@ mod test { ColumnType::new(CDT::uint32_datatype(), false), ColumnType::new(CDT::datetime_datatype(), false), ]); - schemas.insert(gid, schema); + schemas.insert(gid, schema.into_unnamed()); tri_map.insert(Some(name.clone()), Some(1025), gid); } @@ -294,7 +295,9 @@ mod test { let LogicalPlan::DfPlan(plan) = plan; // encode then decode so to rely on the impl of conversion from logical plan to substrait plan - let bytes = DFLogicalSubstraitConvertor {}.encode(&plan).unwrap(); + let bytes = DFLogicalSubstraitConvertor {} + .encode(&plan, DefaultSerializer) + .unwrap(); proto::Plan::decode(bytes).unwrap() } diff --git a/src/flow/src/transform/aggr.rs b/src/flow/src/transform/aggr.rs index d21df2cf6907..8b69146c153a 100644 --- a/src/flow/src/transform/aggr.rs +++ b/src/flow/src/transform/aggr.rs @@ -435,6 +435,236 @@ mod test { use crate::repr::{self, ColumnType, RelationType}; use crate::transform::test::{create_test_ctx, create_test_query_engine, sql_to_substrait}; + /// TODO(discord9): add more illegal sql tests + #[tokio::test] + async fn test_tumble_composite() { + let engine = create_test_query_engine(); + let sql = + "SELECT number, avg(number) FROM numbers_with_ts GROUP BY tumble(ts, '1 hour'), number"; + let plan = sql_to_substrait(engine.clone(), sql).await; + + let mut ctx = create_test_ctx(); + let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan).unwrap(); + + let aggr_exprs = vec![ + AggregateExpr { + func: AggregateFunc::SumUInt32, + expr: ScalarExpr::Column(0), + distinct: false, + }, + AggregateExpr { + func: AggregateFunc::Count, + expr: ScalarExpr::Column(0), + distinct: false, + }, + ]; + let avg_expr = ScalarExpr::If { + cond: Box::new(ScalarExpr::Column(4).call_binary( + ScalarExpr::Literal(Value::from(0i64), CDT::int64_datatype()), + BinaryFunc::NotEq, + )), + then: Box::new(ScalarExpr::Column(3).call_binary( + ScalarExpr::Column(4).call_unary(UnaryFunc::Cast(CDT::uint64_datatype())), + BinaryFunc::DivUInt64, + )), + els: Box::new(ScalarExpr::Literal(Value::Null, CDT::uint64_datatype())), + }; + let expected = TypedPlan { + // TODO(discord9): mfp indirectly ref to key columns + /* + .with_key(vec![1]) + .with_time_index(Some(0)),*/ + plan: Plan::Mfp { + input: Box::new( + Plan::Reduce { + input: Box::new( + Plan::Get { + id: crate::expr::Id::Global(GlobalId::User(1)), + } + .with_types(RelationType::new(vec![ + ColumnType::new(ConcreteDataType::uint32_datatype(), false), + ColumnType::new(ConcreteDataType::datetime_datatype(), false), + ])), + ), + key_val_plan: KeyValPlan { + key_plan: MapFilterProject::new(2) + .map(vec![ + ScalarExpr::Column(1).call_unary( + UnaryFunc::TumbleWindowFloor { + window_size: Interval::from_month_day_nano( + 0, + 0, + 3_600_000_000_000, + ), + start_time: None, + }, + ), + ScalarExpr::Column(1).call_unary( + UnaryFunc::TumbleWindowCeiling { + window_size: Interval::from_month_day_nano( + 0, + 0, + 3_600_000_000_000, + ), + start_time: None, + }, + ), + ScalarExpr::Column(0), + ]) + .unwrap() + .project(vec![2, 3, 4]) + .unwrap() + .into_safe(), + val_plan: MapFilterProject::new(2) + .project(vec![0, 1]) + .unwrap() + .into_safe(), + }, + reduce_plan: ReducePlan::Accumulable(AccumulablePlan { + full_aggrs: aggr_exprs.clone(), + simple_aggrs: vec![ + AggrWithIndex::new(aggr_exprs[0].clone(), 0, 0), + AggrWithIndex::new(aggr_exprs[1].clone(), 0, 1), + ], + distinct_aggrs: vec![], + }), + } + .with_types( + RelationType::new(vec![ + // keys + ColumnType::new(CDT::datetime_datatype(), false), // window start(time index) + ColumnType::new(CDT::datetime_datatype(), false), // window end(pk) + ColumnType::new(CDT::uint32_datatype(), false), // number(pk) + // values + ColumnType::new(CDT::uint64_datatype(), true), // avg.sum(number) + ColumnType::new(CDT::int64_datatype(), true), // avg.count(number) + ]) + .with_key(vec![1, 2]) + .with_time_index(Some(0)), + ), + ), + mfp: MapFilterProject::new(5) + .map(vec![ + avg_expr, + ScalarExpr::Column(2), // number(pk) + ScalarExpr::Column(5), // avg.sum(number) + ScalarExpr::Column(0), // window start + ScalarExpr::Column(1), // window end + ]) + .unwrap() + .project(vec![6, 7, 8, 9]) + .unwrap(), + }, + typ: RelationType::new(vec![ + ColumnType::new(CDT::uint32_datatype(), false), // number + ColumnType::new(CDT::uint64_datatype(), true), // avg(number) + ColumnType::new(CDT::datetime_datatype(), false), // window start + ColumnType::new(CDT::datetime_datatype(), false), // window end + ]), + }; + assert_eq!(flow_plan, expected); + } + + #[tokio::test] + async fn test_tumble_parse_optional() { + let engine = create_test_query_engine(); + let sql = "SELECT sum(number) FROM numbers_with_ts GROUP BY tumble(ts, '1 hour')"; + let plan = sql_to_substrait(engine.clone(), sql).await; + + let mut ctx = create_test_ctx(); + let flow_plan = TypedPlan::from_substrait_plan(&mut ctx, &plan).unwrap(); + + let aggr_expr = AggregateExpr { + func: AggregateFunc::SumUInt32, + expr: ScalarExpr::Column(0), + distinct: false, + }; + let expected = TypedPlan { + typ: RelationType::new(vec![ + ColumnType::new(CDT::uint64_datatype(), true), // sum(number) + ColumnType::new(CDT::datetime_datatype(), false), // window start + ColumnType::new(CDT::datetime_datatype(), false), // window end + ]), + // TODO(discord9): mfp indirectly ref to key columns + /* + .with_key(vec![1]) + .with_time_index(Some(0)),*/ + plan: Plan::Mfp { + input: Box::new( + Plan::Reduce { + input: Box::new( + Plan::Get { + id: crate::expr::Id::Global(GlobalId::User(1)), + } + .with_types(RelationType::new(vec![ + ColumnType::new(ConcreteDataType::uint32_datatype(), false), + ColumnType::new(ConcreteDataType::datetime_datatype(), false), + ])), + ), + key_val_plan: KeyValPlan { + key_plan: MapFilterProject::new(2) + .map(vec![ + ScalarExpr::Column(1).call_unary( + UnaryFunc::TumbleWindowFloor { + window_size: Interval::from_month_day_nano( + 0, + 0, + 3_600_000_000_000, + ), + start_time: None, + }, + ), + ScalarExpr::Column(1).call_unary( + UnaryFunc::TumbleWindowCeiling { + window_size: Interval::from_month_day_nano( + 0, + 0, + 3_600_000_000_000, + ), + start_time: None, + }, + ), + ]) + .unwrap() + .project(vec![2, 3]) + .unwrap() + .into_safe(), + val_plan: MapFilterProject::new(2) + .project(vec![0, 1]) + .unwrap() + .into_safe(), + }, + reduce_plan: ReducePlan::Accumulable(AccumulablePlan { + full_aggrs: vec![aggr_expr.clone()], + simple_aggrs: vec![AggrWithIndex::new(aggr_expr.clone(), 0, 0)], + distinct_aggrs: vec![], + }), + } + .with_types( + RelationType::new(vec![ + ColumnType::new(CDT::datetime_datatype(), false), // window start + ColumnType::new(CDT::datetime_datatype(), false), // window end + ColumnType::new(CDT::uint64_datatype(), true), //sum(number) + ]) + .with_key(vec![1]) + .with_time_index(Some(0)), + ), + ), + mfp: MapFilterProject::new(3) + .map(vec![ + ScalarExpr::Column(2), + ScalarExpr::Column(3), + ScalarExpr::Column(0), + ScalarExpr::Column(1), + ]) + .unwrap() + .project(vec![4, 5, 6]) + .unwrap(), + }, + }; + assert_eq!(flow_plan, expected); + } + #[tokio::test] async fn test_tumble_parse() { let engine = create_test_query_engine(); diff --git a/src/flow/src/transform/expr.rs b/src/flow/src/transform/expr.rs index 7e0dc2df3b62..74fc7ef61753 100644 --- a/src/flow/src/transform/expr.rs +++ b/src/flow/src/transform/expr.rs @@ -101,8 +101,7 @@ impl TypedExpr { .unzip(); match arg_len { - // because variadic function can also have 1 arguments, we need to check if it's a variadic function first - 1 if VariadicFunc::from_str_and_types(fn_name, &arg_types).is_err() => { + 1 if UnaryFunc::is_valid_func_name(fn_name) => { let func = UnaryFunc::from_str_and_type(fn_name, None)?; let arg = arg_exprs[0].clone(); let ret_type = ColumnType::new_nullable(func.signature().output.clone()); @@ -124,8 +123,7 @@ impl TypedExpr { Ok(TypedExpr::new(arg.call_unary(func), ret_type)) } - // because variadic function can also have 2 arguments, we need to check if it's a variadic function first - 2 if VariadicFunc::from_str_and_types(fn_name, &arg_types).is_err() => { + 2 if BinaryFunc::is_valid_func_name(fn_name) => { let (func, signature) = BinaryFunc::from_str_expr_and_type(fn_name, &arg_exprs, &arg_types[0..2])?; @@ -167,7 +165,8 @@ impl TypedExpr { Ok(TypedExpr::new(ret_expr, ret_type)) } _var => { - if let Ok(func) = VariadicFunc::from_str_and_types(fn_name, &arg_types) { + if VariadicFunc::is_valid_func_name(fn_name) { + let func = VariadicFunc::from_str_and_types(fn_name, &arg_types)?; let ret_type = ColumnType::new_nullable(func.signature().output.clone()); let mut expr = ScalarExpr::CallVariadic { func, @@ -175,9 +174,8 @@ impl TypedExpr { }; expr.optimize(); Ok(TypedExpr::new(expr, ret_type)) - } else if let Ok(func) = - UnmaterializableFunc::from_str_args(fn_name, arg_typed_exprs) - { + } else if UnmaterializableFunc::is_valid_func_name(fn_name) { + let func = UnmaterializableFunc::from_str_args(fn_name, arg_typed_exprs)?; let ret_type = ColumnType::new_nullable(func.signature().output.clone()); Ok(TypedExpr::new( ScalarExpr::CallUnmaterializable(func), @@ -324,8 +322,12 @@ impl TypedExpr { #[cfg(test)] mod test { + use std::collections::HashMap; + + use common_time::{DateTime, Interval}; use datatypes::prelude::ConcreteDataType; use datatypes::value::Value; + use pretty_assertions::assert_eq; use super::*; use crate::expr::{GlobalId, MapFilterProject}; @@ -510,4 +512,162 @@ mod test { assert_eq!(flow_plan.unwrap(), expected); } + + #[test] + fn test_func_sig() { + fn lit(v: impl ToString) -> substrait_proto::proto::FunctionArgument { + use substrait_proto::proto::expression; + let expr = Expression { + rex_type: Some(expression::RexType::Literal(expression::Literal { + nullable: false, + type_variation_reference: 0, + literal_type: Some(expression::literal::LiteralType::String(v.to_string())), + })), + }; + substrait_proto::proto::FunctionArgument { + arg_type: Some(substrait_proto::proto::function_argument::ArgType::Value( + expr, + )), + } + } + fn col(i: usize) -> substrait_proto::proto::FunctionArgument { + use substrait_proto::proto::expression; + let expr = Expression { + rex_type: Some(expression::RexType::Selection(Box::new( + expression::FieldReference { + reference_type: Some( + expression::field_reference::ReferenceType::DirectReference( + expression::ReferenceSegment { + reference_type: Some( + expression::reference_segment::ReferenceType::StructField( + Box::new(expression::reference_segment::StructField { + field: i as i32, + child: None, + }), + ), + ), + }, + ), + ), + root_type: None, + }, + ))), + }; + substrait_proto::proto::FunctionArgument { + arg_type: Some(substrait_proto::proto::function_argument::ArgType::Value( + expr, + )), + } + } + + let f = substrait_proto::proto::expression::ScalarFunction { + function_reference: 0, + arguments: vec![col(0)], + options: vec![], + output_type: None, + ..Default::default() + }; + let input_schema = RelationType::new(vec![ColumnType::new(CDT::uint32_datatype(), false)]); + let extensions = FunctionExtensions { + anchor_to_name: HashMap::from([(0, "is_null".to_string())]), + }; + let res = TypedExpr::from_substrait_scalar_func(&f, &input_schema, &extensions).unwrap(); + + assert_eq!( + res, + TypedExpr { + expr: ScalarExpr::Column(0).call_unary(UnaryFunc::IsNull), + typ: ColumnType { + scalar_type: CDT::boolean_datatype(), + nullable: true, + }, + } + ); + + let f = substrait_proto::proto::expression::ScalarFunction { + function_reference: 0, + arguments: vec![col(0), col(1)], + options: vec![], + output_type: None, + ..Default::default() + }; + let input_schema = RelationType::new(vec![ + ColumnType::new(CDT::uint32_datatype(), false), + ColumnType::new(CDT::uint32_datatype(), false), + ]); + let extensions = FunctionExtensions { + anchor_to_name: HashMap::from([(0, "add".to_string())]), + }; + let res = TypedExpr::from_substrait_scalar_func(&f, &input_schema, &extensions).unwrap(); + + assert_eq!( + res, + TypedExpr { + expr: ScalarExpr::Column(0) + .call_binary(ScalarExpr::Column(1), BinaryFunc::AddUInt32,), + typ: ColumnType { + scalar_type: CDT::uint32_datatype(), + nullable: true, + }, + } + ); + + let f = substrait_proto::proto::expression::ScalarFunction { + function_reference: 0, + arguments: vec![col(0), lit("1 second"), lit("2021-07-01 00:00:00")], + options: vec![], + output_type: None, + ..Default::default() + }; + let input_schema = RelationType::new(vec![ + ColumnType::new(CDT::timestamp_nanosecond_datatype(), false), + ColumnType::new(CDT::string_datatype(), false), + ]); + let extensions = FunctionExtensions { + anchor_to_name: HashMap::from([(0, "tumble".to_string())]), + }; + let res = TypedExpr::from_substrait_scalar_func(&f, &input_schema, &extensions).unwrap(); + + assert_eq!( + res, + ScalarExpr::CallUnmaterializable(UnmaterializableFunc::TumbleWindow { + ts: Box::new( + ScalarExpr::Column(0) + .with_type(ColumnType::new(CDT::timestamp_nanosecond_datatype(), false)) + ), + window_size: Interval::from_month_day_nano(0, 0, 1_000_000_000), + start_time: Some(DateTime::new(1625097600000)) + }) + .with_type(ColumnType::new(CDT::timestamp_millisecond_datatype(), true)), + ); + + let f = substrait_proto::proto::expression::ScalarFunction { + function_reference: 0, + arguments: vec![col(0), lit("1 second")], + options: vec![], + output_type: None, + ..Default::default() + }; + let input_schema = RelationType::new(vec![ + ColumnType::new(CDT::timestamp_nanosecond_datatype(), false), + ColumnType::new(CDT::string_datatype(), false), + ]); + let extensions = FunctionExtensions { + anchor_to_name: HashMap::from([(0, "tumble".to_string())]), + }; + let res = TypedExpr::from_substrait_scalar_func(&f, &input_schema, &extensions).unwrap(); + + assert_eq!( + res, + ScalarExpr::CallUnmaterializable(UnmaterializableFunc::TumbleWindow { + ts: Box::new( + ScalarExpr::Column(0) + .with_type(ColumnType::new(CDT::timestamp_nanosecond_datatype(), false)) + ), + window_size: Interval::from_month_day_nano(0, 0, 1_000_000_000), + start_time: None + }) + .with_type(ColumnType::new(CDT::timestamp_millisecond_datatype(), true)), + ) + } } diff --git a/src/flow/src/transform/plan.rs b/src/flow/src/transform/plan.rs index 0dedc9e5356b..337eba7eef45 100644 --- a/src/flow/src/transform/plan.rs +++ b/src/flow/src/transform/plan.rs @@ -269,7 +269,7 @@ impl TypedPlan { id: crate::expr::Id::Global(table.0), }; let get_table = TypedPlan { - typ: table.1, + typ: table.1.typ().clone(), plan: get_table, }; diff --git a/src/flow/src/utils.rs b/src/flow/src/utils.rs index 93edf176e77a..30d48f0319d4 100644 --- a/src/flow/src/utils.rs +++ b/src/flow/src/utils.rs @@ -18,6 +18,7 @@ use std::collections::{BTreeMap, BTreeSet}; use std::ops::Bound; use std::sync::Arc; +use common_telemetry::debug; use itertools::Itertools; use serde::{Deserialize, Serialize}; use smallvec::{smallvec, SmallVec}; @@ -86,7 +87,7 @@ impl KeyExpiryManager { /// /// - If given key is expired by now (that is less than `now - expiry_duration`), return the amount of time it's expired. /// - If it's not expired, return None - pub fn update_event_ts( + pub fn get_expire_duration_and_update_event_ts( &mut self, now: Timestamp, row: &Row, @@ -95,6 +96,33 @@ impl KeyExpiryManager { return Ok(None); }; + self.event_ts_to_key + .entry(event_ts) + .or_default() + .insert(row.clone()); + + if let Some(expire_time) = self.compute_expiration_timestamp(now) { + if expire_time > event_ts { + // return how much time it's expired + return Ok(Some(expire_time - event_ts)); + } + } + + Ok(None) + } + + /// Get the expire duration of a key, if it's expired by now. + /// + /// Return None if the key is not expired + pub fn get_expire_duration( + &self, + now: Timestamp, + row: &Row, + ) -> Result, EvalError> { + let Some(event_ts) = self.extract_event_ts(row)? else { + return Ok(None); + }; + if let Some(expire_time) = self.compute_expiration_timestamp(now) { if expire_time > event_ts { // return how much time it's expired @@ -102,10 +130,6 @@ impl KeyExpiryManager { } } - self.event_ts_to_key - .entry(event_ts) - .or_default() - .insert(row.clone()); Ok(None) } @@ -189,6 +213,10 @@ impl Arrangement { } } + pub fn get_expire_state(&self) -> Option<&KeyExpiryManager> { + self.expire_state.as_ref() + } + pub fn set_expire_state(&mut self, expire_state: KeyExpiryManager) { self.expire_state = Some(expire_state); } @@ -208,8 +236,12 @@ impl Arrangement { for ((key, val), update_ts, diff) in updates { // check if the key is expired if let Some(s) = &mut self.expire_state { - if let Some(expired_by) = s.update_event_ts(now, &key)? { + if let Some(expired_by) = s.get_expire_duration_and_update_event_ts(now, &key)? { max_expired_by = max_expired_by.max(Some(expired_by)); + debug!( + "Expired key: {:?}, expired by: {:?} with time being now={}", + key, expired_by, now + ); continue; } } @@ -335,7 +367,9 @@ impl Arrangement { for (key, updates) in batch { // check if the key is expired if let Some(s) = &mut self.expire_state { - if let Some(expired_by) = s.update_event_ts(now, &key)? { + if let Some(expired_by) = + s.get_expire_duration_and_update_event_ts(now, &key)? + { max_expired_by = max_expired_by.max(Some(expired_by)); continue; } @@ -540,6 +574,10 @@ impl ArrangeHandler { pub fn set_full_arrangement(&self, full: bool) { self.write().full_arrangement = full; } + + pub fn is_full_arrangement(&self) -> bool { + self.read().full_arrangement + } } #[cfg(test)] diff --git a/src/frontend/src/instance/grpc.rs b/src/frontend/src/instance/grpc.rs index 7be2c09ec1d4..6597e049aa3b 100644 --- a/src/frontend/src/instance/grpc.rs +++ b/src/frontend/src/instance/grpc.rs @@ -18,7 +18,6 @@ use api::v1::query_request::Query; use api::v1::{DeleteRequests, DropFlowExpr, InsertRequests, RowDeleteRequests, RowInsertRequests}; use async_trait::async_trait; use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq}; -use common_meta::table_name::TableName; use common_query::Output; use common_telemetry::tracing; use query::parser::PromQuery; @@ -27,6 +26,7 @@ use servers::query_handler::grpc::GrpcQueryHandler; use servers::query_handler::sql::SqlQueryHandler; use session::context::QueryContextRef; use snafu::{ensure, OptionExt, ResultExt}; +use table::table_name::TableName; use crate::error::{ Error, IncompleteGrpcRequestSnafu, NotSupportedSnafu, PermissionSnafu, Result, diff --git a/src/frontend/src/script.rs b/src/frontend/src/script.rs index 68a7d780ae9e..5cda392d112f 100644 --- a/src/frontend/src/script.rs +++ b/src/frontend/src/script.rs @@ -72,12 +72,12 @@ mod python { use arc_swap::ArcSwap; use catalog::RegisterSystemTableRequest; use common_error::ext::BoxedError; - use common_meta::table_name::TableName; use common_telemetry::{error, info}; use script::manager::ScriptManager; use servers::query_handler::grpc::GrpcQueryHandler; use session::context::QueryContext; use snafu::{OptionExt, ResultExt}; + use table::table_name::TableName; use super::*; use crate::error::{CatalogSnafu, TableNotFoundSnafu}; diff --git a/src/log-store/src/error.rs b/src/log-store/src/error.rs index 45449c9d65e8..280ce6410609 100644 --- a/src/log-store/src/error.rs +++ b/src/log-store/src/error.rs @@ -21,12 +21,18 @@ use serde_json::error::Error as JsonError; use snafu::{Location, Snafu}; use store_api::storage::RegionId; -use crate::kafka::NamespaceImpl as KafkaNamespace; - #[derive(Snafu)] #[snafu(visibility(pub))] #[stack_trace_debug] pub enum Error { + #[snafu(display("Invalid provider type, expected: {}, actual: {}", expected, actual))] + InvalidProvider { + #[snafu(implicit)] + location: Location, + expected: String, + actual: String, + }, + #[snafu(display("Failed to start log store gc task"))] StartGcTask { #[snafu(implicit)] @@ -170,34 +176,28 @@ pub enum Error { location: Location, }, - #[snafu(display( - "Failed to produce records to Kafka, topic: {}, size: {}, limit: {}", - topic, - size, - limit, - ))] + #[snafu(display("Failed to produce records to Kafka, topic: {}, size: {}", topic, size))] ProduceRecord { topic: String, size: usize, - limit: usize, #[snafu(implicit)] location: Location, #[snafu(source)] error: rskafka::client::producer::Error, }, - #[snafu(display("Failed to read a record from Kafka, ns: {}", ns))] + #[snafu(display("Failed to read a record from Kafka, topic: {}", topic))] ConsumeRecord { - ns: KafkaNamespace, + topic: String, #[snafu(implicit)] location: Location, #[snafu(source)] error: rskafka::client::error::Error, }, - #[snafu(display("Failed to get the latest offset, ns: {}", ns))] + #[snafu(display("Failed to get the latest offset, topic: {}", topic))] GetOffset { - ns: KafkaNamespace, + topic: String, #[snafu(implicit)] location: Location, #[snafu(source)] diff --git a/src/log-store/src/kafka.rs b/src/log-store/src/kafka.rs index dc068f3b4b52..415cc53ddbce 100644 --- a/src/log-store/src/kafka.rs +++ b/src/log-store/src/kafka.rs @@ -12,17 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::mem::size_of; pub(crate) mod client_manager; pub mod log_store; pub(crate) mod util; -use std::fmt::Display; - use serde::{Deserialize, Serialize}; -use store_api::logstore::entry::{Entry, Id as EntryId, RawEntry}; -use store_api::logstore::namespace::Namespace; -use store_api::storage::RegionId; +use store_api::logstore::entry::Id as EntryId; /// Kafka Namespace implementation. #[derive(Debug, PartialEq, Eq, Hash, Clone, Serialize, Deserialize)] @@ -31,18 +26,6 @@ pub struct NamespaceImpl { pub topic: String, } -impl Namespace for NamespaceImpl { - fn id(&self) -> u64 { - self.region_id - } -} - -impl Display for NamespaceImpl { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "[topic: {}, region: {}]", self.topic, self.region_id) - } -} - /// Kafka Entry implementation. #[derive(Debug, PartialEq, Clone)] pub struct EntryImpl { @@ -53,65 +36,3 @@ pub struct EntryImpl { /// The namespace used to identify and isolate log entries from different regions. pub ns: NamespaceImpl, } - -impl Entry for EntryImpl { - fn into_raw_entry(self) -> RawEntry { - RawEntry { - region_id: self.region_id(), - entry_id: self.id(), - data: self.data, - } - } - - fn data(&self) -> &[u8] { - &self.data - } - - fn id(&self) -> EntryId { - self.id - } - - fn region_id(&self) -> RegionId { - RegionId::from_u64(self.ns.region_id) - } - - fn estimated_size(&self) -> usize { - size_of::() + self.data.capacity() * size_of::() + self.ns.topic.capacity() - } -} - -impl Display for EntryImpl { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Entry [ns: {}, id: {}, data_len: {}]", - self.ns, - self.id, - self.data.len() - ) - } -} - -#[cfg(test)] -mod tests { - use std::mem::size_of; - - use store_api::logstore::entry::Entry; - - use crate::kafka::{EntryImpl, NamespaceImpl}; - - #[test] - fn test_estimated_size() { - let entry = EntryImpl { - data: Vec::with_capacity(100), - id: 0, - ns: NamespaceImpl { - region_id: 0, - topic: String::with_capacity(10), - }, - }; - let expected = size_of::() + 100 * size_of::() + 10; - let got = entry.estimated_size(); - assert_eq!(expected, got); - } -} diff --git a/src/log-store/src/kafka/client_manager.rs b/src/log-store/src/kafka/client_manager.rs index 1708efed1d09..81feaddb6627 100644 --- a/src/log-store/src/kafka/client_manager.rs +++ b/src/log-store/src/kafka/client_manager.rs @@ -27,6 +27,7 @@ use tokio::sync::RwLock; use crate::error::{ BuildClientSnafu, BuildPartitionClientSnafu, ResolveKafkaEndpointSnafu, Result, }; +use crate::kafka::util::record::MIN_BATCH_SIZE; // Each topic only has one partition for now. // The `DEFAULT_PARTITION` refers to the index of the partition. @@ -48,7 +49,8 @@ pub(crate) struct Client { impl Client { /// Creates a Client from the raw client. pub(crate) fn new(raw_client: Arc, config: &DatanodeKafkaConfig) -> Self { - let record_aggregator = RecordAggregator::new(config.max_batch_size.as_bytes() as usize); + let record_aggregator = + RecordAggregator::new((config.max_batch_size.as_bytes() as usize).max(MIN_BATCH_SIZE)); let batch_producer = BatchProducerBuilder::new(raw_client.clone()) .with_compression(config.compression) .with_linger(config.linger) diff --git a/src/log-store/src/kafka/log_store.rs b/src/log-store/src/kafka/log_store.rs index 1a0f96b6587b..ceca6fc30bd7 100644 --- a/src/log-store/src/kafka/log_store.rs +++ b/src/log-store/src/kafka/log_store.rs @@ -17,21 +17,23 @@ use std::sync::Arc; use common_telemetry::{debug, warn}; use common_wal::config::kafka::DatanodeKafkaConfig; -use common_wal::options::WalOptions; use futures_util::StreamExt; use rskafka::client::consumer::{StartOffset, StreamConsumerBuilder}; use rskafka::client::partition::OffsetAt; -use snafu::ResultExt; -use store_api::logstore::entry::{Entry as EntryTrait, Id as EntryId}; -use store_api::logstore::entry_stream::SendableEntryStream; -use store_api::logstore::namespace::Id as NamespaceId; -use store_api::logstore::{AppendBatchResponse, AppendResponse, LogStore}; - -use crate::error::{ConsumeRecordSnafu, Error, GetOffsetSnafu, IllegalSequenceSnafu, Result}; +use snafu::{OptionExt, ResultExt}; +use store_api::logstore::entry::{ + Entry, Id as EntryId, MultiplePartEntry, MultiplePartHeader, NaiveEntry, +}; +use store_api::logstore::provider::{KafkaProvider, Provider}; +use store_api::logstore::{AppendBatchResponse, LogStore, SendableEntryStream}; +use store_api::storage::RegionId; + +use crate::error::{self, ConsumeRecordSnafu, Error, GetOffsetSnafu, InvalidProviderSnafu, Result}; use crate::kafka::client_manager::{ClientManager, ClientManagerRef}; use crate::kafka::util::offset::Offset; -use crate::kafka::util::record::{maybe_emit_entry, Record, RecordProducer}; -use crate::kafka::{EntryImpl, NamespaceImpl}; +use crate::kafka::util::record::{ + maybe_emit_entry, remaining_entries, Record, RecordProducer, ESTIMATED_META_SIZE, +}; use crate::metrics; /// A log store backed by Kafka. @@ -52,41 +54,81 @@ impl KafkaLogStore { } } +fn build_entry( + data: &mut Vec, + entry_id: EntryId, + region_id: RegionId, + provider: &Provider, + max_data_size: usize, +) -> Entry { + if data.len() <= max_data_size { + Entry::Naive(NaiveEntry { + provider: provider.clone(), + region_id, + entry_id, + data: std::mem::take(data), + }) + } else { + let parts = std::mem::take(data) + .chunks(max_data_size) + .map(|s| s.into()) + .collect::>(); + let num_parts = parts.len(); + + let mut headers = Vec::with_capacity(num_parts); + headers.push(MultiplePartHeader::First); + headers.extend((1..num_parts - 1).map(MultiplePartHeader::Middle)); + headers.push(MultiplePartHeader::Last); + + Entry::MultiplePart(MultiplePartEntry { + provider: provider.clone(), + region_id, + entry_id, + headers, + parts, + }) + } +} + #[async_trait::async_trait] impl LogStore for KafkaLogStore { type Error = Error; - type Entry = EntryImpl; - type Namespace = NamespaceImpl; - - /// Creates an entry of the associated Entry type. - fn entry(&self, data: &mut Vec, entry_id: EntryId, ns: Self::Namespace) -> Self::Entry { - EntryImpl { - data: std::mem::take(data), - id: entry_id, - ns, - } - } - /// Appends an entry to the log store and returns a response containing the entry id of the appended entry. - async fn append(&self, entry: Self::Entry) -> Result { - let entry_id = RecordProducer::new(entry.ns.clone()) - .with_entries(vec![entry]) - .produce(&self.client_manager) - .await - .map(TryInto::try_into)??; - Ok(AppendResponse { - last_entry_id: entry_id, - }) + /// Creates an [Entry]. + fn entry( + &self, + data: &mut Vec, + entry_id: EntryId, + region_id: RegionId, + provider: &Provider, + ) -> Result { + provider + .as_kafka_provider() + .with_context(|| InvalidProviderSnafu { + expected: KafkaProvider::type_name(), + actual: provider.type_name(), + })?; + + let max_data_size = + self.client_manager.config.max_batch_size.as_bytes() as usize - ESTIMATED_META_SIZE; + Ok(build_entry( + data, + entry_id, + region_id, + provider, + max_data_size, + )) } + // TODO(weny): refactor the writing. /// Appends a batch of entries and returns a response containing a map where the key is a region id /// while the value is the id of the last successfully written entry of the region. - async fn append_batch(&self, entries: Vec) -> Result { + async fn append_batch(&self, entries: Vec) -> Result { metrics::METRIC_KAFKA_APPEND_BATCH_CALLS_TOTAL.inc(); metrics::METRIC_KAFKA_APPEND_BATCH_BYTES_TOTAL.inc_by( entries .iter() - .map(EntryTrait::estimated_size) + .map(|entry| entry.estimated_size()) .sum::() as u64, ); let _timer = metrics::METRIC_KAFKA_APPEND_BATCH_ELAPSED.start_timer(); @@ -98,9 +140,17 @@ impl LogStore for KafkaLogStore { // Groups entries by region id and pushes them to an associated record producer. let mut producers = HashMap::with_capacity(entries.len()); for entry in entries { + let provider = entry + .provider() + .as_kafka_provider() + .context(error::InvalidProviderSnafu { + expected: KafkaProvider::type_name(), + actual: entry.provider().type_name(), + })? + .clone(); producers - .entry(entry.ns.region_id) - .or_insert_with(|| RecordProducer::new(entry.ns.clone())) + .entry(entry.region_id()) + .or_insert_with(|| RecordProducer::new(provider)) .push(entry); } @@ -122,20 +172,27 @@ impl LogStore for KafkaLogStore { Ok(AppendBatchResponse { last_entry_ids }) } - /// Creates a new `EntryStream` to asynchronously generates `Entry` with entry ids - /// starting from `entry_id`. The generated entries will be filtered by the namespace. + /// Creates a new `EntryStream` to asynchronously generates `Entry` with entry ids. + /// Returns entries belonging to `provider`, starting from `entry_id`. async fn read( &self, - ns: &Self::Namespace, + provider: &Provider, entry_id: EntryId, - ) -> Result> { + ) -> Result> { + let provider = provider + .as_kafka_provider() + .with_context(|| InvalidProviderSnafu { + expected: KafkaProvider::type_name(), + actual: provider.type_name(), + })?; + metrics::METRIC_KAFKA_READ_CALLS_TOTAL.inc(); let _timer = metrics::METRIC_KAFKA_READ_ELAPSED.start_timer(); // Gets the client associated with the topic. let client = self .client_manager - .get_or_insert(&ns.topic) + .get_or_insert(&provider.topic) .await? .raw_client .clone(); @@ -147,14 +204,16 @@ impl LogStore for KafkaLogStore { let end_offset = client .get_offset(OffsetAt::Latest) .await - .context(GetOffsetSnafu { ns: ns.clone() })? + .context(GetOffsetSnafu { + topic: &provider.topic, + })? - 1; // Reads entries with offsets in the range [start_offset, end_offset]. let start_offset = Offset::try_from(entry_id)?.0; debug!( "Start reading entries in range [{}, {}] for ns {}", - start_offset, end_offset, ns + start_offset, end_offset, provider ); // Abort if there're no new entries. @@ -162,7 +221,7 @@ impl LogStore for KafkaLogStore { if start_offset > end_offset { warn!( "No new entries for ns {} in range [{}, {}]", - ns, start_offset, end_offset + provider, start_offset, end_offset ); return Ok(futures_util::stream::empty().boxed()); } @@ -174,20 +233,20 @@ impl LogStore for KafkaLogStore { debug!( "Built a stream consumer for ns {} to consume entries in range [{}, {}]", - ns, start_offset, end_offset + provider, start_offset, end_offset ); - // Key: entry id, Value: the records associated with the entry. - let mut entry_records: HashMap<_, Vec<_>> = HashMap::new(); - let ns_clone = ns.clone(); + // A buffer is used to collect records to construct a complete entry. + let mut entry_records: HashMap> = HashMap::new(); + let provider = provider.clone(); let stream = async_stream::stream!({ while let Some(consume_result) = stream_consumer.next().await { // Each next on the stream consumer produces a `RecordAndOffset` and a high watermark offset. // The `RecordAndOffset` contains the record data and its start offset. // The high watermark offset is the offset of the last record plus one. let (record_and_offset, high_watermark) = - consume_result.with_context(|_| ConsumeRecordSnafu { - ns: ns_clone.clone(), + consume_result.context(ConsumeRecordSnafu { + topic: &provider.topic, })?; let (kafka_record, offset) = (record_and_offset.record, record_and_offset.offset); @@ -195,37 +254,35 @@ impl LogStore for KafkaLogStore { .inc_by(kafka_record.approximate_size() as u64); debug!( - "Read a record at offset {} for ns {}, high watermark: {}", - offset, ns_clone, high_watermark + "Read a record at offset {} for topic {}, high watermark: {}", + offset, provider.topic, high_watermark ); // Ignores no-op records. if kafka_record.value.is_none() { - if check_termination(offset, end_offset, &entry_records)? { + if check_termination(offset, end_offset) { + if let Some(entries) = remaining_entries(&provider, &mut entry_records) { + yield Ok(entries); + } break; } continue; } - // Filters records by namespace. let record = Record::try_from(kafka_record)?; - if record.meta.ns != ns_clone { - if check_termination(offset, end_offset, &entry_records)? { - break; - } - continue; - } - // Tries to construct an entry from records consumed so far. - if let Some(mut entry) = maybe_emit_entry(record, &mut entry_records)? { + if let Some(mut entry) = maybe_emit_entry(&provider, record, &mut entry_records)? { // We don't rely on the EntryId generated by mito2. // Instead, we use the offset return from Kafka as EntryId. // Therefore, we MUST overwrite the EntryId with RecordOffset. - entry.id = offset as u64; + entry.set_entry_id(offset as u64); yield Ok(vec![entry]); } - if check_termination(offset, end_offset, &entry_records)? { + if check_termination(offset, end_offset) { + if let Some(entries) = remaining_entries(&provider, &mut entry_records) { + yield Ok(entries); + } break; } } @@ -233,39 +290,25 @@ impl LogStore for KafkaLogStore { Ok(Box::pin(stream)) } - /// Creates a namespace of the associated Namespace type. - fn namespace(&self, ns_id: NamespaceId, wal_options: &WalOptions) -> Self::Namespace { - // Safety: upon start, the datanode checks the consistency of the wal providers in the wal config of the - // datanode and that of the metasrv. Therefore, the wal options passed into the kafka log store - // must be of type WalOptions::Kafka. - let WalOptions::Kafka(kafka_options) = wal_options else { - unreachable!() - }; - NamespaceImpl { - region_id: ns_id, - topic: kafka_options.topic.clone(), - } - } - /// Creates a new `Namespace` from the given ref. - async fn create_namespace(&self, _ns: &Self::Namespace) -> Result<()> { + async fn create_namespace(&self, _provider: &Provider) -> Result<()> { Ok(()) } /// Deletes an existing `Namespace` specified by the given ref. - async fn delete_namespace(&self, _ns: &Self::Namespace) -> Result<()> { + async fn delete_namespace(&self, _provider: &Provider) -> Result<()> { Ok(()) } /// Lists all existing namespaces. - async fn list_namespaces(&self) -> Result> { + async fn list_namespaces(&self) -> Result> { Ok(vec![]) } /// Marks all entries with ids `<=entry_id` of the given `namespace` as obsolete, /// so that the log store can safely delete those entries. This method does not guarantee /// that the obsolete entries are deleted immediately. - async fn obsolete(&self, _ns: Self::Namespace, _entry_id: EntryId) -> Result<()> { + async fn obsolete(&self, _provider: &Provider, _entry_id: EntryId) -> Result<()> { Ok(()) } @@ -275,227 +318,249 @@ impl LogStore for KafkaLogStore { } } -fn check_termination( - offset: i64, - end_offset: i64, - entry_records: &HashMap>, -) -> Result { +fn check_termination(offset: i64, end_offset: i64) -> bool { // Terminates the stream if the entry with the end offset was read. if offset >= end_offset { debug!("Stream consumer terminates at offset {}", offset); // There must have no records when the stream terminates. - if !entry_records.is_empty() { - return IllegalSequenceSnafu { - error: "Found records leftover", - } - .fail(); - } - Ok(true) + true } else { - Ok(false) + false } } #[cfg(test)] mod tests { + + use std::assert_matches::assert_matches; + use std::collections::HashMap; + use common_base::readable_size::ReadableSize; - use rand::seq::IteratorRandom; - - use super::*; - use crate::test_util::kafka::{ - create_topics, entries_with_random_data, new_namespace, EntryBuilder, - }; - - // Stores test context for a region. - struct RegionContext { - ns: NamespaceImpl, - entry_builder: EntryBuilder, - expected: Vec, - flushed_entry_id: EntryId, + use common_telemetry::info; + use common_telemetry::tracing::warn; + use common_wal::config::kafka::DatanodeKafkaConfig; + use futures::TryStreamExt; + use rand::prelude::SliceRandom; + use rand::Rng; + use store_api::logstore::entry::{Entry, MultiplePartEntry, MultiplePartHeader, NaiveEntry}; + use store_api::logstore::provider::Provider; + use store_api::logstore::LogStore; + use store_api::storage::RegionId; + + use super::build_entry; + use crate::kafka::log_store::KafkaLogStore; + + #[test] + fn test_build_naive_entry() { + let provider = Provider::kafka_provider("my_topic".to_string()); + let region_id = RegionId::new(1, 1); + let entry = build_entry(&mut vec![1; 100], 1, region_id, &provider, 120); + + assert_eq!( + entry.into_naive_entry().unwrap(), + NaiveEntry { + provider, + region_id, + entry_id: 1, + data: vec![1; 100] + } + ) } - /// Prepares for a test in that a log store is constructed and a collection of topics is created. - async fn prepare( - test_name: &str, - num_topics: usize, - broker_endpoints: Vec, - ) -> (KafkaLogStore, Vec) { - let topics = create_topics( - num_topics, - |i| format!("{test_name}_{}_{}", i, uuid::Uuid::new_v4()), - &broker_endpoints, + #[test] + fn test_build_into_multiple_part_entry() { + let provider = Provider::kafka_provider("my_topic".to_string()); + let region_id = RegionId::new(1, 1); + let entry = build_entry(&mut vec![1; 100], 1, region_id, &provider, 50); + + assert_eq!( + entry.into_multiple_part_entry().unwrap(), + MultiplePartEntry { + provider: provider.clone(), + region_id, + entry_id: 1, + headers: vec![MultiplePartHeader::First, MultiplePartHeader::Last], + parts: vec![vec![1; 50], vec![1; 50]], + } + ); + + let region_id = RegionId::new(1, 1); + let entry = build_entry(&mut vec![1; 100], 1, region_id, &provider, 21); + + assert_eq!( + entry.into_multiple_part_entry().unwrap(), + MultiplePartEntry { + provider, + region_id, + entry_id: 1, + headers: vec![ + MultiplePartHeader::First, + MultiplePartHeader::Middle(1), + MultiplePartHeader::Middle(2), + MultiplePartHeader::Middle(3), + MultiplePartHeader::Last + ], + parts: vec![ + vec![1; 21], + vec![1; 21], + vec![1; 21], + vec![1; 21], + vec![1; 16] + ], + } ) - .await; + } + + fn generate_entries( + logstore: &KafkaLogStore, + provider: &Provider, + num_entries: usize, + region_id: RegionId, + data_len: usize, + ) -> Vec { + (0..num_entries) + .map(|_| { + let mut data: Vec = (0..data_len).map(|_| rand::random::()).collect(); + // Always set `entry_id` to 0, the real entry_id will be set during the read. + logstore.entry(&mut data, 0, region_id, provider).unwrap() + }) + .collect() + } + #[tokio::test] + async fn test_append_batch_basic() { + common_telemetry::init_default_ut_logging(); + let Ok(broker_endpoints) = std::env::var("GT_KAFKA_ENDPOINTS") else { + warn!("The endpoints is empty, skipping the test 'test_append_batch_basic'"); + return; + }; + let broker_endpoints = broker_endpoints + .split(',') + .map(|s| s.trim().to_string()) + .collect::>(); let config = DatanodeKafkaConfig { broker_endpoints, max_batch_size: ReadableSize::kb(32), ..Default::default() }; let logstore = KafkaLogStore::try_new(&config).await.unwrap(); + let topic_name = uuid::Uuid::new_v4().to_string(); + let provider = Provider::kafka_provider(topic_name); + let region_entries = (0..5) + .map(|i| { + let region_id = RegionId::new(1, i); + ( + region_id, + generate_entries(&logstore, &provider, 20, region_id, 1024), + ) + }) + .collect::>>(); - // Appends a no-op record to each topic. - for topic in topics.iter() { - let last_entry_id = logstore - .append(EntryImpl { - data: vec![], - id: 0, - ns: new_namespace(topic, 0), - }) - .await - .unwrap() - .last_entry_id; - assert_eq!(last_entry_id, 0); - } - - (logstore, topics) - } - - /// Creates a vector containing indexes of all regions if the `all` is true. - /// Otherwise, creates a subset of the indexes. The cardinality of the subset - /// is nearly a quarter of that of the universe set. - fn all_or_subset(all: bool, num_regions: usize) -> Vec { - assert!(num_regions > 0); - let amount = if all { - num_regions - } else { - (num_regions / 4).max(1) - }; - (0..num_regions as u64).choose_multiple(&mut rand::thread_rng(), amount) - } + let mut all_entries = region_entries + .values() + .flatten() + .cloned() + .collect::>(); + all_entries.shuffle(&mut rand::thread_rng()); - /// Builds entries for regions specified by `which`. Builds large entries if `large` is true. - /// Returns the aggregated entries. - fn build_entries( - region_contexts: &mut HashMap, - which: &[u64], - large: bool, - ) -> Vec { - let mut aggregated = Vec::with_capacity(which.len()); - for region_id in which { - let ctx = region_contexts.get_mut(region_id).unwrap(); - // Builds entries for the region. - ctx.expected = if !large { - entries_with_random_data(3, &ctx.entry_builder) - } else { - // Builds a large entry of size 256KB which is way greater than the configured `max_batch_size` which is 32KB. - let large_entry = ctx.entry_builder.with_data([b'1'; 256 * 1024]); - vec![large_entry] - }; - // Aggregates entries of all regions. - aggregated.push(ctx.expected.clone()); + let response = logstore.append_batch(all_entries.clone()).await.unwrap(); + // 5 region + assert_eq!(response.last_entry_ids.len(), 5); + let got_entries = logstore + .read(&provider, 0) + .await + .unwrap() + .try_collect::>() + .await + .unwrap() + .into_iter() + .flatten() + .collect::>(); + for (region_id, _) in region_entries { + let expected_entries = all_entries + .iter() + .filter(|entry| entry.region_id() == region_id) + .cloned() + .collect::>(); + let mut actual_entries = got_entries + .iter() + .filter(|entry| entry.region_id() == region_id) + .cloned() + .collect::>(); + actual_entries + .iter_mut() + .for_each(|entry| entry.set_entry_id(0)); + assert_eq!(expected_entries, actual_entries); } - aggregated.into_iter().flatten().collect() } - /// Starts a test with: - /// * `test_name` - The name of the test. - /// * `num_topics` - Number of topics to be created in the preparation phase. - /// * `num_regions` - Number of regions involved in the test. - /// * `num_appends` - Number of append operations to be performed. - /// * `all` - All regions will be involved in an append operation if `all` is true. Otherwise, - /// an append operation will only randomly choose a subset of regions. - /// * `large` - Builds large entries for each region is `large` is true. - async fn test_with( - test_name: &str, - num_topics: usize, - num_regions: usize, - num_appends: usize, - all: bool, - large: bool, - ) { + #[tokio::test] + async fn test_append_batch_basic_large() { + common_telemetry::init_default_ut_logging(); let Ok(broker_endpoints) = std::env::var("GT_KAFKA_ENDPOINTS") else { - warn!("The endpoints is empty, skipping the test {test_name}"); + warn!("The endpoints is empty, skipping the test 'test_append_batch_basic_large'"); return; }; + let data_size_kb = rand::thread_rng().gen_range(9..31usize); + info!("Entry size: {}Ki", data_size_kb); let broker_endpoints = broker_endpoints .split(',') .map(|s| s.trim().to_string()) .collect::>(); - - let (logstore, topics) = prepare(test_name, num_topics, broker_endpoints).await; - let mut region_contexts = (0..num_regions) + let config = DatanodeKafkaConfig { + broker_endpoints, + max_batch_size: ReadableSize::kb(8), + ..Default::default() + }; + let logstore = KafkaLogStore::try_new(&config).await.unwrap(); + let topic_name = uuid::Uuid::new_v4().to_string(); + let provider = Provider::kafka_provider(topic_name); + let region_entries = (0..5) .map(|i| { - let topic = &topics[i % topics.len()]; - let ns = new_namespace(topic, i as u64); - let entry_builder = EntryBuilder::new(ns.clone()); + let region_id = RegionId::new(1, i); ( - i as u64, - RegionContext { - ns, - entry_builder, - expected: Vec::new(), - flushed_entry_id: 0, - }, + region_id, + generate_entries(&logstore, &provider, 20, region_id, data_size_kb * 1024), ) }) - .collect(); - - for _ in 0..num_appends { - // Appends entries for a subset of regions. - let which = all_or_subset(all, num_regions); - let entries = build_entries(&mut region_contexts, &which, large); - let last_entry_ids = logstore.append_batch(entries).await.unwrap().last_entry_ids; - - // Reads entries for regions and checks for each region that the gotten entries are identical with the expected ones. - for region_id in which { - let ctx = region_contexts.get_mut(®ion_id).unwrap(); - let stream = logstore - .read(&ctx.ns, ctx.flushed_entry_id + 1) - .await - .unwrap(); - let mut got = stream - .collect::>() - .await - .into_iter() - .flat_map(|x| x.unwrap()) - .collect::>(); - //FIXME(weny): https://github.com/GreptimeTeam/greptimedb/issues/3152 - ctx.expected.iter_mut().for_each(|entry| entry.id = 0); - got.iter_mut().for_each(|entry| entry.id = 0); - assert_eq!(ctx.expected, got); - } + .collect::>>(); - // Simulates a flush for regions. - for (region_id, last_entry_id) in last_entry_ids { - let ctx = region_contexts.get_mut(®ion_id).unwrap(); - ctx.flushed_entry_id = last_entry_id; - } + let mut all_entries = region_entries + .values() + .flatten() + .cloned() + .collect::>(); + assert_matches!(all_entries[0], Entry::MultiplePart(_)); + all_entries.shuffle(&mut rand::thread_rng()); + + let response = logstore.append_batch(all_entries.clone()).await.unwrap(); + // 5 region + assert_eq!(response.last_entry_ids.len(), 5); + let got_entries = logstore + .read(&provider, 0) + .await + .unwrap() + .try_collect::>() + .await + .unwrap() + .into_iter() + .flatten() + .collect::>(); + for (region_id, _) in region_entries { + let expected_entries = all_entries + .iter() + .filter(|entry| entry.region_id() == region_id) + .cloned() + .collect::>(); + let mut actual_entries = got_entries + .iter() + .filter(|entry| entry.region_id() == region_id) + .cloned() + .collect::>(); + actual_entries + .iter_mut() + .for_each(|entry| entry.set_entry_id(0)); + assert_eq!(expected_entries, actual_entries); } } - - /// Appends entries for one region and checks all entries can be read successfully. - #[tokio::test] - async fn test_one_region() { - test_with("test_one_region", 1, 1, 1, true, false).await; - } - - /// Appends entries for multiple regions and checks entries for each region can be read successfully. - /// A topic is assigned only a single region. - #[tokio::test] - async fn test_multi_regions_disjoint() { - test_with("test_multi_regions_disjoint", 5, 5, 1, true, false).await; - } - - /// Appends entries for multiple regions and checks entries for each region can be read successfully. - /// A topic is assigned multiple regions. - #[tokio::test] - async fn test_multi_regions_overlapped() { - test_with("test_multi_regions_overlapped", 5, 20, 1, true, false).await; - } - - /// Appends entries for multiple regions and checks entries for each region can be read successfully. - /// A topic may be assigned multiple regions. The append operation repeats for a several iterations. - /// Each append operation will only append entries for a subset of randomly chosen regions. - #[tokio::test] - async fn test_multi_appends() { - test_with("test_multi_appends", 5, 20, 3, false, false).await; - } - - /// Appends large entries for multiple regions and checks entries for each region can be read successfully. - /// A topic may be assigned multiple regions. - #[tokio::test] - async fn test_append_large_entries() { - test_with("test_append_large_entries", 5, 20, 3, true, true).await; - } } diff --git a/src/log-store/src/kafka/util/record.rs b/src/log-store/src/kafka/util/record.rs index e2035318c4c7..fa6f77171645 100644 --- a/src/log-store/src/kafka/util/record.rs +++ b/src/log-store/src/kafka/util/record.rs @@ -13,10 +13,14 @@ // limitations under the License. use std::collections::HashMap; +use std::sync::Arc; use rskafka::record::Record as KafkaRecord; use serde::{Deserialize, Serialize}; use snafu::{ensure, OptionExt, ResultExt}; +use store_api::logstore::entry::{Entry, MultiplePartEntry, MultiplePartHeader, NaiveEntry}; +use store_api::logstore::provider::{KafkaProvider, Provider}; +use store_api::storage::RegionId; use crate::error::{ DecodeJsonSnafu, EmptyEntriesSnafu, EncodeJsonSnafu, GetClientSnafu, IllegalSequenceSnafu, @@ -24,7 +28,7 @@ use crate::error::{ }; use crate::kafka::client_manager::ClientManagerRef; use crate::kafka::util::offset::Offset; -use crate::kafka::{EntryId, EntryImpl, NamespaceImpl}; +use crate::kafka::{EntryId, NamespaceImpl}; use crate::metrics; /// The current version of Record. @@ -32,7 +36,10 @@ pub(crate) const VERSION: u32 = 0; /// The estimated size in bytes of a serialized RecordMeta. /// A record is guaranteed to have sizeof(meta) + sizeof(data) <= max_batch_size - ESTIMATED_META_SIZE. -const ESTIMATED_META_SIZE: usize = 256; +pub(crate) const ESTIMATED_META_SIZE: usize = 256; + +/// The minimum batch size +pub(crate) const MIN_BATCH_SIZE: usize = 4 * 1024; /// The type of a record. /// @@ -110,43 +117,25 @@ impl TryFrom for Record { } } -impl From> for EntryImpl { - fn from(records: Vec) -> Self { - let entry_id = records[0].meta.entry_id; - let ns = records[0].meta.ns.clone(); - let data = records.into_iter().flat_map(|record| record.data).collect(); - EntryImpl { - data, - id: entry_id, - ns, - } - } -} - /// Produces a record to a kafka topic. pub(crate) struct RecordProducer { - /// The namespace of the entries. - ns: NamespaceImpl, + /// The provide of the entries. + provider: Arc, /// Entries are buffered before being built into a record. - entries: Vec, + entries: Vec, } impl RecordProducer { /// Creates a new producer for producing entries with the given namespace. - pub(crate) fn new(ns: NamespaceImpl) -> Self { + pub(crate) fn new(provider: Arc) -> Self { Self { - ns, + provider, entries: Vec::new(), } } - /// Populates the entry buffer with the given entries. - pub(crate) fn with_entries(self, entries: Vec) -> Self { - Self { entries, ..self } - } - /// Pushes an entry into the entry buffer. - pub(crate) fn push(&mut self, entry: EntryImpl) { + pub(crate) fn push(&mut self, entry: Entry) { self.entries.push(entry); } @@ -158,11 +147,11 @@ impl RecordProducer { // Gets the producer in which a record buffer is maintained. let producer = client_manager - .get_or_insert(&self.ns.topic) + .get_or_insert(&self.provider.topic) .await .map_err(|e| { GetClientSnafu { - topic: &self.ns.topic, + topic: &self.provider.topic, error: e.to_string(), } .build() @@ -171,10 +160,8 @@ impl RecordProducer { // Stores the offset of the last successfully produced record. let mut last_offset = None; - let max_record_size = - client_manager.config.max_batch_size.as_bytes() as usize - ESTIMATED_META_SIZE; for entry in self.entries { - for record in build_records(entry, max_record_size) { + for record in convert_to_records(entry) { let kafka_record = KafkaRecord::try_from(record)?; metrics::METRIC_KAFKA_PRODUCE_RECORD_COUNTS.inc(); @@ -187,9 +174,8 @@ impl RecordProducer { .await .map(Offset) .with_context(|_| ProduceRecordSnafu { - topic: &self.ns.topic, + topic: &self.provider.topic, size: kafka_record.approximate_size(), - limit: max_record_size, })?; last_offset = Some(offset); } @@ -199,100 +185,188 @@ impl RecordProducer { } } -fn record_type(seq: usize, num_records: usize) -> RecordType { - if seq == 0 { - RecordType::First - } else if seq == num_records - 1 { - RecordType::Last - } else { - RecordType::Middle(seq) - } -} - -fn build_records(entry: EntryImpl, max_record_size: usize) -> Vec { - if entry.data.len() <= max_record_size { - let record = Record { +fn convert_to_records(entry: Entry) -> Vec { + match entry { + Entry::Naive(entry) => vec![Record { meta: RecordMeta { version: VERSION, tp: RecordType::Full, - entry_id: entry.id, - ns: entry.ns, + // TODO(weny): refactor the record meta. + entry_id: 0, + ns: NamespaceImpl { + region_id: entry.region_id.as_u64(), + // TODO(weny): refactor the record meta. + topic: String::new(), + }, }, data: entry.data, + }], + Entry::MultiplePart(entry) => { + let mut entries = Vec::with_capacity(entry.parts.len()); + + for (idx, part) in entry.parts.into_iter().enumerate() { + let tp = match entry.headers[idx] { + MultiplePartHeader::First => RecordType::First, + MultiplePartHeader::Middle(i) => RecordType::Middle(i), + MultiplePartHeader::Last => RecordType::Last, + }; + entries.push(Record { + meta: RecordMeta { + version: VERSION, + tp, + // TODO(weny): refactor the record meta. + entry_id: 0, + ns: NamespaceImpl { + region_id: entry.region_id.as_u64(), + topic: String::new(), + }, + }, + data: part, + }) + } + entries + } + } +} + +fn convert_to_naive_entry(provider: Arc, record: Record) -> Entry { + let region_id = RegionId::from_u64(record.meta.ns.region_id); + + Entry::Naive(NaiveEntry { + provider: Provider::Kafka(provider), + region_id, + // TODO(weny): should be the offset in the topic + entry_id: record.meta.entry_id, + data: record.data, + }) +} + +fn convert_to_multiple_entry( + provider: Arc, + region_id: RegionId, + records: Vec, +) -> Entry { + let mut headers = Vec::with_capacity(records.len()); + let mut parts = Vec::with_capacity(records.len()); + + for record in records { + let header = match record.meta.tp { + RecordType::Full => unreachable!(), + RecordType::First => MultiplePartHeader::First, + RecordType::Middle(i) => MultiplePartHeader::Middle(i), + RecordType::Last => MultiplePartHeader::Last, }; - return vec![record]; + headers.push(header); + parts.push(record.data); } - let chunks = entry.data.chunks(max_record_size); - let num_chunks = chunks.len(); - chunks - .enumerate() - .map(|(i, chunk)| Record { - meta: RecordMeta { - version: VERSION, - tp: record_type(i, num_chunks), - entry_id: entry.id, - ns: entry.ns.clone(), - }, - data: chunk.to_vec(), - }) - .collect() + Entry::MultiplePart(MultiplePartEntry { + provider: Provider::Kafka(provider), + region_id, + // TODO(weny): should be the offset in the topic + entry_id: 0, + headers, + parts, + }) } -pub fn maybe_emit_entry( +/// Constructs entries from `buffered_records` +pub fn remaining_entries( + provider: &Arc, + buffered_records: &mut HashMap>, +) -> Option> { + if buffered_records.is_empty() { + None + } else { + let mut entries = Vec::with_capacity(buffered_records.len()); + for (region_id, records) in buffered_records.drain() { + entries.push(convert_to_multiple_entry( + provider.clone(), + region_id, + records, + )); + } + Some(entries) + } +} + +/// For type of [Entry::Naive] Entry: +/// - Emits a [RecordType::Full] type record immediately. +/// +/// For type of [Entry::MultiplePart] Entry: +/// - Emits a complete or incomplete [Entry] while the next same [RegionId] record arrives. +/// +/// **Incomplete Entry:** +/// If the records arrive in the following order, it emits **the incomplete [Entry]** when the next record arrives. +/// - **[RecordType::First], [RecordType::Middle]**, [RecordType::First] +/// - **[RecordType::Middle]**, [RecordType::First] +/// - **[RecordType::Last]** +pub(crate) fn maybe_emit_entry( + provider: &Arc, record: Record, - entry_records: &mut HashMap>, -) -> Result> { + buffered_records: &mut HashMap>, +) -> Result> { let mut entry = None; match record.meta.tp { - RecordType::Full => { - entry = Some(EntryImpl::from(vec![record])); - } + RecordType::Full => entry = Some(convert_to_naive_entry(provider.clone(), record)), RecordType::First => { - ensure!( - !entry_records.contains_key(&record.meta.entry_id), - IllegalSequenceSnafu { - error: "First record must be the first" - } - ); - entry_records.insert(record.meta.entry_id, vec![record]); + let region_id = record.meta.ns.region_id.into(); + if let Some(records) = buffered_records.insert(region_id, vec![record]) { + // Incomplete entry + entry = Some(convert_to_multiple_entry( + provider.clone(), + region_id, + records, + )) + } } RecordType::Middle(seq) => { - let prefix = - entry_records - .get_mut(&record.meta.entry_id) - .context(IllegalSequenceSnafu { - error: "Middle record must not be the first", - })?; - // Safety: the records are guaranteed not empty if the key exists. - let last_record = prefix.last().unwrap(); - let legal = match last_record.meta.tp { - // Legal if this record follows a First record. - RecordType::First => seq == 1, - // Legal if this record follows a Middle record just prior to this record. - RecordType::Middle(last_seq) => last_seq + 1 == seq, - // Illegal sequence. - _ => false, - }; - ensure!( - legal, - IllegalSequenceSnafu { - error: "Illegal prefix for a Middle record" - } - ); + let region_id = record.meta.ns.region_id.into(); + let records = buffered_records.entry(region_id).or_default(); + + // Only validate complete entries. + if !records.is_empty() { + // Safety: the records are guaranteed not empty if the key exists. + let last_record = records.last().unwrap(); + let legal = match last_record.meta.tp { + // Legal if this record follows a First record. + RecordType::First => seq == 1, + // Legal if this record follows a Middle record just prior to this record. + RecordType::Middle(last_seq) => last_seq + 1 == seq, + // Illegal sequence. + _ => false, + }; + ensure!( + legal, + IllegalSequenceSnafu { + error: format!( + "Illegal sequence of a middle record, last record: {:?}, incoming record: {:?}", + last_record.meta.tp, + record.meta.tp + ) + } + ); + } - prefix.push(record); + records.push(record); } RecordType::Last => { - // There must have a sequence prefix before a Last record is read. - let mut records = - entry_records - .remove(&record.meta.entry_id) - .context(IllegalSequenceSnafu { - error: "Missing prefix for a Last record", - })?; - records.push(record); - entry = Some(EntryImpl::from(records)); + let region_id = record.meta.ns.region_id.into(); + if let Some(mut records) = buffered_records.remove(®ion_id) { + records.push(record); + entry = Some(convert_to_multiple_entry( + provider.clone(), + region_id, + records, + )) + } else { + // Incomplete entry + entry = Some(convert_to_multiple_entry( + provider.clone(), + region_id, + vec![record], + )) + } } } Ok(entry) @@ -300,278 +374,141 @@ pub fn maybe_emit_entry( #[cfg(test)] mod tests { + use std::assert_matches::assert_matches; use std::sync::Arc; - use common_base::readable_size::ReadableSize; - use common_wal::config::kafka::DatanodeKafkaConfig; - use common_wal::test_util::run_test_with_kafka_wal; - use uuid::Uuid; - use super::*; - use crate::kafka::client_manager::ClientManager; - - // Implements some utility methods for testing. - impl Default for Record { - fn default() -> Self { - Self { - meta: RecordMeta { - version: VERSION, - tp: RecordType::Full, - ns: NamespaceImpl { - region_id: 0, - topic: "greptimedb_wal_topic".to_string(), - }, - entry_id: 0, - }, - data: Vec::new(), - } - } - } - - impl Record { - /// Overrides tp. - fn with_tp(&self, tp: RecordType) -> Self { - Self { - meta: RecordMeta { - tp, - ..self.meta.clone() - }, - ..self.clone() - } - } - - /// Overrides data with the given data. - fn with_data(&self, data: &[u8]) -> Self { - Self { - data: data.to_vec(), - ..self.clone() - } - } - - /// Overrides entry id. - fn with_entry_id(&self, entry_id: EntryId) -> Self { - Self { - meta: RecordMeta { - entry_id, - ..self.meta.clone() - }, - ..self.clone() - } - } - - /// Overrides namespace. - fn with_ns(&self, ns: NamespaceImpl) -> Self { - Self { - meta: RecordMeta { ns, ..self.meta }, - ..self.clone() - } - } - } - - fn new_test_entry>(data: D, entry_id: EntryId, ns: NamespaceImpl) -> EntryImpl { - EntryImpl { - data: data.as_ref().to_vec(), - id: entry_id, - ns, - } - } - - /// Tests that the `build_records` works as expected. - #[test] - fn test_build_records() { - let max_record_size = 128; - - // On a small entry. - let ns = NamespaceImpl { - region_id: 1, - topic: "greptimedb_wal_topic".to_string(), - }; - let entry = new_test_entry([b'1'; 100], 0, ns.clone()); - let records = build_records(entry.clone(), max_record_size); - assert!(records.len() == 1); - assert_eq!(entry.data, records[0].data); - - // On a large entry. - let entry = new_test_entry([b'1'; 150], 0, ns.clone()); - let records = build_records(entry.clone(), max_record_size); - assert!(records.len() == 2); - assert_eq!(&records[0].data, &[b'1'; 128]); - assert_eq!(&records[1].data, &[b'1'; 22]); - - // On a way-too large entry. - let entry = new_test_entry([b'1'; 5000], 0, ns.clone()); - let records = build_records(entry.clone(), max_record_size); - let matched = entry - .data - .chunks(max_record_size) - .enumerate() - .all(|(i, chunk)| records[i].data == chunk); - assert!(matched); - } + use crate::error; - /// Tests that Record and KafkaRecord are able to be converted back and forth. - #[test] - fn test_record_conversion() { - let record = Record { + fn new_test_record(tp: RecordType, entry_id: EntryId, region_id: u64, data: Vec) -> Record { + Record { meta: RecordMeta { version: VERSION, - tp: RecordType::Full, - entry_id: 1, + tp, ns: NamespaceImpl { - region_id: 1, + region_id, topic: "greptimedb_wal_topic".to_string(), }, + entry_id, }, - data: b"12345".to_vec(), - }; - let kafka_record: KafkaRecord = record.clone().try_into().unwrap(); - let got = Record::try_from(kafka_record).unwrap(); - assert_eq!(record, got); + data, + } } - /// Tests that the reconstruction of an entry works as expected. #[test] - fn test_reconstruct_entry() { - let template = Record::default(); - let records = vec![ - template.with_data(b"111").with_tp(RecordType::First), - template.with_data(b"222").with_tp(RecordType::Middle(1)), - template.with_data(b"333").with_tp(RecordType::Last), - ]; - let entry = EntryImpl::from(records.clone()); - assert_eq!(records[0].meta.entry_id, entry.id); - assert_eq!(records[0].meta.ns, entry.ns); + fn test_maybe_emit_entry_emit_naive_entry() { + let provider = Arc::new(KafkaProvider::new("my_topic".to_string())); + let region_id = RegionId::new(1, 1); + let mut buffer = HashMap::new(); + let record = new_test_record(RecordType::Full, 1, region_id.as_u64(), vec![1; 100]); + let entry = maybe_emit_entry(&provider, record, &mut buffer) + .unwrap() + .unwrap(); assert_eq!( - entry.data, - records - .into_iter() - .flat_map(|record| record.data) - .collect::>() + entry, + Entry::Naive(NaiveEntry { + provider: Provider::Kafka(provider), + region_id, + entry_id: 1, + data: vec![1; 100] + }) ); } - /// Tests that `maybe_emit_entry` works as expected. - /// This test does not check for illegal record sequences since they're already tested in the `test_check_records` test. #[test] - fn test_maybe_emit_entry() { - let ns = NamespaceImpl { - region_id: 1, - topic: "greptimedb_wal_topic".to_string(), - }; - let template = Record::default().with_ns(ns); - let mut entry_records = HashMap::from([ - ( - 1, - vec![template.with_entry_id(1).with_tp(RecordType::First)], - ), - ( - 2, - vec![template.with_entry_id(2).with_tp(RecordType::First)], - ), - ( - 3, - vec![ - template.with_entry_id(3).with_tp(RecordType::First), - template.with_entry_id(3).with_tp(RecordType::Middle(1)), - ], - ), - ]); - - // A Full record arrives. - let got = maybe_emit_entry( - template.with_entry_id(0).with_tp(RecordType::Full), - &mut entry_records, - ) - .unwrap(); - assert!(got.is_some()); - - // A First record arrives with no prefix. - let got = maybe_emit_entry( - template.with_entry_id(0).with_tp(RecordType::First), - &mut entry_records, - ) - .unwrap(); - assert!(got.is_none()); - - // A First record arrives with some prefix. - let got = maybe_emit_entry( - template.with_entry_id(1).with_tp(RecordType::First), - &mut entry_records, - ); - assert!(got.is_err()); - - // A Middle record arrives with legal prefix (First). - let got = maybe_emit_entry( - template.with_entry_id(2).with_tp(RecordType::Middle(1)), - &mut entry_records, - ) - .unwrap(); - assert!(got.is_none()); - - // A Middle record arrives with legal prefix (Middle). - let got = maybe_emit_entry( - template.with_entry_id(2).with_tp(RecordType::Middle(2)), - &mut entry_records, - ) - .unwrap(); - assert!(got.is_none()); - - // A Middle record arrives with illegal prefix. - let got = maybe_emit_entry( - template.with_entry_id(2).with_tp(RecordType::Middle(1)), - &mut entry_records, + fn test_maybe_emit_entry_emit_incomplete_entry() { + let provider = Arc::new(KafkaProvider::new("my_topic".to_string())); + let region_id = RegionId::new(1, 1); + // `First` overwrite `First` + let mut buffer = HashMap::new(); + let record = new_test_record(RecordType::First, 1, region_id.as_u64(), vec![1; 100]); + assert!(maybe_emit_entry(&provider, record, &mut buffer) + .unwrap() + .is_none()); + let record = new_test_record(RecordType::First, 2, region_id.as_u64(), vec![2; 100]); + let incomplete_entry = maybe_emit_entry(&provider, record, &mut buffer) + .unwrap() + .unwrap(); + + assert_eq!( + incomplete_entry, + Entry::MultiplePart(MultiplePartEntry { + provider: Provider::Kafka(provider.clone()), + region_id, + // TODO(weny): always be 0. + entry_id: 0, + headers: vec![MultiplePartHeader::First], + parts: vec![vec![1; 100]], + }) ); - assert!(got.is_err()); - // A Middle record arrives with no prefix. - let got = maybe_emit_entry( - template.with_entry_id(22).with_tp(RecordType::Middle(1)), - &mut entry_records, + // `Last` overwrite `None` + let mut buffer = HashMap::new(); + let record = new_test_record(RecordType::Last, 1, region_id.as_u64(), vec![1; 100]); + let incomplete_entry = maybe_emit_entry(&provider, record, &mut buffer) + .unwrap() + .unwrap(); + + assert_eq!( + incomplete_entry, + Entry::MultiplePart(MultiplePartEntry { + provider: Provider::Kafka(provider.clone()), + region_id, + // TODO(weny): always be 0. + entry_id: 0, + headers: vec![MultiplePartHeader::Last], + parts: vec![vec![1; 100]], + }) ); - assert!(got.is_err()); - // A Last record arrives with no prefix. - let got = maybe_emit_entry( - template.with_entry_id(33).with_tp(RecordType::Last), - &mut entry_records, + // `First` overwrite `Middle(0)` + let mut buffer = HashMap::new(); + let record = new_test_record(RecordType::Middle(0), 1, region_id.as_u64(), vec![1; 100]); + assert!(maybe_emit_entry(&provider, record, &mut buffer) + .unwrap() + .is_none()); + let record = new_test_record(RecordType::First, 2, region_id.as_u64(), vec![2; 100]); + let incomplete_entry = maybe_emit_entry(&provider, record, &mut buffer) + .unwrap() + .unwrap(); + + assert_eq!( + incomplete_entry, + Entry::MultiplePart(MultiplePartEntry { + provider: Provider::Kafka(provider), + region_id, + // TODO(weny): always be 0. + entry_id: 0, + headers: vec![MultiplePartHeader::Middle(0)], + parts: vec![vec![1; 100]], + }) ); - assert!(got.is_err()); - - // A Last record arrives with legal prefix. - let got = maybe_emit_entry( - template.with_entry_id(3).with_tp(RecordType::Last), - &mut entry_records, - ) - .unwrap(); - assert!(got.is_some()); - - // Check state. - assert_eq!(entry_records.len(), 3); - assert_eq!(entry_records[&0].len(), 1); - assert_eq!(entry_records[&1].len(), 1); - assert_eq!(entry_records[&2].len(), 3); } - #[tokio::test] - async fn test_produce_large_entry() { - run_test_with_kafka_wal(|broker_endpoints| { - Box::pin(async { - let topic = format!("greptimedb_wal_topic_{}", Uuid::new_v4()); - let ns = NamespaceImpl { - region_id: 1, - topic, - }; - let entry = new_test_entry([b'1'; 2000000], 0, ns.clone()); - let producer = RecordProducer::new(ns.clone()).with_entries(vec![entry]); - let config = DatanodeKafkaConfig { - broker_endpoints, - max_batch_size: ReadableSize::mb(1), - ..Default::default() - }; - let manager = Arc::new(ClientManager::try_new(&config).await.unwrap()); - producer.produce(&manager).await.unwrap(); - }) - }) - .await + #[test] + fn test_maybe_emit_entry_illegal_seq() { + let provider = Arc::new(KafkaProvider::new("my_topic".to_string())); + let region_id = RegionId::new(1, 1); + let mut buffer = HashMap::new(); + let record = new_test_record(RecordType::First, 1, region_id.as_u64(), vec![1; 100]); + assert!(maybe_emit_entry(&provider, record, &mut buffer) + .unwrap() + .is_none()); + let record = new_test_record(RecordType::Middle(2), 1, region_id.as_u64(), vec![2; 100]); + let err = maybe_emit_entry(&provider, record, &mut buffer).unwrap_err(); + assert_matches!(err, error::Error::IllegalSequence { .. }); + + let mut buffer = HashMap::new(); + let record = new_test_record(RecordType::First, 1, region_id.as_u64(), vec![1; 100]); + assert!(maybe_emit_entry(&provider, record, &mut buffer) + .unwrap() + .is_none()); + let record = new_test_record(RecordType::Middle(1), 1, region_id.as_u64(), vec![2; 100]); + assert!(maybe_emit_entry(&provider, record, &mut buffer) + .unwrap() + .is_none()); + let record = new_test_record(RecordType::Middle(3), 1, region_id.as_u64(), vec![2; 100]); + let err = maybe_emit_entry(&provider, record, &mut buffer).unwrap_err(); + assert_matches!(err, error::Error::IllegalSequence { .. }); } } diff --git a/src/log-store/src/lib.rs b/src/log-store/src/lib.rs index c035e5fcff80..a119aac390c2 100644 --- a/src/log-store/src/lib.rs +++ b/src/log-store/src/lib.rs @@ -14,12 +14,10 @@ #![feature(let_chains)] #![feature(io_error_more)] +#![feature(assert_matches)] pub mod error; pub mod kafka; pub mod metrics; -mod noop; pub mod raft_engine; pub mod test_util; - -pub use noop::NoopLogStore; diff --git a/src/log-store/src/noop.rs b/src/log-store/src/noop.rs deleted file mode 100644 index e5ed7fd66bd2..000000000000 --- a/src/log-store/src/noop.rs +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_wal::options::WalOptions; -use store_api::logstore::entry::{Entry, Id as EntryId, RawEntry}; -use store_api::logstore::namespace::{Id as NamespaceId, Namespace}; -use store_api::logstore::{AppendBatchResponse, AppendResponse, LogStore}; -use store_api::storage::RegionId; - -use crate::error::{Error, Result}; - -/// A noop log store which only for test -#[derive(Debug, Default)] -pub struct NoopLogStore; - -#[derive(Debug, Default, Clone, PartialEq)] -pub struct EntryImpl; - -#[derive(Debug, Clone, Default, Eq, PartialEq, Hash)] -pub struct NamespaceImpl; - -impl Namespace for NamespaceImpl { - fn id(&self) -> NamespaceId { - 0 - } -} - -impl Entry for EntryImpl { - fn into_raw_entry(self) -> RawEntry { - RawEntry { - region_id: self.region_id(), - entry_id: self.id(), - data: vec![], - } - } - - fn data(&self) -> &[u8] { - &[] - } - - fn id(&self) -> EntryId { - 0 - } - - fn region_id(&self) -> RegionId { - RegionId::from_u64(0) - } - - fn estimated_size(&self) -> usize { - 0 - } -} - -#[async_trait::async_trait] -impl LogStore for NoopLogStore { - type Error = Error; - type Namespace = NamespaceImpl; - type Entry = EntryImpl; - - async fn stop(&self) -> Result<()> { - Ok(()) - } - - async fn append(&self, mut _e: Self::Entry) -> Result { - Ok(AppendResponse::default()) - } - - async fn append_batch(&self, _e: Vec) -> Result { - Ok(AppendBatchResponse::default()) - } - - async fn read( - &self, - _ns: &Self::Namespace, - _entry_id: EntryId, - ) -> Result> - { - Ok(Box::pin(futures::stream::once(futures::future::ready(Ok( - vec![], - ))))) - } - - async fn create_namespace(&self, _ns: &Self::Namespace) -> Result<()> { - Ok(()) - } - - async fn delete_namespace(&self, _ns: &Self::Namespace) -> Result<()> { - Ok(()) - } - - async fn list_namespaces(&self) -> Result> { - Ok(vec![]) - } - - fn entry(&self, data: &mut Vec, entry_id: EntryId, ns: Self::Namespace) -> Self::Entry { - let _ = data; - let _ = entry_id; - let _ = ns; - EntryImpl - } - - fn namespace(&self, ns_id: NamespaceId, wal_options: &WalOptions) -> Self::Namespace { - let _ = ns_id; - let _ = wal_options; - NamespaceImpl - } - - async fn obsolete( - &self, - ns: Self::Namespace, - entry_id: EntryId, - ) -> std::result::Result<(), Self::Error> { - let _ = ns; - let _ = entry_id; - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_mock_entry() { - let e = EntryImpl; - assert_eq!(0, e.data().len()); - assert_eq!(0, e.id()); - } - - #[tokio::test] - async fn test_noop_logstore() { - let store = NoopLogStore; - let e = store.entry(&mut vec![], 1, NamespaceImpl); - let _ = store.append(e.clone()).await.unwrap(); - assert!(store.append_batch(vec![e]).await.is_ok()); - store.create_namespace(&NamespaceImpl).await.unwrap(); - assert_eq!(0, store.list_namespaces().await.unwrap().len()); - store.delete_namespace(&NamespaceImpl).await.unwrap(); - assert_eq!(NamespaceImpl, store.namespace(0, &WalOptions::default())); - store.obsolete(NamespaceImpl, 1).await.unwrap(); - } -} diff --git a/src/log-store/src/raft_engine.rs b/src/log-store/src/raft_engine.rs index cdb600249caa..86a46bb1a02f 100644 --- a/src/log-store/src/raft_engine.rs +++ b/src/log-store/src/raft_engine.rs @@ -12,20 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::hash::{Hash, Hasher}; -use std::mem::size_of; - -use store_api::logstore::entry::{Entry, Id as EntryId, RawEntry}; -use store_api::logstore::namespace::{Id as NamespaceId, Namespace}; -use store_api::storage::RegionId; - -use crate::raft_engine::protos::logstore::{EntryImpl, NamespaceImpl}; +use crate::raft_engine::protos::logstore::EntryImpl; mod backend; pub mod log_store; pub use backend::RaftEngineBackend; pub use raft_engine::Config; +use store_api::logstore::entry::{Entry, NaiveEntry}; +use store_api::logstore::provider::Provider; +use store_api::storage::RegionId; pub mod protos { include!(concat!(env!("OUT_DIR"), concat!("/", "protos/", "mod.rs"))); @@ -42,65 +38,20 @@ impl EntryImpl { } } -impl NamespaceImpl { - pub fn with_id(id: NamespaceId) -> Self { - Self { +impl From for Entry { + fn from( + EntryImpl { id, - ..Default::default() - } - } -} - -#[allow(clippy::derived_hash_with_manual_eq)] -impl Hash for NamespaceImpl { - fn hash(&self, state: &mut H) { - self.id.hash(state); - } -} - -impl Eq for NamespaceImpl {} - -impl Namespace for NamespaceImpl { - fn id(&self) -> NamespaceId { - self.id - } -} - -impl Entry for EntryImpl { - fn into_raw_entry(self) -> RawEntry { - RawEntry { - region_id: self.region_id(), - entry_id: self.id(), - data: self.data, - } - } - - fn data(&self) -> &[u8] { - self.data.as_slice() - } - - fn id(&self) -> EntryId { - self.id - } - - fn region_id(&self) -> RegionId { - RegionId::from_u64(self.id) - } - - fn estimated_size(&self) -> usize { - self.data.len() + size_of::() + size_of::() - } -} - -#[cfg(test)] -mod tests { - use store_api::logstore::entry::Entry; - - use crate::raft_engine::protos::logstore::EntryImpl; - - #[test] - fn test_estimated_size() { - let entry = EntryImpl::create(1, 1, b"hello, world".to_vec()); - assert_eq!(28, entry.estimated_size()); + namespace_id, + data, + .. + }: EntryImpl, + ) -> Self { + Entry::Naive(NaiveEntry { + provider: Provider::raft_engine_provider(namespace_id), + region_id: RegionId::from_u64(namespace_id), + entry_id: id, + data, + }) } } diff --git a/src/log-store/src/raft_engine/log_store.rs b/src/log-store/src/raft_engine/log_store.rs index b2070abbf3ec..c9632e6ea341 100644 --- a/src/log-store/src/raft_engine/log_store.rs +++ b/src/log-store/src/raft_engine/log_store.rs @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::hash_map::Entry; -use std::collections::HashMap; +use std::collections::{hash_map, HashMap}; use std::fmt::{Debug, Formatter}; use std::sync::atomic::{AtomicI64, Ordering}; use std::sync::Arc; @@ -22,22 +21,21 @@ use async_stream::stream; use common_runtime::{RepeatedTask, TaskFunction}; use common_telemetry::{error, info}; use common_wal::config::raft_engine::RaftEngineConfig; -use common_wal::options::WalOptions; use raft_engine::{Config, Engine, LogBatch, MessageExt, ReadableSize, RecoveryMode}; -use snafu::{ensure, ResultExt}; -use store_api::logstore::entry::{Entry as EntryTrait, Id as EntryId}; -use store_api::logstore::entry_stream::SendableEntryStream; -use store_api::logstore::namespace::{Id as NamespaceId, Namespace as NamespaceTrait}; -use store_api::logstore::{AppendBatchResponse, AppendResponse, LogStore}; +use snafu::{ensure, OptionExt, ResultExt}; +use store_api::logstore::entry::{Entry, Id as EntryId, NaiveEntry}; +use store_api::logstore::provider::{Provider, RaftEngineProvider}; +use store_api::logstore::{AppendBatchResponse, LogStore, SendableEntryStream}; +use store_api::storage::RegionId; use crate::error::{ AddEntryLogBatchSnafu, DiscontinuousLogIndexSnafu, Error, FetchEntrySnafu, - IllegalNamespaceSnafu, IllegalStateSnafu, OverrideCompactedEntrySnafu, RaftEngineSnafu, Result, - StartGcTaskSnafu, StopGcTaskSnafu, + IllegalNamespaceSnafu, IllegalStateSnafu, InvalidProviderSnafu, OverrideCompactedEntrySnafu, + RaftEngineSnafu, Result, StartGcTaskSnafu, StopGcTaskSnafu, }; use crate::metrics; use crate::raft_engine::backend::SYSTEM_NAMESPACE; -use crate::raft_engine::protos::logstore::{EntryImpl, NamespaceImpl as Namespace}; +use crate::raft_engine::protos::logstore::{EntryImpl, NamespaceImpl}; const NAMESPACE_PREFIX: &str = "$sys/"; @@ -117,10 +115,10 @@ impl RaftEngineLogStore { .context(StartGcTaskSnafu) } - fn span(&self, namespace: &::Namespace) -> (Option, Option) { + fn span(&self, provider: &RaftEngineProvider) -> (Option, Option) { ( - self.engine.first_index(namespace.id()), - self.engine.last_index(namespace.id()), + self.engine.first_index(provider.id), + self.engine.last_index(provider.id), ) } @@ -129,56 +127,65 @@ impl RaftEngineLogStore { /// to append in each namespace(region). fn entries_to_batch( &self, - entries: Vec, - ) -> Result<(LogBatch, HashMap)> { + entries: Vec, + ) -> Result<(LogBatch, HashMap)> { // Records the last entry id for each region's entries. - let mut entry_ids: HashMap = HashMap::with_capacity(entries.len()); + let mut entry_ids: HashMap = HashMap::with_capacity(entries.len()); let mut batch = LogBatch::with_capacity(entries.len()); for e in entries { - let ns_id = e.namespace_id; - match entry_ids.entry(ns_id) { - Entry::Occupied(mut o) => { + let region_id = e.region_id(); + let entry_id = e.entry_id(); + match entry_ids.entry(region_id) { + hash_map::Entry::Occupied(mut o) => { let prev = *o.get(); ensure!( - e.id == prev + 1, + entry_id == prev + 1, DiscontinuousLogIndexSnafu { - region_id: ns_id, + region_id, last_index: prev, - attempt_index: e.id + attempt_index: entry_id } ); - o.insert(e.id); + o.insert(entry_id); } - Entry::Vacant(v) => { + hash_map::Entry::Vacant(v) => { // this entry is the first in batch of given region. - if let Some(first_index) = self.engine.first_index(ns_id) { + if let Some(first_index) = self.engine.first_index(region_id.as_u64()) { // ensure the first in batch does not override compacted entry. ensure!( - e.id > first_index, + entry_id > first_index, OverrideCompactedEntrySnafu { - namespace: ns_id, + namespace: region_id, first_index, - attempt_index: e.id, + attempt_index: entry_id, } ); } // ensure the first in batch does not form a hole in raft-engine. - if let Some(last_index) = self.engine.last_index(ns_id) { + if let Some(last_index) = self.engine.last_index(region_id.as_u64()) { ensure!( - e.id == last_index + 1, + entry_id == last_index + 1, DiscontinuousLogIndexSnafu { - region_id: ns_id, + region_id, last_index, - attempt_index: e.id + attempt_index: entry_id } ); } - v.insert(e.id); + v.insert(entry_id); } } batch - .add_entries::(ns_id, &[e]) + .add_entries::( + region_id.as_u64(), + &[EntryImpl { + id: entry_id, + namespace_id: region_id.as_u64(), + data: e.into_bytes(), + ..Default::default() + }], + ) .context(AddEntryLogBatchSnafu)?; } @@ -198,62 +205,19 @@ impl Debug for RaftEngineLogStore { #[async_trait::async_trait] impl LogStore for RaftEngineLogStore { type Error = Error; - type Namespace = Namespace; - type Entry = EntryImpl; async fn stop(&self) -> Result<()> { self.gc_task.stop().await.context(StopGcTaskSnafu) } - /// Appends an entry to logstore. Currently the existence of the entry's namespace is not checked. - async fn append(&self, e: Self::Entry) -> Result { - ensure!(self.started(), IllegalStateSnafu); - let entry_id = e.id; - let namespace_id = e.namespace_id; - let mut batch = LogBatch::with_capacity(1); - batch - .add_entries::(namespace_id, &[e]) - .context(AddEntryLogBatchSnafu)?; - - if let Some(first_index) = self.engine.first_index(namespace_id) { - ensure!( - entry_id > first_index, - OverrideCompactedEntrySnafu { - namespace: namespace_id, - first_index, - attempt_index: entry_id, - } - ); - } - - if let Some(last_index) = self.engine.last_index(namespace_id) { - ensure!( - entry_id == last_index + 1, - DiscontinuousLogIndexSnafu { - region_id: namespace_id, - last_index, - attempt_index: entry_id - } - ); - } - - let _ = self - .engine - .write(&mut batch, self.config.sync_write) - .context(RaftEngineSnafu)?; - Ok(AppendResponse { - last_entry_id: entry_id, - }) - } - /// Appends a batch of entries to logstore. `RaftEngineLogStore` assures the atomicity of /// batch append. - async fn append_batch(&self, entries: Vec) -> Result { + async fn append_batch(&self, entries: Vec) -> Result { metrics::METRIC_RAFT_ENGINE_APPEND_BATCH_CALLS_TOTAL.inc(); metrics::METRIC_RAFT_ENGINE_APPEND_BATCH_BYTES_TOTAL.inc_by( entries .iter() - .map(EntryTrait::estimated_size) + .map(|entry| entry.estimated_size()) .sum::() as u64, ); let _timer = metrics::METRIC_RAFT_ENGINE_APPEND_BATCH_ELAPSED.start_timer(); @@ -287,40 +251,47 @@ impl LogStore for RaftEngineLogStore { /// determined by the current "last index" of the namespace. async fn read( &self, - ns: &Self::Namespace, + provider: &Provider, entry_id: EntryId, - ) -> Result> { + ) -> Result> { + let ns = provider + .as_raft_engine_provider() + .with_context(|| InvalidProviderSnafu { + expected: RaftEngineProvider::type_name(), + actual: provider.type_name(), + })?; + let namespace_id = ns.id; metrics::METRIC_RAFT_ENGINE_READ_CALLS_TOTAL.inc(); let _timer = metrics::METRIC_RAFT_ENGINE_READ_ELAPSED.start_timer(); ensure!(self.started(), IllegalStateSnafu); let engine = self.engine.clone(); - let last_index = engine.last_index(ns.id()).unwrap_or(0); - let mut start_index = entry_id.max(engine.first_index(ns.id()).unwrap_or(last_index + 1)); + let last_index = engine.last_index(namespace_id).unwrap_or(0); + let mut start_index = + entry_id.max(engine.first_index(namespace_id).unwrap_or(last_index + 1)); info!( "Read logstore, namespace: {}, start: {}, span: {:?}", - ns.id(), + namespace_id, entry_id, self.span(ns) ); let max_batch_size = self.config.read_batch_size; let (tx, mut rx) = tokio::sync::mpsc::channel(max_batch_size); - let ns = ns.clone(); let _handle = common_runtime::spawn_read(async move { while start_index <= last_index { let mut vec = Vec::with_capacity(max_batch_size); match engine .fetch_entries_to::( - ns.id, + namespace_id, start_index, last_index + 1, Some(max_batch_size), &mut vec, ) .context(FetchEntrySnafu { - ns: ns.id, + ns: namespace_id, start: start_index, end: last_index, max_size: max_batch_size, @@ -344,22 +315,40 @@ impl LogStore for RaftEngineLogStore { let s = stream!({ while let Some(res) = rx.recv().await { - yield res; + let res = res?; + + yield Ok(res.into_iter().map(Entry::from).collect::>()); } }); Ok(Box::pin(s)) } - async fn create_namespace(&self, ns: &Self::Namespace) -> Result<()> { + async fn create_namespace(&self, ns: &Provider) -> Result<()> { + let ns = ns + .as_raft_engine_provider() + .with_context(|| InvalidProviderSnafu { + expected: RaftEngineProvider::type_name(), + actual: ns.type_name(), + })?; + let namespace_id = ns.id; ensure!( - ns.id != SYSTEM_NAMESPACE, - IllegalNamespaceSnafu { ns: ns.id } + namespace_id != SYSTEM_NAMESPACE, + IllegalNamespaceSnafu { ns: namespace_id } ); ensure!(self.started(), IllegalStateSnafu); - let key = format!("{}{}", NAMESPACE_PREFIX, ns.id).as_bytes().to_vec(); + let key = format!("{}{}", NAMESPACE_PREFIX, namespace_id) + .as_bytes() + .to_vec(); let mut batch = LogBatch::with_capacity(1); batch - .put_message::(SYSTEM_NAMESPACE, key, ns) + .put_message::( + SYSTEM_NAMESPACE, + key, + &NamespaceImpl { + id: namespace_id, + ..Default::default() + }, + ) .context(RaftEngineSnafu)?; let _ = self .engine @@ -368,13 +357,22 @@ impl LogStore for RaftEngineLogStore { Ok(()) } - async fn delete_namespace(&self, ns: &Self::Namespace) -> Result<()> { + async fn delete_namespace(&self, ns: &Provider) -> Result<()> { + let ns = ns + .as_raft_engine_provider() + .with_context(|| InvalidProviderSnafu { + expected: RaftEngineProvider::type_name(), + actual: ns.type_name(), + })?; + let namespace_id = ns.id; ensure!( - ns.id != SYSTEM_NAMESPACE, - IllegalNamespaceSnafu { ns: ns.id } + namespace_id != SYSTEM_NAMESPACE, + IllegalNamespaceSnafu { ns: namespace_id } ); ensure!(self.started(), IllegalStateSnafu); - let key = format!("{}{}", NAMESPACE_PREFIX, ns.id).as_bytes().to_vec(); + let key = format!("{}{}", NAMESPACE_PREFIX, namespace_id) + .as_bytes() + .to_vec(); let mut batch = LogBatch::with_capacity(1); batch.delete(SYSTEM_NAMESPACE, key); let _ = self @@ -384,17 +382,17 @@ impl LogStore for RaftEngineLogStore { Ok(()) } - async fn list_namespaces(&self) -> Result> { + async fn list_namespaces(&self) -> Result> { ensure!(self.started(), IllegalStateSnafu); - let mut namespaces: Vec = vec![]; + let mut namespaces: Vec = vec![]; self.engine - .scan_messages::( + .scan_messages::( SYSTEM_NAMESPACE, Some(NAMESPACE_PREFIX.as_bytes()), None, false, |_, v| { - namespaces.push(v); + namespaces.push(Provider::RaftEngine(RaftEngineProvider { id: v.id })); true }, ) @@ -402,32 +400,41 @@ impl LogStore for RaftEngineLogStore { Ok(namespaces) } - fn entry(&self, data: &mut Vec, entry_id: EntryId, ns: Self::Namespace) -> Self::Entry { - EntryImpl { - id: entry_id, + fn entry( + &self, + data: &mut Vec, + entry_id: EntryId, + region_id: RegionId, + provider: &Provider, + ) -> Result { + debug_assert_eq!( + provider.as_raft_engine_provider().unwrap().id, + region_id.as_u64() + ); + Ok(Entry::Naive(NaiveEntry { + provider: provider.clone(), + region_id, + entry_id, data: std::mem::take(data), - namespace_id: ns.id(), - ..Default::default() - } + })) } - fn namespace(&self, ns_id: NamespaceId, wal_options: &WalOptions) -> Self::Namespace { - let _ = wal_options; - Namespace { - id: ns_id, - ..Default::default() - } - } - - async fn obsolete(&self, ns: Self::Namespace, entry_id: EntryId) -> Result<()> { + async fn obsolete(&self, provider: &Provider, entry_id: EntryId) -> Result<()> { + let ns = provider + .as_raft_engine_provider() + .with_context(|| InvalidProviderSnafu { + expected: RaftEngineProvider::type_name(), + actual: provider.type_name(), + })?; + let namespace_id = ns.id; ensure!(self.started(), IllegalStateSnafu); - let obsoleted = self.engine.compact_to(ns.id(), entry_id + 1); + let obsoleted = self.engine.compact_to(namespace_id, entry_id + 1); info!( "Namespace {} obsoleted {} entries, compacted index: {}, span: {:?}", - ns.id(), + namespace_id, obsoleted, entry_id, - self.span(&ns) + self.span(ns) ); Ok(()) } @@ -444,6 +451,19 @@ impl MessageExt for MessageType { } } +#[cfg(test)] +impl RaftEngineLogStore { + /// Appends a batch of entries and returns a response containing a map where the key is a region id + /// while the value is the id of the last successfully written entry of the region. + async fn append(&self, entry: Entry) -> Result { + let response = self.append_batch(vec![entry]).await?; + if let Some((_, last_entry_id)) = response.last_entry_ids.into_iter().next() { + return Ok(store_api::logstore::AppendResponse { last_entry_id }); + } + unreachable!() + } +} + #[cfg(test)] mod tests { use std::collections::HashSet; @@ -453,14 +473,12 @@ mod tests { use common_telemetry::debug; use common_test_util::temp_dir::{create_temp_dir, TempDir}; use futures_util::StreamExt; - use store_api::logstore::entry_stream::SendableEntryStream; - use store_api::logstore::namespace::Namespace as NamespaceTrait; - use store_api::logstore::LogStore; + use store_api::logstore::{LogStore, SendableEntryStream}; use super::*; use crate::error::Error; use crate::raft_engine::log_store::RaftEngineLogStore; - use crate::raft_engine::protos::logstore::{EntryImpl as Entry, NamespaceImpl as Namespace}; + use crate::raft_engine::protos::logstore::EntryImpl; #[tokio::test] async fn test_open_logstore() { @@ -487,15 +505,15 @@ mod tests { assert!(logstore.list_namespaces().await.unwrap().is_empty()); logstore - .create_namespace(&Namespace::with_id(42)) + .create_namespace(&Provider::raft_engine_provider(42)) .await .unwrap(); let namespaces = logstore.list_namespaces().await.unwrap(); assert_eq!(1, namespaces.len()); - assert_eq!(Namespace::with_id(42), namespaces[0]); + assert_eq!(Provider::raft_engine_provider(42), namespaces[0]); logstore - .delete_namespace(&Namespace::with_id(42)) + .delete_namespace(&Provider::raft_engine_provider(42)) .await .unwrap(); assert!(logstore.list_namespaces().await.unwrap().is_empty()); @@ -511,24 +529,25 @@ mod tests { .await .unwrap(); - let namespace = Namespace::with_id(1); + let namespace_id = 1; let cnt = 1024; for i in 0..cnt { let response = logstore - .append(Entry::create( - i, - namespace.id, - i.to_string().as_bytes().to_vec(), - )) + .append( + EntryImpl::create(i, namespace_id, i.to_string().as_bytes().to_vec()).into(), + ) .await .unwrap(); assert_eq!(i, response.last_entry_id); } let mut entries = HashSet::with_capacity(1024); - let mut s = logstore.read(&Namespace::with_id(1), 0).await.unwrap(); + let mut s = logstore + .read(&Provider::raft_engine_provider(1), 0) + .await + .unwrap(); while let Some(r) = s.next().await { let vec = r.unwrap(); - entries.extend(vec.into_iter().map(|e| e.id)); + entries.extend(vec.into_iter().map(|e| e.entry_id())); } assert_eq!((0..cnt).collect::>(), entries); } @@ -552,11 +571,11 @@ mod tests { .await .unwrap(); assert!(logstore - .append(Entry::create(1, 1, "1".as_bytes().to_vec())) + .append(EntryImpl::create(1, 1, "1".as_bytes().to_vec()).into()) .await .is_ok()); let entries = logstore - .read(&Namespace::with_id(1), 1) + .read(&Provider::raft_engine_provider(1), 1) .await .unwrap() .collect::>() @@ -572,11 +591,16 @@ mod tests { .await .unwrap(); - let entries = - collect_entries(logstore.read(&Namespace::with_id(1), 1).await.unwrap()).await; + let entries = collect_entries( + logstore + .read(&Provider::raft_engine_provider(1), 1) + .await + .unwrap(), + ) + .await; assert_eq!(1, entries.len()); - assert_eq!(1, entries[0].id); - assert_eq!(1, entries[0].namespace_id); + assert_eq!(1, entries[0].entry_id()); + assert_eq!(1, entries[0].region_id().as_u64()); } async fn wal_dir_usage(path: impl AsRef) -> usize { @@ -615,14 +639,15 @@ mod tests { let dir = create_temp_dir("raft-engine-logstore-test"); let logstore = new_test_log_store(&dir).await; - let namespace = Namespace::with_id(42); + let namespace_id = 42; + let namespace = Provider::raft_engine_provider(namespace_id); for id in 0..4096 { - let entry = Entry::create(id, namespace.id(), [b'x'; 4096].to_vec()); + let entry = EntryImpl::create(id, namespace_id, [b'x'; 4096].to_vec()).into(); let _ = logstore.append(entry).await.unwrap(); } let before_purge = wal_dir_usage(dir.path().to_str().unwrap()).await; - logstore.obsolete(namespace, 4000).await.unwrap(); + logstore.obsolete(&namespace, 4000).await.unwrap(); tokio::time::sleep(Duration::from_secs(6)).await; let after_purge = wal_dir_usage(dir.path().to_str().unwrap()).await; @@ -639,19 +664,20 @@ mod tests { let dir = create_temp_dir("raft-engine-logstore-test"); let logstore = new_test_log_store(&dir).await; - let namespace = Namespace::with_id(42); + let namespace_id = 42; + let namespace = Provider::raft_engine_provider(namespace_id); for id in 0..1024 { - let entry = Entry::create(id, namespace.id(), [b'x'; 4096].to_vec()); + let entry = EntryImpl::create(id, namespace_id, [b'x'; 4096].to_vec()).into(); let _ = logstore.append(entry).await.unwrap(); } - logstore.obsolete(namespace.clone(), 100).await.unwrap(); - assert_eq!(101, logstore.engine.first_index(namespace.id).unwrap()); + logstore.obsolete(&namespace, 100).await.unwrap(); + assert_eq!(101, logstore.engine.first_index(namespace_id).unwrap()); let res = logstore.read(&namespace, 100).await.unwrap(); let mut vec = collect_entries(res).await; - vec.sort_by(|a, b| a.id.partial_cmp(&b.id).unwrap()); - assert_eq!(101, vec.first().unwrap().id); + vec.sort_by(|a, b| a.entry_id().partial_cmp(&b.entry_id()).unwrap()); + assert_eq!(101, vec.first().unwrap().entry_id()); } #[tokio::test] @@ -663,14 +689,14 @@ mod tests { let entries = (0..8) .flat_map(|ns_id| { let data = [ns_id as u8].repeat(4096); - (0..16).map(move |idx| Entry::create(idx, ns_id, data.clone())) + (0..16).map(move |idx| EntryImpl::create(idx, ns_id, data.clone()).into()) }) .collect(); logstore.append_batch(entries).await.unwrap(); for ns_id in 0..8 { - let namespace = Namespace::with_id(ns_id); - let (first, last) = logstore.span(&namespace); + let namespace = &RaftEngineProvider::new(ns_id); + let (first, last) = logstore.span(namespace); assert_eq!(0, first.unwrap()); assert_eq!(15, last.unwrap()); } @@ -681,19 +707,24 @@ mod tests { common_telemetry::init_default_ut_logging(); let dir = create_temp_dir("logstore-append-batch-test"); let logstore = new_test_log_store(&dir).await; - let entries = vec![ - Entry::create(0, 0, [b'0'; 4096].to_vec()), - Entry::create(1, 0, [b'0'; 4096].to_vec()), - Entry::create(0, 1, [b'1'; 4096].to_vec()), - Entry::create(2, 0, [b'0'; 4096].to_vec()), - Entry::create(1, 1, [b'1'; 4096].to_vec()), + EntryImpl::create(0, 0, [b'0'; 4096].to_vec()).into(), + EntryImpl::create(1, 0, [b'0'; 4096].to_vec()).into(), + EntryImpl::create(0, 1, [b'1'; 4096].to_vec()).into(), + EntryImpl::create(2, 0, [b'0'; 4096].to_vec()).into(), + EntryImpl::create(1, 1, [b'1'; 4096].to_vec()).into(), ]; logstore.append_batch(entries).await.unwrap(); - assert_eq!((Some(0), Some(2)), logstore.span(&Namespace::with_id(0))); - assert_eq!((Some(0), Some(1)), logstore.span(&Namespace::with_id(1))); + assert_eq!( + (Some(0), Some(2)), + logstore.span(&RaftEngineProvider::new(0)) + ); + assert_eq!( + (Some(0), Some(1)), + logstore.span(&RaftEngineProvider::new(1)) + ); } #[tokio::test] @@ -704,21 +735,21 @@ mod tests { let entries = vec![ // Entry[0] from region 0. - Entry::create(0, 0, [b'0'; 4096].to_vec()), + EntryImpl::create(0, 0, [b'0'; 4096].to_vec()).into(), // Entry[0] from region 1. - Entry::create(0, 1, [b'1'; 4096].to_vec()), + EntryImpl::create(0, 1, [b'1'; 4096].to_vec()).into(), // Entry[1] from region 1. - Entry::create(1, 0, [b'1'; 4096].to_vec()), + EntryImpl::create(1, 0, [b'1'; 4096].to_vec()).into(), // Entry[1] from region 0. - Entry::create(1, 1, [b'0'; 4096].to_vec()), + EntryImpl::create(1, 1, [b'0'; 4096].to_vec()).into(), // Entry[2] from region 2. - Entry::create(2, 2, [b'2'; 4096].to_vec()), + EntryImpl::create(2, 2, [b'2'; 4096].to_vec()).into(), ]; // Ensure the last entry id returned for each region is the expected one. let last_entry_ids = logstore.append_batch(entries).await.unwrap().last_entry_ids; - assert_eq!(last_entry_ids[&0], 1); - assert_eq!(last_entry_ids[&1], 1); - assert_eq!(last_entry_ids[&2], 2); + assert_eq!(last_entry_ids[&(0.into())], 1); + assert_eq!(last_entry_ids[&(1.into())], 1); + assert_eq!(last_entry_ids[&(2.into())], 2); } } diff --git a/src/meta-srv/src/procedure/region_failover.rs b/src/meta-srv/src/procedure/region_failover.rs index 7d82ad36d520..9ee017ad15a6 100644 --- a/src/meta-srv/src/procedure/region_failover.rs +++ b/src/meta-srv/src/procedure/region_failover.rs @@ -29,7 +29,6 @@ use common_meta::key::datanode_table::DatanodeTableKey; use common_meta::key::{TableMetadataManagerRef, MAINTENANCE_KEY}; use common_meta::kv_backend::{KvBackendRef, ResettableKvBackendRef}; use common_meta::lock_key::{CatalogLock, RegionLock, SchemaLock, TableLock}; -use common_meta::table_name::TableName; use common_meta::{ClusterId, RegionIdent}; use common_procedure::error::{ Error as ProcedureError, FromJsonSnafu, Result as ProcedureResult, ToJsonSnafu, @@ -44,6 +43,7 @@ use serde::{Deserialize, Serialize}; use snafu::ResultExt; use store_api::storage::{RegionId, RegionNumber}; use table::metadata::TableId; +use table::table_name::TableName; use crate::error::{ self, KvBackendSnafu, RegisterProcedureLoaderSnafu, Result, TableMetadataManagerSnafu, diff --git a/src/meta-srv/src/procedure/region_migration/manager.rs b/src/meta-srv/src/procedure/region_migration/manager.rs index 7dde629cbdb9..871342fd4fef 100644 --- a/src/meta-srv/src/procedure/region_migration/manager.rs +++ b/src/meta-srv/src/procedure/region_migration/manager.rs @@ -22,12 +22,12 @@ use common_meta::key::table_info::TableInfoValue; use common_meta::key::table_route::TableRouteValue; use common_meta::peer::Peer; use common_meta::rpc::router::RegionRoute; -use common_meta::table_name::TableName; use common_meta::ClusterId; use common_procedure::{watcher, ProcedureId, ProcedureManagerRef, ProcedureWithId}; use common_telemetry::{error, info}; use snafu::{ensure, OptionExt, ResultExt}; use store_api::storage::RegionId; +use table::table_name::TableName; use crate::error::{self, Result}; use crate::procedure::region_migration::{ diff --git a/src/metric-engine/src/engine.rs b/src/metric-engine/src/engine.rs index e5b4bf2faca3..3d3ab8c77dbf 100644 --- a/src/metric-engine/src/engine.rs +++ b/src/metric-engine/src/engine.rs @@ -202,9 +202,9 @@ impl RegionEngine for MetricEngine { /// Retrieves region's disk usage. /// /// Note: Returns `None` if it's a logical region. - async fn region_disk_usage(&self, region_id: RegionId) -> Option { + fn region_disk_usage(&self, region_id: RegionId) -> Option { if self.inner.is_physical_region(region_id) { - self.inner.mito.region_disk_usage(region_id).await + self.inner.mito.region_disk_usage(region_id) } else { None } @@ -383,15 +383,7 @@ mod test { let logical_region_id = env.default_logical_region_id(); let physical_region_id = env.default_physical_region_id(); - assert!(env - .metric() - .region_disk_usage(logical_region_id) - .await - .is_none()); - assert!(env - .metric() - .region_disk_usage(physical_region_id) - .await - .is_some()); + assert!(env.metric().region_disk_usage(logical_region_id).is_none()); + assert!(env.metric().region_disk_usage(physical_region_id).is_some()); } } diff --git a/src/mito2/src/engine.rs b/src/mito2/src/engine.rs index bd8d70a6acd7..5c04d75b2292 100644 --- a/src/mito2/src/engine.rs +++ b/src/mito2/src/engine.rs @@ -110,14 +110,14 @@ impl MitoEngine { } /// Returns the region disk/memory usage information. - pub async fn get_region_usage(&self, region_id: RegionId) -> Result { + pub fn get_region_usage(&self, region_id: RegionId) -> Result { let region = self .inner .workers .get_region(region_id) .context(RegionNotFoundSnafu { region_id })?; - Ok(region.region_usage().await) + Ok(region.region_usage()) } /// Handle substrait query and return a stream of record batches @@ -368,10 +368,9 @@ impl RegionEngine for MitoEngine { self.inner.stop().await.map_err(BoxedError::new) } - async fn region_disk_usage(&self, region_id: RegionId) -> Option { + fn region_disk_usage(&self, region_id: RegionId) -> Option { let size = self .get_region_usage(region_id) - .await .map(|usage| usage.disk_usage()) .ok()?; size.try_into().ok() diff --git a/src/mito2/src/engine/basic_test.rs b/src/mito2/src/engine/basic_test.rs index 6d3fac897eda..9a5cca209b7a 100644 --- a/src/mito2/src/engine/basic_test.rs +++ b/src/mito2/src/engine/basic_test.rs @@ -524,7 +524,7 @@ async fn test_region_usage() { .unwrap(); // region is empty now, check manifest size let region = engine.get_region(region_id).unwrap(); - let region_stat = region.region_usage().await; + let region_stat = region.region_usage(); assert_eq!(region_stat.manifest_usage, 686); // put some rows @@ -535,7 +535,7 @@ async fn test_region_usage() { put_rows(&engine, region_id, rows).await; - let region_stat = region.region_usage().await; + let region_stat = region.region_usage(); assert!(region_stat.wal_usage > 0); // delete some rows @@ -545,13 +545,13 @@ async fn test_region_usage() { }; delete_rows(&engine, region_id, rows).await; - let region_stat = region.region_usage().await; + let region_stat = region.region_usage(); assert!(region_stat.wal_usage > 0); // flush region flush_region(&engine, region_id, None).await; - let region_stat = region.region_usage().await; + let region_stat = region.region_usage(); assert_eq!(region_stat.sst_usage, 3010); // region total usage diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs index 400284fdf124..df5062aa7dfb 100644 --- a/src/mito2/src/error.rs +++ b/src/mito2/src/error.rs @@ -27,6 +27,7 @@ use datatypes::prelude::ConcreteDataType; use object_store::ErrorKind; use prost::{DecodeError, EncodeError}; use snafu::{Location, Snafu}; +use store_api::logstore::provider::Provider; use store_api::manifest::ManifestVersion; use store_api::storage::RegionId; @@ -226,6 +227,14 @@ pub enum Error { source: datatypes::Error, }, + #[snafu(display("Failed to build entry, region_id: {}", region_id))] + BuildEntry { + region_id: RegionId, + #[snafu(implicit)] + location: Location, + source: BoxedError, + }, + #[snafu(display("Failed to encode WAL entry, region_id: {}", region_id))] EncodeWal { region_id: RegionId, @@ -242,17 +251,9 @@ pub enum Error { source: BoxedError, }, - #[snafu(display("Failed to read WAL, region_id: {}", region_id))] + #[snafu(display("Failed to read WAL, provider: {}", provider))] ReadWal { - region_id: RegionId, - #[snafu(implicit)] - location: Location, - source: BoxedError, - }, - - #[snafu(display("Failed to read WAL, topic: {}", topic))] - ReadKafkaWal { - topic: String, + provider: Provider, #[snafu(implicit)] location: Location, source: BoxedError, @@ -330,6 +331,13 @@ pub enum Error { location: Location, }, + #[snafu(display("Invalid wal read request, {}", reason))] + InvalidWalReadRequest { + reason: String, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("Failed to convert array to vector"))] ConvertVector { #[snafu(implicit)] @@ -636,6 +644,13 @@ pub enum Error { unexpected_entry_id: u64, }, + #[snafu(display("Read the corrupted log entry, region_id: {}", region_id))] + CorruptedEntry { + region_id: RegionId, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display( "Failed to upload file, region_id: {}, file_id: {}, file_type: {:?}", region_id, @@ -757,7 +772,6 @@ impl ErrorExt for Error { | ReadParquet { .. } | WriteWal { .. } | ReadWal { .. } - | ReadKafkaWal { .. } | DeleteWal { .. } => StatusCode::StorageUnavailable, CompressObject { .. } | DecompressObject { .. } @@ -780,7 +794,8 @@ impl ErrorExt for Error { | ConvertColumnDataType { .. } | ColumnNotFound { .. } | InvalidMetadata { .. } - | InvalidRegionOptions { .. } => StatusCode::InvalidArguments, + | InvalidRegionOptions { .. } + | InvalidWalReadRequest { .. } => StatusCode::InvalidArguments, InvalidRegionRequestSchemaVersion { .. } => StatusCode::RequestOutdated, @@ -789,8 +804,10 @@ impl ErrorExt for Error { | WorkerStopped { .. } | Recv { .. } | EncodeWal { .. } - | DecodeWal { .. } => StatusCode::Internal, + | DecodeWal { .. } + | BuildEntry { .. } => StatusCode::Internal, OpenRegion { source, .. } => source.status_code(), + WriteBuffer { source, .. } => source.status_code(), WriteGroup { source, .. } => source.status_code(), FieldTypeMismatch { source, .. } => source.status_code(), @@ -837,7 +854,9 @@ impl ErrorExt for Error { Upload { .. } => StatusCode::StorageUnavailable, BiError { .. } => StatusCode::Internal, - EncodeMemtable { .. } | ReadDataPart { .. } => StatusCode::Internal, + EncodeMemtable { .. } | ReadDataPart { .. } | CorruptedEntry { .. } => { + StatusCode::Internal + } ChecksumMismatch { .. } => StatusCode::Unexpected, RegionStopped { .. } => StatusCode::RegionNotReady, TimeRangePredicateOverflow { .. } => StatusCode::InvalidArguments, diff --git a/src/mito2/src/manifest/manager.rs b/src/mito2/src/manifest/manager.rs index b121db9c48e2..777f9a47e49a 100644 --- a/src/mito2/src/manifest/manager.rs +++ b/src/mito2/src/manifest/manager.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::sync::atomic::AtomicU64; use std::sync::Arc; use common_datasource::compression::CompressionType; @@ -121,12 +122,17 @@ pub struct RegionManifestManager { impl RegionManifestManager { /// Constructs a region's manifest and persist it. - pub async fn new(metadata: RegionMetadataRef, options: RegionManifestOptions) -> Result { + pub async fn new( + metadata: RegionMetadataRef, + options: RegionManifestOptions, + total_manifest_size: Arc, + ) -> Result { // construct storage let mut store = ManifestObjectStore::new( &options.manifest_dir, options.object_store.clone(), options.compress_type, + total_manifest_size, ); info!( @@ -168,7 +174,10 @@ impl RegionManifestManager { /// Opens an existing manifest. /// /// Returns `Ok(None)` if no such manifest. - pub async fn open(options: RegionManifestOptions) -> Result> { + pub async fn open( + options: RegionManifestOptions, + total_manifest_size: Arc, + ) -> Result> { let _t = MANIFEST_OP_ELAPSED .with_label_values(&["open"]) .start_timer(); @@ -178,6 +187,7 @@ impl RegionManifestManager { &options.manifest_dir, options.object_store.clone(), options.compress_type, + total_manifest_size, ); // recover from storage diff --git a/src/mito2/src/manifest/storage.rs b/src/mito2/src/manifest/storage.rs index 815afd9f4c6c..88450a21bdbb 100644 --- a/src/mito2/src/manifest/storage.rs +++ b/src/mito2/src/manifest/storage.rs @@ -15,6 +15,8 @@ use std::collections::HashMap; use std::iter::Iterator; use std::str::FromStr; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; use common_datasource::compression::CompressionType; use common_telemetry::debug; @@ -133,15 +135,22 @@ pub struct ManifestObjectStore { path: String, /// Stores the size of each manifest file. manifest_size_map: HashMap, + total_manifest_size: Arc, } impl ManifestObjectStore { - pub fn new(path: &str, object_store: ObjectStore, compress_type: CompressionType) -> Self { + pub fn new( + path: &str, + object_store: ObjectStore, + compress_type: CompressionType, + total_manifest_size: Arc, + ) -> Self { Self { object_store, compress_type, path: util::normalize_dir(path), manifest_size_map: HashMap::new(), + total_manifest_size, } } @@ -338,10 +347,9 @@ impl ManifestObjectStore { // delete manifest sizes for (_, is_checkpoint, version) in &del_entries { if *is_checkpoint { - self.manifest_size_map - .remove(&FileKey::Checkpoint(*version)); + self.unset_file_size(&FileKey::Checkpoint(*version)); } else { - self.manifest_size_map.remove(&FileKey::Delta(*version)); + self.unset_file_size(&FileKey::Delta(*version)); } } @@ -564,12 +572,28 @@ impl ManifestObjectStore { /// Set the size of the delta file by delta version. pub(crate) fn set_delta_file_size(&mut self, version: ManifestVersion, size: u64) { self.manifest_size_map.insert(FileKey::Delta(version), size); + self.inc_total_manifest_size(size); } /// Set the size of the checkpoint file by checkpoint version. pub(crate) fn set_checkpoint_file_size(&mut self, version: ManifestVersion, size: u64) { self.manifest_size_map .insert(FileKey::Checkpoint(version), size); + self.inc_total_manifest_size(size); + } + + fn unset_file_size(&mut self, key: &FileKey) { + if let Some(val) = self.manifest_size_map.remove(key) { + self.dec_total_manifest_size(val); + } + } + + fn inc_total_manifest_size(&self, val: u64) { + self.total_manifest_size.fetch_add(val, Ordering::Relaxed); + } + + fn dec_total_manifest_size(&self, val: u64) { + self.total_manifest_size.fetch_sub(val, Ordering::Relaxed); } } @@ -610,7 +634,12 @@ mod tests { let mut builder = Fs::default(); let _ = builder.root(&tmp_dir.path().to_string_lossy()); let object_store = ObjectStore::new(builder).unwrap().finish(); - ManifestObjectStore::new("/", object_store, CompressionType::Uncompressed) + ManifestObjectStore::new( + "/", + object_store, + CompressionType::Uncompressed, + Default::default(), + ) } fn new_checkpoint_metadata_with_version(version: ManifestVersion) -> CheckpointMetadata { diff --git a/src/mito2/src/region.rs b/src/mito2/src/region.rs index c9930d2d04a7..fc000f3e8197 100644 --- a/src/mito2/src/region.rs +++ b/src/mito2/src/region.rs @@ -20,13 +20,13 @@ pub(crate) mod version; use std::collections::hash_map::Entry; use std::collections::HashMap; -use std::sync::atomic::{AtomicI64, Ordering}; +use std::sync::atomic::{AtomicI64, AtomicU64, Ordering}; use std::sync::{Arc, RwLock}; use common_telemetry::{error, info, warn}; -use common_wal::options::WalOptions; use crossbeam_utils::atomic::AtomicCell; use snafu::{ensure, OptionExt}; +use store_api::logstore::provider::Provider; use store_api::metadata::RegionMetadataRef; use store_api::storage::RegionId; @@ -98,14 +98,16 @@ pub(crate) struct MitoRegion { pub(crate) manifest_ctx: ManifestContextRef, /// SST file purger. pub(crate) file_purger: FilePurgerRef, - /// Wal options of this region. - pub(crate) wal_options: WalOptions, + /// The provider of log store. + pub(crate) provider: Provider, /// Last flush time in millis. last_flush_millis: AtomicI64, /// Provider to get current time. time_provider: TimeProviderRef, /// Memtable builder for the region. pub(crate) memtable_builder: MemtableBuilderRef, + /// manifest stats + stats: ManifestStats, } pub(crate) type MitoRegionRef = Arc; @@ -233,7 +235,7 @@ impl MitoRegion { } /// Returns the region usage in bytes. - pub(crate) async fn region_usage(&self) -> RegionUsage { + pub(crate) fn region_usage(&self) -> RegionUsage { let region_id = self.region_id; let version = self.version(); @@ -243,13 +245,7 @@ impl MitoRegion { let sst_usage = version.ssts.sst_usage(); let wal_usage = self.estimated_wal_usage(memtable_usage); - - let manifest_usage = self - .manifest_ctx - .manifest_manager - .read() - .await - .manifest_usage(); + let manifest_usage = self.stats.total_manifest_size(); RegionUsage { region_id, @@ -526,6 +522,18 @@ impl OpeningRegions { pub(crate) type OpeningRegionsRef = Arc; +/// Manifest stats. +#[derive(Default, Debug, Clone)] +pub(crate) struct ManifestStats { + total_manifest_size: Arc, +} + +impl ManifestStats { + fn total_manifest_size(&self) -> u64 { + self.total_manifest_size.load(Ordering::Relaxed) + } +} + #[cfg(test)] mod tests { use crossbeam_utils::atomic::AtomicCell; diff --git a/src/mito2/src/region/opener.rs b/src/mito2/src/region/opener.rs index 64e333c9c476..ed9cbf037b30 100644 --- a/src/mito2/src/region/opener.rs +++ b/src/mito2/src/region/opener.rs @@ -24,6 +24,7 @@ use futures::StreamExt; use object_store::manager::ObjectStoreManagerRef; use object_store::util::{join_dir, normalize_dir}; use snafu::{ensure, OptionExt}; +use store_api::logstore::provider::Provider; use store_api::logstore::LogStore; use store_api::metadata::{ColumnMetadata, RegionMetadata}; use store_api::storage::{ColumnId, RegionId}; @@ -40,7 +41,7 @@ use crate::memtable::time_partition::TimePartitions; use crate::memtable::MemtableBuilderProvider; use crate::region::options::RegionOptions; use crate::region::version::{VersionBuilder, VersionControl, VersionControlRef}; -use crate::region::{ManifestContext, MitoRegion, RegionState}; +use crate::region::{ManifestContext, ManifestStats, MitoRegion, RegionState}; use crate::region_write_ctx::RegionWriteCtx; use crate::request::OptionOutputTx; use crate::schedule::scheduler::SchedulerRef; @@ -62,6 +63,7 @@ pub(crate) struct RegionOpener { skip_wal_replay: bool, intermediate_manager: IntermediateManager, time_provider: Option, + stats: ManifestStats, } impl RegionOpener { @@ -86,6 +88,7 @@ impl RegionOpener { skip_wal_replay: false, intermediate_manager, time_provider: None, + stats: Default::default(), } } @@ -162,14 +165,18 @@ impl RegionOpener { } } let options = self.options.take().unwrap(); - let wal_options = options.wal_options.clone(); + let provider = self.provider(&options.wal_options); let object_store = self.object_store(&options.storage)?.clone(); // Create a manifest manager for this region and writes regions to the manifest file. let region_manifest_options = self.manifest_options(config, &options)?; let metadata = Arc::new(self.metadata.unwrap()); - let manifest_manager = - RegionManifestManager::new(metadata.clone(), region_manifest_options).await?; + let manifest_manager = RegionManifestManager::new( + metadata.clone(), + region_manifest_options, + self.stats.total_manifest_size.clone(), + ) + .await?; let memtable_builder = self .memtable_builder_provider @@ -212,10 +219,11 @@ impl RegionOpener { access_layer, self.cache_manager, )), - wal_options, + provider, last_flush_millis: AtomicI64::new(time_provider.current_time_millis()), time_provider, memtable_builder, + stats: self.stats, }) } @@ -250,6 +258,13 @@ impl RegionOpener { Ok(region) } + fn provider(&self, wal_options: &WalOptions) -> Provider { + match wal_options { + WalOptions::RaftEngine => Provider::raft_engine_provider(self.region_id.as_u64()), + WalOptions::Kafka(options) => Provider::kafka_provider(options.topic.to_string()), + } + } + /// Tries to open the region and returns `None` if the region directory is empty. async fn maybe_open( &self, @@ -257,10 +272,13 @@ impl RegionOpener { wal: &Wal, ) -> Result> { let region_options = self.options.as_ref().unwrap().clone(); - let wal_options = region_options.wal_options.clone(); let region_manifest_options = self.manifest_options(config, ®ion_options)?; - let Some(manifest_manager) = RegionManifestManager::open(region_manifest_options).await? + let Some(manifest_manager) = RegionManifestManager::open( + region_manifest_options, + self.stats.total_manifest_size.clone(), + ) + .await? else { return Ok(None); }; @@ -269,6 +287,7 @@ impl RegionOpener { let metadata = manifest.metadata.clone(); let region_id = self.region_id; + let provider = self.provider(®ion_options.wal_options); let object_store = self.object_store(®ion_options.storage)?.clone(); debug!("Open region {} with options: {:?}", region_id, self.options); @@ -313,7 +332,7 @@ impl RegionOpener { ); replay_memtable( wal, - &wal_options, + &provider, region_id, flushed_entry_id, &version_control, @@ -338,10 +357,11 @@ impl RegionOpener { RegionState::ReadOnly, )), file_purger, - wal_options, + provider, last_flush_millis: AtomicI64::new(time_provider.current_time_millis()), time_provider, memtable_builder, + stats: self.stats.clone(), }; Ok(Some(region)) } @@ -430,7 +450,7 @@ pub(crate) fn check_recovered_region( /// Replays the mutations from WAL and inserts mutations to memtable of given region. pub(crate) async fn replay_memtable( wal: &Wal, - wal_options: &WalOptions, + provider: &Provider, region_id: RegionId, flushed_entry_id: EntryId, version_control: &VersionControlRef, @@ -442,7 +462,7 @@ pub(crate) async fn replay_memtable( let mut last_entry_id = flushed_entry_id; let replay_from_entry_id = flushed_entry_id + 1; - let mut wal_stream = wal.scan(region_id, replay_from_entry_id, wal_options)?; + let mut wal_stream = wal.scan(region_id, replay_from_entry_id, provider)?; while let Some(res) = wal_stream.next().await { let (entry_id, entry) = res?; if entry_id <= flushed_entry_id { @@ -459,7 +479,7 @@ pub(crate) async fn replay_memtable( last_entry_id = last_entry_id.max(entry_id); let mut region_write_ctx = - RegionWriteCtx::new(region_id, version_control, wal_options.clone()); + RegionWriteCtx::new(region_id, version_control, provider.clone()); for mutation in entry.mutations { rows_replayed += mutation .rows @@ -474,8 +494,9 @@ pub(crate) async fn replay_memtable( region_write_ctx.write_memtable(); } - wal.obsolete(region_id, flushed_entry_id, wal_options) - .await?; + // TODO(weny): We need to update `flushed_entry_id` in the region manifest + // to avoid reading potentially incomplete entries in the future. + wal.obsolete(region_id, flushed_entry_id, provider).await?; info!( "Replay WAL for region: {}, rows recovered: {}, last entry id: {}", diff --git a/src/mito2/src/region_write_ctx.rs b/src/mito2/src/region_write_ctx.rs index 36b1a0fac67b..e86ff77ca2f1 100644 --- a/src/mito2/src/region_write_ctx.rs +++ b/src/mito2/src/region_write_ctx.rs @@ -16,8 +16,8 @@ use std::mem; use std::sync::Arc; use api::v1::{Mutation, OpType, Rows, WalEntry}; -use common_wal::options::WalOptions; use snafu::ResultExt; +use store_api::logstore::provider::Provider; use store_api::logstore::LogStore; use store_api::storage::{RegionId, SequenceNumber}; @@ -86,7 +86,7 @@ pub(crate) struct RegionWriteCtx { /// out of the context to construct the wal entry when we write to the wal. wal_entry: WalEntry, /// Wal options of the region being written to. - wal_options: WalOptions, + provider: Provider, /// Notifiers to send write results to waiters. /// /// The i-th notify is for i-th mutation. @@ -106,7 +106,7 @@ impl RegionWriteCtx { pub(crate) fn new( region_id: RegionId, version_control: &VersionControlRef, - wal_options: WalOptions, + provider: Provider, ) -> RegionWriteCtx { let VersionControlData { version, @@ -122,7 +122,7 @@ impl RegionWriteCtx { next_sequence: committed_sequence + 1, next_entry_id: last_entry_id + 1, wal_entry: WalEntry::default(), - wal_options, + provider, notifiers: Vec::new(), failed: false, put_num: 0, @@ -163,7 +163,7 @@ impl RegionWriteCtx { self.region_id, self.next_entry_id, &self.wal_entry, - &self.wal_options, + &self.provider, )?; self.next_entry_id += 1; Ok(()) diff --git a/src/mito2/src/test_util.rs b/src/mito2/src/test_util.rs index 78dbd1c3362b..d7c671962c03 100644 --- a/src/mito2/src/test_util.rs +++ b/src/mito2/src/test_util.rs @@ -20,6 +20,7 @@ pub mod meta_util; pub mod scheduler_util; pub mod sst_util; pub mod version_util; +pub mod wal_util; use std::collections::HashMap; use std::path::Path; @@ -356,11 +357,11 @@ impl TestEnv { }; if let Some(metadata) = initial_metadata { - RegionManifestManager::new(metadata, manifest_opts) + RegionManifestManager::new(metadata, manifest_opts, Default::default()) .await .map(Some) } else { - RegionManifestManager::open(manifest_opts).await + RegionManifestManager::open(manifest_opts, Default::default()).await } } diff --git a/src/mito2/src/test_util/scheduler_util.rs b/src/mito2/src/test_util/scheduler_util.rs index bfaf569123ed..a47d9d4e7c63 100644 --- a/src/mito2/src/test_util/scheduler_util.rs +++ b/src/mito2/src/test_util/scheduler_util.rs @@ -109,6 +109,7 @@ impl SchedulerEnv { compress_type: CompressionType::Uncompressed, checkpoint_distance: 10, }, + Default::default(), ) .await .unwrap(), diff --git a/src/mito2/src/test_util/wal_util.rs b/src/mito2/src/test_util/wal_util.rs new file mode 100644 index 000000000000..823242faae23 --- /dev/null +++ b/src/mito2/src/test_util/wal_util.rs @@ -0,0 +1,68 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use api::v1::WalEntry; +use futures::stream; +use prost::Message; +use store_api::logstore::entry::{Entry, MultiplePartEntry, MultiplePartHeader}; +use store_api::logstore::provider::Provider; +use store_api::logstore::EntryId; +use store_api::storage::RegionId; + +use crate::error::Result; +use crate::wal::raw_entry_reader::{EntryStream, RawEntryReader}; + +pub(crate) struct MockRawEntryStream { + pub(crate) entries: Vec, +} + +impl RawEntryReader for MockRawEntryStream { + fn read(&self, _ns: &Provider, _start_id: EntryId) -> Result> { + Ok(Box::pin(stream::iter( + self.entries.clone().into_iter().map(Ok), + ))) + } +} + +/// Puts an incomplete [`Entry`] at the end of `input`. +pub(crate) fn generate_tail_corrupted_stream( + provider: Provider, + region_id: RegionId, + input: &WalEntry, + num_parts: usize, +) -> Vec { + let encoded_entry = input.encode_to_vec(); + let parts = encoded_entry + .chunks(encoded_entry.len() / num_parts) + .map(Into::into) + .collect::>(); + + vec![ + Entry::MultiplePart(MultiplePartEntry { + provider: provider.clone(), + region_id, + entry_id: 0, + headers: vec![MultiplePartHeader::First, MultiplePartHeader::Last], + parts, + }), + // The tail corrupted data. + Entry::MultiplePart(MultiplePartEntry { + provider: provider.clone(), + region_id, + entry_id: 0, + headers: vec![MultiplePartHeader::First], + parts: vec![vec![1; 100]], + }), + ] +} diff --git a/src/mito2/src/wal.rs b/src/mito2/src/wal.rs index 0b3b8282833c..c0cca52dcbbd 100644 --- a/src/mito2/src/wal.rs +++ b/src/mito2/src/wal.rs @@ -16,30 +16,31 @@ /// TODO(weny): remove it #[allow(unused)] -pub(crate) mod raw_entry_reader; +pub(crate) mod entry_distributor; +/// TODO(weny): remove it +#[allow(unused)] +pub(crate) mod entry_reader; /// TODO(weny): remove it #[allow(unused)] -pub(crate) mod wal_entry_reader; +pub(crate) mod raw_entry_reader; use std::collections::HashMap; use std::mem; use std::sync::Arc; use api::v1::WalEntry; -use async_stream::try_stream; use common_error::ext::BoxedError; -use common_wal::options::WalOptions; use futures::stream::BoxStream; -use futures::StreamExt; use prost::Message; use snafu::ResultExt; use store_api::logstore::entry::Entry; +use store_api::logstore::provider::Provider; use store_api::logstore::{AppendBatchResponse, LogStore}; use store_api::storage::RegionId; -use crate::error::{ - DecodeWalSnafu, DeleteWalSnafu, EncodeWalSnafu, ReadWalSnafu, Result, WriteWalSnafu, -}; +use crate::error::{BuildEntrySnafu, DeleteWalSnafu, EncodeWalSnafu, Result, WriteWalSnafu}; +use crate::wal::entry_reader::{LogStoreEntryReader, WalEntryReader}; +use crate::wal::raw_entry_reader::{LogStoreRawEntryReader, RegionRawEntryReader}; /// WAL entry id. pub type EntryId = store_api::logstore::entry::Id; @@ -60,6 +61,10 @@ impl Wal { pub fn new(store: Arc) -> Self { Self { store } } + + pub fn store(&self) -> &Arc { + &self.store + } } impl Clone for Wal { @@ -77,7 +82,7 @@ impl Wal { store: self.store.clone(), entries: Vec::new(), entry_encode_buf: Vec::new(), - namespaces: HashMap::new(), + providers: HashMap::new(), } } @@ -86,29 +91,19 @@ impl Wal { &'a self, region_id: RegionId, start_id: EntryId, - wal_options: &'a WalOptions, - ) -> Result { - let stream = try_stream!({ - let namespace = self.store.namespace(region_id.into(), wal_options); - let mut stream = self - .store - .read(&namespace, start_id) - .await - .map_err(BoxedError::new) - .context(ReadWalSnafu { region_id })?; - - while let Some(entries) = stream.next().await { - let entries = entries - .map_err(BoxedError::new) - .context(ReadWalSnafu { region_id })?; - - for entry in entries { - yield decode_entry(region_id, entry)?; - } + namespace: &'a Provider, + ) -> Result> { + match namespace { + Provider::RaftEngine(_) => { + LogStoreEntryReader::new(LogStoreRawEntryReader::new(self.store.clone())) + .read(namespace, start_id) } - }); - - Ok(Box::pin(stream)) + Provider::Kafka(_) => LogStoreEntryReader::new(RegionRawEntryReader::new( + LogStoreRawEntryReader::new(self.store.clone()), + region_id, + )) + .read(namespace, start_id), + } } /// Mark entries whose ids `<= last_id` as deleted. @@ -116,37 +111,26 @@ impl Wal { &self, region_id: RegionId, last_id: EntryId, - wal_options: &WalOptions, + provider: &Provider, ) -> Result<()> { - let namespace = self.store.namespace(region_id.into(), wal_options); self.store - .obsolete(namespace, last_id) + .obsolete(provider, last_id) .await .map_err(BoxedError::new) .context(DeleteWalSnafu { region_id }) } } -/// Decode Wal entry from log store. -fn decode_entry(region_id: RegionId, entry: E) -> Result<(EntryId, WalEntry)> { - let entry_id = entry.id(); - let data = entry.data(); - - let wal_entry = WalEntry::decode(data).context(DecodeWalSnafu { region_id })?; - - Ok((entry_id, wal_entry)) -} - /// WAL batch writer. pub struct WalWriter { /// Log store of the WAL. store: Arc, /// Entries to write. - entries: Vec, + entries: Vec, /// Buffer to encode WAL entry. entry_encode_buf: Vec, - /// Namespaces of regions being written into. - namespaces: HashMap, + /// Providers of regions being written into. + providers: HashMap, } impl WalWriter { @@ -156,14 +140,13 @@ impl WalWriter { region_id: RegionId, entry_id: EntryId, wal_entry: &WalEntry, - wal_options: &WalOptions, + provider: &Provider, ) -> Result<()> { - // Gets or inserts with a newly built namespace. - let namespace = self - .namespaces + // Gets or inserts with a newly built provider. + let provider = self + .providers .entry(region_id) - .or_insert_with(|| self.store.namespace(region_id.into(), wal_options)) - .clone(); + .or_insert_with(|| provider.clone()); // Encode wal entry to log store entry. self.entry_encode_buf.clear(); @@ -172,7 +155,9 @@ impl WalWriter { .context(EncodeWalSnafu { region_id })?; let entry = self .store - .entry(&mut self.entry_encode_buf, entry_id, namespace); + .entry(&mut self.entry_encode_buf, entry_id, region_id, provider) + .map_err(BoxedError::new) + .context(BuildEntrySnafu { region_id })?; self.entries.push(entry); @@ -272,7 +257,6 @@ mod tests { async fn test_write_wal() { let env = WalEnv::new().await; let wal = env.new_wal(); - let wal_options = WalOptions::default(); let entry = WalEntry { mutations: vec![ @@ -282,16 +266,34 @@ mod tests { }; let mut writer = wal.writer(); // Region 1 entry 1. + let region_id = RegionId::new(1, 1); writer - .add_entry(RegionId::new(1, 1), 1, &entry, &wal_options) + .add_entry( + region_id, + 1, + &entry, + &Provider::raft_engine_provider(region_id.as_u64()), + ) .unwrap(); // Region 2 entry 1. + let region_id = RegionId::new(1, 2); writer - .add_entry(RegionId::new(1, 2), 1, &entry, &wal_options) + .add_entry( + region_id, + 1, + &entry, + &Provider::raft_engine_provider(region_id.as_u64()), + ) .unwrap(); // Region 1 entry 2. + let region_id = RegionId::new(1, 2); writer - .add_entry(RegionId::new(1, 1), 2, &entry, &wal_options) + .add_entry( + region_id, + 2, + &entry, + &Provider::raft_engine_provider(region_id.as_u64()), + ) .unwrap(); // Test writing multiple region to wal. @@ -339,32 +341,33 @@ mod tests { async fn test_scan_wal() { let env = WalEnv::new().await; let wal = env.new_wal(); - let wal_options = WalOptions::default(); let entries = sample_entries(); let (id1, id2) = (RegionId::new(1, 1), RegionId::new(1, 2)); + let ns1 = Provider::raft_engine_provider(id1.as_u64()); + let ns2 = Provider::raft_engine_provider(id2.as_u64()); let mut writer = wal.writer(); - writer.add_entry(id1, 1, &entries[0], &wal_options).unwrap(); + writer.add_entry(id1, 1, &entries[0], &ns1).unwrap(); // Insert one entry into region2. Scan should not return this entry. - writer.add_entry(id2, 1, &entries[0], &wal_options).unwrap(); - writer.add_entry(id1, 2, &entries[1], &wal_options).unwrap(); - writer.add_entry(id1, 3, &entries[2], &wal_options).unwrap(); - writer.add_entry(id1, 4, &entries[3], &wal_options).unwrap(); + writer.add_entry(id2, 1, &entries[0], &ns2).unwrap(); + writer.add_entry(id1, 2, &entries[1], &ns1).unwrap(); + writer.add_entry(id1, 3, &entries[2], &ns1).unwrap(); + writer.add_entry(id1, 4, &entries[3], &ns1).unwrap(); writer.write_to_wal().await.unwrap(); // Scan all contents region1 - let stream = wal.scan(id1, 1, &wal_options).unwrap(); + let stream = wal.scan(id1, 1, &ns1).unwrap(); let actual: Vec<_> = stream.try_collect().await.unwrap(); check_entries(&entries, 1, &actual); // Scan parts of contents - let stream = wal.scan(id1, 2, &wal_options).unwrap(); + let stream = wal.scan(id1, 2, &ns1).unwrap(); let actual: Vec<_> = stream.try_collect().await.unwrap(); check_entries(&entries[1..], 2, &actual); // Scan out of range - let stream = wal.scan(id1, 5, &wal_options).unwrap(); + let stream = wal.scan(id1, 5, &ns1).unwrap(); let actual: Vec<_> = stream.try_collect().await.unwrap(); assert!(actual.is_empty()); } @@ -373,35 +376,27 @@ mod tests { async fn test_obsolete_wal() { let env = WalEnv::new().await; let wal = env.new_wal(); - let wal_options = WalOptions::default(); let entries = sample_entries(); let mut writer = wal.writer(); let region_id = RegionId::new(1, 1); - writer - .add_entry(region_id, 1, &entries[0], &wal_options) - .unwrap(); - writer - .add_entry(region_id, 2, &entries[1], &wal_options) - .unwrap(); - writer - .add_entry(region_id, 3, &entries[2], &wal_options) - .unwrap(); + let ns = Provider::raft_engine_provider(region_id.as_u64()); + writer.add_entry(region_id, 1, &entries[0], &ns).unwrap(); + writer.add_entry(region_id, 2, &entries[1], &ns).unwrap(); + writer.add_entry(region_id, 3, &entries[2], &ns).unwrap(); writer.write_to_wal().await.unwrap(); // Delete 1, 2. - wal.obsolete(region_id, 2, &wal_options).await.unwrap(); + wal.obsolete(region_id, 2, &ns).await.unwrap(); // Put 4. let mut writer = wal.writer(); - writer - .add_entry(region_id, 4, &entries[3], &wal_options) - .unwrap(); + writer.add_entry(region_id, 4, &entries[3], &ns).unwrap(); writer.write_to_wal().await.unwrap(); // Scan all - let stream = wal.scan(region_id, 1, &wal_options).unwrap(); + let stream = wal.scan(region_id, 1, &ns).unwrap(); let actual: Vec<_> = stream.try_collect().await.unwrap(); check_entries(&entries[2..], 3, &actual); } diff --git a/src/mito2/src/wal/entry_distributor.rs b/src/mito2/src/wal/entry_distributor.rs new file mode 100644 index 000000000000..dacb1d3ae9dc --- /dev/null +++ b/src/mito2/src/wal/entry_distributor.rs @@ -0,0 +1,634 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::min; +use std::collections::HashMap; +use std::sync::Arc; + +use api::v1::WalEntry; +use async_stream::stream; +use common_telemetry::{debug, error}; +use futures::future::join_all; +use snafu::ensure; +use store_api::logstore::entry::Entry; +use store_api::logstore::provider::Provider; +use store_api::storage::RegionId; +use tokio::sync::mpsc::{self, Receiver, Sender, UnboundedReceiver, UnboundedSender}; +use tokio::sync::oneshot; +use tokio_stream::StreamExt; + +use crate::error::{self, Result}; +use crate::wal::entry_reader::{decode_raw_entry, WalEntryReader}; +use crate::wal::raw_entry_reader::RawEntryReader; +use crate::wal::{EntryId, WalEntryStream}; + +/// [WalEntryDistributor] distributes Wal entries to specific [WalEntryReceiver]s based on [RegionId]. +pub(crate) struct WalEntryDistributor { + raw_wal_reader: Arc, + provider: Provider, + /// Sends [Entry] to receivers based on [RegionId] + senders: HashMap>, + /// Waits for the arg from the [WalEntryReader]. + arg_receivers: Vec<(RegionId, oneshot::Receiver)>, +} + +impl WalEntryDistributor { + /// Distributes entries to specific [WalEntryReceiver]s based on [RegionId]. + pub async fn distribute(mut self) -> Result<()> { + let arg_futures = self + .arg_receivers + .iter_mut() + .map(|(region_id, receiver)| async { (*region_id, receiver.await.ok()) }); + let args = join_all(arg_futures) + .await + .into_iter() + .filter_map(|(region_id, start_id)| start_id.map(|start_id| (region_id, start_id))) + .collect::>(); + + // No subscribers + if args.is_empty() { + return Ok(()); + } + // Safety: must exist + let min_start_id = args.iter().map(|(_, start_id)| *start_id).min().unwrap(); + let receivers: HashMap<_, _> = args + .into_iter() + .map(|(region_id, start_id)| { + ( + region_id, + EntryReceiver { + start_id, + sender: self.senders[®ion_id].clone(), + }, + ) + }) + .collect(); + + let mut stream = self.raw_wal_reader.read(&self.provider, min_start_id)?; + while let Some(entry) = stream.next().await { + let entry = entry?; + let entry_id = entry.entry_id(); + let region_id = entry.region_id(); + + if let Some(EntryReceiver { sender, start_id }) = receivers.get(®ion_id) { + if entry_id >= *start_id { + if let Err(err) = sender.send(entry).await { + error!(err; "Failed to distribute raw entry, entry_id:{}, region_id: {}", entry_id, region_id); + } + } + } else { + debug!("Subscriber not found, region_id: {}", region_id); + } + } + + Ok(()) + } +} + +/// Receives the Wal entries from [WalEntryDistributor]. +#[derive(Debug)] +pub(crate) struct WalEntryReceiver { + region_id: RegionId, + /// Receives the [Entry] from the [WalEntryDistributor]. + entry_receiver: Receiver, + /// Sends the `start_id` to the [WalEntryDistributor]. + arg_sender: oneshot::Sender, +} + +impl WalEntryReceiver { + pub fn new( + region_id: RegionId, + entry_receiver: Receiver, + arg_sender: oneshot::Sender, + ) -> Self { + Self { + region_id, + entry_receiver, + arg_sender, + } + } +} + +impl WalEntryReader for WalEntryReceiver { + fn read(self, provider: &Provider, start_id: EntryId) -> Result> { + let WalEntryReceiver { + region_id: expected_region_id, + mut entry_receiver, + arg_sender, + } = self; + + if arg_sender.send(start_id).is_err() { + return error::InvalidWalReadRequestSnafu { + reason: format!( + "WalEntryDistributor is dropped, failed to send arg, start_id: {start_id}" + ), + } + .fail(); + } + + let stream = stream! { + let mut buffered_entry = None; + while let Some(next_entry) = entry_receiver.recv().await { + match buffered_entry.take() { + Some(entry) => { + yield decode_raw_entry(entry); + buffered_entry = Some(next_entry); + }, + None => { + buffered_entry = Some(next_entry); + } + }; + } + if let Some(entry) = buffered_entry { + // Ignores tail corrupted data. + if entry.is_complete() { + yield decode_raw_entry(entry); + } + } + }; + + Ok(Box::pin(stream)) + } +} + +struct EntryReceiver { + start_id: EntryId, + sender: Sender, +} + +/// Returns [WalEntryDistributor] and batch [WalEntryReceiver]s. +/// +/// ### Note: +/// Ensures `receiver.read` is called before the `distributor.distribute` in the same thread. +/// +/// ```text +/// let (distributor, receivers) = build_wal_entry_distributor_and_receivers(..); +/// Thread 1 | +/// | +/// // may deadlock | +/// distributor.distribute().await; | +/// | +/// | +/// receivers[0].read().await | +/// ``` +/// +pub fn build_wal_entry_distributor_and_receivers( + provider: Provider, + raw_wal_reader: Arc, + region_ids: Vec, + buffer_size: usize, +) -> (WalEntryDistributor, Vec) { + let mut senders = HashMap::with_capacity(region_ids.len()); + let mut readers = Vec::with_capacity(region_ids.len()); + let mut arg_receivers = Vec::with_capacity(region_ids.len()); + + for region_id in region_ids { + let (entry_sender, entry_receiver) = mpsc::channel(buffer_size); + let (arg_sender, arg_receiver) = oneshot::channel(); + + senders.insert(region_id, entry_sender); + arg_receivers.push((region_id, arg_receiver)); + readers.push(WalEntryReceiver::new(region_id, entry_receiver, arg_sender)); + } + + ( + WalEntryDistributor { + provider, + raw_wal_reader, + senders, + arg_receivers, + }, + readers, + ) +} + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + + use api::v1::{Mutation, OpType}; + use futures::{stream, TryStreamExt}; + use prost::Message; + use store_api::logstore::entry::{Entry, MultiplePartEntry, MultiplePartHeader, NaiveEntry}; + + use super::*; + use crate::test_util::wal_util::generate_tail_corrupted_stream; + use crate::wal::raw_entry_reader::{EntryStream, RawEntryReader}; + use crate::wal::EntryId; + + struct MockRawEntryReader { + entries: Vec, + } + + impl MockRawEntryReader { + pub fn new(entries: Vec) -> MockRawEntryReader { + Self { entries } + } + } + + impl RawEntryReader for MockRawEntryReader { + fn read(&self, provider: &Provider, _start_id: EntryId) -> Result> { + let stream = stream::iter(self.entries.clone().into_iter().map(Ok)); + Ok(Box::pin(stream)) + } + } + + #[tokio::test] + async fn test_wal_entry_distributor_without_receivers() { + let provider = Provider::kafka_provider("my_topic".to_string()); + let reader = Arc::new(MockRawEntryReader::new(vec![Entry::Naive(NaiveEntry { + region_id: RegionId::new(1024, 1), + provider: provider.clone(), + entry_id: 1, + data: vec![1], + })])); + + let (distributor, receivers) = build_wal_entry_distributor_and_receivers( + provider, + reader, + vec![RegionId::new(1024, 1), RegionId::new(1025, 1)], + 128, + ); + + // Drops all receivers + drop(receivers); + // Returns immediately + distributor.distribute().await.unwrap(); + } + + #[tokio::test] + async fn test_wal_entry_distributor() { + common_telemetry::init_default_ut_logging(); + let provider = Provider::kafka_provider("my_topic".to_string()); + let reader = Arc::new(MockRawEntryReader::new(vec![ + Entry::Naive(NaiveEntry { + provider: provider.clone(), + region_id: RegionId::new(1024, 1), + entry_id: 1, + data: WalEntry { + mutations: vec![Mutation { + op_type: OpType::Put as i32, + sequence: 1u64, + rows: None, + }], + } + .encode_to_vec(), + }), + Entry::Naive(NaiveEntry { + provider: provider.clone(), + region_id: RegionId::new(1024, 2), + entry_id: 2, + data: WalEntry { + mutations: vec![Mutation { + op_type: OpType::Put as i32, + sequence: 2u64, + rows: None, + }], + } + .encode_to_vec(), + }), + Entry::Naive(NaiveEntry { + provider: provider.clone(), + region_id: RegionId::new(1024, 3), + entry_id: 3, + data: WalEntry { + mutations: vec![Mutation { + op_type: OpType::Put as i32, + sequence: 3u64, + rows: None, + }], + } + .encode_to_vec(), + }), + ])); + + // Builds distributor and receivers + let (distributor, mut receivers) = build_wal_entry_distributor_and_receivers( + provider.clone(), + reader, + vec![ + RegionId::new(1024, 1), + RegionId::new(1024, 2), + RegionId::new(1024, 3), + ], + 128, + ); + assert_eq!(receivers.len(), 3); + + // Should be okay if one of receiver is dropped. + let last = receivers.pop().unwrap(); + drop(last); + + let mut streams = receivers + .into_iter() + .map(|receiver| receiver.read(&provider, 0).unwrap()) + .collect::>(); + distributor.distribute().await.unwrap(); + let entries = streams + .get_mut(0) + .unwrap() + .try_collect::>() + .await + .unwrap(); + assert_eq!( + entries, + vec![( + 1, + WalEntry { + mutations: vec![Mutation { + op_type: OpType::Put as i32, + sequence: 1u64, + rows: None, + }], + } + )] + ); + let entries = streams + .get_mut(1) + .unwrap() + .try_collect::>() + .await + .unwrap(); + assert_eq!( + entries, + vec![( + 2, + WalEntry { + mutations: vec![Mutation { + op_type: OpType::Put as i32, + sequence: 2u64, + rows: None, + }], + } + )] + ); + } + + #[tokio::test] + async fn test_tail_corrupted_stream() { + let mut entries = vec![]; + let region1 = RegionId::new(1, 1); + let region1_expected_wal_entry = WalEntry { + mutations: vec![Mutation { + op_type: OpType::Put as i32, + sequence: 1u64, + rows: None, + }], + }; + let region2 = RegionId::new(1, 2); + let region2_expected_wal_entry = WalEntry { + mutations: vec![Mutation { + op_type: OpType::Put as i32, + sequence: 3u64, + rows: None, + }], + }; + let region3 = RegionId::new(1, 3); + let region3_expected_wal_entry = WalEntry { + mutations: vec![Mutation { + op_type: OpType::Put as i32, + sequence: 3u64, + rows: None, + }], + }; + let provider = Provider::kafka_provider("my_topic".to_string()); + entries.extend(generate_tail_corrupted_stream( + provider.clone(), + region1, + ®ion1_expected_wal_entry, + 3, + )); + entries.extend(generate_tail_corrupted_stream( + provider.clone(), + region2, + ®ion2_expected_wal_entry, + 2, + )); + entries.extend(generate_tail_corrupted_stream( + provider.clone(), + region3, + ®ion3_expected_wal_entry, + 4, + )); + + let corrupted_stream = MockRawEntryReader { entries }; + // Builds distributor and receivers + let (distributor, mut receivers) = build_wal_entry_distributor_and_receivers( + provider.clone(), + Arc::new(corrupted_stream), + vec![region1, region2, region3], + 128, + ); + assert_eq!(receivers.len(), 3); + let mut streams = receivers + .into_iter() + .map(|receiver| receiver.read(&provider, 0).unwrap()) + .collect::>(); + distributor.distribute().await.unwrap(); + + assert_eq!( + streams + .get_mut(0) + .unwrap() + .try_collect::>() + .await + .unwrap(), + vec![(0, region1_expected_wal_entry)] + ); + + assert_eq!( + streams + .get_mut(1) + .unwrap() + .try_collect::>() + .await + .unwrap(), + vec![(0, region2_expected_wal_entry)] + ); + + assert_eq!( + streams + .get_mut(2) + .unwrap() + .try_collect::>() + .await + .unwrap(), + vec![(0, region3_expected_wal_entry)] + ); + } + + #[tokio::test] + async fn test_part_corrupted_stream() { + let mut entries = vec![]; + let region1 = RegionId::new(1, 1); + let region1_expected_wal_entry = WalEntry { + mutations: vec![Mutation { + op_type: OpType::Put as i32, + sequence: 1u64, + rows: None, + }], + }; + let region2 = RegionId::new(1, 2); + let provider = Provider::kafka_provider("my_topic".to_string()); + entries.extend(generate_tail_corrupted_stream( + provider.clone(), + region1, + ®ion1_expected_wal_entry, + 3, + )); + entries.extend(vec![ + // The corrupted data. + Entry::MultiplePart(MultiplePartEntry { + provider: provider.clone(), + region_id: region2, + entry_id: 0, + headers: vec![MultiplePartHeader::First], + parts: vec![vec![1; 100]], + }), + Entry::MultiplePart(MultiplePartEntry { + provider: provider.clone(), + region_id: region2, + entry_id: 0, + headers: vec![MultiplePartHeader::First], + parts: vec![vec![1; 100]], + }), + ]); + + let corrupted_stream = MockRawEntryReader { entries }; + // Builds distributor and receivers + let (distributor, mut receivers) = build_wal_entry_distributor_and_receivers( + provider.clone(), + Arc::new(corrupted_stream), + vec![region1, region2], + 128, + ); + assert_eq!(receivers.len(), 2); + let mut streams = receivers + .into_iter() + .map(|receiver| receiver.read(&provider, 0).unwrap()) + .collect::>(); + distributor.distribute().await.unwrap(); + assert_eq!( + streams + .get_mut(0) + .unwrap() + .try_collect::>() + .await + .unwrap(), + vec![(0, region1_expected_wal_entry)] + ); + + assert_matches!( + streams + .get_mut(1) + .unwrap() + .try_collect::>() + .await + .unwrap_err(), + error::Error::CorruptedEntry { .. } + ); + } + + #[tokio::test] + async fn test_wal_entry_receiver_start_id() { + let provider = Provider::kafka_provider("my_topic".to_string()); + let reader = Arc::new(MockRawEntryReader::new(vec![ + Entry::Naive(NaiveEntry { + provider: provider.clone(), + region_id: RegionId::new(1024, 1), + entry_id: 1, + data: WalEntry { + mutations: vec![Mutation { + op_type: OpType::Put as i32, + sequence: 1u64, + rows: None, + }], + } + .encode_to_vec(), + }), + Entry::Naive(NaiveEntry { + provider: provider.clone(), + region_id: RegionId::new(1024, 2), + entry_id: 2, + data: WalEntry { + mutations: vec![Mutation { + op_type: OpType::Put as i32, + sequence: 2u64, + rows: None, + }], + } + .encode_to_vec(), + }), + Entry::Naive(NaiveEntry { + provider: provider.clone(), + region_id: RegionId::new(1024, 1), + entry_id: 3, + data: WalEntry { + mutations: vec![Mutation { + op_type: OpType::Put as i32, + sequence: 3u64, + rows: None, + }], + } + .encode_to_vec(), + }), + Entry::Naive(NaiveEntry { + provider: provider.clone(), + region_id: RegionId::new(1024, 2), + entry_id: 4, + data: WalEntry { + mutations: vec![Mutation { + op_type: OpType::Put as i32, + sequence: 4u64, + rows: None, + }], + } + .encode_to_vec(), + }), + ])); + + // Builds distributor and receivers + let (distributor, mut receivers) = build_wal_entry_distributor_and_receivers( + provider.clone(), + reader, + vec![RegionId::new(1024, 1), RegionId::new(1024, 2)], + 128, + ); + assert_eq!(receivers.len(), 2); + let mut streams = receivers + .into_iter() + .map(|receiver| receiver.read(&provider, 4).unwrap()) + .collect::>(); + distributor.distribute().await.unwrap(); + + assert_eq!( + streams + .get_mut(1) + .unwrap() + .try_collect::>() + .await + .unwrap(), + vec![( + 4, + WalEntry { + mutations: vec![Mutation { + op_type: OpType::Put as i32, + sequence: 4u64, + rows: None, + }], + } + )] + ); + } +} diff --git a/src/mito2/src/wal/entry_reader.rs b/src/mito2/src/wal/entry_reader.rs new file mode 100644 index 000000000000..c29a5e629d5c --- /dev/null +++ b/src/mito2/src/wal/entry_reader.rs @@ -0,0 +1,184 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use api::v1::WalEntry; +use async_stream::stream; +use common_telemetry::info; +use futures::StreamExt; +use prost::Message; +use snafu::{ensure, ResultExt}; +use store_api::logstore::entry::Entry; +use store_api::logstore::provider::Provider; +use store_api::storage::RegionId; + +use crate::error::{CorruptedEntrySnafu, DecodeWalSnafu, Result}; +use crate::wal::raw_entry_reader::RawEntryReader; +use crate::wal::{EntryId, WalEntryStream}; + +pub(crate) fn decode_raw_entry(raw_entry: Entry) -> Result<(EntryId, WalEntry)> { + let entry_id = raw_entry.entry_id(); + let region_id = raw_entry.region_id(); + ensure!(raw_entry.is_complete(), CorruptedEntrySnafu { region_id }); + // TODO(weny): implement the [Buf] for return value, avoid extra memory allocation. + let bytes = raw_entry.into_bytes(); + let wal_entry = WalEntry::decode(bytes.as_slice()).context(DecodeWalSnafu { region_id })?; + + Ok((entry_id, wal_entry)) +} + +/// [WalEntryReader] provides the ability to read and decode entries from the underlying store. +pub(crate) trait WalEntryReader: Send + Sync { + fn read(self, ns: &'_ Provider, start_id: EntryId) -> Result>; +} + +/// A Reader reads the [RawEntry] from [RawEntryReader] and decodes [RawEntry] into [WalEntry]. +pub struct LogStoreEntryReader { + reader: R, +} + +impl LogStoreEntryReader { + pub fn new(reader: R) -> Self { + Self { reader } + } +} + +impl WalEntryReader for LogStoreEntryReader { + fn read(self, ns: &'_ Provider, start_id: EntryId) -> Result> { + let LogStoreEntryReader { reader } = self; + let mut stream = reader.read(ns, start_id)?; + + let stream = stream! { + let mut buffered_entry = None; + while let Some(next_entry) = stream.next().await { + match buffered_entry.take() { + Some(entry) => { + yield decode_raw_entry(entry); + buffered_entry = Some(next_entry?); + }, + None => { + buffered_entry = Some(next_entry?); + } + }; + } + if let Some(entry) = buffered_entry { + // Ignores tail corrupted data. + if entry.is_complete() { + yield decode_raw_entry(entry); + } + } + }; + + Ok(Box::pin(stream)) + } +} + +#[cfg(test)] +mod tests { + use std::assert_matches::assert_matches; + + use api::v1::{Mutation, OpType, WalEntry}; + use futures::{stream, TryStreamExt}; + use prost::Message; + use store_api::logstore::entry::{Entry, MultiplePartEntry, MultiplePartHeader}; + use store_api::logstore::provider::Provider; + use store_api::storage::RegionId; + + use crate::error::{self, Result}; + use crate::test_util::wal_util::MockRawEntryStream; + use crate::wal::entry_reader::{LogStoreEntryReader, WalEntryReader}; + use crate::wal::raw_entry_reader::{EntryStream, RawEntryReader}; + use crate::wal::EntryId; + + #[tokio::test] + async fn test_tail_corrupted_stream() { + common_telemetry::init_default_ut_logging(); + let provider = Provider::kafka_provider("my_topic".to_string()); + let wal_entry = WalEntry { + mutations: vec![Mutation { + op_type: OpType::Put as i32, + sequence: 1u64, + rows: None, + }], + }; + let encoded_entry = wal_entry.encode_to_vec(); + let parts = encoded_entry + .chunks(encoded_entry.len() / 2) + .map(Into::into) + .collect::>(); + let raw_entry_stream = MockRawEntryStream { + entries: vec![ + Entry::MultiplePart(MultiplePartEntry { + provider: provider.clone(), + region_id: RegionId::new(1, 1), + entry_id: 2, + headers: vec![MultiplePartHeader::First, MultiplePartHeader::Last], + parts, + }), + // The tail corrupted data. + Entry::MultiplePart(MultiplePartEntry { + provider: provider.clone(), + region_id: RegionId::new(1, 1), + entry_id: 1, + headers: vec![MultiplePartHeader::Last], + parts: vec![vec![1; 100]], + }), + ], + }; + + let reader = LogStoreEntryReader::new(raw_entry_stream); + let entries = reader + .read(&provider, 0) + .unwrap() + .try_collect::>() + .await + .unwrap() + .into_iter() + .map(|(_, entry)| entry) + .collect::>(); + + assert_eq!(entries, vec![wal_entry]); + } + + #[tokio::test] + async fn test_corrupted_stream() { + let provider = Provider::kafka_provider("my_topic".to_string()); + let raw_entry_stream = MockRawEntryStream { + entries: vec![ + Entry::MultiplePart(MultiplePartEntry { + provider: provider.clone(), + region_id: RegionId::new(1, 1), + entry_id: 1, + headers: vec![MultiplePartHeader::Last], + parts: vec![vec![1; 100]], + }), + Entry::MultiplePart(MultiplePartEntry { + provider: provider.clone(), + region_id: RegionId::new(1, 1), + entry_id: 2, + headers: vec![MultiplePartHeader::First], + parts: vec![vec![1; 100]], + }), + ], + }; + + let reader = LogStoreEntryReader::new(raw_entry_stream); + let err = reader + .read(&provider, 0) + .unwrap() + .try_collect::>() + .await + .unwrap_err(); + assert_matches!(err, error::Error::CorruptedEntry { .. }); + } +} diff --git a/src/mito2/src/wal/raw_entry_reader.rs b/src/mito2/src/wal/raw_entry_reader.rs index 57cee5845e50..d8afc7915119 100644 --- a/src/mito2/src/wal/raw_entry_reader.rs +++ b/src/mito2/src/wal/raw_entry_reader.rs @@ -20,7 +20,8 @@ use common_wal::options::{KafkaWalOptions, WalOptions}; use futures::stream::BoxStream; use futures::TryStreamExt; use snafu::ResultExt; -use store_api::logstore::entry::{Entry, RawEntry}; +use store_api::logstore::entry::Entry; +use store_api::logstore::provider::{KafkaProvider, Provider, RaftEngineProvider}; use store_api::logstore::LogStore; use store_api::storage::RegionId; use tokio_stream::StreamExt; @@ -28,38 +29,12 @@ use tokio_stream::StreamExt; use crate::error::{self, Result}; use crate::wal::EntryId; -/// A stream that yields [RawEntry]. -pub type RawEntryStream<'a> = BoxStream<'a, Result>; +/// A stream that yields [Entry]. +pub type EntryStream<'a> = BoxStream<'a, Result>; -// The namespace of kafka log store -pub struct KafkaNamespace<'a> { - topic: &'a str, -} - -// The namespace of raft engine log store -pub struct RaftEngineNamespace { - region_id: RegionId, -} - -impl RaftEngineNamespace { - pub fn new(region_id: RegionId) -> Self { - Self { region_id } - } -} - -/// The namespace of [RawEntryReader]. -pub(crate) enum LogStoreNamespace<'a> { - RaftEngine(RaftEngineNamespace), - Kafka(KafkaNamespace<'a>), -} - -/// [RawEntryReader] provides the ability to read [RawEntry] from the underlying [LogStore]. +/// [RawEntryReader] provides the ability to read [Entry] from the underlying [LogStore]. pub(crate) trait RawEntryReader: Send + Sync { - fn read<'a>( - &'a self, - ctx: LogStoreNamespace<'a>, - start_id: EntryId, - ) -> Result>; + fn read(&self, provider: &Provider, start_id: EntryId) -> Result>; } /// Implement the [RawEntryReader] for the [LogStore]. @@ -67,66 +42,35 @@ pub struct LogStoreRawEntryReader { store: Arc, } -impl LogStoreRawEntryReader { +impl LogStoreRawEntryReader { pub fn new(store: Arc) -> Self { Self { store } } +} - fn read_region(&self, ns: RaftEngineNamespace, start_id: EntryId) -> Result { - let region_id = ns.region_id; - let stream = try_stream!({ - // TODO(weny): refactor the `namespace` method. - let namespace = self.store.namespace(region_id.into(), &Default::default()); - let mut stream = self - .store - .read(&namespace, start_id) - .await - .map_err(BoxedError::new) - .context(error::ReadWalSnafu { region_id })?; - - while let Some(entries) = stream.next().await { - let entries = entries - .map_err(BoxedError::new) - .context(error::ReadWalSnafu { region_id })?; - - for entry in entries { - yield entry.into_raw_entry() - } - } - }); - - Ok(Box::pin(stream)) - } - - fn read_topic<'a>( - &'a self, - ns: KafkaNamespace<'a>, - start_id: EntryId, - ) -> Result { - let topic = ns.topic; +impl RawEntryReader for LogStoreRawEntryReader { + fn read(&self, provider: &Provider, start_id: EntryId) -> Result> { + let store = self.store.clone(); + let provider = provider.clone(); let stream = try_stream!({ - // TODO(weny): refactor the `namespace` method. - let namespace = self.store.namespace( - RegionId::from_u64(0).into(), - &WalOptions::Kafka(KafkaWalOptions { - topic: topic.to_string(), - }), - ); - - let mut stream = self - .store - .read(&namespace, start_id) + let mut stream = store + .read(&provider, start_id) .await .map_err(BoxedError::new) - .context(error::ReadKafkaWalSnafu { topic })?; + .with_context(|_| error::ReadWalSnafu { + provider: provider.clone(), + })?; while let Some(entries) = stream.next().await { - let entries = entries - .map_err(BoxedError::new) - .context(error::ReadKafkaWalSnafu { topic })?; + let entries = + entries + .map_err(BoxedError::new) + .with_context(|_| error::ReadWalSnafu { + provider: provider.clone(), + })?; for entry in entries { - yield entry.into_raw_entry() + yield entry } } }); @@ -135,53 +79,33 @@ impl LogStoreRawEntryReader { } } -impl RawEntryReader for LogStoreRawEntryReader { - fn read<'a>( - &'a self, - ctx: LogStoreNamespace<'a>, - start_id: EntryId, - ) -> Result> { - let stream = match ctx { - LogStoreNamespace::RaftEngine(ns) => self.read_region(ns, start_id)?, - LogStoreNamespace::Kafka(ns) => self.read_topic(ns, start_id)?, - }; - - Ok(Box::pin(stream)) - } -} - -/// A filter implement the [RawEntryReader] -pub struct RawEntryReaderFilter { +/// A [RawEntryReader] reads [RawEntry] belongs to a specific region. +pub struct RegionRawEntryReader { reader: R, - filter: F, + region_id: RegionId, } -impl RawEntryReaderFilter +impl RegionRawEntryReader where R: RawEntryReader, - F: Fn(&RawEntry) -> bool + Sync + Send, { - pub fn new(reader: R, filter: F) -> Self { - Self { reader, filter } + pub fn new(reader: R, region_id: RegionId) -> Self { + Self { reader, region_id } } } -impl RawEntryReader for RawEntryReaderFilter +impl RawEntryReader for RegionRawEntryReader where R: RawEntryReader, - F: Fn(&RawEntry) -> bool + Sync + Send, { - fn read<'a>( - &'a self, - ctx: LogStoreNamespace<'a>, - start_id: EntryId, - ) -> Result> { + fn read(&self, ctx: &Provider, start_id: EntryId) -> Result> { let mut stream = self.reader.read(ctx, start_id)?; - let filter = &(self.filter); + let region_id = self.region_id; + let stream = try_stream!({ while let Some(entry) = stream.next().await { let entry = entry?; - if filter(&entry) { + if entry.region_id() == region_id { yield entry } } @@ -197,11 +121,9 @@ mod tests { use common_wal::options::WalOptions; use futures::stream; - use store_api::logstore::entry::{Entry, RawEntry}; - use store_api::logstore::entry_stream::SendableEntryStream; - use store_api::logstore::namespace::Namespace; + use store_api::logstore::entry::{Entry, NaiveEntry}; use store_api::logstore::{ - AppendBatchResponse, AppendResponse, EntryId, LogStore, NamespaceId, + AppendBatchResponse, AppendResponse, EntryId, LogStore, SendableEntryStream, }; use store_api::storage::RegionId; @@ -210,93 +132,79 @@ mod tests { #[derive(Debug)] struct MockLogStore { - entries: Vec, - } - - #[derive(Debug, Eq, PartialEq, Clone, Copy, Default, Hash)] - struct MockNamespace; - - impl Namespace for MockNamespace { - fn id(&self) -> NamespaceId { - 0 - } + entries: Vec, } #[async_trait::async_trait] impl LogStore for MockLogStore { - type Entry = RawEntry; type Error = error::Error; - type Namespace = MockNamespace; async fn stop(&self) -> Result<(), Self::Error> { unreachable!() } - async fn append(&self, entry: Self::Entry) -> Result { - unreachable!() - } - async fn append_batch( &self, - entries: Vec, + entries: Vec, ) -> Result { unreachable!() } async fn read( &self, - ns: &Self::Namespace, + provider: &Provider, id: EntryId, - ) -> Result, Self::Error> { + ) -> Result, Self::Error> { Ok(Box::pin(stream::iter(vec![Ok(self.entries.clone())]))) } - async fn create_namespace(&self, ns: &Self::Namespace) -> Result<(), Self::Error> { + async fn create_namespace(&self, ns: &Provider) -> Result<(), Self::Error> { unreachable!() } - async fn delete_namespace(&self, ns: &Self::Namespace) -> Result<(), Self::Error> { + async fn delete_namespace(&self, ns: &Provider) -> Result<(), Self::Error> { unreachable!() } - async fn list_namespaces(&self) -> Result, Self::Error> { + async fn list_namespaces(&self) -> Result, Self::Error> { unreachable!() } async fn obsolete( &self, - ns: Self::Namespace, + provider: &Provider, entry_id: EntryId, ) -> Result<(), Self::Error> { unreachable!() } - fn entry(&self, data: &mut Vec, entry_id: EntryId, ns: Self::Namespace) -> Self::Entry { + fn entry( + &self, + data: &mut Vec, + entry_id: EntryId, + region_id: RegionId, + provider: &Provider, + ) -> Result { unreachable!() } - - fn namespace(&self, _ns_id: NamespaceId, _wal_options: &WalOptions) -> Self::Namespace { - MockNamespace - } } #[tokio::test] async fn test_raw_entry_reader() { - let expected_entries = vec![RawEntry { + let provider = Provider::raft_engine_provider(RegionId::new(1024, 1).as_u64()); + let expected_entries = vec![Entry::Naive(NaiveEntry { + provider: provider.clone(), region_id: RegionId::new(1024, 1), entry_id: 1, - data: vec![], - }]; + data: vec![1], + })]; let store = MockLogStore { entries: expected_entries.clone(), }; let reader = LogStoreRawEntryReader::new(Arc::new(store)); let entries = reader - .read( - LogStoreNamespace::RaftEngine(RaftEngineNamespace::new(RegionId::new(1024, 1))), - 0, - ) + .read(&provider, 0) .unwrap() .try_collect::>() .await @@ -306,37 +214,38 @@ mod tests { #[tokio::test] async fn test_raw_entry_reader_filter() { + let provider = Provider::raft_engine_provider(RegionId::new(1024, 1).as_u64()); let all_entries = vec![ - RawEntry { + Entry::Naive(NaiveEntry { + provider: provider.clone(), region_id: RegionId::new(1024, 1), entry_id: 1, data: vec![1], - }, - RawEntry { + }), + Entry::Naive(NaiveEntry { + provider: provider.clone(), region_id: RegionId::new(1024, 2), entry_id: 2, data: vec![2], - }, - RawEntry { + }), + Entry::Naive(NaiveEntry { + provider: provider.clone(), region_id: RegionId::new(1024, 3), entry_id: 3, data: vec![3], - }, + }), ]; let store = MockLogStore { entries: all_entries.clone(), }; let expected_region_id = RegionId::new(1024, 3); - let reader = - RawEntryReaderFilter::new(LogStoreRawEntryReader::new(Arc::new(store)), |entry| { - entry.region_id == expected_region_id - }); + let reader = RegionRawEntryReader::new( + LogStoreRawEntryReader::new(Arc::new(store)), + expected_region_id, + ); let entries = reader - .read( - LogStoreNamespace::RaftEngine(RaftEngineNamespace::new(RegionId::new(1024, 1))), - 0, - ) + .read(&provider, 0) .unwrap() .try_collect::>() .await @@ -344,7 +253,7 @@ mod tests { assert_eq!( all_entries .into_iter() - .filter(|entry| entry.region_id == expected_region_id) + .filter(|entry| entry.region_id() == expected_region_id) .collect::>(), entries ); diff --git a/src/mito2/src/worker/handle_catchup.rs b/src/mito2/src/worker/handle_catchup.rs index f6d890dc8ff8..595b6ee56635 100644 --- a/src/mito2/src/worker/handle_catchup.rs +++ b/src/mito2/src/worker/handle_catchup.rs @@ -75,7 +75,7 @@ impl RegionWorkerLoop { let timer = Instant::now(); let last_entry_id = replay_memtable( &self.wal, - ®ion.wal_options, + ®ion.provider, region_id, flushed_entry_id, ®ion.version_control, diff --git a/src/mito2/src/worker/handle_flush.rs b/src/mito2/src/worker/handle_flush.rs index b776f98aaa56..2d1c4b96ca39 100644 --- a/src/mito2/src/worker/handle_flush.rs +++ b/src/mito2/src/worker/handle_flush.rs @@ -212,7 +212,7 @@ impl RegionWorkerLoop { ); if let Err(e) = self .wal - .obsolete(region_id, request.flushed_entry_id, ®ion.wal_options) + .obsolete(region_id, request.flushed_entry_id, ®ion.provider) .await { error!(e; "Failed to write wal, region: {}", region_id); diff --git a/src/mito2/src/worker/handle_truncate.rs b/src/mito2/src/worker/handle_truncate.rs index f5598286a563..70aca9f6ace4 100644 --- a/src/mito2/src/worker/handle_truncate.rs +++ b/src/mito2/src/worker/handle_truncate.rs @@ -82,7 +82,7 @@ impl RegionWorkerLoop { .obsolete( region_id, truncate_result.truncated_entry_id, - ®ion.wal_options, + ®ion.provider, ) .await { diff --git a/src/mito2/src/worker/handle_write.rs b/src/mito2/src/worker/handle_write.rs index 3614d1be5de2..85ce49f3150f 100644 --- a/src/mito2/src/worker/handle_write.rs +++ b/src/mito2/src/worker/handle_write.rs @@ -84,8 +84,7 @@ impl RegionWorkerLoop { for (region_id, region_ctx) in region_ctxs.iter_mut() { // Safety: the log store implementation ensures that either the `write_to_wal` fails and no // response is returned or the last entry ids for each region do exist. - let last_entry_id = - response.last_entry_ids.get(®ion_id.as_u64()).unwrap(); + let last_entry_id = response.last_entry_ids.get(region_id).unwrap(); region_ctx.set_next_entry_id(last_entry_id + 1); } } @@ -162,7 +161,7 @@ impl RegionWorkerLoop { let region_ctx = RegionWriteCtx::new( region.region_id, ®ion.version_control, - region.wal_options.clone(), + region.provider.clone(), ); e.insert(region_ctx); diff --git a/src/operator/src/error.rs b/src/operator/src/error.rs index 6e77b53d3eba..d7dcdb9d7057 100644 --- a/src/operator/src/error.rs +++ b/src/operator/src/error.rs @@ -22,7 +22,6 @@ use datafusion::parquet; use datatypes::arrow::error::ArrowError; use servers::define_into_tonic_status; use snafu::{Location, Snafu}; -use sql::ast::Value; #[derive(Snafu)] #[snafu(visibility(pub))] @@ -113,12 +112,11 @@ pub enum Error { error: datafusion::error::DataFusionError, }, - #[snafu(display("Failed to convert value to sql value: {}", value))] - ConvertSqlValue { - value: Value, + #[snafu(display("Failed to extract table names"))] + ExtractTableNames { #[snafu(implicit)] location: Location, - source: sql::error::Error, + source: query::error::Error, }, #[snafu(display("Column datatype error"))] @@ -542,13 +540,6 @@ pub enum Error { location: Location, }, - #[snafu(display("Failed to prepare immutable table"))] - PrepareImmutableTable { - #[snafu(implicit)] - location: Location, - source: query::error::Error, - }, - #[snafu(display("Invalid COPY parameter, key: {}, value: {}", key, value))] InvalidCopyParameter { key: String, @@ -571,20 +562,6 @@ pub enum Error { location: Location, }, - #[snafu(display("Failed to read record batch"))] - ReadRecordBatch { - source: common_recordbatch::error::Error, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Failed to build column vectors"))] - BuildColumnVectors { - source: common_recordbatch::error::Error, - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Missing insert body"))] MissingInsertBody { source: sql::error::Error, @@ -738,7 +715,6 @@ impl ErrorExt for Error { | Error::ColumnNotFound { .. } | Error::BuildRegex { .. } | Error::InvalidSchema { .. } - | Error::PrepareImmutableTable { .. } | Error::BuildCsvConfig { .. } | Error::ProjectSchema { .. } | Error::UnsupportedFormat { .. } @@ -762,9 +738,7 @@ impl ErrorExt for Error { Error::TableMetadataManager { source, .. } => source.status_code(), - Error::ConvertSqlValue { source, .. } | Error::ParseSql { source, .. } => { - source.status_code() - } + Error::ParseSql { source, .. } => source.status_code(), Error::InvalidateTableCache { source, .. } => source.status_code(), @@ -814,6 +788,7 @@ impl ErrorExt for Error { | Error::FindNewColumnsOnInsertion { source, .. } => source.status_code(), Error::ExecuteStatement { source, .. } + | Error::ExtractTableNames { source, .. } | Error::PlanStatement { source, .. } | Error::ParseQuery { source, .. } | Error::ExecLogicalPlan { source, .. } @@ -843,10 +818,6 @@ impl ErrorExt for Error { StatusCode::InvalidArguments } - Error::ReadRecordBatch { source, .. } | Error::BuildColumnVectors { source, .. } => { - source.status_code() - } - Error::ColumnDefaultValue { source, .. } => source.status_code(), Error::DdlWithMultiCatalogs { .. } diff --git a/src/operator/src/expr_factory.rs b/src/operator/src/expr_factory.rs index 25bc07cb1e3a..9f3d2b1c4077 100644 --- a/src/operator/src/expr_factory.rs +++ b/src/operator/src/expr_factory.rs @@ -515,6 +515,7 @@ pub(crate) fn to_alter_expr( pub fn to_create_view_expr( stmt: CreateView, logical_plan: Vec, + table_names: Vec, query_ctx: QueryContextRef, ) -> Result { let (catalog_name, schema_name, view_name) = table_idents_to_full_name(&stmt.name, &query_ctx) @@ -528,6 +529,7 @@ pub fn to_create_view_expr( logical_plan, create_if_not_exists: stmt.if_not_exists, or_replace: stmt.or_replace, + table_names, }; Ok(expr) @@ -785,6 +787,21 @@ mod tests { assert!(change_column_type.target_type_extension.is_none()); } + fn new_test_table_names() -> Vec { + vec![ + TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "a_table".to_string(), + }, + TableName { + catalog_name: "greptime".to_string(), + schema_name: "public".to_string(), + table_name: "b_table".to_string(), + }, + ] + } + #[test] fn test_to_create_view_expr() { let sql = "CREATE VIEW test AS SELECT * FROM NUMBERS"; @@ -799,8 +816,15 @@ mod tests { }; let logical_plan = vec![1, 2, 3]; + let table_names = new_test_table_names(); - let expr = to_create_view_expr(stmt, logical_plan.clone(), QueryContext::arc()).unwrap(); + let expr = to_create_view_expr( + stmt, + logical_plan.clone(), + table_names.clone(), + QueryContext::arc(), + ) + .unwrap(); assert_eq!("greptime", expr.catalog_name); assert_eq!("public", expr.schema_name); @@ -808,6 +832,7 @@ mod tests { assert!(!expr.create_if_not_exists); assert!(!expr.or_replace); assert_eq!(logical_plan, expr.logical_plan); + assert_eq!(table_names, expr.table_names); } #[test] @@ -824,8 +849,15 @@ mod tests { }; let logical_plan = vec![1, 2, 3]; + let table_names = new_test_table_names(); - let expr = to_create_view_expr(stmt, logical_plan.clone(), QueryContext::arc()).unwrap(); + let expr = to_create_view_expr( + stmt, + logical_plan.clone(), + table_names.clone(), + QueryContext::arc(), + ) + .unwrap(); assert_eq!("greptime", expr.catalog_name); assert_eq!("test", expr.schema_name); @@ -833,5 +865,6 @@ mod tests { assert!(expr.create_if_not_exists); assert!(expr.or_replace); assert_eq!(logical_plan, expr.logical_plan); + assert_eq!(table_names, expr.table_names); } } diff --git a/src/operator/src/insert.rs b/src/operator/src/insert.rs index a68ed9b6be18..b54efc06dae9 100644 --- a/src/operator/src/insert.rs +++ b/src/operator/src/insert.rs @@ -15,7 +15,6 @@ use std::collections::HashMap; use std::sync::Arc; -use api::region::RegionResponse; use api::v1::alter_expr::Kind; use api::v1::region::{InsertRequests as RegionInsertRequests, RegionRequestHeader}; use api::v1::{ @@ -191,41 +190,6 @@ impl Inserter { } impl Inserter { - fn post_request(&self, requests: RegionInsertRequests) { - let node_manager = self.node_manager.clone(); - let table_flownode_set_cache = self.table_flownode_set_cache.clone(); - // Spawn all tasks that do job for mirror insert requests for flownode - common_runtime::spawn_bg(async move { - match Self::mirror_flow_node_requests(table_flownode_set_cache, requests).await { - Ok(flow_tasks) => { - let flow_tasks = flow_tasks.into_iter().map(|(peer, inserts)| { - let node_manager = node_manager.clone(); - common_runtime::spawn_write(async move { - node_manager - .flownode(&peer) - .await - .handle_inserts(inserts) - .await - .map(|flow_response| RegionResponse { - affected_rows: flow_response.affected_rows as AffectedRows, - extension: flow_response.extension, - }) - .context(RequestInsertsSnafu) - }) - }); - - if let Err(err) = future::try_join_all(flow_tasks) - .await - .context(JoinTaskSnafu) - { - warn!(err; "Failed to insert data into flownode"); - } - } - Err(err) => warn!(err; "Failed to mirror request to flownode"), - } - }); - } - async fn do_request( &self, requests: RegionInsertRequests, @@ -238,8 +202,44 @@ impl Inserter { ..Default::default() }); - let tasks = self - .group_requests_by_peer(requests.clone()) + // Mirror requests for source table to flownode + match self.mirror_flow_node_requests(&requests).await { + Ok(flow_requests) => { + let node_manager = self.node_manager.clone(); + let flow_tasks = flow_requests.into_iter().map(|(peer, inserts)| { + let node_manager = node_manager.clone(); + common_runtime::spawn_bg(async move { + node_manager + .flownode(&peer) + .await + .handle_inserts(inserts) + .await + .context(RequestInsertsSnafu) + }) + }); + + match future::try_join_all(flow_tasks) + .await + .context(JoinTaskSnafu) + { + Ok(ret) => { + let affected_rows = ret + .into_iter() + .map(|resp| resp.map(|r| r.affected_rows)) + .sum::>() + .unwrap_or(0); + crate::metrics::DIST_MIRROR_ROW_COUNT.inc_by(affected_rows); + } + Err(err) => { + warn!(err; "Failed to insert data into flownode"); + } + } + } + Err(err) => warn!(err; "Failed to mirror request to flownode"), + } + + let write_tasks = self + .group_requests_by_peer(requests) .await? .into_iter() .map(|(peer, inserts)| { @@ -254,8 +254,9 @@ impl Inserter { .context(RequestInsertsSnafu) }) }); - let results = future::try_join_all(tasks).await.context(JoinTaskSnafu)?; - self.post_request(requests); + let results = future::try_join_all(write_tasks) + .await + .context(JoinTaskSnafu)?; let affected_rows = results .into_iter() .map(|resp| resp.map(|r| r.affected_rows)) @@ -269,21 +270,22 @@ impl Inserter { /// Mirror requests for source table to flownode async fn mirror_flow_node_requests( - table_flownode_set_cache: TableFlownodeSetCacheRef, - requests: RegionInsertRequests, + &self, + requests: &RegionInsertRequests, ) -> Result> { // store partial source table requests used by flow node(only store what's used) let mut src_table_reqs: HashMap, RegionInsertRequests)>> = HashMap::new(); - for req in requests.requests { - match src_table_reqs.get_mut(&RegionId::from_u64(req.region_id).table_id()) { - Some(Some((_peers, reqs))) => reqs.requests.push(req), + for req in &requests.requests { + let table_id = RegionId::from_u64(req.region_id).table_id(); + match src_table_reqs.get_mut(&table_id) { + Some(Some((_peers, reqs))) => reqs.requests.push(req.clone()), // already know this is not source table Some(None) => continue, _ => { - let table_id = RegionId::from_u64(req.region_id).table_id(); // TODO(discord9): determine where to store the flow node address in distributed mode - let peers = table_flownode_set_cache + let peers = self + .table_flownode_set_cache .get(table_id) .await .context(RequestInsertsSnafu)? @@ -294,7 +296,7 @@ impl Inserter { if !peers.is_empty() { let mut reqs = RegionInsertRequests::default(); - reqs.requests.push(req); + reqs.requests.push(req.clone()); src_table_reqs.insert(table_id, Some((peers, reqs))); } else { // insert a empty entry to avoid repeat query @@ -310,14 +312,26 @@ impl Inserter { .into_iter() .filter_map(|(k, v)| v.map(|v| (k, v))) { - for flownode in peers { + if peers.len() == 1 { + // fast path, zero copy inserts - .entry(flownode.clone()) + .entry(peers[0].clone()) .or_default() .requests - .extend(reqs.requests.clone()); + .extend(reqs.requests); + continue; + } else { + // TODO(discord9): need to split requests to multiple flownodes + for flownode in peers { + inserts + .entry(flownode.clone()) + .or_default() + .requests + .extend(reqs.requests.clone()); + } } } + Ok(inserts) } diff --git a/src/operator/src/metrics.rs b/src/operator/src/metrics.rs index 932aca168003..97c5e0015a55 100644 --- a/src/operator/src/metrics.rs +++ b/src/operator/src/metrics.rs @@ -36,6 +36,11 @@ lazy_static! { "table operator ingest rows" ) .unwrap(); + pub static ref DIST_MIRROR_ROW_COUNT: IntCounter = register_int_counter!( + "greptime_table_operator_mirror_rows", + "table operator mirror rows" + ) + .unwrap(); pub static ref DIST_DELETE_ROW_COUNT: IntCounter = register_int_counter!( "greptime_table_operator_delete_rows", "table operator delete rows" diff --git a/src/operator/src/statement.rs b/src/operator/src/statement.rs index e9b6f4b282c0..649af286a4bb 100644 --- a/src/operator/src/statement.rs +++ b/src/operator/src/statement.rs @@ -32,7 +32,6 @@ use common_meta::ddl::ProcedureExecutorRef; use common_meta::key::flow::{FlowMetadataManager, FlowMetadataManagerRef}; use common_meta::key::{TableMetadataManager, TableMetadataManagerRef}; use common_meta::kv_backend::KvBackendRef; -use common_meta::table_name::TableName; use common_query::Output; use common_telemetry::tracing; use common_time::range::TimestampRange; @@ -50,6 +49,7 @@ use sql::statements::OptionMap; use sql::util::format_raw_object_name; use sqlparser::ast::ObjectName; use table::requests::{CopyDatabaseRequest, CopyDirection, CopyTableRequest}; +use table::table_name::TableName; use table::table_reference::TableReference; use table::TableRef; diff --git a/src/operator/src/statement/ddl.rs b/src/operator/src/statement/ddl.rs index 67c4a4251bf0..2cfe71fd0d24 100644 --- a/src/operator/src/statement/ddl.rs +++ b/src/operator/src/statement/ddl.rs @@ -32,7 +32,6 @@ use common_meta::rpc::ddl::{ CreateFlowTask, DdlTask, DropFlowTask, SubmitDdlTaskRequest, SubmitDdlTaskResponse, }; use common_meta::rpc::router::{Partition, Partition as MetaPartition}; -use common_meta::table_name::TableName; use common_query::Output; use common_telemetry::{debug, info, tracing}; use common_time::Timezone; @@ -43,6 +42,8 @@ use lazy_static::lazy_static; use partition::expr::{Operand, PartitionExpr, RestrictedOp}; use partition::partition::{PartitionBound, PartitionDef}; use query::parser::QueryStatement; +use query::plan::extract_and_rewrite_full_table_names; +use query::query_engine::DefaultSerializer; use query::sql::create_table_stmt; use regex::Regex; use session::context::QueryContextRef; @@ -60,17 +61,19 @@ use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan}; use table::dist_table::DistTable; use table::metadata::{self, RawTableInfo, RawTableMeta, TableId, TableInfo, TableType}; use table::requests::{AlterKind, AlterTableRequest, TableOptions, COMMENT_KEY}; +use table::table_name::TableName; use table::TableRef; use super::StatementExecutor; use crate::error::{ self, AlterExprToRequestSnafu, CatalogSnafu, ColumnDataTypeSnafu, ColumnNotFoundSnafu, CreateLogicalTablesSnafu, CreateTableInfoSnafu, DdlWithMultiCatalogsSnafu, - DdlWithMultiSchemasSnafu, DeserializePartitionSnafu, EmptyDdlExprSnafu, FlowNotFoundSnafu, - InvalidPartitionColumnsSnafu, InvalidPartitionRuleSnafu, InvalidTableNameSnafu, - InvalidViewNameSnafu, InvalidViewStmtSnafu, ParseSqlValueSnafu, Result, SchemaInUseSnafu, - SchemaNotFoundSnafu, SubstraitCodecSnafu, TableAlreadyExistsSnafu, TableMetadataManagerSnafu, - TableNotFoundSnafu, UnrecognizedTableOptionSnafu, ViewAlreadyExistsSnafu, + DdlWithMultiSchemasSnafu, DeserializePartitionSnafu, EmptyDdlExprSnafu, ExtractTableNamesSnafu, + FlowNotFoundSnafu, InvalidPartitionColumnsSnafu, InvalidPartitionRuleSnafu, + InvalidTableNameSnafu, InvalidViewNameSnafu, InvalidViewStmtSnafu, ParseSqlValueSnafu, Result, + SchemaInUseSnafu, SchemaNotFoundSnafu, SubstraitCodecSnafu, TableAlreadyExistsSnafu, + TableMetadataManagerSnafu, TableNotFoundSnafu, UnrecognizedTableOptionSnafu, + ViewAlreadyExistsSnafu, }; use crate::expr_factory; use crate::statement::show::create_partitions_stmt; @@ -398,16 +401,33 @@ impl StatementExecutor { return InvalidViewStmtSnafu {}.fail(); } }; - let optimized_plan = self.optimize_logical_plan(logical_plan)?; + + // Extract the table names from the origin plan + // and rewrite them as fully qualified names. + let (table_names, plan) = + extract_and_rewrite_full_table_names(logical_plan.unwrap_df_plan(), ctx.clone()) + .context(ExtractTableNamesSnafu)?; + + let table_names = table_names.into_iter().map(|t| t.into()).collect(); + + // TODO(dennis): we don't save the optimized plan yet, + // because there are some serialization issue with our own defined plan node (such as `MergeScanLogicalPlan`). + // When the issues are fixed, we can use the `optimized_plan` instead. + // let optimized_plan = self.optimize_logical_plan(logical_plan)?.unwrap_df_plan(); // encode logical plan let encoded_plan = DFLogicalSubstraitConvertor - .encode(&optimized_plan.unwrap_df_plan()) + .encode(&plan, DefaultSerializer) .context(SubstraitCodecSnafu)?; - let expr = - expr_factory::to_create_view_expr(create_view, encoded_plan.to_vec(), ctx.clone())?; + let expr = expr_factory::to_create_view_expr( + create_view, + encoded_plan.to_vec(), + table_names, + ctx.clone(), + )?; + //TODO(dennis): validate the logical plan self.create_view_by_expr(expr, ctx).await } diff --git a/src/operator/src/statement/show.rs b/src/operator/src/statement/show.rs index ca1a500c2a38..a89df5985206 100644 --- a/src/operator/src/statement/show.rs +++ b/src/operator/src/statement/show.rs @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_meta::table_name::TableName; use common_query::Output; use common_telemetry::tracing; use partition::manager::PartitionInfo; @@ -24,6 +23,7 @@ use sql::statements::create::Partitions; use sql::statements::show::{ ShowColumns, ShowDatabases, ShowIndex, ShowKind, ShowTables, ShowVariables, }; +use table::table_name::TableName; use table::TableRef; use crate::error::{self, ExecuteStatementSnafu, Result}; diff --git a/src/promql/Cargo.toml b/src/promql/Cargo.toml index 0bc9d6187485..4039328528c3 100644 --- a/src/promql/Cargo.toml +++ b/src/promql/Cargo.toml @@ -9,30 +9,22 @@ workspace = true [dependencies] ahash.workspace = true -async-recursion = "1.0" async-trait.workspace = true bytemuck.workspace = true -catalog.workspace = true -common-catalog.workspace = true common-error.workspace = true common-macro.workspace = true -common-query.workspace = true common-recordbatch.workspace = true common-telemetry.workspace = true datafusion.workspace = true datafusion-expr.workspace = true -datafusion-functions.workspace = true datatypes.workspace = true futures = "0.3" greptime-proto.workspace = true -itertools.workspace = true lazy_static.workspace = true prometheus.workspace = true promql-parser.workspace = true prost.workspace = true -session.workspace = true snafu.workspace = true -table.workspace = true [dev-dependencies] query.workspace = true diff --git a/src/promql/src/error.rs b/src/promql/src/error.rs index a9598904f036..3f3c216acd54 100644 --- a/src/promql/src/error.rs +++ b/src/promql/src/error.rs @@ -18,35 +18,12 @@ use common_error::ext::ErrorExt; use common_error::status_code::StatusCode; use common_macro::stack_trace_debug; use datafusion::error::DataFusionError; -use promql_parser::parser::token::TokenType; -use promql_parser::parser::{Expr as PromExpr, VectorMatchCardinality}; use snafu::{Location, Snafu}; #[derive(Snafu)] #[snafu(visibility(pub))] #[stack_trace_debug] pub enum Error { - #[snafu(display("Unsupported expr type: {}", name))] - UnsupportedExpr { - name: String, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Unsupported vector matches: {:?}", name))] - UnsupportedVectorMatch { - name: VectorMatchCardinality, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Unexpected token: {:?}", token))] - UnexpectedToken { - token: TokenType, - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Internal error during building DataFusion plan"))] DataFusionPlanning { #[snafu(source)] @@ -55,49 +32,6 @@ pub enum Error { location: Location, }, - #[snafu(display("Unexpected plan or expression: {}", desc))] - UnexpectedPlanExpr { - desc: String, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Unknown table type, downcast failed"))] - UnknownTable { - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Cannot find time index column in table {}", table))] - TimeIndexNotFound { - table: String, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Cannot find value columns in table {}", table))] - ValueNotFound { - table: String, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display( - "Cannot accept multiple vector as function input, PromQL expr: {:?}", - expr, - ))] - MultipleVector { - expr: PromExpr, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Expect a PromQL expr but not found, input expr: {:?}", expr))] - ExpectExpr { - expr: PromExpr, - #[snafu(implicit)] - location: Location, - }, #[snafu(display( "Illegal range: offset {}, length {}, array len {}", offset, @@ -126,117 +60,24 @@ pub enum Error { location: Location, }, - #[snafu(display( - "Table (metric) name not found, this indicates a procedure error in PromQL planner" - ))] - TableNameNotFound { - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("General catalog error: "))] - Catalog { - #[snafu(implicit)] - location: Location, - source: catalog::error::Error, - }, - - #[snafu(display("Expect a range selector, but not found"))] - ExpectRangeSelector { - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Zero range in range selector"))] - ZeroRangeSelector { - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Cannot find column {col}"))] ColumnNotFound { col: String, #[snafu(implicit)] location: Location, }, - - #[snafu(display("Found multiple metric matchers in selector"))] - MultipleMetricMatchers { - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Expect a metric matcher, but not found"))] - NoMetricMatcher { - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Invalid function argument for {}", fn_name))] - FunctionInvalidArgument { - fn_name: String, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display( - "Attempt to combine two tables with different column sets, left: {:?}, right: {:?}", - left, - right - ))] - CombineTableColumnMismatch { - left: Vec, - right: Vec, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Multi fields calculation is not supported in {}", operator))] - MultiFieldsNotSupported { - operator: String, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Matcher operator {matcher_op} is not supported for {matcher}"))] - UnsupportedMatcherOp { - matcher_op: String, - matcher: String, - #[snafu(implicit)] - location: Location, - }, } impl ErrorExt for Error { fn status_code(&self) -> StatusCode { use Error::*; match self { - TimeIndexNotFound { .. } - | ValueNotFound { .. } - | UnsupportedExpr { .. } - | UnexpectedToken { .. } - | MultipleVector { .. } - | ExpectExpr { .. } - | ExpectRangeSelector { .. } - | ZeroRangeSelector { .. } - | ColumnNotFound { .. } - | Deserialize { .. } - | FunctionInvalidArgument { .. } - | UnsupportedVectorMatch { .. } - | CombineTableColumnMismatch { .. } - | DataFusionPlanning { .. } - | MultiFieldsNotSupported { .. } - | UnexpectedPlanExpr { .. } - | UnsupportedMatcherOp { .. } - | IllegalRange { .. } => StatusCode::InvalidArguments, - - UnknownTable { .. } | EmptyRange { .. } => StatusCode::Internal, - - TableNameNotFound { .. } => StatusCode::TableNotFound, - - MultipleMetricMatchers { .. } | NoMetricMatcher { .. } => StatusCode::InvalidSyntax, + Deserialize { .. } => StatusCode::Unexpected, + IllegalRange { .. } | ColumnNotFound { .. } | EmptyRange { .. } => { + StatusCode::InvalidArguments + } - Catalog { source, .. } => source.status_code(), + DataFusionPlanning { .. } => StatusCode::PlanQuery, } } diff --git a/src/promql/src/extension_plan.rs b/src/promql/src/extension_plan.rs index f8e32fc4dcdf..eba327c1bf64 100644 --- a/src/promql/src/extension_plan.rs +++ b/src/promql/src/extension_plan.rs @@ -35,4 +35,4 @@ pub use scalar_calculate::ScalarCalculate; pub use series_divide::{SeriesDivide, SeriesDivideExec, SeriesDivideStream}; pub use union_distinct_on::{UnionDistinctOn, UnionDistinctOnExec, UnionDistinctOnStream}; -pub(crate) type Millisecond = ::Native; +pub type Millisecond = ::Native; diff --git a/src/promql/src/lib.rs b/src/promql/src/lib.rs index 127bf45d5f1a..a29fc032e957 100644 --- a/src/promql/src/lib.rs +++ b/src/promql/src/lib.rs @@ -20,5 +20,4 @@ pub mod error; pub mod extension_plan; pub mod functions; mod metrics; -pub mod planner; pub mod range_array; diff --git a/src/query/Cargo.toml b/src/query/Cargo.toml index 5aec3dffe49b..eed0d5a3398e 100644 --- a/src/query/Cargo.toml +++ b/src/query/Cargo.toml @@ -16,6 +16,7 @@ arrow-schema.workspace = true async-recursion = "1.0" async-stream.workspace = true async-trait = "0.1" +bytes.workspace = true catalog.workspace = true chrono.workspace = true common-base.workspace = true @@ -28,11 +29,13 @@ common-meta.workspace = true common-plugins.workspace = true common-query.workspace = true common-recordbatch.workspace = true +common-runtime.workspace = true common-telemetry.workspace = true common-time.workspace = true datafusion.workspace = true datafusion-common.workspace = true datafusion-expr.workspace = true +datafusion-functions.workspace = true datafusion-optimizer.workspace = true datafusion-physical-expr.workspace = true datafusion-sql.workspace = true @@ -41,6 +44,7 @@ futures = "0.3" futures-util.workspace = true greptime-proto.workspace = true humantime.workspace = true +itertools.workspace = true lazy_static.workspace = true meter-core.workspace = true meter-macros.workspace = true @@ -49,6 +53,7 @@ once_cell.workspace = true prometheus.workspace = true promql.workspace = true promql-parser.workspace = true +prost.workspace = true regex.workspace = true session.workspace = true snafu.workspace = true @@ -64,6 +69,7 @@ approx_eq = "0.1" arrow.workspace = true catalog = { workspace = true, features = ["testing"] } common-macro.workspace = true +common-query = { workspace = true, features = ["testing"] } format_num = "0.1" num = "0.4" num-traits = "0.2" diff --git a/src/query/src/datafusion/planner.rs b/src/query/src/datafusion/planner.rs index ab37f406ac2a..65e78f130160 100644 --- a/src/query/src/datafusion/planner.rs +++ b/src/query/src/datafusion/planner.rs @@ -34,7 +34,7 @@ use session::context::QueryContextRef; use snafu::ResultExt; use crate::error::{CatalogSnafu, DataFusionSnafu, Result}; -use crate::query_engine::QueryEngineState; +use crate::query_engine::{DefaultPlanDecoder, QueryEngineState}; pub struct DfContextProviderAdapter { engine_state: Arc, @@ -63,6 +63,7 @@ impl DfContextProviderAdapter { engine_state.catalog_manager().clone(), engine_state.disallow_cross_catalog_query(), query_ctx.as_ref(), + Arc::new(DefaultPlanDecoder::new(session_state.clone(), &query_ctx)?), ); let tables = resolve_tables(table_names, &mut table_provider).await?; diff --git a/src/query/src/dist_plan/analyzer.rs b/src/query/src/dist_plan/analyzer.rs index 870b92633981..bbb3e5ddd9cc 100644 --- a/src/query/src/dist_plan/analyzer.rs +++ b/src/query/src/dist_plan/analyzer.rs @@ -31,6 +31,7 @@ use crate::dist_plan::commutativity::{ partial_commutative_transformer, Categorizer, Commutativity, }; use crate::dist_plan::merge_scan::MergeScanLogicalPlan; +use crate::query_engine::DefaultSerializer; pub struct DistPlannerAnalyzer; @@ -150,7 +151,10 @@ impl PlanRewriter { /// Return true if should stop and expand. The input plan is the parent node of current node fn should_expand(&mut self, plan: &LogicalPlan) -> bool { - if DFLogicalSubstraitConvertor.encode(plan).is_err() { + if DFLogicalSubstraitConvertor + .encode(plan, DefaultSerializer) + .is_err() + { return true; } diff --git a/src/query/src/dist_plan/merge_scan.rs b/src/query/src/dist_plan/merge_scan.rs index a2cd734da004..23d7fbb832cd 100644 --- a/src/query/src/dist_plan/merge_scan.rs +++ b/src/query/src/dist_plan/merge_scan.rs @@ -21,7 +21,6 @@ use async_stream::stream; use common_base::bytes::Bytes; use common_catalog::parse_catalog_and_schema_from_db_string; use common_error::ext::BoxedError; -use common_meta::table_name::TableName; use common_plugins::GREPTIME_EXEC_READ_COST; use common_recordbatch::adapter::{DfRecordBatchStreamAdapter, RecordBatchMetrics}; use common_recordbatch::error::ExternalSnafu; @@ -48,6 +47,7 @@ use meter_macros::read_meter; use session::context::QueryContextRef; use snafu::ResultExt; use store_api::storage::RegionId; +use table::table_name::TableName; use tokio::time::Instant; use crate::error::ConvertSchemaSnafu; @@ -122,7 +122,6 @@ impl MergeScanLogicalPlan { &self.input } } - pub struct MergeScanExec { table: TableName, regions: Vec, diff --git a/src/query/src/dist_plan/planner.rs b/src/query/src/dist_plan/planner.rs index c9f9f8e5b311..c3d8b00eaf2d 100644 --- a/src/query/src/dist_plan/planner.rs +++ b/src/query/src/dist_plan/planner.rs @@ -19,7 +19,6 @@ use std::sync::Arc; use async_trait::async_trait; use catalog::CatalogManagerRef; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; -use common_meta::table_name::TableName; use datafusion::common::Result; use datafusion::datasource::DefaultTableSource; use datafusion::execution::context::SessionState; @@ -35,10 +34,12 @@ use store_api::storage::RegionId; use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan}; pub use table::metadata::TableType; use table::table::adapter::DfTableProviderAdapter; +use table::table_name::TableName; use crate::dist_plan::merge_scan::{MergeScanExec, MergeScanLogicalPlan}; use crate::error; use crate::error::{CatalogSnafu, TableNotFoundSnafu}; +use crate::query_engine::DefaultSerializer; use crate::region_query::RegionQueryHandlerRef; pub struct DistExtensionPlanner { @@ -101,7 +102,7 @@ impl ExtensionPlanner for DistExtensionPlanner { // Pass down the original plan, allow execution nodes to do their optimization let amended_plan = Self::plan_with_full_table_name(input_plan.clone(), &table_name)?; let substrait_plan = DFLogicalSubstraitConvertor - .encode(&amended_plan) + .encode(&amended_plan, DefaultSerializer) .context(error::EncodeSubstraitLogicalPlanSnafu)? .into(); diff --git a/src/query/src/error.rs b/src/query/src/error.rs index 7c0160d96042..35d3fbdb17b9 100644 --- a/src/query/src/error.rs +++ b/src/query/src/error.rs @@ -56,20 +56,6 @@ pub enum Error { location: Location, }, - #[snafu(display("Catalog not found: {}", catalog))] - CatalogNotFound { - catalog: String, - #[snafu(implicit)] - location: Location, - }, - - #[snafu(display("Schema not found: {}", schema))] - SchemaNotFound { - schema: String, - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Table not found: {}", table))] TableNotFound { table: String, @@ -137,13 +123,6 @@ pub enum Error { location: Location, }, - #[snafu(display("Invalid timestamp `{}`", raw))] - InvalidTimestamp { - raw: String, - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Failed to parse float number `{}`", raw))] ParseFloat { raw: String, @@ -347,13 +326,10 @@ impl ErrorExt for Error { } UnsupportedExpr { .. } | Unimplemented { .. } - | CatalogNotFound { .. } - | SchemaNotFound { .. } | TableNotFound { .. } | UnknownTable { .. } | TimeIndexNotFound { .. } | ParseTimestamp { .. } - | InvalidTimestamp { .. } | ParseFloat { .. } | MissingRequiredField { .. } | BuildRegex { .. } diff --git a/src/query/src/lib.rs b/src/query/src/lib.rs index 9b6413e4ed92..4ac5a7c10aa9 100644 --- a/src/query/src/lib.rs +++ b/src/query/src/lib.rs @@ -14,6 +14,7 @@ #![feature(let_chains)] #![feature(int_roundings)] +#![feature(option_get_or_insert_default)] mod analyze; pub mod dataframe; @@ -31,6 +32,7 @@ pub mod physical_planner; pub mod physical_wrapper; pub mod plan; pub mod planner; +pub mod promql; pub mod query_engine; mod range_select; pub mod region_query; diff --git a/src/query/src/optimizer.rs b/src/query/src/optimizer.rs index e6a971417c23..1cb54c7126c3 100644 --- a/src/query/src/optimizer.rs +++ b/src/query/src/optimizer.rs @@ -17,7 +17,7 @@ pub mod order_hint; pub mod remove_duplicate; pub mod string_normalization; #[cfg(test)] -mod test_util; +pub(crate) mod test_util; pub mod type_conversion; use datafusion_common::config::ConfigOptions; diff --git a/src/query/src/optimizer/test_util.rs b/src/query/src/optimizer/test_util.rs index 4ffc3e28e08f..773270351fdf 100644 --- a/src/query/src/optimizer/test_util.rs +++ b/src/query/src/optimizer/test_util.rs @@ -79,7 +79,7 @@ impl RegionEngine for MetaRegionEngine { }) } - async fn region_disk_usage(&self, _region_id: RegionId) -> Option { + fn region_disk_usage(&self, _region_id: RegionId) -> Option { None } diff --git a/src/query/src/plan.rs b/src/query/src/plan.rs index 34495dee989a..ea9dae3770da 100644 --- a/src/query/src/plan.rs +++ b/src/query/src/plan.rs @@ -12,15 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fmt::{Debug, Display}; use common_query::prelude::ScalarValue; -use datafusion_common::ParamValues; +use datafusion::datasource::DefaultTableSource; +use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter}; +use datafusion_common::{ParamValues, TableReference}; use datafusion_expr::LogicalPlan as DfLogicalPlan; use datatypes::data_type::ConcreteDataType; use datatypes::schema::Schema; +use session::context::QueryContextRef; use snafu::ResultExt; +pub use table::metadata::TableType; +use table::table::adapter::DfTableProviderAdapter; +use table::table_name::TableName; use crate::error::{ConvertDatafusionSchemaSnafu, DataFusionSnafu, Result}; @@ -94,6 +100,13 @@ impl LogicalPlan { LogicalPlan::DfPlan(plan) => plan, } } + + /// Returns the DataFusion logical plan reference + pub fn df_plan(&self) -> &DfLogicalPlan { + match self { + LogicalPlan::DfPlan(plan) => plan, + } + } } impl From for LogicalPlan { @@ -101,3 +114,156 @@ impl From for LogicalPlan { Self::DfPlan(plan) } } + +struct TableNamesExtractAndRewriter { + pub(crate) table_names: HashSet, + query_ctx: QueryContextRef, +} + +impl TreeNodeRewriter for TableNamesExtractAndRewriter { + type Node = DfLogicalPlan; + + /// descend + fn f_down<'a>( + &mut self, + node: Self::Node, + ) -> datafusion::error::Result> { + match node { + DfLogicalPlan::TableScan(mut scan) => { + if let Some(source) = scan.source.as_any().downcast_ref::() { + if let Some(provider) = source + .table_provider + .as_any() + .downcast_ref::() + { + if provider.table().table_type() == TableType::Base { + let info = provider.table().table_info(); + self.table_names.insert(TableName::new( + info.catalog_name.clone(), + info.schema_name.clone(), + info.name.clone(), + )); + } + } + } + match &scan.table_name { + TableReference::Full { + catalog, + schema, + table, + } => { + self.table_names.insert(TableName::new( + catalog.to_string(), + schema.to_string(), + table.to_string(), + )); + } + TableReference::Partial { schema, table } => { + self.table_names.insert(TableName::new( + self.query_ctx.current_catalog(), + schema.to_string(), + table.to_string(), + )); + + scan.table_name = TableReference::Full { + catalog: self.query_ctx.current_catalog().into(), + schema: schema.clone(), + table: table.clone(), + }; + } + TableReference::Bare { table } => { + self.table_names.insert(TableName::new( + self.query_ctx.current_catalog(), + self.query_ctx.current_schema(), + table.to_string(), + )); + + scan.table_name = TableReference::Full { + catalog: self.query_ctx.current_catalog().into(), + schema: self.query_ctx.current_schema().into(), + table: table.clone(), + }; + } + } + Ok(Transformed::yes(DfLogicalPlan::TableScan(scan))) + } + node => Ok(Transformed::no(node)), + } + } +} + +impl TableNamesExtractAndRewriter { + fn new(query_ctx: QueryContextRef) -> Self { + Self { + query_ctx, + table_names: HashSet::new(), + } + } +} + +/// Extracts and rewrites the table names in the plan in the fully qualified style, +/// return the table names and new plan. +pub fn extract_and_rewrite_full_table_names( + plan: DfLogicalPlan, + query_ctx: QueryContextRef, +) -> Result<(HashSet, DfLogicalPlan)> { + let mut extractor = TableNamesExtractAndRewriter::new(query_ctx); + let plan = plan.rewrite(&mut extractor).context(DataFusionSnafu)?; + Ok((extractor.table_names, plan.data)) +} + +#[cfg(test)] +pub(crate) mod tests { + + use std::sync::Arc; + + use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; + use common_catalog::consts::DEFAULT_CATALOG_NAME; + use datafusion::logical_expr::builder::LogicalTableSource; + use datafusion::logical_expr::{col, lit, LogicalPlan, LogicalPlanBuilder}; + use session::context::QueryContextBuilder; + + use super::*; + + pub(crate) fn mock_plan() -> LogicalPlan { + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("name", DataType::Utf8, true), + Field::new("ts", DataType::Timestamp(TimeUnit::Millisecond, None), true), + ]); + let table_source = LogicalTableSource::new(SchemaRef::new(schema)); + + let projection = None; + + let builder = + LogicalPlanBuilder::scan("devices", Arc::new(table_source), projection).unwrap(); + + builder + .filter(col("id").gt(lit(500))) + .unwrap() + .build() + .unwrap() + } + + #[test] + fn test_extract_full_table_names() { + let ctx = QueryContextBuilder::default() + .current_schema("test".to_string()) + .build(); + + let (table_names, plan) = + extract_and_rewrite_full_table_names(mock_plan(), Arc::new(ctx)).unwrap(); + + assert_eq!(1, table_names.len()); + assert!(table_names.contains(&TableName::new( + DEFAULT_CATALOG_NAME.to_string(), + "test".to_string(), + "devices".to_string() + ))); + + assert_eq!( + "Filter: devices.id > Int32(500)\n TableScan: greptime.test.devices", + format!("{:?}", plan) + ); + } +} diff --git a/src/query/src/planner.rs b/src/query/src/planner.rs index 0a5c1a36de7b..7155f22510ab 100644 --- a/src/query/src/planner.rs +++ b/src/query/src/planner.rs @@ -24,7 +24,6 @@ use datafusion::execution::context::SessionState; use datafusion::sql::planner::PlannerContext; use datafusion_expr::Expr as DfExpr; use datafusion_sql::planner::{ParserOptions, SqlToRel}; -use promql::planner::PromPlanner; use promql_parser::parser::EvalStmt; use session::context::QueryContextRef; use snafu::ResultExt; @@ -34,7 +33,8 @@ use sql::statements::statement::Statement; use crate::error::{DataFusionSnafu, PlanSqlSnafu, QueryPlanSnafu, Result, SqlSnafu}; use crate::parser::QueryStatement; use crate::plan::LogicalPlan; -use crate::query_engine::QueryEngineState; +use crate::promql::planner::PromPlanner; +use crate::query_engine::{DefaultPlanDecoder, QueryEngineState}; use crate::range_select::plan_rewrite::RangePlanRewriter; use crate::{DfContextProviderAdapter, QueryEngineContext}; @@ -69,6 +69,10 @@ impl DfLogicalPlanner { self.engine_state.catalog_manager().clone(), self.engine_state.disallow_cross_catalog_query(), query_ctx.as_ref(), + Arc::new(DefaultPlanDecoder::new( + self.session_state.clone(), + &query_ctx, + )?), ); let context_provider = DfContextProviderAdapter::try_new( @@ -140,6 +144,10 @@ impl DfLogicalPlanner { self.engine_state.catalog_manager().clone(), self.engine_state.disallow_cross_catalog_query(), query_ctx.as_ref(), + Arc::new(DefaultPlanDecoder::new( + self.session_state.clone(), + &query_ctx, + )?), ); PromPlanner::stmt_to_plan(table_provider, stmt, &self.session_state) .await diff --git a/src/store-api/src/logstore/namespace.rs b/src/query/src/promql.rs similarity index 68% rename from src/store-api/src/logstore/namespace.rs rename to src/query/src/promql.rs index ac1b62e31bd4..06d2bbd21ae0 100644 --- a/src/store-api/src/logstore/namespace.rs +++ b/src/query/src/promql.rs @@ -12,13 +12,5 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::hash::Hash; - -/// The namespace id. -/// Usually the namespace id is identical with the region id. -pub type Id = u64; - -pub trait Namespace: Send + Sync + Clone + std::fmt::Debug + Hash + PartialEq + Eq { - /// Returns the namespace id. - fn id(&self) -> Id; -} +pub(crate) mod error; +pub mod planner; diff --git a/src/query/src/promql/error.rs b/src/query/src/promql/error.rs new file mode 100644 index 000000000000..f204cdbd7b76 --- /dev/null +++ b/src/query/src/promql/error.rs @@ -0,0 +1,229 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::any::Any; + +use common_error::ext::ErrorExt; +use common_error::status_code::StatusCode; +use common_macro::stack_trace_debug; +use datafusion::error::DataFusionError; +use promql::error::Error as PromqlError; +use promql_parser::parser::token::TokenType; +use promql_parser::parser::{Expr as PromExpr, VectorMatchCardinality}; +use snafu::{Location, Snafu}; + +#[derive(Snafu)] +#[snafu(visibility(pub))] +#[stack_trace_debug] +pub enum Error { + #[snafu(display("Unsupported expr type: {}", name))] + UnsupportedExpr { + name: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Unsupported vector matches: {:?}", name))] + UnsupportedVectorMatch { + name: VectorMatchCardinality, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Unexpected token: {:?}", token))] + UnexpectedToken { + token: TokenType, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Internal error during building DataFusion plan"))] + DataFusionPlanning { + #[snafu(source)] + error: datafusion::error::DataFusionError, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Unexpected plan or expression: {}", desc))] + UnexpectedPlanExpr { + desc: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Unknown table type, downcast failed"))] + UnknownTable { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Cannot find time index column in table {}", table))] + TimeIndexNotFound { + table: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Cannot find value columns in table {}", table))] + ValueNotFound { + table: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Failed to create PromQL plan node"))] + PromqlPlanNode { + #[snafu(source)] + source: PromqlError, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display( + "Cannot accept multiple vector as function input, PromQL expr: {:?}", + expr, + ))] + MultipleVector { + expr: PromExpr, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display( + "Table (metric) name not found, this indicates a procedure error in PromQL planner" + ))] + TableNameNotFound { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("General catalog error: "))] + Catalog { + #[snafu(implicit)] + location: Location, + source: catalog::error::Error, + }, + + #[snafu(display("Expect a range selector, but not found"))] + ExpectRangeSelector { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Zero range in range selector"))] + ZeroRangeSelector { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Cannot find column {col}"))] + ColumnNotFound { + col: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Found multiple metric matchers in selector"))] + MultipleMetricMatchers { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Expect a metric matcher, but not found"))] + NoMetricMatcher { + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Invalid function argument for {}", fn_name))] + FunctionInvalidArgument { + fn_name: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display( + "Attempt to combine two tables with different column sets, left: {:?}, right: {:?}", + left, + right + ))] + CombineTableColumnMismatch { + left: Vec, + right: Vec, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Multi fields calculation is not supported in {}", operator))] + MultiFieldsNotSupported { + operator: String, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Matcher operator {matcher_op} is not supported for {matcher}"))] + UnsupportedMatcherOp { + matcher_op: String, + matcher: String, + #[snafu(implicit)] + location: Location, + }, +} + +impl ErrorExt for Error { + fn status_code(&self) -> StatusCode { + use Error::*; + match self { + TimeIndexNotFound { .. } + | ValueNotFound { .. } + | UnsupportedExpr { .. } + | UnexpectedToken { .. } + | MultipleVector { .. } + | ExpectRangeSelector { .. } + | ZeroRangeSelector { .. } + | ColumnNotFound { .. } + | FunctionInvalidArgument { .. } + | UnsupportedVectorMatch { .. } + | CombineTableColumnMismatch { .. } + | UnexpectedPlanExpr { .. } + | UnsupportedMatcherOp { .. } => StatusCode::InvalidArguments, + + UnknownTable { .. } => StatusCode::Internal, + + PromqlPlanNode { source, .. } => source.status_code(), + + DataFusionPlanning { .. } => StatusCode::PlanQuery, + + TableNameNotFound { .. } => StatusCode::TableNotFound, + + MultipleMetricMatchers { .. } | NoMetricMatcher { .. } => StatusCode::InvalidSyntax, + + MultiFieldsNotSupported { .. } => StatusCode::Unsupported, + Catalog { source, .. } => source.status_code(), + } + } + + fn as_any(&self) -> &dyn Any { + self + } +} + +pub type Result = std::result::Result; + +impl From for DataFusionError { + fn from(err: Error) -> Self { + DataFusionError::External(Box::new(err)) + } +} diff --git a/src/promql/src/planner.rs b/src/query/src/promql/planner.rs similarity index 99% rename from src/promql/src/planner.rs rename to src/query/src/promql/planner.rs index d9a501f755bc..2a0a16a2000c 100644 --- a/src/promql/src/planner.rs +++ b/src/query/src/promql/planner.rs @@ -38,6 +38,15 @@ use datafusion_expr::utils::conjunction; use datatypes::arrow::datatypes::{DataType as ArrowDataType, TimeUnit as ArrowTimeUnit}; use datatypes::data_type::ConcreteDataType; use itertools::Itertools; +use promql::extension_plan::{ + build_special_time_expr, EmptyMetric, HistogramFold, InstantManipulate, Millisecond, + RangeManipulate, ScalarCalculate, SeriesDivide, SeriesNormalize, UnionDistinctOn, +}; +use promql::functions::{ + AbsentOverTime, AvgOverTime, Changes, CountOverTime, Delta, Deriv, HoltWinters, IDelta, + Increase, LastOverTime, MaxOverTime, MinOverTime, PredictLinear, PresentOverTime, + QuantileOverTime, Rate, Resets, StddevOverTime, StdvarOverTime, SumOverTime, +}; use promql_parser::label::{MatchOp, Matcher, Matchers, METRIC_NAME}; use promql_parser::parser::token::TokenType; use promql_parser::parser::{ @@ -49,23 +58,14 @@ use promql_parser::parser::{ use snafu::{ensure, OptionExt, ResultExt}; use table::table::adapter::DfTableProviderAdapter; -use crate::error::{ +use crate::promql::error::{ CatalogSnafu, ColumnNotFoundSnafu, CombineTableColumnMismatchSnafu, DataFusionPlanningSnafu, ExpectRangeSelectorSnafu, FunctionInvalidArgumentSnafu, MultiFieldsNotSupportedSnafu, - MultipleMetricMatchersSnafu, MultipleVectorSnafu, NoMetricMatcherSnafu, Result, - TableNameNotFoundSnafu, TimeIndexNotFoundSnafu, UnexpectedPlanExprSnafu, UnexpectedTokenSnafu, - UnknownTableSnafu, UnsupportedExprSnafu, UnsupportedMatcherOpSnafu, + MultipleMetricMatchersSnafu, MultipleVectorSnafu, NoMetricMatcherSnafu, PromqlPlanNodeSnafu, + Result, TableNameNotFoundSnafu, TimeIndexNotFoundSnafu, UnexpectedPlanExprSnafu, + UnexpectedTokenSnafu, UnknownTableSnafu, UnsupportedExprSnafu, UnsupportedMatcherOpSnafu, UnsupportedVectorMatchSnafu, ValueNotFoundSnafu, ZeroRangeSelectorSnafu, }; -use crate::extension_plan::{ - build_special_time_expr, EmptyMetric, HistogramFold, InstantManipulate, Millisecond, - RangeManipulate, ScalarCalculate, SeriesDivide, SeriesNormalize, UnionDistinctOn, -}; -use crate::functions::{ - AbsentOverTime, AvgOverTime, Changes, CountOverTime, Delta, Deriv, HoltWinters, IDelta, - Increase, LastOverTime, MaxOverTime, MinOverTime, PredictLinear, PresentOverTime, - QuantileOverTime, Rate, Resets, StddevOverTime, StdvarOverTime, SumOverTime, -}; /// `time()` function in PromQL. const SPECIAL_TIME_FUNCTION: &str = "time"; @@ -1538,16 +1538,19 @@ impl PromPlanner { }, ); let scalar_plan = LogicalPlan::Extension(Extension { - node: Arc::new(ScalarCalculate::new( - self.ctx.start, - self.ctx.end, - self.ctx.interval, - input, - self.ctx.time_index_column.as_ref().unwrap(), - &self.ctx.tag_columns, - &self.ctx.field_columns[0], - self.ctx.table_name.as_deref(), - )?), + node: Arc::new( + ScalarCalculate::new( + self.ctx.start, + self.ctx.end, + self.ctx.interval, + input, + self.ctx.time_index_column.as_ref().unwrap(), + &self.ctx.tag_columns, + &self.ctx.field_columns[0], + self.ctx.table_name.as_deref(), + ) + .context(PromqlPlanNodeSnafu)?, + ), }); // scalar plan have no tag columns self.ctx.tag_columns.clear(); @@ -2196,6 +2199,7 @@ mod test { use catalog::memory::MemoryCatalogManager; use catalog::RegisterTableRequest; use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME}; + use common_query::test_util::DummyDecoder; use datafusion::execution::runtime_env::RuntimeEnv; use datatypes::prelude::ConcreteDataType; use datatypes::schema::{ColumnSchema, Schema}; @@ -2268,7 +2272,12 @@ mod test { .is_ok()); } - DfTableSourceProvider::new(catalog_list, false, QueryContext::arc().as_ref()) + DfTableSourceProvider::new( + catalog_list, + false, + QueryContext::arc().as_ref(), + DummyDecoder::arc(), + ) } // { @@ -3102,7 +3111,12 @@ mod test { .is_ok()); let plan = PromPlanner::stmt_to_plan( - DfTableSourceProvider::new(catalog_list.clone(), false, QueryContext::arc().as_ref()), + DfTableSourceProvider::new( + catalog_list.clone(), + false, + QueryContext::arc().as_ref(), + DummyDecoder::arc(), + ), EvalStmt { expr: parser::parse("metrics{tag = \"1\"}").unwrap(), start: UNIX_EPOCH, @@ -3126,7 +3140,12 @@ mod test { \n TableScan: metrics [tag:Utf8, timestamp:Timestamp(Nanosecond, None), field:Float64;N]" ); let plan = PromPlanner::stmt_to_plan( - DfTableSourceProvider::new(catalog_list.clone(), false, QueryContext::arc().as_ref()), + DfTableSourceProvider::new( + catalog_list.clone(), + false, + QueryContext::arc().as_ref(), + DummyDecoder::arc(), + ), EvalStmt { expr: parser::parse("avg_over_time(metrics{tag = \"1\"}[5s])").unwrap(), start: UNIX_EPOCH, diff --git a/src/query/src/query_engine.rs b/src/query/src/query_engine.rs index 18923f3b96ad..1beea2a1c2d2 100644 --- a/src/query/src/query_engine.rs +++ b/src/query/src/query_engine.rs @@ -13,9 +13,9 @@ // limitations under the License. mod context; +mod default_serializer; pub mod options; mod state; - use std::any::Any; use std::sync::Arc; @@ -29,6 +29,7 @@ use common_function::scalars::aggregate::AggregateFunctionMetaRef; use common_query::prelude::ScalarUdf; use common_query::Output; use datatypes::schema::Schema; +pub use default_serializer::{DefaultPlanDecoder, DefaultSerializer}; use session::context::QueryContextRef; use table::TableRef; diff --git a/src/query/src/query_engine/context.rs b/src/query/src/query_engine/context.rs index f76332cde2b4..c527e9d40557 100644 --- a/src/query/src/query_engine/context.rs +++ b/src/query/src/query_engine/context.rs @@ -14,10 +14,13 @@ use std::sync::Arc; +use common_query::logical_plan::SubstraitPlanDecoderRef; use common_telemetry::tracing_context::TracingContext; use datafusion::execution::context::{SessionState, TaskContext}; use session::context::QueryContextRef; +use crate::query_engine::default_serializer::DefaultPlanDecoder; + #[derive(Debug)] pub struct QueryEngineContext { state: SessionState, @@ -58,6 +61,14 @@ impl QueryEngineContext { )) } + /// Creates a [`LogicalPlan`] decoder + pub fn new_plan_decoder(&self) -> crate::error::Result { + Ok(Arc::new(DefaultPlanDecoder::new( + self.state.clone(), + &self.query_ctx, + )?)) + } + /// Mock an engine context for unit tests. #[cfg(any(test, feature = "test"))] pub fn mock() -> Self { diff --git a/src/query/src/query_engine/default_serializer.rs b/src/query/src/query_engine/default_serializer.rs new file mode 100644 index 000000000000..ff341a26ed82 --- /dev/null +++ b/src/query/src/query_engine/default_serializer.rs @@ -0,0 +1,171 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use common_error::ext::BoxedError; +use common_function::function_registry::FUNCTION_REGISTRY; +use common_function::scalars::udf::create_udf; +use common_query::logical_plan::SubstraitPlanDecoder; +use datafusion::catalog::CatalogProviderList; +use datafusion::common::DataFusionError; +use datafusion::error::Result; +use datafusion::execution::context::SessionState; +use datafusion::execution::registry::SerializerRegistry; +use datafusion::execution::FunctionRegistry; +use datafusion::logical_expr::LogicalPlan; +use datafusion_expr::UserDefinedLogicalNode; +use greptime_proto::substrait_extension::MergeScan as PbMergeScan; +use prost::Message; +use session::context::QueryContextRef; +use snafu::ResultExt; +use substrait::extension_serializer::ExtensionSerializer; +use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan}; + +use crate::dist_plan::MergeScanLogicalPlan; +use crate::error::DataFusionSnafu; + +/// Extended [`substrait::extension_serializer::ExtensionSerializer`] but supports [`MergeScanLogicalPlan`] serialization. +pub struct DefaultSerializer; + +impl SerializerRegistry for DefaultSerializer { + fn serialize_logical_plan(&self, node: &dyn UserDefinedLogicalNode) -> Result> { + if node.name() == MergeScanLogicalPlan::name() { + let merge_scan = node + .as_any() + .downcast_ref::() + .expect("Failed to downcast to MergeScanLogicalPlan"); + + let input = merge_scan.input(); + let is_placeholder = merge_scan.is_placeholder(); + let input = DFLogicalSubstraitConvertor + .encode(input, DefaultSerializer) + .map_err(|e| DataFusionError::External(Box::new(e)))? + .to_vec(); + + Ok(PbMergeScan { + is_placeholder, + input, + } + .encode_to_vec()) + } else { + ExtensionSerializer.serialize_logical_plan(node) + } + } + + fn deserialize_logical_plan( + &self, + name: &str, + bytes: &[u8], + ) -> Result> { + if name == MergeScanLogicalPlan::name() { + // TODO(dennis): missing `session_state` to decode the logical plan in `MergeScanLogicalPlan`, + // so we only save the unoptimized logical plan for view currently. + Err(DataFusionError::Substrait(format!( + "Unsupported plan node: {name}" + ))) + } else { + ExtensionSerializer.deserialize_logical_plan(name, bytes) + } + } +} + +/// The datafusion `[LogicalPlan]` decoder. +pub struct DefaultPlanDecoder { + session_state: SessionState, +} + +impl DefaultPlanDecoder { + pub fn new( + mut session_state: SessionState, + query_ctx: &QueryContextRef, + ) -> crate::error::Result { + // Substrait decoder will look up the UDFs in SessionState, so we need to register them + // Note: the query context must be passed to set the timezone + for func in FUNCTION_REGISTRY.functions() { + let udf = Arc::new(create_udf(func, query_ctx.clone(), Default::default()).into()); + session_state.register_udf(udf).context(DataFusionSnafu)?; + } + + Ok(Self { session_state }) + } +} + +#[async_trait::async_trait] +impl SubstraitPlanDecoder for DefaultPlanDecoder { + async fn decode( + &self, + message: bytes::Bytes, + catalog_list: Arc, + optimize: bool, + ) -> common_query::error::Result { + // The session_state already has the `DefaultSerialzier` as `SerializerRegistry`. + let logical_plan = DFLogicalSubstraitConvertor + .decode(message, catalog_list.clone(), self.session_state.clone()) + .await + .map_err(BoxedError::new) + .context(common_query::error::DecodePlanSnafu)?; + + if optimize { + self.session_state + .optimize(&logical_plan) + .context(common_query::error::GeneralDataFusionSnafu) + } else { + Ok(logical_plan) + } + } +} + +#[cfg(test)] +mod tests { + use session::context::QueryContext; + + use super::*; + use crate::dummy_catalog::DummyCatalogList; + use crate::optimizer::test_util::mock_table_provider; + use crate::plan::tests::mock_plan; + use crate::QueryEngineFactory; + + #[tokio::test] + async fn test_serializer_decode_plan() { + let catalog_list = catalog::memory::new_memory_catalog_manager().unwrap(); + let factory = QueryEngineFactory::new(catalog_list, None, None, None, false); + + let engine = factory.query_engine(); + + let plan = mock_plan(); + + let bytes = DFLogicalSubstraitConvertor + .encode(&plan, DefaultSerializer) + .unwrap(); + + let plan_decoder = engine + .engine_context(QueryContext::arc()) + .new_plan_decoder() + .unwrap(); + let table_provider = Arc::new(mock_table_provider(1.into())); + let catalog_list = Arc::new(DummyCatalogList::with_table_provider(table_provider)); + + let decode_plan = plan_decoder + .decode(bytes, catalog_list, false) + .await + .unwrap(); + + assert_eq!( + "Filter: devices.k0 > Int32(500) + TableScan: devices projection=[k0, ts, v0]", + format!("{:?}", decode_plan), + ); + } +} diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs index 9fdee8fc0e36..51b3f82ef228 100644 --- a/src/query/src/query_engine/state.rs +++ b/src/query/src/query_engine/state.rs @@ -37,7 +37,6 @@ use datafusion_optimizer::analyzer::count_wildcard_rule::CountWildcardRule; use datafusion_optimizer::analyzer::{Analyzer, AnalyzerRule}; use datafusion_optimizer::optimizer::Optimizer; use promql::extension_plan::PromExtensionPlanner; -use substrait::extension_serializer::ExtensionSerializer; use table::table::adapter::DfTableProviderAdapter; use table::TableRef; @@ -49,6 +48,7 @@ use crate::optimizer::string_normalization::StringNormalizationRule; use crate::optimizer::type_conversion::TypeConversionRule; use crate::optimizer::ExtensionAnalyzerRule; use crate::query_engine::options::QueryOptions; +use crate::query_engine::DefaultSerializer; use crate::range_select::planner::RangeSelectPlanner; use crate::region_query::RegionQueryHandlerRef; use crate::QueryEngineContext; @@ -115,8 +115,8 @@ impl QueryEngineState { physical_optimizer.rules.push(Arc::new(RemoveDuplicate)); let session_state = SessionState::new_with_config_rt(session_config, runtime_env) - .with_serializer_registry(Arc::new(ExtensionSerializer)) .with_analyzer_rules(analyzer.rules) + .with_serializer_registry(Arc::new(DefaultSerializer)) .with_query_planner(Arc::new(DfQueryPlanner::new( catalog_list.clone(), region_query_handler, diff --git a/src/store-api/src/logstore.rs b/src/store-api/src/logstore.rs index 33739ac85fb4..347643982716 100644 --- a/src/store-api/src/logstore.rs +++ b/src/store-api/src/logstore.rs @@ -14,68 +14,64 @@ //! LogStore APIs. +pub mod entry; +pub mod provider; + use std::collections::HashMap; +use std::pin::Pin; use common_error::ext::ErrorExt; -use common_wal::options::WalOptions; +use entry::Entry; +use futures::Stream; -use crate::logstore::entry::Entry; -pub use crate::logstore::entry::Id as EntryId; -use crate::logstore::entry_stream::SendableEntryStream; -pub use crate::logstore::namespace::Id as NamespaceId; -use crate::logstore::namespace::Namespace; +pub type SendableEntryStream<'a, I, E> = Pin, E>> + Send + 'a>>; -pub mod entry; -pub mod entry_stream; -pub mod namespace; +pub use crate::logstore::entry::Id as EntryId; +use crate::logstore::provider::Provider; +use crate::storage::RegionId; /// `LogStore` serves as a Write-Ahead-Log for storage engine. #[async_trait::async_trait] pub trait LogStore: Send + Sync + 'static + std::fmt::Debug { type Error: ErrorExt + Send + Sync + 'static; - type Namespace: Namespace; - type Entry: Entry; /// Stops components of the logstore. async fn stop(&self) -> Result<(), Self::Error>; - /// Appends an entry to the log store and returns a response containing the id of the append entry. - async fn append(&self, entry: Self::Entry) -> Result; - /// Appends a batch of entries and returns a response containing a map where the key is a region id /// while the value is the id of the last successfully written entry of the region. - async fn append_batch( - &self, - entries: Vec, - ) -> Result; + async fn append_batch(&self, entries: Vec) -> Result; /// Creates a new `EntryStream` to asynchronously generates `Entry` with ids /// starting from `id`. async fn read( &self, - ns: &Self::Namespace, + provider: &Provider, id: EntryId, - ) -> Result, Self::Error>; + ) -> Result, Self::Error>; /// Creates a new `Namespace` from the given ref. - async fn create_namespace(&self, ns: &Self::Namespace) -> Result<(), Self::Error>; + async fn create_namespace(&self, ns: &Provider) -> Result<(), Self::Error>; /// Deletes an existing `Namespace` specified by the given ref. - async fn delete_namespace(&self, ns: &Self::Namespace) -> Result<(), Self::Error>; + async fn delete_namespace(&self, ns: &Provider) -> Result<(), Self::Error>; /// Lists all existing namespaces. - async fn list_namespaces(&self) -> Result, Self::Error>; + async fn list_namespaces(&self) -> Result, Self::Error>; /// Marks all entries with ids `<=entry_id` of the given `namespace` as obsolete, /// so that the log store can safely delete those entries. This method does not guarantee /// that the obsolete entries are deleted immediately. - async fn obsolete(&self, ns: Self::Namespace, entry_id: EntryId) -> Result<(), Self::Error>; + async fn obsolete(&self, provider: &Provider, entry_id: EntryId) -> Result<(), Self::Error>; /// Makes an entry instance of the associated Entry type - fn entry(&self, data: &mut Vec, entry_id: EntryId, ns: Self::Namespace) -> Self::Entry; - - /// Makes a namespace instance of the associated Namespace type - fn namespace(&self, ns_id: NamespaceId, wal_options: &WalOptions) -> Self::Namespace; + fn entry( + &self, + data: &mut Vec, + entry_id: EntryId, + region_id: RegionId, + provider: &Provider, + ) -> Result; } /// The response of an `append` operation. @@ -89,5 +85,5 @@ pub struct AppendResponse { #[derive(Debug, Default)] pub struct AppendBatchResponse { /// Key: region id (as u64). Value: the id of the last successfully written entry of the region. - pub last_entry_ids: HashMap, + pub last_entry_ids: HashMap, } diff --git a/src/store-api/src/logstore/entry.rs b/src/store-api/src/logstore/entry.rs index 09daa2e1abb9..8b7f838be17a 100644 --- a/src/store-api/src/logstore/entry.rs +++ b/src/store-api/src/logstore/entry.rs @@ -12,58 +12,141 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::mem::size_of; + +use crate::logstore::provider::Provider; use crate::storage::RegionId; /// An entry's id. /// Different log store implementations may interpret the id to different meanings. pub type Id = u64; -/// The raw Wal entry. +/// The [Entry::Naive] is used in RaftEngineLogStore and KafkaLogStore. +/// +/// The [Entry::MultiplePart] contains multiple parts of data that split from a large entry, is used in KafkaLogStore, #[derive(Debug, Clone, PartialEq, Eq)] -pub struct RawEntry { +pub enum Entry { + Naive(NaiveEntry), + MultiplePart(MultiplePartEntry), +} + +impl Entry { + /// Into [NaiveEntry] if it's type of [Entry::Naive]. + pub fn into_naive_entry(self) -> Option { + match self { + Entry::Naive(entry) => Some(entry), + Entry::MultiplePart(_) => None, + } + } + + /// Into [MultiplePartEntry] if it's type of [Entry::MultiplePart]. + pub fn into_multiple_part_entry(self) -> Option { + match self { + Entry::Naive(_) => None, + Entry::MultiplePart(entry) => Some(entry), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct NaiveEntry { + pub provider: Provider, pub region_id: RegionId, pub entry_id: Id, pub data: Vec, } -impl Entry for RawEntry { - fn into_raw_entry(self) -> RawEntry { - self +impl NaiveEntry { + fn estimated_size(&self) -> usize { + size_of::() + self.data.capacity() * size_of::() } +} - fn data(&self) -> &[u8] { - &self.data - } +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum MultiplePartHeader { + First, + Middle(usize), + Last, +} - fn id(&self) -> Id { - self.entry_id - } +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct MultiplePartEntry { + pub provider: Provider, + pub region_id: RegionId, + pub entry_id: Id, + pub headers: Vec, + pub parts: Vec>, +} - fn region_id(&self) -> RegionId { - self.region_id +impl MultiplePartEntry { + fn is_complete(&self) -> bool { + self.headers.contains(&MultiplePartHeader::First) + && self.headers.contains(&MultiplePartHeader::Last) } fn estimated_size(&self) -> usize { - std::mem::size_of_val(self) + size_of::() + + self + .parts + .iter() + .map(|data| data.capacity() * size_of::()) + .sum::() + + self.headers.capacity() * size_of::() } } -/// Entry is the minimal data storage unit through which users interact with the log store. -/// The log store implementation may have larger or smaller data storage unit than an entry. -pub trait Entry: Send + Sync { - /// Consumes [Entry] and converts to [RawEntry]. - fn into_raw_entry(self) -> RawEntry; +impl Entry { + /// Returns the [Provider] + pub fn provider(&self) -> &Provider { + match self { + Entry::Naive(entry) => &entry.provider, + Entry::MultiplePart(entry) => &entry.provider, + } + } - /// Returns the contained data of the entry. - fn data(&self) -> &[u8]; + /// Returns the [RegionId] + pub fn region_id(&self) -> RegionId { + match self { + Entry::Naive(entry) => entry.region_id, + Entry::MultiplePart(entry) => entry.region_id, + } + } - /// Returns the id of the entry. - /// Usually the namespace id is identical with the region id. - fn id(&self) -> Id; + /// Returns the [Id] + pub fn entry_id(&self) -> Id { + match self { + Entry::Naive(entry) => entry.entry_id, + Entry::MultiplePart(entry) => entry.entry_id, + } + } - /// Returns the [RegionId] - fn region_id(&self) -> RegionId; + /// Returns the [Id] + pub fn set_entry_id(&mut self, id: Id) { + match self { + Entry::Naive(entry) => entry.entry_id = id, + Entry::MultiplePart(entry) => entry.entry_id = id, + } + } + + /// Returns true if it's a complete entry. + pub fn is_complete(&self) -> bool { + match self { + Entry::Naive(_) => true, + Entry::MultiplePart(entry) => entry.is_complete(), + } + } - /// Computes the estimated encoded size. - fn estimated_size(&self) -> usize; + pub fn into_bytes(self) -> Vec { + match self { + Entry::Naive(entry) => entry.data, + Entry::MultiplePart(entry) => entry.parts.concat(), + } + } + + pub fn estimated_size(&self) -> usize { + match self { + Entry::Naive(entry) => entry.estimated_size(), + Entry::MultiplePart(entry) => entry.estimated_size(), + } + } } diff --git a/src/store-api/src/logstore/entry_stream.rs b/src/store-api/src/logstore/entry_stream.rs deleted file mode 100644 index 6a5886b0b53f..000000000000 --- a/src/store-api/src/logstore/entry_stream.rs +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::pin::Pin; - -use common_error::ext::ErrorExt; -use futures::Stream; - -use crate::logstore::entry::Entry; - -pub trait EntryStream: Stream, Self::Error>> { - type Error: ErrorExt; - type Entry: Entry; - - fn start_id(&self) -> u64; -} - -pub type SendableEntryStream<'a, I, E> = Pin, E>> + Send + 'a>>; - -#[cfg(test)] -mod tests { - use std::any::Any; - use std::task::{Context, Poll}; - - use common_error::ext::StackError; - use futures::StreamExt; - use snafu::Snafu; - - use super::*; - pub use crate::logstore::entry::Id; - use crate::logstore::entry::RawEntry; - use crate::storage::RegionId; - - pub struct SimpleEntry { - /// Binary data of current entry - data: Vec, - } - - #[derive(Debug, Snafu)] - #[snafu(visibility(pub))] - pub struct Error {} - - impl ErrorExt for Error { - fn as_any(&self) -> &dyn Any { - self - } - } - - impl StackError for Error { - fn debug_fmt(&self, _: usize, _: &mut Vec) {} - - fn next(&self) -> Option<&dyn StackError> { - None - } - } - - impl Entry for SimpleEntry { - fn into_raw_entry(self) -> RawEntry { - RawEntry { - region_id: RegionId::from_u64(0), - entry_id: 0, - data: vec![], - } - } - - fn data(&self) -> &[u8] { - &self.data - } - - fn id(&self) -> Id { - 0u64 - } - - fn region_id(&self) -> RegionId { - RegionId::from_u64(0) - } - - fn estimated_size(&self) -> usize { - self.data.len() - } - } - - impl SimpleEntry { - pub fn new(data: impl AsRef<[u8]>) -> Self { - let data = data.as_ref().to_vec(); - Self { data } - } - } - - pub struct EntryStreamImpl<'a> { - inner: SendableEntryStream<'a, SimpleEntry, Error>, - start_id: u64, - } - - impl<'a> EntryStream for EntryStreamImpl<'a> { - type Error = Error; - type Entry = SimpleEntry; - - fn start_id(&self) -> u64 { - self.start_id - } - } - - impl Stream for EntryStreamImpl<'_> { - type Item = Result, Error>; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - match Pin::new(&mut self.inner).poll_next(cx) { - Poll::Ready(Some(v)) => Poll::Ready(Some(v)), - Poll::Ready(None) => Poll::Ready(None), - Poll::Pending => Poll::Pending, - } - } - } - - #[tokio::test] - pub async fn test_entry_stream() { - let stream = - async_stream::stream!(yield Ok(vec![SimpleEntry::new("test_entry".as_bytes())])); - - let mut stream_impl = EntryStreamImpl { - inner: Box::pin(stream), - start_id: 1234, - }; - - if let Some(v) = stream_impl.next().await { - let vec = v.unwrap(); - assert_eq!(1, vec.len()); - assert_eq!(b"test_entry", vec[0].data()); - } - } -} diff --git a/src/store-api/src/logstore/provider.rs b/src/store-api/src/logstore/provider.rs new file mode 100644 index 000000000000..f893a47df54f --- /dev/null +++ b/src/store-api/src/logstore/provider.rs @@ -0,0 +1,110 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Display; +use std::sync::Arc; + +use crate::storage::RegionId; + +// The Provider of kafka log store +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct KafkaProvider { + pub topic: String, +} + +impl KafkaProvider { + pub fn new(topic: String) -> Self { + Self { topic } + } + + /// Returns the type name. + pub fn type_name() -> &'static str { + "KafkaProvider" + } +} + +impl Display for KafkaProvider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.topic) + } +} + +// The Provider of raft engine log store +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct RaftEngineProvider { + pub id: u64, +} + +impl RaftEngineProvider { + pub fn new(id: u64) -> Self { + Self { id } + } + + /// Returns the type name. + pub fn type_name() -> &'static str { + "RaftEngineProvider" + } +} + +/// The Provider of LogStore +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Provider { + RaftEngine(RaftEngineProvider), + Kafka(Arc), +} + +impl Display for Provider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self { + Provider::RaftEngine(provider) => { + write!(f, "region: {}", RegionId::from_u64(provider.id)) + } + Provider::Kafka(provider) => write!(f, "topic: {}", provider.topic), + } + } +} + +impl Provider { + pub fn raft_engine_provider(id: u64) -> Provider { + Provider::RaftEngine(RaftEngineProvider { id }) + } + + pub fn kafka_provider(topic: String) -> Provider { + Provider::Kafka(Arc::new(KafkaProvider { topic })) + } + + /// Returns the type name. + pub fn type_name(&self) -> &'static str { + match self { + Provider::RaftEngine(_) => RaftEngineProvider::type_name(), + Provider::Kafka(_) => KafkaProvider::type_name(), + } + } + + /// Returns the reference of [`RaftEngineProvider`] if it's the type of [`LogStoreProvider::RaftEngine`]. + pub fn as_raft_engine_provider(&self) -> Option<&RaftEngineProvider> { + if let Provider::RaftEngine(ns) = self { + return Some(ns); + } + None + } + + /// Returns the reference of [`KafkaProvider`] if it's the type of [`LogStoreProvider::Kafka`]. + pub fn as_kafka_provider(&self) -> Option<&Arc> { + if let Provider::Kafka(ns) = self { + return Some(ns); + } + None + } +} diff --git a/src/store-api/src/region_engine.rs b/src/store-api/src/region_engine.rs index 91813c91295b..3f9d58a95588 100644 --- a/src/store-api/src/region_engine.rs +++ b/src/store-api/src/region_engine.rs @@ -200,7 +200,7 @@ pub trait RegionEngine: Send + Sync { async fn get_metadata(&self, region_id: RegionId) -> Result; /// Retrieves region's disk usage. - async fn region_disk_usage(&self, region_id: RegionId) -> Option; + fn region_disk_usage(&self, region_id: RegionId) -> Option; /// Stops the engine async fn stop(&self) -> Result<(), BoxedError>; diff --git a/src/table/Cargo.toml b/src/table/Cargo.toml index 9463b1809fe7..b33f4757c66e 100644 --- a/src/table/Cargo.toml +++ b/src/table/Cargo.toml @@ -11,6 +11,7 @@ testing = [] workspace = true [dependencies] +api.workspace = true async-trait = "0.1" chrono.workspace = true common-base.workspace = true diff --git a/src/table/src/lib.rs b/src/table/src/lib.rs index 857d529e8add..f4eb68cc85c7 100644 --- a/src/table/src/lib.rs +++ b/src/table/src/lib.rs @@ -21,6 +21,7 @@ pub mod predicate; pub mod requests; pub mod stats; pub mod table; +pub mod table_name; pub mod table_reference; pub mod test_util; diff --git a/src/common/meta/src/table_name.rs b/src/table/src/table_name.rs similarity index 98% rename from src/common/meta/src/table_name.rs rename to src/table/src/table_name.rs index 645e6386df02..f999e013f243 100644 --- a/src/common/meta/src/table_name.rs +++ b/src/table/src/table_name.rs @@ -16,7 +16,8 @@ use std::fmt::{Display, Formatter}; use api::v1::TableName as PbTableName; use serde::{Deserialize, Serialize}; -use table::table_reference::TableReference; + +use crate::table_reference::TableReference; #[derive(Debug, Clone, Hash, Eq, PartialEq, Deserialize, Serialize)] pub struct TableName { diff --git a/tests-integration/src/cluster.rs b/tests-integration/src/cluster.rs index bfa59966ae8e..7c0bb2f1d0ba 100644 --- a/tests-integration/src/cluster.rs +++ b/tests-integration/src/cluster.rs @@ -364,16 +364,12 @@ impl GreptimeDbClusterBuilder { .build(), ); - let table_cache = cache_registry.get().unwrap(); - let table_route_cache = cache_registry.get().unwrap(); let catalog_manager = KvBackendCatalogManager::new( Mode::Distributed, Some(meta_client.clone()), cached_meta_backend.clone(), - table_cache, - table_route_cache, - ) - .await; + cache_registry.clone(), + ); let handlers_executor = HandlerGroupExecutor::new(vec![ Arc::new(ParseMailboxMessageHandler), diff --git a/tests-integration/src/grpc.rs b/tests-integration/src/grpc.rs index 6d7179b18da1..ed2d6425a439 100644 --- a/tests-integration/src/grpc.rs +++ b/tests-integration/src/grpc.rs @@ -34,6 +34,7 @@ mod test { use frontend::instance::Instance; use query::parser::QueryLanguageParser; use query::plan::LogicalPlan; + use query::query_engine::DefaultSerializer; use servers::query_handler::grpc::GrpcQueryHandler; use session::context::QueryContext; use store_api::storage::RegionId; @@ -544,7 +545,9 @@ CREATE TABLE {table_name} ( .plan(stmt, QueryContext::arc()) .await .unwrap(); - let plan = DFLogicalSubstraitConvertor.encode(&plan).unwrap(); + let plan = DFLogicalSubstraitConvertor + .encode(&plan, DefaultSerializer) + .unwrap(); for (region, dn) in region_to_dn_map.iter() { let region_server = instance.datanodes().get(dn).unwrap().region_server(); diff --git a/tests-integration/src/instance.rs b/tests-integration/src/instance.rs index 1e52162ef3ca..feff39e136c0 100644 --- a/tests-integration/src/instance.rs +++ b/tests-integration/src/instance.rs @@ -32,6 +32,7 @@ mod tests { use frontend::instance::Instance; use query::parser::QueryLanguageParser; use query::plan::LogicalPlan; + use query::query_engine::DefaultSerializer; use servers::interceptor::{SqlQueryInterceptor, SqlQueryInterceptorRef}; use servers::query_handler::sql::SqlQueryHandler; use session::context::{QueryContext, QueryContextRef}; @@ -238,7 +239,9 @@ mod tests { .plan(stmt, QueryContext::arc()) .await .unwrap(); - let plan = DFLogicalSubstraitConvertor.encode(&plan).unwrap(); + let plan = DFLogicalSubstraitConvertor + .encode(&plan, DefaultSerializer) + .unwrap(); for (region, dn) in region_to_dn_map.iter() { let region_server = instance.datanodes().get(dn).unwrap().region_server(); diff --git a/tests-integration/src/standalone.rs b/tests-integration/src/standalone.rs index 5cbc46c69305..35a14e261260 100644 --- a/tests-integration/src/standalone.rs +++ b/tests-integration/src/standalone.rs @@ -146,10 +146,8 @@ impl GreptimeDbStandaloneBuilder { Mode::Standalone, None, kv_backend.clone(), - cache_registry.get().unwrap(), - cache_registry.get().unwrap(), - ) - .await; + cache_registry.clone(), + ); let flow_builder = FlownodeBuilder::new( 1, // for standalone mode this value is default to one diff --git a/tests/cases/standalone/common/information_schema/tables.result b/tests/cases/standalone/common/information_schema/tables.result new file mode 100644 index 000000000000..d5fd7021e817 --- /dev/null +++ b/tests/cases/standalone/common/information_schema/tables.result @@ -0,0 +1,46 @@ +create schema abc; + +Affected Rows: 1 + +use abc; + +Affected Rows: 0 + +create table t (ts timestamp time index); + +Affected Rows: 0 + +create schema abcde; + +Affected Rows: 1 + +use abcde; + +Affected Rows: 0 + +create table t (ts timestamp time index); + +Affected Rows: 0 + +select table_catalog, table_schema, table_name from information_schema.tables where table_schema != 'information_schema'; + ++---------------+--------------+------------+ +| table_catalog | table_schema | table_name | ++---------------+--------------+------------+ +| greptime | abc | t | +| greptime | abcde | t | +| greptime | public | numbers | ++---------------+--------------+------------+ + +use public; + +Affected Rows: 0 + +drop schema abc; + +Affected Rows: 0 + +drop schema abcde; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/information_schema/tables.sql b/tests/cases/standalone/common/information_schema/tables.sql new file mode 100644 index 000000000000..4a03a4a3b63a --- /dev/null +++ b/tests/cases/standalone/common/information_schema/tables.sql @@ -0,0 +1,19 @@ +create schema abc; + +use abc; + +create table t (ts timestamp time index); + +create schema abcde; + +use abcde; + +create table t (ts timestamp time index); + +select table_catalog, table_schema, table_name from information_schema.tables where table_schema != 'information_schema'; + +use public; + +drop schema abc; + +drop schema abcde; diff --git a/tests/cases/standalone/common/view/create.result b/tests/cases/standalone/common/view/create.result index dbcd435a7424..855eb08bb7ed 100644 --- a/tests/cases/standalone/common/view/create.result +++ b/tests/cases/standalone/common/view/create.result @@ -1,9 +1,9 @@ --- test CREATE VIEW --- -CREATE DATABASE for_test_view; +CREATE DATABASE schema_for_view_test; Affected Rows: 1 -USE for_test_view; +USE schema_for_view_test; Affected Rows: 0 @@ -22,17 +22,17 @@ Error: 2000(InvalidSyntax), sql parser error: Expected SELECT, VALUES, or a subq --- Table already exists --- CREATE VIEW test_table as SELECT * FROM public.numbers; -Error: 4000(TableAlreadyExists), Table already exists: `greptime.for_test_view.test_table` +Error: 4000(TableAlreadyExists), Table already exists: `greptime.schema_for_view_test.test_table` --- Table already exists even when create_if_not_exists --- CREATE VIEW IF NOT EXISTS test_table as SELECT * FROM public.numbers; -Error: 4000(TableAlreadyExists), Table already exists: `greptime.for_test_view.test_table` +Error: 4000(TableAlreadyExists), Table already exists: `greptime.schema_for_view_test.test_table` --- Table already exists even when or_replace --- CREATE OR REPLACE VIEW test_table as SELECT * FROM public.numbers; -Error: 4000(TableAlreadyExists), Table already exists: `greptime.for_test_view.test_table` +Error: 4000(TableAlreadyExists), Table already exists: `greptime.schema_for_view_test.test_table` CREATE VIEW test_view as SELECT * FROM public.numbers; @@ -41,7 +41,7 @@ Affected Rows: 0 --- View already exists ---- CREATE VIEW test_view as SELECT * FROM public.numbers; -Error: 4000(TableAlreadyExists), View already exists: `greptime.for_test_view.test_view` +Error: 4000(TableAlreadyExists), View already exists: `greptime.schema_for_view_test.test_view` CREATE VIEW IF NOT EXISTS test_view as SELECT * FROM public.numbers; @@ -72,51 +72,51 @@ SHOW FULL TABLES; -- SQLNESS REPLACE (\s\d+\s) ID SELECT * FROM INFORMATION_SCHEMA.TABLES ORDER BY TABLE_NAME, TABLE_TYPE; -+---------------+--------------------+---------------------------------------+-----------------+----------+-------------+ -| table_catalog | table_schema | table_name | table_type | table_id | engine | -+---------------+--------------------+---------------------------------------+-----------------+----------+-------------+ -| greptime | information_schema | build_info | LOCAL TEMPORARY |ID | | -| greptime | information_schema | character_sets | LOCAL TEMPORARY |ID | | -| greptime | information_schema | check_constraints | LOCAL TEMPORARY |ID | | -| greptime | information_schema | cluster_info | LOCAL TEMPORARY |ID | | -| greptime | information_schema | collation_character_set_applicability | LOCAL TEMPORARY |ID | | -| greptime | information_schema | collations | LOCAL TEMPORARY |ID | | -| greptime | information_schema | column_privileges | LOCAL TEMPORARY |ID | | -| greptime | information_schema | column_statistics | LOCAL TEMPORARY |ID | | -| greptime | information_schema | columns | LOCAL TEMPORARY |ID | | -| greptime | information_schema | engines | LOCAL TEMPORARY |ID | | -| greptime | information_schema | events | LOCAL TEMPORARY |ID | | -| greptime | information_schema | files | LOCAL TEMPORARY |ID | | -| greptime | information_schema | global_status | LOCAL TEMPORARY |ID | | -| greptime | information_schema | key_column_usage | LOCAL TEMPORARY |ID | | -| greptime | public | numbers | LOCAL TEMPORARY |ID | test_engine | -| greptime | information_schema | optimizer_trace | LOCAL TEMPORARY |ID | | -| greptime | information_schema | parameters | LOCAL TEMPORARY |ID | | -| greptime | information_schema | partitions | LOCAL TEMPORARY |ID | | -| greptime | information_schema | profiling | LOCAL TEMPORARY |ID | | -| greptime | information_schema | referential_constraints | LOCAL TEMPORARY |ID | | -| greptime | information_schema | region_peers | LOCAL TEMPORARY |ID | | -| greptime | information_schema | routines | LOCAL TEMPORARY |ID | | -| greptime | information_schema | runtime_metrics | LOCAL TEMPORARY |ID | | -| greptime | information_schema | schema_privileges | LOCAL TEMPORARY |ID | | -| greptime | information_schema | schemata | LOCAL TEMPORARY |ID | | -| greptime | information_schema | session_status | LOCAL TEMPORARY |ID | | -| greptime | information_schema | table_constraints | LOCAL TEMPORARY |ID | | -| greptime | information_schema | table_privileges | LOCAL TEMPORARY |ID | | -| greptime | information_schema | tables | LOCAL TEMPORARY |ID | | -| greptime | for_test_view | test_table | BASE TABLE |ID | mito | -| greptime | for_test_view | test_view | VIEW |ID | | -| greptime | information_schema | triggers | LOCAL TEMPORARY |ID | | -+---------------+--------------------+---------------------------------------+-----------------+----------+-------------+ ++---------------+----------------------+---------------------------------------+-----------------+----------+-------------+ +| table_catalog | table_schema | table_name | table_type | table_id | engine | ++---------------+----------------------+---------------------------------------+-----------------+----------+-------------+ +| greptime | information_schema | build_info | LOCAL TEMPORARY |ID | | +| greptime | information_schema | character_sets | LOCAL TEMPORARY |ID | | +| greptime | information_schema | check_constraints | LOCAL TEMPORARY |ID | | +| greptime | information_schema | cluster_info | LOCAL TEMPORARY |ID | | +| greptime | information_schema | collation_character_set_applicability | LOCAL TEMPORARY |ID | | +| greptime | information_schema | collations | LOCAL TEMPORARY |ID | | +| greptime | information_schema | column_privileges | LOCAL TEMPORARY |ID | | +| greptime | information_schema | column_statistics | LOCAL TEMPORARY |ID | | +| greptime | information_schema | columns | LOCAL TEMPORARY |ID | | +| greptime | information_schema | engines | LOCAL TEMPORARY |ID | | +| greptime | information_schema | events | LOCAL TEMPORARY |ID | | +| greptime | information_schema | files | LOCAL TEMPORARY |ID | | +| greptime | information_schema | global_status | LOCAL TEMPORARY |ID | | +| greptime | information_schema | key_column_usage | LOCAL TEMPORARY |ID | | +| greptime | public | numbers | LOCAL TEMPORARY |ID | test_engine | +| greptime | information_schema | optimizer_trace | LOCAL TEMPORARY |ID | | +| greptime | information_schema | parameters | LOCAL TEMPORARY |ID | | +| greptime | information_schema | partitions | LOCAL TEMPORARY |ID | | +| greptime | information_schema | profiling | LOCAL TEMPORARY |ID | | +| greptime | information_schema | referential_constraints | LOCAL TEMPORARY |ID | | +| greptime | information_schema | region_peers | LOCAL TEMPORARY |ID | | +| greptime | information_schema | routines | LOCAL TEMPORARY |ID | | +| greptime | information_schema | runtime_metrics | LOCAL TEMPORARY |ID | | +| greptime | information_schema | schema_privileges | LOCAL TEMPORARY |ID | | +| greptime | information_schema | schemata | LOCAL TEMPORARY |ID | | +| greptime | information_schema | session_status | LOCAL TEMPORARY |ID | | +| greptime | information_schema | table_constraints | LOCAL TEMPORARY |ID | | +| greptime | information_schema | table_privileges | LOCAL TEMPORARY |ID | | +| greptime | information_schema | tables | LOCAL TEMPORARY |ID | | +| greptime | schema_for_view_test | test_table | BASE TABLE |ID | mito | +| greptime | schema_for_view_test | test_view | VIEW |ID | | +| greptime | information_schema | triggers | LOCAL TEMPORARY |ID | | ++---------------+----------------------+---------------------------------------+-----------------+----------+-------------+ -- SQLNESS REPLACE (\s\d+\s) ID SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'VIEW'; -+---------------+---------------+------------+------------+----------+--------+ -| table_catalog | table_schema | table_name | table_type | table_id | engine | -+---------------+---------------+------------+------------+----------+--------+ -| greptime | for_test_view | test_view | VIEW |ID | | -+---------------+---------------+------------+------------+----------+--------+ ++---------------+----------------------+------------+------------+----------+--------+ +| table_catalog | table_schema | table_name | table_type | table_id | engine | ++---------------+----------------------+------------+------------+----------+--------+ +| greptime | schema_for_view_test | test_view | VIEW |ID | | ++---------------+----------------------+------------+------------+----------+--------+ SHOW COLUMNS FROM test_view; @@ -133,16 +133,28 @@ SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'test_view'; ++ ++ ---- FIXED in the following PR --- -SELECT * FROM test_view; - -Error: 3001(EngineExecuteQuery), DataFusion error: Unsupported operation: get stream from a distributed table +SELECT * FROM test_view LIMIT 10; + ++--------+ +| number | ++--------+ +| 0 | +| 1 | +| 2 | +| 3 | +| 4 | +| 5 | +| 6 | +| 7 | +| 8 | +| 9 | ++--------+ USE public; Affected Rows: 0 -DROP DATABASE for_test_view; +DROP DATABASE schema_for_view_test; Affected Rows: 0 diff --git a/tests/cases/standalone/common/view/create.sql b/tests/cases/standalone/common/view/create.sql index a01741f9166f..a778180939a8 100644 --- a/tests/cases/standalone/common/view/create.sql +++ b/tests/cases/standalone/common/view/create.sql @@ -1,8 +1,8 @@ --- test CREATE VIEW --- -CREATE DATABASE for_test_view; +CREATE DATABASE schema_for_view_test; -USE for_test_view; +USE schema_for_view_test; CREATE TABLE test_table(a STRING, ts TIMESTAMP TIME INDEX); @@ -44,9 +44,8 @@ SHOW FULL COLUMNS FROM test_view; SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = 'test_view'; ---- FIXED in the following PR --- -SELECT * FROM test_view; +SELECT * FROM test_view LIMIT 10; USE public; -DROP DATABASE for_test_view; +DROP DATABASE schema_for_view_test; diff --git a/tests/cases/standalone/common/view/view.result b/tests/cases/standalone/common/view/view.result new file mode 100644 index 000000000000..132ec48033b1 --- /dev/null +++ b/tests/cases/standalone/common/view/view.result @@ -0,0 +1,62 @@ +-- From: https://github.com/duckdb/duckdb/blob/main/test/sql/catalog/view/test_view.test -- +CREATE DATABASE schema_for_view_test; + +Affected Rows: 1 + +USE schema_for_view_test; + +Affected Rows: 0 + +CREATE TABLE t1(i TIMESTAMP TIME INDEX); + +Affected Rows: 0 + +INSERT INTO t1 VALUES (41), (42), (43); + +Affected Rows: 3 + +CREATE VIEW v1 AS SELECT + i AS j +FROM t1 WHERE i < 43; + +Affected Rows: 0 + +SELECT * FROM v1; + ++-------------------------+ +| i | ++-------------------------+ +| 1970-01-01T00:00:00.041 | +| 1970-01-01T00:00:00.042 | ++-------------------------+ + +-- CREATE VIEW v1 AS SELECT 'whatever'; -- +SELECT j FROM v1 WHERE j > 41; + +Error: 3000(PlanQuery), Failed to plan SQL: No field named j. Valid fields are v1.i. + +-- FIXME(dennis):: name alias in view, not supported yet -- +--SELECT x FROM v1 t1(x) WHERE x > 41 -- +-- FIXME(dennis): DROP VIEW not supported yet-- +-- DROP VIEW v1 -- +-- SELECT j FROM v1 WHERE j > 41 -- +-- CREATE VIEW v1 AS SELECT 'whatever'; -- +-- SELECT * FROM v1; -- +-- CREATE OR REPLACE VIEW v1 AS SELECT 42; -- +-- SELECT * FROM v1; -- +INSERT INTO v1 VALUES (1); + +Error: 1004(InvalidArguments), Invalid SQL, error: column count mismatch, columns: 0, values: 1 + +CREATE VIEW v1 AS SELECT * FROM dontexist; + +Error: 3000(PlanQuery), Failed to plan SQL: Error during planning: Table not found: greptime.schema_for_view_test.dontexist + +USE public; + +Affected Rows: 0 + +DROP DATABASE schema_for_view_test; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/view/view.sql b/tests/cases/standalone/common/view/view.sql new file mode 100644 index 000000000000..3ca4cd7a7426 --- /dev/null +++ b/tests/cases/standalone/common/view/view.sql @@ -0,0 +1,45 @@ +-- From: https://github.com/duckdb/duckdb/blob/main/test/sql/catalog/view/test_view.test -- + +CREATE DATABASE schema_for_view_test; + +USE schema_for_view_test; + +CREATE TABLE t1(i TIMESTAMP TIME INDEX); + +INSERT INTO t1 VALUES (41), (42), (43); + +CREATE VIEW v1 AS SELECT + i AS j +FROM t1 WHERE i < 43; + +SELECT * FROM v1; + +-- CREATE VIEW v1 AS SELECT 'whatever'; -- + +SELECT j FROM v1 WHERE j > 41; + + +-- FIXME(dennis):: name alias in view, not supported yet -- +--SELECT x FROM v1 t1(x) WHERE x > 41 -- + +-- FIXME(dennis): DROP VIEW not supported yet-- +-- DROP VIEW v1 -- + +-- SELECT j FROM v1 WHERE j > 41 -- + +-- CREATE VIEW v1 AS SELECT 'whatever'; -- + +-- SELECT * FROM v1; -- + + +-- CREATE OR REPLACE VIEW v1 AS SELECT 42; -- + +-- SELECT * FROM v1; -- + +INSERT INTO v1 VALUES (1); + +CREATE VIEW v1 AS SELECT * FROM dontexist; + +USE public; + +DROP DATABASE schema_for_view_test;