diff --git a/Cargo.lock b/Cargo.lock index 69040d3995f9..1c315ade8577 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -641,12 +641,6 @@ version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba" -[[package]] -name = "atomic_float" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62af46d040ba9df09edc6528dae9d8e49f5f3e82f55b7d2ec31a733c38dbc49d" - [[package]] name = "atty" version = "0.2.14" @@ -1205,7 +1199,6 @@ dependencies = [ "serde_json", "session", "snafu", - "storage", "store-api", "table", "tokio", @@ -1628,11 +1621,13 @@ dependencies = [ "common-runtime", "common-test-util", "datafusion", + "datatypes", "derive_builder 0.12.0", "futures", "lazy_static", "object-store", "orc-rust", + "parquet", "paste", "regex", "serde", @@ -1722,7 +1717,7 @@ dependencies = [ "common-runtime", "common-telemetry", "common-time", - "criterion 0.4.0", + "criterion", "dashmap", "datafusion", "datatypes", @@ -2142,32 +2137,6 @@ dependencies = [ "cfg-if 1.0.0", ] -[[package]] -name = "criterion" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b01d6de93b2b6c65e17c634a26653a29d107b3c98c607c765bf38d041531cd8f" -dependencies = [ - "atty", - "cast", - "clap 2.34.0", - "criterion-plot 0.4.5", - "csv", - "itertools 0.10.5", - "lazy_static", - "num-traits", - "oorandom", - "plotters", - "rayon", - "regex", - "serde", - "serde_cbor", - "serde_derive", - "serde_json", - "tinytemplate", - "walkdir", -] - [[package]] name = "criterion" version = "0.4.0" @@ -2179,7 +2148,7 @@ dependencies = [ "cast", "ciborium", "clap 3.2.25", - "criterion-plot 0.5.0", + "criterion-plot", "futures", "itertools 0.10.5", "lazy_static", @@ -2196,16 +2165,6 @@ dependencies = [ "walkdir", ] -[[package]] -name = "criterion-plot" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2673cc8207403546f45f5fd319a974b1e6983ad1a3ee7e6041650013be041876" -dependencies = [ - "cast", - "itertools 0.10.5", -] - [[package]] name = "criterion-plot" version = "0.5.0" @@ -2681,7 +2640,6 @@ dependencies = [ "session", "snafu", "sql", - "storage", "store-api", "substrait 0.4.3", "table", @@ -3313,7 +3271,6 @@ dependencies = [ "snafu", "sql", "sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=0fbae07d0c46dc18e3381c406d8b9b8abef6b1fd)", - "storage", "store-api", "strfmt", "substrait 0.4.3", @@ -5569,7 +5526,6 @@ dependencies = [ "snafu", "sql", "sqlparser 0.38.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=0fbae07d0c46dc18e3381c406d8b9b8abef6b1fd)", - "storage", "store-api", "substrait 0.4.3", "table", @@ -7966,7 +7922,7 @@ dependencies = [ "common-test-util", "common-time", "console", - "criterion 0.4.0", + "criterion", "crossbeam-utils", "datafusion", "datafusion-common", @@ -7998,7 +7954,6 @@ dependencies = [ "session", "snafu", "sql", - "storage", "store-api", "table", "tokio", @@ -8078,16 +8033,6 @@ dependencies = [ "serde_derive", ] -[[package]] -name = "serde_cbor" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" -dependencies = [ - "half 1.8.2", - "serde", -] - [[package]] name = "serde_derive" version = "1.0.190" @@ -8829,60 +8774,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "storage" -version = "0.4.3" -dependencies = [ - "api", - "arc-swap", - "arrow", - "arrow-array", - "async-compat", - "async-stream", - "async-trait", - "atomic_float", - "bytes", - "common-base", - "common-config", - "common-datasource", - "common-error", - "common-macro", - "common-query", - "common-recordbatch", - "common-runtime", - "common-telemetry", - "common-test-util", - "common-time", - "criterion 0.3.6", - "datafusion", - "datafusion-common", - "datafusion-expr", - "datafusion-physical-expr", - "datatypes", - "futures", - "futures-util", - "itertools 0.10.5", - "lazy_static", - "log-store", - "object-store", - "parquet", - "paste", - "prometheus", - "prost 0.12.1", - "rand", - "regex", - "serde", - "serde_json", - "snafu", - "store-api", - "table", - "tokio", - "tokio-util", - "tonic 0.10.2", - "tonic-build 0.9.2", - "uuid", -] - [[package]] name = "store-api" version = "0.4.3" diff --git a/Cargo.toml b/Cargo.toml index dbd5d578692a..ba46247cf922 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,7 +49,6 @@ members = [ "src/servers", "src/session", "src/sql", - "src/storage", "src/store-api", "src/table", "tests-integration", @@ -176,7 +175,6 @@ script = { path = "src/script" } servers = { path = "src/servers" } session = { path = "src/session" } sql = { path = "src/sql" } -storage = { path = "src/storage" } store-api = { path = "src/store-api" } substrait = { path = "src/common/substrait" } table = { path = "src/table" } diff --git a/config/datanode.example.toml b/config/datanode.example.toml index 3d4d3c81e845..256970863184 100644 --- a/config/datanode.example.toml +++ b/config/datanode.example.toml @@ -53,33 +53,6 @@ type = "File" # The local file cache capacity in bytes. # cache_capacity = "256MB" -# Compaction options, see `standalone.example.toml`. -[storage.compaction] -max_inflight_tasks = 4 -max_files_in_level0 = 8 -max_purge_tasks = 32 - -# Storage manifest options -[storage.manifest] -# Region checkpoint actions margin. -# Create a checkpoint every actions. -checkpoint_margin = 10 -# Region manifest logs and checkpoints gc execution duration -gc_duration = '10m' - -# Storage flush options -[storage.flush] -# Max inflight flush tasks. -max_flush_tasks = 8 -# Default write buffer size for a region. -region_write_buffer_size = "32MB" -# Interval to check whether a region needs flush. -picker_schedule_interval = "5m" -# Interval to auto flush a region if it has not flushed yet. -auto_flush_interval = "1h" -# Global write buffer size for all regions. -global_write_buffer_size = "1GB" - # Mito engine options [[region_engine]] [region_engine.mito] diff --git a/config/standalone.example.toml b/config/standalone.example.toml index 254b89d02f29..629c9bdb9a73 100644 --- a/config/standalone.example.toml +++ b/config/standalone.example.toml @@ -122,36 +122,6 @@ type = "File" # The local file cache capacity in bytes. # cache_capacity = "256MB" -# Compaction options. -[storage.compaction] -# Max task number that can concurrently run. -max_inflight_tasks = 4 -# Max files in level 0 to trigger compaction. -max_files_in_level0 = 8 -# Max task number for SST purge task after compaction. -max_purge_tasks = 32 - -# Storage manifest options -[storage.manifest] -# Region checkpoint actions margin. -# Create a checkpoint every actions. -checkpoint_margin = 10 -# Region manifest logs and checkpoints gc execution duration -gc_duration = '10m' - -# Storage flush options -[storage.flush] -# Max inflight flush tasks. -max_flush_tasks = 8 -# Default write buffer size for a region. -region_write_buffer_size = "32MB" -# Interval to check whether a region needs flush. -picker_schedule_interval = "5m" -# Interval to auto flush a region if it has not flushed yet. -auto_flush_interval = "1h" -# Global write buffer size for all regions. -global_write_buffer_size = "1GB" - # Mito engine options [[region_engine]] [region_engine.mito] diff --git a/src/catalog/Cargo.toml b/src/catalog/Cargo.toml index f395be039353..fb41bf15d94d 100644 --- a/src/catalog/Cargo.toml +++ b/src/catalog/Cargo.toml @@ -49,5 +49,4 @@ chrono.workspace = true common-test-util.workspace = true log-store.workspace = true object-store.workspace = true -storage.workspace = true tokio.workspace = true diff --git a/src/cmd/src/datanode.rs b/src/cmd/src/datanode.rs index b58f25260660..9babe11aaf37 100644 --- a/src/cmd/src/datanode.rs +++ b/src/cmd/src/datanode.rs @@ -192,7 +192,7 @@ mod tests { use std::time::Duration; use common_test_util::temp_dir::create_named_temp_file; - use datanode::config::{CompactionConfig, FileConfig, ObjectStoreConfig, RegionManifestConfig}; + use datanode::config::{FileConfig, ObjectStoreConfig}; use servers::heartbeat_options::HeartbeatOptions; use servers::Mode; @@ -232,16 +232,6 @@ mod tests { type = "File" data_home = "/tmp/greptimedb/" - [storage.compaction] - max_inflight_tasks = 3 - max_files_in_level0 = 7 - max_purge_tasks = 32 - - [storage.manifest] - checkpoint_margin = 9 - gc_duration = '7s' - compress = true - [logging] level = "debug" dir = "/tmp/greptimedb/test/logs" @@ -294,23 +284,6 @@ mod tests { ObjectStoreConfig::File(FileConfig { .. }) )); - assert_eq!( - CompactionConfig { - max_inflight_tasks: 3, - max_files_in_level0: 7, - max_purge_tasks: 32, - }, - options.storage.compaction, - ); - assert_eq!( - RegionManifestConfig { - checkpoint_margin: Some(9), - gc_duration: Some(Duration::from_secs(7)), - compress: true - }, - options.storage.manifest, - ); - assert_eq!("debug", options.logging.level.unwrap()); assert_eq!("/tmp/greptimedb/test/logs".to_string(), options.logging.dir); } @@ -387,18 +360,12 @@ mod tests { file_size = "1GB" purge_threshold = "50GB" purge_interval = "10m" - read_batch_size = 128 sync_write = false [storage] type = "File" data_home = "/tmp/greptimedb/" - [storage.compaction] - max_inflight_tasks = 3 - max_files_in_level0 = 7 - max_purge_tasks = 32 - [logging] level = "debug" dir = "/tmp/greptimedb/test/logs" @@ -409,26 +376,24 @@ mod tests { temp_env::with_vars( [ ( - // storage.manifest.gc_duration = 9s + // wal.purge_interval = 1m [ env_prefix.to_string(), - "storage".to_uppercase(), - "manifest".to_uppercase(), - "gc_duration".to_uppercase(), + "wal".to_uppercase(), + "purge_interval".to_uppercase(), ] .join(ENV_VAR_SEP), - Some("9s"), + Some("1m"), ), ( - // storage.compaction.max_purge_tasks = 99 + // wal.read_batch_size = 100 [ env_prefix.to_string(), - "storage".to_uppercase(), - "compaction".to_uppercase(), - "max_purge_tasks".to_uppercase(), + "wal".to_uppercase(), + "read_batch_size".to_uppercase(), ] .join(ENV_VAR_SEP), - Some("99"), + Some("100"), ), ( // meta_client.metasrv_addrs = 127.0.0.1:3001,127.0.0.1:3002,127.0.0.1:3003 @@ -456,10 +421,7 @@ mod tests { }; // Should be read from env, env > default values. - assert_eq!( - opts.storage.manifest.gc_duration, - Some(Duration::from_secs(9)) - ); + assert_eq!(opts.wal.read_batch_size, 100,); assert_eq!( opts.meta_client.unwrap().metasrv_addrs, vec![ @@ -470,19 +432,13 @@ mod tests { ); // Should be read from config file, config file > env > default values. - assert_eq!(opts.storage.compaction.max_purge_tasks, 32); + assert_eq!(opts.wal.purge_interval, Duration::from_secs(60 * 10)); // Should be read from cli, cli > config file > env > default values. assert_eq!(opts.wal.dir.unwrap(), "/other/wal/dir"); // Should be default value. - assert_eq!( - opts.storage.manifest.checkpoint_margin, - DatanodeOptions::default() - .storage - .manifest - .checkpoint_margin - ); + assert_eq!(opts.http.addr, DatanodeOptions::default().http.addr); }, ); } diff --git a/src/cmd/src/options.rs b/src/cmd/src/options.rs index ace707a7c57b..9127f4502698 100644 --- a/src/cmd/src/options.rs +++ b/src/cmd/src/options.rs @@ -147,7 +147,6 @@ impl Options { #[cfg(test)] mod tests { use std::io::Write; - use std::time::Duration; use common_test_util::temp_dir::create_named_temp_file; use datanode::config::{DatanodeOptions, ObjectStoreConfig}; @@ -179,11 +178,6 @@ mod tests { read_batch_size = 128 sync_write = false - [storage.compaction] - max_inflight_tasks = 3 - max_files_in_level0 = 7 - max_purge_tasks = 32 - [logging] level = "debug" dir = "/tmp/greptimedb/test/logs" @@ -194,17 +188,6 @@ mod tests { temp_env::with_vars( // The following environment variables will be used to override the values in the config file. [ - ( - // storage.manifest.checkpoint_margin = 99 - [ - env_prefix.to_string(), - "storage".to_uppercase(), - "manifest".to_uppercase(), - "checkpoint_margin".to_uppercase(), - ] - .join(ENV_VAR_SEP), - Some("99"), - ), ( // storage.type = S3 [ @@ -225,17 +208,6 @@ mod tests { .join(ENV_VAR_SEP), Some("mybucket"), ), - ( - // storage.manifest.gc_duration = 42s - [ - env_prefix.to_string(), - "storage".to_uppercase(), - "manifest".to_uppercase(), - "gc_duration".to_uppercase(), - ] - .join(ENV_VAR_SEP), - Some("42s"), - ), ( // wal.dir = /other/wal/dir [ @@ -266,17 +238,12 @@ mod tests { .unwrap(); // Check the configs from environment variables. - assert_eq!(opts.storage.manifest.checkpoint_margin, Some(99)); match opts.storage.store { ObjectStoreConfig::S3(s3_config) => { assert_eq!(s3_config.bucket, "mybucket".to_string()); } _ => panic!("unexpected store type"), } - assert_eq!( - opts.storage.manifest.gc_duration, - Some(Duration::from_secs(42)) - ); assert_eq!( opts.meta_client.unwrap().metasrv_addrs, vec![ diff --git a/src/common/datasource/Cargo.toml b/src/common/datasource/Cargo.toml index 0f24dc3107c4..3de0d2fbaeb4 100644 --- a/src/common/datasource/Cargo.toml +++ b/src/common/datasource/Cargo.toml @@ -21,11 +21,13 @@ common-error.workspace = true common-macro.workspace = true common-runtime.workspace = true datafusion.workspace = true +datatypes.workspace = true derive_builder.workspace = true futures.workspace = true lazy_static.workspace = true object-store.workspace = true orc-rust = "0.2" +parquet.workspace = true paste = "1.0" regex = "1.7" serde.workspace = true diff --git a/src/common/datasource/src/error.rs b/src/common/datasource/src/error.rs index f8ab4a30c634..77618345a3eb 100644 --- a/src/common/datasource/src/error.rs +++ b/src/common/datasource/src/error.rs @@ -166,6 +166,14 @@ pub enum Error { #[snafu(display("Buffered writer closed"))] BufferedWriterClosed { location: Location }, + + #[snafu(display("Failed to write parquet file, path: {}", path))] + WriteParquet { + path: String, + location: Location, + #[snafu(source)] + error: parquet::errors::ParquetError, + }, } pub type Result = std::result::Result; @@ -178,7 +186,8 @@ impl ErrorExt for Error { | ListObjects { .. } | ReadObject { .. } | WriteObject { .. } - | AsyncWrite { .. } => StatusCode::StorageUnavailable, + | AsyncWrite { .. } + | WriteParquet { .. } => StatusCode::StorageUnavailable, UnsupportedBackendProtocol { .. } | UnsupportedCompressionType { .. } @@ -231,6 +240,7 @@ impl ErrorExt for Error { InvalidConnection { location, .. } => Some(*location), UnsupportedCompressionType { location, .. } => Some(*location), UnsupportedFormat { location, .. } => Some(*location), + WriteParquet { location, .. } => Some(*location), } } } diff --git a/src/common/datasource/src/file_format/parquet.rs b/src/common/datasource/src/file_format/parquet.rs index 5e76255cafe7..220e8b5d68b1 100644 --- a/src/common/datasource/src/file_format/parquet.rs +++ b/src/common/datasource/src/file_format/parquet.rs @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::future::Future; +use std::pin::Pin; use std::result; use std::sync::Arc; use arrow::record_batch::RecordBatch; -use arrow_schema::Schema; +use arrow_schema::{Schema, SchemaRef}; use async_trait::async_trait; use datafusion::datasource::physical_plan::{FileMeta, ParquetFileReaderFactory}; use datafusion::error::Result as DatafusionResult; @@ -26,11 +28,15 @@ use datafusion::parquet::errors::{ParquetError, Result as ParquetResult}; use datafusion::parquet::file::metadata::ParquetMetaData; use datafusion::parquet::format::FileMetaData; use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; +use datafusion::physical_plan::SendableRecordBatchStream; use futures::future::BoxFuture; +use futures::StreamExt; use object_store::{ObjectStore, Reader}; +use parquet::basic::{Compression, ZstdLevel}; +use parquet::file::properties::WriterProperties; use snafu::ResultExt; -use crate::buffered_writer::{ArrowWriterCloser, DfRecordBatchEncoder}; +use crate::buffered_writer::{ArrowWriterCloser, DfRecordBatchEncoder, LazyBufferedWriter}; use crate::error::{self, Result}; use crate::file_format::FileFormat; use crate::share_buffer::SharedBuffer; @@ -156,6 +162,103 @@ impl ArrowWriterCloser for ArrowWriter { } } +/// Parquet writer that buffers row groups in memory and writes buffered data to an underlying +/// storage by chunks to reduce memory consumption. +pub struct BufferedWriter { + inner: InnerBufferedWriter, +} + +type InnerBufferedWriter = LazyBufferedWriter< + object_store::Writer, + ArrowWriter, + Box< + dyn FnMut( + String, + ) + -> Pin> + Send>> + + Send, + >, +>; + +impl BufferedWriter { + pub async fn try_new( + path: String, + store: ObjectStore, + arrow_schema: SchemaRef, + props: Option, + buffer_threshold: usize, + ) -> error::Result { + let buffer = SharedBuffer::with_capacity(buffer_threshold); + + let arrow_writer = ArrowWriter::try_new(buffer.clone(), arrow_schema.clone(), props) + .context(error::WriteParquetSnafu { path: &path })?; + + Ok(Self { + inner: LazyBufferedWriter::new( + buffer_threshold, + buffer, + arrow_writer, + &path, + Box::new(move |path| { + let store = store.clone(); + Box::pin(async move { + store + .writer(&path) + .await + .context(error::WriteObjectSnafu { path }) + }) + }), + ), + }) + } + + /// Write a record batch to stream writer. + pub async fn write(&mut self, arrow_batch: &RecordBatch) -> error::Result<()> { + self.inner.write(arrow_batch).await?; + self.inner.try_flush(false).await?; + + Ok(()) + } + + /// Close parquet writer. + /// + /// Return file metadata and bytes written. + pub async fn close(self) -> error::Result<(FileMetaData, u64)> { + self.inner.close_with_arrow_writer().await + } +} + +/// Output the stream to a parquet file. +/// +/// Returns number of rows written. +pub async fn stream_to_parquet( + mut stream: SendableRecordBatchStream, + store: ObjectStore, + path: &str, + threshold: usize, +) -> Result { + let write_props = WriterProperties::builder() + .set_compression(Compression::ZSTD(ZstdLevel::default())) + .build(); + let schema = stream.schema(); + let mut buffered_writer = BufferedWriter::try_new( + path.to_string(), + store, + schema, + Some(write_props), + threshold, + ) + .await?; + let mut rows_written = 0; + while let Some(batch) = stream.next().await { + let batch = batch.context(error::ReadRecordBatchSnafu)?; + buffered_writer.write(&batch).await?; + rows_written += batch.num_rows(); + } + buffered_writer.close().await?; + Ok(rows_written) +} + #[cfg(test)] mod tests { use common_test_util::find_workspace_path; diff --git a/src/datanode/Cargo.toml b/src/datanode/Cargo.toml index 3a3e5067f1e0..08773eaf98d5 100644 --- a/src/datanode/Cargo.toml +++ b/src/datanode/Cargo.toml @@ -61,7 +61,6 @@ servers.workspace = true session.workspace = true snafu.workspace = true sql.workspace = true -storage.workspace = true store-api.workspace = true substrait.workspace = true table.workspace = true diff --git a/src/datanode/src/config.rs b/src/datanode/src/config.rs index 739d451d95e8..3c2e9ff88774 100644 --- a/src/datanode/src/config.rs +++ b/src/datanode/src/config.rs @@ -31,11 +31,6 @@ use serde::{Deserialize, Serialize}; use servers::heartbeat_options::HeartbeatOptions; use servers::http::HttpOptions; use servers::Mode; -use storage::config::{ - EngineConfig as StorageEngineConfig, DEFAULT_AUTO_FLUSH_INTERVAL, DEFAULT_MAX_FLUSH_TASKS, - DEFAULT_PICKER_SCHEDULE_INTERVAL, DEFAULT_REGION_WRITE_BUFFER_SIZE, -}; -use storage::scheduler::SchedulerConfig; pub const DEFAULT_OBJECT_STORE_CACHE_SIZE: ReadableSize = ReadableSize::mb(256); @@ -68,9 +63,6 @@ pub struct StorageConfig { pub data_home: String, #[serde(flatten)] pub store: ObjectStoreConfig, - pub compaction: CompactionConfig, - pub manifest: RegionManifestConfig, - pub flush: FlushConfig, } impl Default for StorageConfig { @@ -79,9 +71,6 @@ impl Default for StorageConfig { global_ttl: None, data_home: DEFAULT_DATA_HOME.to_string(), store: ObjectStoreConfig::default(), - compaction: CompactionConfig::default(), - manifest: RegionManifestConfig::default(), - flush: FlushConfig::default(), } } } @@ -216,109 +205,6 @@ impl Default for ObjectStoreConfig { } } -/// Options for region manifest -#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] -#[serde(default)] -pub struct RegionManifestConfig { - /// Region manifest checkpoint actions margin. - /// Manifest service create a checkpoint every `checkpoint_margin` actions. - pub checkpoint_margin: Option, - /// Region manifest logs and checkpoints gc task execution duration. - #[serde(with = "humantime_serde")] - pub gc_duration: Option, - /// Whether to compress manifest and checkpoint file by gzip - pub compress: bool, -} - -impl Default for RegionManifestConfig { - fn default() -> Self { - Self { - checkpoint_margin: Some(10u16), - gc_duration: Some(Duration::from_secs(600)), - compress: false, - } - } -} - -/// Options for table compaction -#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] -#[serde(default)] -pub struct CompactionConfig { - /// Max task number that can concurrently run. - pub max_inflight_tasks: usize, - /// Max files in level 0 to trigger compaction. - pub max_files_in_level0: usize, - /// Max task number for SST purge task after compaction. - pub max_purge_tasks: usize, -} - -impl Default for CompactionConfig { - fn default() -> Self { - Self { - max_inflight_tasks: 4, - max_files_in_level0: 8, - max_purge_tasks: 32, - } - } -} - -#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] -#[serde(default)] -pub struct FlushConfig { - /// Max inflight flush tasks. - pub max_flush_tasks: usize, - /// Default write buffer size for a region. - pub region_write_buffer_size: ReadableSize, - /// Interval to schedule auto flush picker to find region to flush. - #[serde(with = "humantime_serde")] - pub picker_schedule_interval: Duration, - /// Interval to auto flush a region if it has not flushed yet. - #[serde(with = "humantime_serde")] - pub auto_flush_interval: Duration, - /// Global write buffer size for all regions. - pub global_write_buffer_size: Option, -} - -impl Default for FlushConfig { - fn default() -> Self { - Self { - max_flush_tasks: DEFAULT_MAX_FLUSH_TASKS, - region_write_buffer_size: DEFAULT_REGION_WRITE_BUFFER_SIZE, - picker_schedule_interval: Duration::from_millis( - DEFAULT_PICKER_SCHEDULE_INTERVAL.into(), - ), - auto_flush_interval: Duration::from_millis(DEFAULT_AUTO_FLUSH_INTERVAL.into()), - global_write_buffer_size: None, - } - } -} - -impl From<&DatanodeOptions> for SchedulerConfig { - fn from(value: &DatanodeOptions) -> Self { - Self { - max_inflight_tasks: value.storage.compaction.max_inflight_tasks, - } - } -} - -impl From<&DatanodeOptions> for StorageEngineConfig { - fn from(value: &DatanodeOptions) -> Self { - Self { - compress_manifest: value.storage.manifest.compress, - manifest_checkpoint_margin: value.storage.manifest.checkpoint_margin, - manifest_gc_duration: value.storage.manifest.gc_duration, - max_files_in_l0: value.storage.compaction.max_files_in_level0, - max_purge_tasks: value.storage.compaction.max_purge_tasks, - max_flush_tasks: value.storage.flush.max_flush_tasks, - region_write_buffer_size: value.storage.flush.region_write_buffer_size, - picker_schedule_interval: value.storage.flush.picker_schedule_interval, - auto_flush_interval: value.storage.flush.auto_flush_interval, - global_write_buffer_size: value.storage.flush.global_write_buffer_size, - global_ttl: value.storage.global_ttl, - } - } -} - #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(default)] pub struct DatanodeOptions { diff --git a/src/frontend/Cargo.toml b/src/frontend/Cargo.toml index 91b62cf0f23d..a95b557961ce 100644 --- a/src/frontend/Cargo.toml +++ b/src/frontend/Cargo.toml @@ -68,7 +68,6 @@ session.workspace = true snafu.workspace = true sql.workspace = true sqlparser.workspace = true -storage.workspace = true store-api.workspace = true substrait.workspace = true table.workspace = true diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs index 8ce6e8719efe..e9aba0bf6f8f 100644 --- a/src/mito2/src/error.rs +++ b/src/mito2/src/error.rs @@ -121,14 +121,6 @@ pub enum Error { source: common_datasource::error::Error, }, - #[snafu(display("Failed to write parquet file, path: {}", path))] - WriteParquet { - path: String, - location: Location, - #[snafu(source)] - error: parquet::errors::ParquetError, - }, - #[snafu(display("Failed to read parquet file, path: {}", path))] ReadParquet { path: String, @@ -428,7 +420,6 @@ impl ErrorExt for Error { match self { OpenDal { .. } - | WriteParquet { .. } | ReadParquet { .. } | WriteWal { .. } | ReadWal { .. } diff --git a/src/mito2/src/sst.rs b/src/mito2/src/sst.rs index b12fa962da9d..32c7b4951a55 100644 --- a/src/mito2/src/sst.rs +++ b/src/mito2/src/sst.rs @@ -17,5 +17,4 @@ pub mod file; pub mod file_purger; pub mod parquet; -mod stream_writer; pub(crate) mod version; diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs index 17d50dc3a3f1..d776b3ac627d 100644 --- a/src/mito2/src/sst/parquet/writer.rs +++ b/src/mito2/src/sst/parquet/writer.rs @@ -14,6 +14,7 @@ //! Parquet writer. +use common_datasource::file_format::parquet::BufferedWriter; use common_telemetry::debug; use common_time::Timestamp; use object_store::ObjectStore; @@ -25,11 +26,10 @@ use snafu::ResultExt; use store_api::metadata::RegionMetadataRef; use store_api::storage::consts::SEQUENCE_COLUMN_NAME; -use crate::error::{InvalidMetadataSnafu, Result}; +use crate::error::{InvalidMetadataSnafu, Result, WriteBufferSnafu}; use crate::read::{Batch, Source}; use crate::sst::parquet::format::WriteFormat; use crate::sst::parquet::{SstInfo, WriteOptions, PARQUET_METADATA_KEY}; -use crate::sst::stream_writer::BufferedWriter; /// Parquet SST writer. pub struct ParquetWriter { @@ -83,14 +83,18 @@ impl ParquetWriter { Some(writer_props), opts.write_buffer_size.as_bytes() as usize, ) - .await?; + .await + .context(WriteBufferSnafu)?; let mut stats = SourceStats::default(); while let Some(batch) = self.source.next_batch().await? { stats.update(&batch); let arrow_batch = write_format.convert_batch(&batch)?; - buffered_writer.write(&arrow_batch).await?; + buffered_writer + .write(&arrow_batch) + .await + .context(WriteBufferSnafu)?; } if stats.num_rows == 0 { @@ -99,11 +103,11 @@ impl ParquetWriter { self.file_path ); - buffered_writer.close().await?; + buffered_writer.close().await.context(WriteBufferSnafu)?; return Ok(None); } - let (_file_meta, file_size) = buffered_writer.close().await?; + let (_file_meta, file_size) = buffered_writer.close().await.context(WriteBufferSnafu)?; // Safety: num rows > 0 so we must have min/max. let time_range = stats.time_range.unwrap(); diff --git a/src/mito2/src/sst/stream_writer.rs b/src/mito2/src/sst/stream_writer.rs deleted file mode 100644 index 28d044298ca3..000000000000 --- a/src/mito2/src/sst/stream_writer.rs +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::future::Future; -use std::pin::Pin; - -use common_datasource::buffered_writer::LazyBufferedWriter; -use common_datasource::share_buffer::SharedBuffer; -use datatypes::arrow::datatypes::SchemaRef; -use datatypes::arrow::record_batch::RecordBatch; -use object_store::ObjectStore; -use parquet::arrow::ArrowWriter; -use parquet::file::properties::WriterProperties; -use parquet::format::FileMetaData; -use snafu::ResultExt; - -use crate::error; -use crate::error::WriteParquetSnafu; - -/// Parquet writer that buffers row groups in memory and writes buffered data to an underlying -/// storage by chunks to reduce memory consumption. -pub struct BufferedWriter { - inner: InnerBufferedWriter, -} - -type InnerBufferedWriter = LazyBufferedWriter< - object_store::Writer, - ArrowWriter, - Box< - dyn FnMut( - String, - ) -> Pin< - Box< - dyn Future> - + Send, - >, - > + Send, - >, ->; - -impl BufferedWriter { - pub async fn try_new( - path: String, - store: ObjectStore, - arrow_schema: SchemaRef, - props: Option, - buffer_threshold: usize, - ) -> error::Result { - let buffer = SharedBuffer::with_capacity(buffer_threshold); - - let arrow_writer = ArrowWriter::try_new(buffer.clone(), arrow_schema.clone(), props) - .context(WriteParquetSnafu { path: &path })?; - - Ok(Self { - inner: LazyBufferedWriter::new( - buffer_threshold, - buffer, - arrow_writer, - &path, - Box::new(move |path| { - let store = store.clone(); - Box::pin(async move { - store - .writer(&path) - .await - .context(common_datasource::error::WriteObjectSnafu { path }) - }) - }), - ), - }) - } - - /// Write a record batch to stream writer. - pub async fn write(&mut self, arrow_batch: &RecordBatch) -> error::Result<()> { - self.inner - .write(arrow_batch) - .await - .context(error::WriteBufferSnafu)?; - self.inner - .try_flush(false) - .await - .context(error::WriteBufferSnafu)?; - - Ok(()) - } - - /// Close parquet writer. - pub async fn close(self) -> error::Result<(FileMetaData, u64)> { - self.inner - .close_with_arrow_writer() - .await - .context(error::WriteBufferSnafu) - } -} diff --git a/src/operator/Cargo.toml b/src/operator/Cargo.toml index 9182861ac9f2..bd49fd9a3ece 100644 --- a/src/operator/Cargo.toml +++ b/src/operator/Cargo.toml @@ -50,7 +50,6 @@ session.workspace = true snafu.workspace = true sql.workspace = true sqlparser.workspace = true -storage.workspace = true store-api.workspace = true substrait.workspace = true table.workspace = true diff --git a/src/operator/src/error.rs b/src/operator/src/error.rs index 07c730887bf9..e96f1aaa21fe 100644 --- a/src/operator/src/error.rs +++ b/src/operator/src/error.rs @@ -378,12 +378,6 @@ pub enum Error { error: datafusion::error::DataFusionError, }, - #[snafu(display("Failed to write parquet file"))] - WriteParquet { - location: Location, - source: storage::error::Error, - }, - #[snafu(display( "Schema datatypes not match at index {}, expected table schema: {}, actual file schema: {}", index, @@ -594,7 +588,6 @@ impl ErrorExt for Error { | Error::ParseUrl { source, .. } | Error::BuildBackend { source, .. } => source.status_code(), - Error::WriteParquet { source, .. } => source.status_code(), Error::ExecuteDdl { source, .. } => source.status_code(), Error::InvalidCopyParameter { .. } => StatusCode::InvalidArguments, diff --git a/src/operator/src/statement/copy_table_to.rs b/src/operator/src/statement/copy_table_to.rs index f70dada86c20..78c192ad01af 100644 --- a/src/operator/src/statement/copy_table_to.rs +++ b/src/operator/src/statement/copy_table_to.rs @@ -17,6 +17,7 @@ use std::sync::Arc; use common_base::readable_size::ReadableSize; use common_datasource::file_format::csv::stream_to_csv; use common_datasource::file_format::json::stream_to_json; +use common_datasource::file_format::parquet::stream_to_parquet; use common_datasource::file_format::Format; use common_datasource::object_store::{build_backend, parse_url}; use common_datasource::util::find_dir_and_filename; @@ -31,17 +32,17 @@ use object_store::ObjectStore; use query::plan::LogicalPlan; use session::context::QueryContextRef; use snafu::{OptionExt, ResultExt}; -use storage::sst::SstInfo; -use storage::{ParquetWriter, Source}; use table::engine::TableReference; use table::requests::CopyTableRequest; use table::table::adapter::DfTableProviderAdapter; -use crate::error::{ - self, BuildDfLogicalPlanSnafu, ExecLogicalPlanSnafu, Result, WriteParquetSnafu, -}; +use crate::error::{self, BuildDfLogicalPlanSnafu, ExecLogicalPlanSnafu, Result}; use crate::statement::StatementExecutor; +// The buffer size should be greater than 5MB (minimum multipart upload size). +/// Buffer size to flush data to object stores. +const WRITE_BUFFER_THRESHOLD: ReadableSize = ReadableSize::mb(8); + impl StatementExecutor { async fn stream_to_file( &self, @@ -50,7 +51,7 @@ impl StatementExecutor { object_store: ObjectStore, path: &str, ) -> Result { - let threshold = ReadableSize::mb(4).as_bytes() as usize; + let threshold = WRITE_BUFFER_THRESHOLD.as_bytes() as usize; match format { Format::Csv(_) => stream_to_csv( @@ -69,17 +70,14 @@ impl StatementExecutor { ) .await .context(error::WriteStreamToFileSnafu { path }), - Format::Parquet(_) => { - let writer = ParquetWriter::new(path, Source::Stream(stream), object_store); - let rows_copied = writer - .write_sst(&storage::sst::WriteOptions::default()) - .await - .context(WriteParquetSnafu)? - .map(|SstInfo { num_rows, .. }| num_rows) - .unwrap_or(0); - - Ok(rows_copied) - } + Format::Parquet(_) => stream_to_parquet( + Box::pin(DfRecordBatchStreamAdapter::new(stream)), + object_store, + path, + threshold, + ) + .await + .context(error::WriteStreamToFileSnafu { path }), _ => error::UnsupportedFormatSnafu { format: *format }.fail(), } } diff --git a/src/script/Cargo.toml b/src/script/Cargo.toml index 9a545912eeca..a443faf219ba 100644 --- a/src/script/Cargo.toml +++ b/src/script/Cargo.toml @@ -85,7 +85,6 @@ rayon = "1.0" ron = "0.7" serde = { version = "1.0", features = ["derive"] } session = { workspace = true, features = ["testing"] } -storage.workspace = true tokio-test = "0.4" [[bench]] diff --git a/src/storage/Cargo.toml b/src/storage/Cargo.toml deleted file mode 100644 index 58bb63e3a3eb..000000000000 --- a/src/storage/Cargo.toml +++ /dev/null @@ -1,64 +0,0 @@ -[package] -name = "storage" -version.workspace = true -edition.workspace = true -license.workspace = true - -[dependencies] -api.workspace = true -arc-swap = "1.0" -arrow-array.workspace = true -arrow.workspace = true -async-compat = "0.2" -async-stream.workspace = true -async-trait = "0.1" -bytes = "1.1" -common-base.workspace = true -common-datasource.workspace = true -common-error.workspace = true -common-macro.workspace = true -common-query.workspace = true -common-recordbatch.workspace = true -common-runtime.workspace = true -common-telemetry.workspace = true -common-time.workspace = true -datafusion-common.workspace = true -datafusion-expr.workspace = true -datafusion-physical-expr.workspace = true -datafusion.workspace = true -datatypes.workspace = true -futures-util.workspace = true -futures.workspace = true -itertools.workspace = true -lazy_static.workspace = true -object-store.workspace = true -parquet = { workspace = true, features = ["async"] } -paste.workspace = true -prometheus.workspace = true -prost.workspace = true -regex = "1.5" -serde.workspace = true -serde_json = "1.0" -snafu.workspace = true -store-api.workspace = true -table.workspace = true -tokio-util.workspace = true -tokio.workspace = true -tonic.workspace = true -uuid.workspace = true - -[dev-dependencies] -atomic_float = "0.1" -common-config.workspace = true -common-test-util.workspace = true -criterion = "0.3" -datatypes = { workspace = true, features = ["test"] } -log-store.workspace = true -rand.workspace = true - -[build-dependencies] -tonic-build = "0.9" - -[[bench]] -name = "bench_main" -harness = false diff --git a/src/storage/benches/bench_main.rs b/src/storage/benches/bench_main.rs deleted file mode 100644 index 6d89eea3a6e6..000000000000 --- a/src/storage/benches/bench_main.rs +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use criterion::criterion_main; - -mod memtable; -mod wal; - -criterion_main! { - memtable::bench_memtable_read::benches, - memtable::bench_memtable_write::benches, - memtable::bench_memtable_read_write_ratio::benches, - wal::bench_wal::benches, - wal::bench_decode::benches, - wal::bench_encode::benches, -} diff --git a/src/storage/benches/memtable/bench_memtable_read.rs b/src/storage/benches/memtable/bench_memtable_read.rs deleted file mode 100644 index b3d3e24f9433..000000000000 --- a/src/storage/benches/memtable/bench_memtable_read.rs +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use criterion::{criterion_group, criterion_main, Criterion, Throughput}; - -use crate::memtable::generate_kvs; -use crate::memtable::util::bench_context::BenchContext; - -fn bench_memtable_read(c: &mut Criterion) { - // the length of string in value is 20 - let kvs = generate_kvs(10, 10000, 20); - let ctx = BenchContext::new(); - kvs.iter().for_each(|kv| ctx.write(kv)); - let mut group = c.benchmark_group("memtable_read"); - let _ = group - .throughput(Throughput::Elements(10 * 10000)) - .bench_function("read", |b| b.iter(|| ctx.read(100))); - group.finish(); -} - -criterion_group!(benches, bench_memtable_read); -criterion_main!(benches); diff --git a/src/storage/benches/memtable/bench_memtable_read_write_ratio.rs b/src/storage/benches/memtable/bench_memtable_read_write_ratio.rs deleted file mode 100644 index a8b7b55c8fc1..000000000000 --- a/src/storage/benches/memtable/bench_memtable_read_write_ratio.rs +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; -use std::sync::Arc; -use std::thread; -use std::time::Instant; - -use atomic_float::AtomicF64; -use criterion::{ - criterion_group, criterion_main, BatchSize, Bencher, BenchmarkId, Criterion, Throughput, -}; -use rand::Rng; - -use crate::memtable::generate_kvs; -use crate::memtable::util::bench_context::BenchContext; - -static READ_NUM: AtomicUsize = AtomicUsize::new(0); -static WRITE_NUM: AtomicUsize = AtomicUsize::new(0); -static READ_SECS: AtomicF64 = AtomicF64::new(0.0); -static WRITE_SECS: AtomicF64 = AtomicF64::new(0.0); - -struct Input { - ratio: bool, - kv_size: usize, - batch_size: usize, -} - -fn memtable_round(ctx: &BenchContext, input: &Input) { - if input.ratio { - let now = Instant::now(); - let read_count = ctx.read(input.batch_size); - let d = now.elapsed(); - let _ = READ_SECS.fetch_add( - d.as_secs() as f64 + d.subsec_nanos() as f64 * 1e-9, - Ordering::Relaxed, - ); - let _ = READ_NUM.fetch_add(read_count, Ordering::Relaxed); - } else { - generate_kvs(input.kv_size, input.batch_size, 20) - .iter() - .for_each(|kv| { - let now = Instant::now(); - ctx.write(kv); - let d = now.elapsed(); - let _ = WRITE_SECS.fetch_add( - d.as_secs() as f64 + d.subsec_nanos() as f64 * 1e-9, - Ordering::Relaxed, - ); - let _ = WRITE_NUM.fetch_add(kv.len(), Ordering::Relaxed); - }); - } -} - -fn bench_read_write_ctx_frac(b: &mut Bencher<'_>, frac: &usize) { - let frac = *frac; - let ctx = Arc::new(BenchContext::default()); - let thread_ctx = ctx.clone(); - let stop = Arc::new(AtomicBool::new(false)); - let thread_stop = stop.clone(); - - let handle = thread::spawn(move || { - let mut rng = rand::thread_rng(); - while !thread_stop.load(Ordering::Relaxed) { - let f = rng.gen_range(0..=10); - let input = Input { - ratio: f < frac, - kv_size: 100, - batch_size: 1000, - }; - memtable_round(&thread_ctx, &input); - } - }); - - let mut rng = rand::thread_rng(); - b.iter_batched_ref( - || { - let f = rng.gen_range(0..=10); - Input { - ratio: f < frac, - kv_size: 100, - batch_size: 1000, - } - }, - |input| { - memtable_round(&ctx, input); - }, - BatchSize::SmallInput, - ); - stop.store(true, Ordering::Relaxed); - handle.join().unwrap(); -} - -#[allow(clippy::print_stdout)] -fn bench_memtable_read_write_ratio(c: &mut Criterion) { - let mut group = c.benchmark_group("memtable_read_write_ratio"); - for i in 0..=10 { - READ_NUM.store(0, Ordering::Relaxed); - WRITE_NUM.store(0, Ordering::Relaxed); - READ_SECS.store(0.0, Ordering::Relaxed); - WRITE_SECS.store(0.0, Ordering::Relaxed); - - let _ = group - .bench_with_input( - BenchmarkId::from_parameter(format!( - "read ratio: {:.2}% , write ratio: {:.2}%", - i as f64 / 10_f64 * 100.0, - (10 - i) as f64 / 10_f64 * 100.0, - )), - &i, - bench_read_write_ctx_frac, - ) - .throughput(Throughput::Elements(100 * 1000)); - - // the time is a little different the real time - let read_num = READ_NUM.load(Ordering::Relaxed); - let read_time = READ_SECS.load(Ordering::Relaxed); - let read_tps = if read_time != 0.0 { - read_num as f64 / read_time - } else { - 0.0 - }; - let write_num = WRITE_NUM.load(Ordering::Relaxed); - let write_time = WRITE_SECS.load(Ordering::Relaxed); - let write_tps = if write_time != 0.0 { - write_num as f64 / write_time - } else { - 0.0 - }; - if read_num != 0 || write_num != 0 { - println!( - "\nread numbers: {read_num}, read thrpt: {read_tps}\nwrite numbers: {write_num}, write thrpt {write_tps}\n", - ); - } - } - group.finish(); -} - -criterion_group!(benches, bench_memtable_read_write_ratio); -criterion_main!(benches); diff --git a/src/storage/benches/memtable/bench_memtable_write.rs b/src/storage/benches/memtable/bench_memtable_write.rs deleted file mode 100644 index ba2a747f069e..000000000000 --- a/src/storage/benches/memtable/bench_memtable_write.rs +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use criterion::{criterion_group, criterion_main, Criterion, Throughput}; - -use crate::memtable::generate_kvs; -use crate::memtable::util::bench_context::BenchContext; - -pub fn bench_memtable_write(c: &mut Criterion) { - // the length of string in value is 20 - let kvs = generate_kvs(10, 1000, 20); - let mut group = c.benchmark_group("memtable_write"); - let _ = group - .throughput(Throughput::Elements(10 * 1000)) - .bench_function("write", |b| { - let ctx = BenchContext::new(); - b.iter(|| kvs.iter().for_each(|kv| ctx.write(kv))) - }); - group.finish(); -} - -criterion_group!(benches, bench_memtable_write); -criterion_main!(benches); diff --git a/src/storage/benches/memtable/mod.rs b/src/storage/benches/memtable/mod.rs deleted file mode 100644 index c0b6cd0ce2ca..000000000000 --- a/src/storage/benches/memtable/mod.rs +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub mod bench_memtable_read; -pub mod bench_memtable_read_write_ratio; -pub mod bench_memtable_write; -pub mod util; - -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::Arc; - -use api::v1::OpType; -use datatypes::prelude::ScalarVectorBuilder; -use datatypes::timestamp::TimestampMillisecond; -use datatypes::vectors::{ - StringVectorBuilder, TimestampMillisecondVectorBuilder, UInt64VectorBuilder, -}; -use rand::distributions::Alphanumeric; -use rand::prelude::ThreadRng; -use rand::Rng; -use storage::memtable::KeyValues; -use store_api::storage::SequenceNumber; - -static NEXT_SEQUENCE: AtomicU64 = AtomicU64::new(0); - -fn get_sequence() -> SequenceNumber { - NEXT_SEQUENCE.fetch_add(1, Ordering::Relaxed) -} - -fn random_kv(rng: &mut ThreadRng, value_size: usize) -> ((i64, u64), (Option, String)) { - let key0 = rng.gen_range(0..10000); - let key1 = rng.gen::(); - let value1 = Some(rng.gen::()); - let value2 = rand::thread_rng() - .sample_iter(&Alphanumeric) - .take(value_size) - .map(char::from) - .collect(); - ((key0, key1), (value1, value2)) -} -type KeyTuple = (i64, u64); -type ValueTuple = (Option, String); - -fn random_kvs(len: usize, value_size: usize) -> (Vec, Vec) { - let mut keys = Vec::with_capacity(len); - let mut values = Vec::with_capacity(len); - for _ in 0..len { - let mut rng = rand::thread_rng(); - let (key, value) = random_kv(&mut rng, value_size); - keys.push(key); - values.push(value); - } - (keys, values) -} - -fn kvs_with_index( - sequence: SequenceNumber, - op_type: OpType, - start_index_in_batch: usize, - keys: &[(i64, u64)], - values: &[(Option, String)], -) -> KeyValues { - let mut key_builders = ( - TimestampMillisecondVectorBuilder::with_capacity(keys.len()), - UInt64VectorBuilder::with_capacity(keys.len()), - ); - for key in keys { - key_builders.0.push(Some(TimestampMillisecond::from(key.0))); - key_builders.1.push(Some(key.1)); - } - let row_keys = vec![Arc::new(key_builders.1.finish()) as _]; - - let mut value_builders = ( - UInt64VectorBuilder::with_capacity(values.len()), - StringVectorBuilder::with_capacity(values.len()), - ); - for value in values { - value_builders.0.push(value.0); - value_builders.1.push(Some(&value.1)); - } - let row_values = vec![ - Arc::new(value_builders.0.finish()) as _, - Arc::new(value_builders.1.finish()) as _, - ]; - KeyValues { - sequence, - op_type, - start_index_in_batch, - keys: row_keys, - values: row_values, - timestamp: Some(Arc::new(key_builders.0.finish()) as _), - } -} - -fn generate_kv(kv_size: usize, start_index_in_batch: usize, value_size: usize) -> KeyValues { - let (keys, values) = random_kvs(kv_size, value_size); - kvs_with_index( - get_sequence(), - OpType::Put, - start_index_in_batch, - &keys, - &values, - ) -} - -fn generate_kvs(kv_size: usize, size: usize, value_size: usize) -> Vec { - (0..size) - .map(|i| generate_kv(kv_size, i, value_size)) - .collect() -} diff --git a/src/storage/benches/memtable/util/bench_context.rs b/src/storage/benches/memtable/util/bench_context.rs deleted file mode 100644 index 0345ab35088b..000000000000 --- a/src/storage/benches/memtable/util/bench_context.rs +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use storage::memtable::{IterContext, KeyValues, MemtableRef}; - -use crate::memtable::util::new_memtable; - -pub struct BenchContext { - memtable: MemtableRef, -} -impl Default for BenchContext { - fn default() -> Self { - BenchContext::new() - } -} -impl BenchContext { - pub fn new() -> BenchContext { - BenchContext { - memtable: new_memtable(), - } - } - - pub fn write(&self, kvs: &KeyValues) { - self.memtable.write(kvs).unwrap(); - } - - pub fn read(&self, batch_size: usize) -> usize { - let mut read_count = 0; - let iter_ctx = IterContext { - batch_size, - ..Default::default() - }; - let iter = self.memtable.iter(iter_ctx).unwrap(); - for batch in iter { - let _ = batch.unwrap(); - read_count += batch_size; - } - read_count - } -} diff --git a/src/storage/benches/memtable/util/mod.rs b/src/storage/benches/memtable/util/mod.rs deleted file mode 100644 index 78787e04f323..000000000000 --- a/src/storage/benches/memtable/util/mod.rs +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub mod bench_context; -pub mod regiondesc_util; -pub mod schema_util; - -use datatypes::type_id::LogicalTypeId; -use storage::memtable::{DefaultMemtableBuilder, MemtableBuilder, MemtableRef}; -use storage::metadata::RegionMetadata; -use storage::schema::RegionSchemaRef; - -use crate::memtable::util::regiondesc_util::RegionDescBuilder; - -pub const TIMESTAMP_NAME: &str = "timestamp"; - -pub fn schema_for_test() -> RegionSchemaRef { - let desc = RegionDescBuilder::new("bench") - .push_field_column(("v1", LogicalTypeId::UInt64, true)) - .push_field_column(("v2", LogicalTypeId::String, true)) - .build(); - let metadata: RegionMetadata = desc.try_into().unwrap(); - - metadata.schema().clone() -} - -pub fn new_memtable() -> MemtableRef { - DefaultMemtableBuilder::default().build(schema_for_test()) -} diff --git a/src/storage/benches/memtable/util/regiondesc_util.rs b/src/storage/benches/memtable/util/regiondesc_util.rs deleted file mode 100644 index 937cccd9e56c..000000000000 --- a/src/storage/benches/memtable/util/regiondesc_util.rs +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use datatypes::prelude::ConcreteDataType; -use store_api::storage::{ - ColumnDescriptor, ColumnDescriptorBuilder, ColumnFamilyDescriptorBuilder, ColumnId, - RegionDescriptor, RowKeyDescriptorBuilder, -}; - -use super::schema_util::ColumnDef; -use super::TIMESTAMP_NAME; - -pub struct RegionDescBuilder { - name: String, - last_column_id: ColumnId, - key_builder: RowKeyDescriptorBuilder, - default_cf_builder: ColumnFamilyDescriptorBuilder, -} - -impl RegionDescBuilder { - pub fn new>(name: T) -> Self { - let key_builder = RowKeyDescriptorBuilder::new( - ColumnDescriptorBuilder::new( - 1, - TIMESTAMP_NAME, - ConcreteDataType::timestamp_millisecond_datatype(), - ) - .is_nullable(false) - .build() - .unwrap(), - ); - - Self { - name: name.into(), - last_column_id: 1, - key_builder, - default_cf_builder: ColumnFamilyDescriptorBuilder::default(), - } - } - - pub fn push_field_column(mut self, column_def: ColumnDef) -> Self { - let column = self.new_column(column_def); - self.default_cf_builder = self.default_cf_builder.push_column(column); - self - } - - pub fn build(self) -> RegionDescriptor { - RegionDescriptor { - id: 0.into(), - name: self.name, - row_key: self.key_builder.build().unwrap(), - default_cf: self.default_cf_builder.build().unwrap(), - extra_cfs: Vec::new(), - } - } - - fn alloc_column_id(&mut self) -> ColumnId { - self.last_column_id += 1; - self.last_column_id - } - - fn new_column(&mut self, column_def: ColumnDef) -> ColumnDescriptor { - let datatype = column_def.1.data_type(); - ColumnDescriptorBuilder::new(self.alloc_column_id(), column_def.0, datatype) - .is_nullable(column_def.2) - .build() - .unwrap() - } -} diff --git a/src/storage/benches/memtable/util/schema_util.rs b/src/storage/benches/memtable/util/schema_util.rs deleted file mode 100644 index f20e74d966b3..000000000000 --- a/src/storage/benches/memtable/util/schema_util.rs +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use datatypes::prelude::*; -use datatypes::schema::{ColumnSchema, Schema, SchemaBuilder, SchemaRef}; - -/// Column definition: (name, datatype, is_nullable) -pub type ColumnDef<'a> = (&'a str, LogicalTypeId, bool); - -pub fn new_schema(column_defs: &[ColumnDef], timestamp_index: Option) -> Schema { - let column_schemas: Vec<_> = column_defs - .iter() - .enumerate() - .map(|(index, column_def)| { - let datatype = column_def.1.data_type(); - if let Some(timestamp_index) = timestamp_index { - ColumnSchema::new(column_def.0, datatype, column_def.2) - .with_time_index(index == timestamp_index) - } else { - ColumnSchema::new(column_def.0, datatype, column_def.2) - } - }) - .collect(); - - SchemaBuilder::try_from(column_schemas) - .unwrap() - .build() - .unwrap() -} - -pub fn new_schema_ref(column_defs: &[ColumnDef], timestamp_index: Option) -> SchemaRef { - Arc::new(new_schema(column_defs, timestamp_index)) -} diff --git a/src/storage/benches/wal/bench_decode.rs b/src/storage/benches/wal/bench_decode.rs deleted file mode 100644 index 617f2517d095..000000000000 --- a/src/storage/benches/wal/bench_decode.rs +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use criterion::{criterion_group, criterion_main, Criterion}; -use storage::codec::{Decoder, Encoder}; -use storage::write_batch::{codec, WriteBatch}; - -use crate::wal::util::gen_new_batch_and_types; - -/* -------------------------------------- - decode | -------------------------------------- -rows | protobuf | arrow | ------------------------------------- -10 | 8.6485 us | 8.8028 us | ------------------------------------- -100 | 63.850 us | 46.174 us | ------------------------------------- -10000| 654.46 us | 433.58 us | ------------------------------------- -*/ - -fn encode_arrow(batch: &WriteBatch, dst: &mut Vec) { - let encoder = codec::PayloadEncoder::new(); - encoder.encode(batch.payload(), dst).unwrap(); -} - -fn decode_arrow(dst: &[u8], mutation_types: &[i32]) { - let decoder = codec::PayloadDecoder::new(mutation_types); - let _ = decoder.decode(dst).unwrap(); -} - -fn bench_wal_decode(c: &mut Criterion) { - let (batch_10, types_10) = gen_new_batch_and_types(1); - let (batch_100, types_100) = gen_new_batch_and_types(10); - let (batch_10000, types_10000) = gen_new_batch_and_types(100); - - let mut dst_arrow_10 = vec![]; - let mut dst_arrow_100 = vec![]; - let mut dst_arrow_10000 = vec![]; - - encode_arrow(&batch_10, &mut dst_arrow_10); - encode_arrow(&batch_100, &mut dst_arrow_100); - encode_arrow(&batch_10000, &mut dst_arrow_10000); - - let mut group = c.benchmark_group("wal_decode"); - let _ = group - .bench_function("arrow_decode_with_10_num_rows", |b| { - b.iter(|| decode_arrow(&dst_arrow_10, &types_10)) - }) - .bench_function("arrow_decode_with_100_num_rows", |b| { - b.iter(|| decode_arrow(&dst_arrow_100, &types_100)) - }) - .bench_function("arrow_decode_with_10000_num_rows", |b| { - b.iter(|| decode_arrow(&dst_arrow_10000, &types_10000)) - }); - group.finish(); -} - -criterion_group!(benches, bench_wal_decode); -criterion_main!(benches); diff --git a/src/storage/benches/wal/bench_encode.rs b/src/storage/benches/wal/bench_encode.rs deleted file mode 100644 index 14d7dd80b9cc..000000000000 --- a/src/storage/benches/wal/bench_encode.rs +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use criterion::{criterion_group, criterion_main, Criterion}; -use storage::codec::Encoder; -use storage::write_batch::{codec, WriteBatch}; - -use crate::wal::util::gen_new_batch_and_types; - -/* -------------------------------------- - encode | -------------------------------------- -rows | protobuf | arrow | ------------------------------------- -10 | 4.8732 us | 5.7388 us | ------------------------------------- -100 | 40.928 us | 24.988 us | ------------------------------------- -10000| 425.69 us | 229.74 us | ------------------------------------- -*/ - -fn encode_arrow(batch: &WriteBatch) { - let encoder = codec::PayloadEncoder::new(); - let mut dst = vec![]; - encoder.encode(batch.payload(), &mut dst).unwrap(); -} - -fn bench_wal_encode(c: &mut Criterion) { - let (batch_10, _) = gen_new_batch_and_types(1); - let (batch_100, _) = gen_new_batch_and_types(10); - let (batch_10000, _) = gen_new_batch_and_types(100); - - let mut group = c.benchmark_group("wal_encode"); - let _ = group - .bench_function("arrow_encode_with_10_num_rows", |b| { - b.iter(|| encode_arrow(&batch_10)) - }) - .bench_function("arrow_encode_with_100_num_rows", |b| { - b.iter(|| encode_arrow(&batch_100)) - }) - .bench_function("arrow_encode_with_10000_num_rows", |b| { - b.iter(|| encode_arrow(&batch_10000)) - }); - group.finish(); -} - -criterion_group!(benches, bench_wal_encode); -criterion_main!(benches); diff --git a/src/storage/benches/wal/bench_wal.rs b/src/storage/benches/wal/bench_wal.rs deleted file mode 100644 index 20a8db3c3181..000000000000 --- a/src/storage/benches/wal/bench_wal.rs +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use criterion::{criterion_group, criterion_main, Criterion}; -use storage::codec::{Decoder, Encoder}; -use storage::write_batch::{codec, WriteBatch}; - -use crate::wal::util::gen_new_batch_and_types; - -/* -------------------------------------- - encode & decode | -------------------------------------- -rows | protobuf | arrow | ------------------------------------- -10 | 13.845 us | 15.093 us | ------------------------------------- -100 | 106.70 us | 73.895 us | ------------------------------------- -10000| 1.0860 ms | 680.12 us | ------------------------------------- -*/ - -fn codec_arrow(batch: &WriteBatch, mutation_types: &[i32]) { - let encoder = codec::PayloadEncoder::new(); - let mut dst = vec![]; - encoder.encode(batch.payload(), &mut dst).unwrap(); - - let decoder = codec::PayloadDecoder::new(mutation_types); - let _ = decoder.decode(&dst).unwrap(); -} - -fn bench_wal_encode_decode(c: &mut Criterion) { - let (batch_10, types_10) = gen_new_batch_and_types(1); - let (batch_100, types_100) = gen_new_batch_and_types(10); - let (batch_10000, types_10000) = gen_new_batch_and_types(100); - - let mut group = c.benchmark_group("wal_encode_decode"); - let _ = group - .bench_function("arrow_encode_decode_with_10_num_rows", |b| { - b.iter(|| codec_arrow(&batch_10, &types_10)) - }) - .bench_function("arrow_encode_decode_with_100_num_rows", |b| { - b.iter(|| codec_arrow(&batch_100, &types_100)) - }) - .bench_function("arrow_encode_decode_with_10000_num_rows", |b| { - b.iter(|| codec_arrow(&batch_10000, &types_10000)) - }); - group.finish(); -} - -criterion_group!(benches, bench_wal_encode_decode); -criterion_main!(benches); diff --git a/src/storage/benches/wal/mod.rs b/src/storage/benches/wal/mod.rs deleted file mode 100644 index 55f04ce477f6..000000000000 --- a/src/storage/benches/wal/mod.rs +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub mod bench_decode; -pub mod bench_encode; -pub mod bench_wal; -pub mod util; diff --git a/src/storage/benches/wal/util/mod.rs b/src/storage/benches/wal/util/mod.rs deleted file mode 100644 index 9328172729c8..000000000000 --- a/src/storage/benches/wal/util/mod.rs +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub mod write_batch_util; - -use std::collections::HashMap; -use std::sync::Arc; - -use datatypes::prelude::ScalarVector; -use datatypes::type_id::LogicalTypeId; -use datatypes::vectors::{ - BooleanVector, Float64Vector, StringVector, TimestampMillisecondVector, UInt64Vector, VectorRef, -}; -use rand::Rng; -use storage::proto; -use storage::write_batch::WriteBatch; -use store_api::storage::WriteRequest; - -pub fn new_test_batch() -> WriteBatch { - write_batch_util::new_write_batch( - &[ - ("k1", LogicalTypeId::UInt64, false), - ("ts", LogicalTypeId::TimestampMillisecond, false), - ("v1", LogicalTypeId::Boolean, true), - ("4", LogicalTypeId::Float64, false), - ("5", LogicalTypeId::Float64, false), - ("6", LogicalTypeId::Float64, false), - ("7", LogicalTypeId::Float64, false), - ("8", LogicalTypeId::Float64, false), - ("9", LogicalTypeId::Float64, false), - ("10", LogicalTypeId::String, false), - ], - Some(2), - 3, - ) -} - -pub fn gen_new_batch_and_types(putdate_nums: usize) -> (WriteBatch, Vec) { - let mut batch = new_test_batch(); - let mut rng = rand::thread_rng(); - for _ in 0..putdate_nums { - let mut intvs = [0u64; 10]; - let mut boolvs = [true; 10]; - let mut tsvs = [0i64; 10]; - let mut fvs = [0.0_f64; 10]; - let svs = [ - "value1_string", - "value2_string", - "value3_string", - "value4_string", - "value5_string", - "value6_string", - "value7_string", - "value8_string", - "value9_string", - "value10_string", - ]; - rng.fill(&mut intvs[..]); - rng.fill(&mut boolvs[..]); - rng.fill(&mut tsvs[..]); - rng.fill(&mut fvs[..]); - let intv = Arc::new(UInt64Vector::from_slice(intvs)) as VectorRef; - let boolv = Arc::new(BooleanVector::from(boolvs.to_vec())) as VectorRef; - let tsv = Arc::new(TimestampMillisecondVector::from_values(tsvs)) as VectorRef; - let fvs = Arc::new(Float64Vector::from_slice(fvs)) as VectorRef; - let svs = Arc::new(StringVector::from_slice(&svs)) as VectorRef; - let put_data = HashMap::from([ - ("k1".to_string(), intv.clone()), - ("v1".to_string(), boolv), - ("ts".to_string(), tsv.clone()), - ("4".to_string(), fvs.clone()), - ("5".to_string(), fvs.clone()), - ("6".to_string(), fvs.clone()), - ("7".to_string(), fvs.clone()), - ("8".to_string(), fvs.clone()), - ("9".to_string(), fvs), - ("10".to_string(), svs), - ]); - batch.put(put_data).unwrap(); - } - let types = proto::wal::gen_mutation_types(batch.payload()); - (batch, types) -} diff --git a/src/storage/benches/wal/util/write_batch_util.rs b/src/storage/benches/wal/util/write_batch_util.rs deleted file mode 100644 index 640138c3e51a..000000000000 --- a/src/storage/benches/wal/util/write_batch_util.rs +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use storage::write_batch::WriteBatch; - -use crate::memtable::util::schema_util::{self, ColumnDef}; - -pub fn new_write_batch( - column_defs: &[ColumnDef], - timestamp_index: Option, - row_key_end: usize, -) -> WriteBatch { - let schema = schema_util::new_schema_ref(column_defs, timestamp_index); - - WriteBatch::new(schema, row_key_end) -} diff --git a/src/storage/build.rs b/src/storage/build.rs deleted file mode 100644 index 1e5230ecc4be..000000000000 --- a/src/storage/build.rs +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -fn main() { - tonic_build::configure() - .compile(&["proto/wal.proto"], &["."]) - .expect("compile proto"); -} diff --git a/src/storage/proto/wal.proto b/src/storage/proto/wal.proto deleted file mode 100644 index 94bb14cda1b2..000000000000 --- a/src/storage/proto/wal.proto +++ /dev/null @@ -1,14 +0,0 @@ -syntax = "proto3"; - -package greptime.storage.wal.v1; - -message WalHeader { - uint64 last_manifest_version = 1; - // Type of each mutation in payload, now only arrow payload uses this field. - repeated MutationType mutation_types = 2; -} - -enum MutationType { - DELETE = 0; - PUT = 1; -} diff --git a/src/storage/src/chunk.rs b/src/storage/src/chunk.rs deleted file mode 100644 index e8fc4a555c83..000000000000 --- a/src/storage/src/chunk.rs +++ /dev/null @@ -1,451 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use async_trait::async_trait; -use common_query::logical_plan::Expr; -use common_recordbatch::OrderOption; -use common_telemetry::logging; -use common_time::range::TimestampRange; -use snafu::ResultExt; -use store_api::storage::{Chunk, ChunkReader, RegionId, SchemaRef, SequenceNumber}; -use table::predicate::{Predicate, TimeRangePredicateBuilder}; - -use crate::error::{self, Error, Result}; -use crate::memtable::{IterContext, MemtableRef}; -use crate::read::{ - Batch, BoxedBatchReader, ChainReader, DedupReader, MergeReaderBuilder, WindowedReader, -}; -use crate::schema::{ProjectedSchema, ProjectedSchemaRef, RegionSchemaRef}; -use crate::sst::{AccessLayerRef, FileHandle, LevelMetas, ReadOptions}; -use crate::window_infer::{PlainWindowInference, WindowInfer}; - -/// Chunk reader implementation. -// Now we use async-trait to implement the chunk reader, which is easier to implement than -// using `Stream`, maybe change to `Stream` if we find out it is more efficient and have -// necessary to do so. -pub struct ChunkReaderImpl { - schema: ProjectedSchemaRef, - batch_reader: BoxedBatchReader, - output_ordering: Option>, -} - -#[async_trait] -impl ChunkReader for ChunkReaderImpl { - type Error = Error; - - fn user_schema(&self) -> &SchemaRef { - self.schema.projected_user_schema() - } - - async fn next_chunk(&mut self) -> Result> { - let batch = match self.batch_reader.next_batch().await? { - Some(b) => b, - None => return Ok(None), - }; - Ok(Some(Chunk::new(batch.columns))) - } - - fn project_chunk(&self, chunk: Chunk) -> Chunk { - let batch = Batch { - columns: chunk.columns, - }; - self.schema.batch_to_chunk(&batch) - } - - fn output_ordering(&self) -> Option> { - self.output_ordering.clone() - } -} - -impl ChunkReaderImpl { - pub fn new( - schema: ProjectedSchemaRef, - batch_reader: BoxedBatchReader, - output_ordering: Option>, - ) -> ChunkReaderImpl { - ChunkReaderImpl { - schema, - batch_reader, - output_ordering, - } - } - - #[inline] - pub fn projected_schema(&self) -> &ProjectedSchemaRef { - &self.schema - } -} - -/// Builder to create a new [ChunkReaderImpl] from scan request. -pub struct ChunkReaderBuilder { - region_id: RegionId, - schema: RegionSchemaRef, - projection: Option>, - filters: Vec, - sst_layer: AccessLayerRef, - iter_ctx: IterContext, - memtables: Vec, - files_to_read: Vec, - output_ordering: Option>, - use_chain_reader: bool, -} - -impl ChunkReaderBuilder { - pub fn new(region_id: RegionId, schema: RegionSchemaRef, sst_layer: AccessLayerRef) -> Self { - ChunkReaderBuilder { - region_id, - schema, - projection: None, - filters: vec![], - sst_layer, - iter_ctx: IterContext::default(), - memtables: Vec::new(), - files_to_read: Vec::new(), - output_ordering: None, - use_chain_reader: false, - } - } - - /// Reserve space for iterating `num` memtables. - pub fn reserve_num_memtables(mut self, num: usize) -> Self { - self.memtables.reserve(num); - self - } - - pub fn projection(mut self, projection: Option>) -> Self { - self.projection = projection; - self - } - - pub fn filters(mut self, filters: Vec) -> Self { - self.filters = filters; - self - } - - pub fn output_ordering(mut self, ordering: Option>) -> Self { - self.output_ordering = ordering; - self - } - - pub fn batch_size(mut self, batch_size: usize) -> Self { - self.iter_ctx.batch_size = batch_size; - self - } - - pub fn visible_sequence(mut self, sequence: SequenceNumber) -> Self { - self.iter_ctx.visible_sequence = sequence; - self - } - - pub fn pick_memtables(mut self, memtables: MemtableRef) -> Self { - self.memtables.push(memtables); - self - } - - /// Partition files and memtables according to their time windows and scan time windows - /// one by one. - /// - /// Note that compaction should not enable this. - pub fn use_chain_reader(mut self, use_chain_reader: bool) -> Self { - self.use_chain_reader = use_chain_reader; - self - } - - /// Picks all SSTs in all levels - pub fn pick_all_ssts(mut self, ssts: &LevelMetas) -> Result { - let files = ssts.levels().iter().flat_map(|level| level.files()); - // Now we read all files, so just reserve enough space to hold all files. - self.files_to_read.reserve(files.size_hint().0); - for file in files { - // We can't invoke async functions here, so we collects all files first, and - // create the batch reader later in `ChunkReaderBuilder`. - self.files_to_read.push(file.clone()); - } - Ok(self) - } - - /// Picks given SSTs to read. - pub fn pick_ssts(mut self, ssts: &[FileHandle]) -> Self { - for file in ssts { - self.files_to_read.push(file.clone()); - } - self - } - - /// Try to infer time window from output ordering. If the result - /// is `None` means the output ordering is not obeyed, otherwise - /// means the output ordering is obeyed and is same with request. - fn infer_time_windows(&self, output_ordering: &[OrderOption]) -> Option> { - if output_ordering.is_empty() { - return None; - } - let OrderOption { name, options } = &output_ordering[0]; - - if name != self.schema.timestamp_column_name() { - return None; - } - let memtable_stats = self - .memtables - .iter() - .filter(|m| m.num_rows() > 0) // Skip empty memtables. - .map(|m| m.stats()) - .collect::>(); - let files = self - .files_to_read - .iter() - .map(FileHandle::meta) - .collect::>(); - - Some(PlainWindowInference {}.infer_window(&files, &memtable_stats, options.descending)) - } - - async fn build_windowed( - self, - schema: &ProjectedSchemaRef, - time_range_predicate: &TimestampRange, - windows: Vec, - order_options: Vec, - ) -> Result { - let mut readers = Vec::with_capacity(windows.len()); - for window in windows { - let time_range_predicate = time_range_predicate.and(&window); - let reader = self.build_reader(schema, &time_range_predicate).await?; - readers.push(reader); - } - let windowed_reader = WindowedReader::new(schema.clone(), readers, order_options); - Ok(Box::new(windowed_reader) as Box<_>) - } - - async fn build_reader( - &self, - schema: &ProjectedSchemaRef, - time_range: &TimestampRange, - ) -> Result { - let num_sources = self.memtables.len() + self.files_to_read.len(); - let mut reader_builder = MergeReaderBuilder::with_capacity(schema.clone(), num_sources) - .batch_size(self.iter_ctx.batch_size); - - for mem in &self.memtables { - let mut iter_ctx = self.iter_ctx.clone(); - iter_ctx.time_range = Some(*time_range); - let iter = mem.iter(iter_ctx)?; - reader_builder = reader_builder.push_batch_iter(iter); - } - - let predicate = Predicate::new(self.filters.clone()); - - let read_opts = ReadOptions { - batch_size: self.iter_ctx.batch_size, - projected_schema: schema.clone(), - predicate, - time_range: *time_range, - }; - - let mut num_read_files = 0; - for file in &self.files_to_read { - if !Self::file_in_range(file, time_range) { - logging::debug!( - "Skip region {} file {:?}, predicate: {:?}", - self.region_id, - file, - time_range - ); - continue; - } - - let reader = self.sst_layer.read_sst(file.clone(), &read_opts).await?; - reader_builder = reader_builder.push_batch_reader(reader); - num_read_files += 1; - } - - logging::debug!( - "build reader done, region_id: {}, time_range: {:?}, total_files: {}, num_read_files: {}", - self.region_id, - time_range, - self.files_to_read.len(), - num_read_files, - ); - - let reader = reader_builder.build(); - let reader = DedupReader::new(schema.clone(), reader); - Ok(Box::new(reader) as Box<_>) - } - - pub async fn build(mut self) -> Result { - let time_range_predicate = self.build_time_range_predicate(); - let schema = Arc::new( - ProjectedSchema::new(self.schema.clone(), self.projection.clone()) - .context(error::InvalidProjectionSnafu)?, - ); - self.iter_ctx.projected_schema = Some(schema.clone()); - - let mut output_ordering = None; - let reader = if let Some(ordering) = self.output_ordering.take() && - let Some(windows) = self.infer_time_windows(&ordering) { - output_ordering = Some(ordering.clone()); - self.build_windowed(&schema, &time_range_predicate, windows, ordering) - .await? - } else if self.use_chain_reader { - self.build_chained(&schema, &time_range_predicate).await? - } else { - self.build_reader(&schema, &time_range_predicate).await? - }; - - Ok(ChunkReaderImpl::new(schema, reader, output_ordering)) - } - - async fn build_chained( - &self, - schema: &ProjectedSchemaRef, - time_range: &TimestampRange, - ) -> Result { - let windows = self.infer_window_for_chain_reader(time_range); - - logging::debug!( - "Infer window for chain reader, region_id: {}, memtables: {}, files: {}, num_windows: {}", - self.region_id, - self.memtables.len(), - self.files_to_read.len(), - windows.len(), - ); - - let mut readers = Vec::with_capacity(windows.len()); - for window in &windows { - let time_range = time_range.and(window); - let reader = self.build_reader(schema, &time_range).await?; - readers.push(reader); - } - - logging::debug!( - "Build chain reader, region_id: {}, time_range: {:?}, num_readers: {}", - self.region_id, - time_range, - readers.len(), - ); - - let chain_reader = ChainReader::new(schema.clone(), readers); - Ok(Box::new(chain_reader) as Box<_>) - } - - /// Build time range predicate from schema and filters. - fn build_time_range_predicate(&self) -> TimestampRange { - let Some(ts_col) = self.schema.user_schema().timestamp_column() else { - return TimestampRange::min_to_max(); - }; - let unit = ts_col - .data_type - .as_timestamp() - .expect("Timestamp column must have timestamp-compatible type") - .unit(); - TimeRangePredicateBuilder::new(&ts_col.name, unit, &self.filters).build() - } - - /// Check if SST file's time range matches predicate. - fn file_in_range(file: &FileHandle, predicate: &TimestampRange) -> bool { - if predicate == &TimestampRange::min_to_max() { - return true; - } - // end_timestamp of sst file is inclusive. - let Some((start, end)) = *file.time_range() else { - return true; - }; - let file_ts_range = TimestampRange::new_inclusive(Some(start), Some(end)); - file_ts_range.intersects(predicate) - } - - /// Returns the time range of memtables to read. - fn compute_memtable_range(&self) -> Option { - let (min_timestamp, max_timestamp) = self - .memtables - .iter() - .filter(|m| m.num_rows() > 0) // Skip empty memtables. - .map(|m| { - let stats = m.stats(); - (stats.min_timestamp, stats.max_timestamp) - }) - .reduce(|acc, e| (acc.0.min(e.0), acc.1.max(e.1)))?; - - logging::debug!( - "Compute memtable range, region_id: {}, min: {:?}, max: {:?}", - self.region_id, - min_timestamp, - max_timestamp, - ); - - Some(TimestampRange::new_inclusive( - Some(min_timestamp), - Some(max_timestamp), - )) - } - - /// Infer time window for chain reader according to the time range of memtables and files. - fn infer_window_for_chain_reader(&self, time_range: &TimestampRange) -> Vec { - let mut memtable_range = self.compute_memtable_range(); - // file ranges: (start, end) - let mut file_ranges = Vec::with_capacity(self.files_to_read.len()); - for file in &self.files_to_read { - if !Self::file_in_range(file, time_range) || file.time_range().is_none() { - continue; - } - // Safety: we have skip files whose range is `None`. - let range = file.time_range().unwrap(); - - // Filter by memtable's time range. - if let Some(mem_range) = &mut memtable_range { - let file_range = TimestampRange::new_inclusive(Some(range.0), Some(range.1)); - if mem_range.intersects(&file_range) { - // If the range of the SST intersects with the range of the - // memtable, we merge it into the memtable's range. - *mem_range = mem_range.or(&file_range); - continue; - } - } - - file_ranges.push((range.0, range.1)); - } - - if file_ranges.is_empty() { - return memtable_range.map(|range| vec![range]).unwrap_or_default(); - } - - // Sort by start times. - file_ranges.sort_unstable_by(|left, right| left.0.cmp(&right.0)); - - // Compute ranges for all SSTs. - let mut time_ranges = Vec::with_capacity(file_ranges.len() + 1); - // Safety: file_ranges is not empty. - let mut prev = - TimestampRange::new_inclusive(Some(file_ranges[0].0), Some(file_ranges[0].1)); - for file_range in &file_ranges[1..] { - let current = TimestampRange::new_inclusive(Some(file_range.0), Some(file_range.1)); - if prev.intersects(¤t) { - prev = prev.or(¤t); - } else { - time_ranges.push(prev); - prev = current; - } - } - time_ranges.push(prev); - - if let Some(mem_range) = memtable_range { - time_ranges.push(mem_range); - // We have pushed the memtable range, resort the array. - time_ranges.sort_unstable_by(|left, right| left.start().cmp(right.start())); - } - - time_ranges - } -} diff --git a/src/storage/src/codec.rs b/src/storage/src/codec.rs deleted file mode 100644 index 380aa1c83ddf..000000000000 --- a/src/storage/src/codec.rs +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_error::ext::ErrorExt; - -pub trait Encoder { - /// The type that is decoded. - type Item; - type Error: ErrorExt; - - /// Encodes a message into the bytes buffer. - fn encode(&self, item: &Self::Item, dst: &mut Vec) -> Result<(), Self::Error>; -} - -pub trait Decoder { - /// The type that is decoded. - type Item; - type Error: ErrorExt; - - /// Decodes a message from the bytes buffer. - fn decode(&self, src: &[u8]) -> Result; -} diff --git a/src/storage/src/compaction.rs b/src/storage/src/compaction.rs deleted file mode 100644 index 38fe1d986991..000000000000 --- a/src/storage/src/compaction.rs +++ /dev/null @@ -1,193 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub mod noop; -mod picker; -mod scheduler; -mod task; -mod twcs; -mod writer; - -use std::sync::Arc; - -use common_telemetry::warn; -use common_time::timestamp::TimeUnit; -use common_time::Timestamp; -pub use picker::{LeveledTimeWindowPicker, Picker, PickerContext}; -pub use scheduler::{CompactionHandler, CompactionRequestImpl}; -use store_api::logstore::LogStore; -use store_api::storage::CompactionStrategy; -pub use task::{CompactionTask, CompactionTaskImpl}; -pub use twcs::TwcsPicker; - -use crate::scheduler::Scheduler; -use crate::sst::FileHandle; - -pub type CompactionPickerRef = - Arc, Task = CompactionTaskImpl> + Send + Sync>; - -pub type CompactionSchedulerRef = - Arc> + Send + Sync>; - -/// Infers the suitable time bucket duration. -/// Now it simply find the max and min timestamp across all SSTs in level and fit the time span -/// into time bucket. -pub(crate) fn infer_time_bucket<'a>(files: impl Iterator) -> i64 { - let mut max_ts = Timestamp::new(i64::MIN, TimeUnit::Second); - let mut min_ts = Timestamp::new(i64::MAX, TimeUnit::Second); - - for f in files { - if let Some((start, end)) = f.time_range() { - min_ts = min_ts.min(*start); - max_ts = max_ts.max(*end); - } else { - // we don't expect an SST file without time range, - // it's either a bug or data corruption. - warn!("Found SST file without time range metadata: {f:?}"); - } - } - - // safety: Convert whatever timestamp into seconds will not cause overflow. - let min_sec = min_ts.convert_to(TimeUnit::Second).unwrap().value(); - let max_sec = max_ts.convert_to(TimeUnit::Second).unwrap().value(); - - max_sec - .checked_sub(min_sec) - .map(|span| TIME_BUCKETS.fit_time_bucket(span)) // return the max bucket on subtraction overflow. - .unwrap_or_else(|| TIME_BUCKETS.max()) // safety: TIME_BUCKETS cannot be empty. -} - -pub(crate) struct TimeBuckets([i64; 7]); - -impl TimeBuckets { - /// Fits a given time span into time bucket by find the minimum bucket that can cover the span. - /// Returns the max bucket if no such bucket can be found. - fn fit_time_bucket(&self, span_sec: i64) -> i64 { - assert!(span_sec >= 0); - match self.0.binary_search(&span_sec) { - Ok(idx) => self.0[idx], - Err(idx) => { - if idx < self.0.len() { - self.0[idx] - } else { - self.0.last().copied().unwrap() - } - } - } - } - - #[cfg(test)] - fn get(&self, idx: usize) -> i64 { - self.0[idx] - } - - fn max(&self) -> i64 { - self.0.last().copied().unwrap() - } -} - -/// A set of predefined time buckets. -pub(crate) const TIME_BUCKETS: TimeBuckets = TimeBuckets([ - 60 * 60, // one hour - 2 * 60 * 60, // two hours - 12 * 60 * 60, // twelve hours - 24 * 60 * 60, // one day - 7 * 24 * 60 * 60, // one week - 365 * 24 * 60 * 60, // one year - 10 * 365 * 24 * 60 * 60, // ten years -]); - -pub fn compaction_strategy_to_picker( - strategy: &CompactionStrategy, -) -> CompactionPickerRef { - match strategy { - CompactionStrategy::Twcs(twcs_opts) => Arc::new(TwcsPicker::new( - twcs_opts.max_active_window_files, - twcs_opts.max_inactive_window_files, - twcs_opts.time_window_seconds, - )) as Arc<_>, - } -} - -#[cfg(test)] -mod tests { - use common_time::Timestamp; - - use super::*; - use crate::file_purger::noop::new_noop_file_purger; - use crate::sst::{FileHandle, FileId, FileMeta, Level}; - - /// Test util to create file handles. - pub fn new_file_handle( - file_id: FileId, - start_ts_millis: i64, - end_ts_millis: i64, - level: Level, - ) -> FileHandle { - let file_purger = new_noop_file_purger(); - let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {}); - FileHandle::new( - FileMeta { - region_id: 0.into(), - file_id, - time_range: Some(( - Timestamp::new_millisecond(start_ts_millis), - Timestamp::new_millisecond(end_ts_millis), - )), - level, - file_size: 0, - }, - layer, - file_purger, - ) - } - - #[test] - fn test_time_bucket() { - assert_eq!(TIME_BUCKETS.get(0), TIME_BUCKETS.fit_time_bucket(1)); - assert_eq!(TIME_BUCKETS.get(0), TIME_BUCKETS.fit_time_bucket(60 * 60)); - assert_eq!( - TIME_BUCKETS.get(1), - TIME_BUCKETS.fit_time_bucket(60 * 60 + 1) - ); - - assert_eq!( - TIME_BUCKETS.get(2), - TIME_BUCKETS.fit_time_bucket(TIME_BUCKETS.get(2) - 1) - ); - assert_eq!( - TIME_BUCKETS.get(2), - TIME_BUCKETS.fit_time_bucket(TIME_BUCKETS.get(2)) - ); - assert_eq!( - TIME_BUCKETS.get(3), - TIME_BUCKETS.fit_time_bucket(TIME_BUCKETS.get(3) - 1) - ); - assert_eq!(TIME_BUCKETS.get(6), TIME_BUCKETS.fit_time_bucket(i64::MAX)); - } - - #[test] - fn test_infer_time_buckets() { - assert_eq!( - TIME_BUCKETS.get(0), - infer_time_bucket( - [ - new_file_handle(FileId::random(), 0, TIME_BUCKETS.get(0) * 1000 - 1, 0), - new_file_handle(FileId::random(), 1, 10_000, 0) - ] - .iter() - ) - ); - } -} diff --git a/src/storage/src/compaction/noop.rs b/src/storage/src/compaction/noop.rs deleted file mode 100644 index eae5f49576b5..000000000000 --- a/src/storage/src/compaction/noop.rs +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::fmt::{Debug, Formatter}; -use std::marker::PhantomData; - -use store_api::storage::RegionId; - -use crate::compaction::{CompactionTask, Picker}; -use crate::error::Result; -use crate::scheduler::{Request, Scheduler}; - -pub struct NoopCompactionScheduler { - _phantom_data: PhantomData, -} - -impl Default for NoopCompactionScheduler { - fn default() -> Self { - Self { - _phantom_data: Default::default(), - } - } -} - -impl Debug for NoopCompactionScheduler { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_struct("NoopCompactionScheduler<...>").finish() - } -} - -#[derive(Default, Debug)] -pub struct NoopCompactionRequest; - -#[derive(Default, Debug)] -pub struct NoopCompactionPicker; - -impl Picker for NoopCompactionPicker { - type Request = NoopCompactionRequest; - type Task = NoopCompactionTask; - - fn pick(&self, _req: &Self::Request) -> Result> { - Ok(None) - } -} - -#[derive(Debug)] -pub struct NoopCompactionTask; - -#[async_trait::async_trait] -impl CompactionTask for NoopCompactionTask { - async fn run(self) -> Result<()> { - Ok(()) - } -} - -impl Request for NoopCompactionRequest { - type Key = RegionId; - - fn key(&self) -> Self::Key { - RegionId::from(0) - } - - fn complete(self, _result: Result<()>) {} -} - -#[async_trait::async_trait] -impl Scheduler for NoopCompactionScheduler -where - R: Request, -{ - type Request = R; - - fn schedule(&self, _request: Self::Request) -> Result { - Ok(true) - } - - async fn stop(&self, _await_termination: bool) -> Result<()> { - Ok(()) - } -} diff --git a/src/storage/src/compaction/picker.rs b/src/storage/src/compaction/picker.rs deleted file mode 100644 index c810cc99c846..000000000000 --- a/src/storage/src/compaction/picker.rs +++ /dev/null @@ -1,432 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; -use std::fmt::{Debug, Formatter}; -use std::marker::PhantomData; -use std::time::Duration; - -use common_telemetry::{debug, error, info, warn}; -use common_time::timestamp::TimeUnit; -use common_time::timestamp_millis::BucketAligned; -use common_time::Timestamp; -use snafu::ResultExt; -use store_api::logstore::LogStore; - -use crate::compaction::infer_time_bucket; -use crate::compaction::scheduler::CompactionRequestImpl; -use crate::compaction::task::{CompactionOutput, CompactionTask, CompactionTaskImpl}; -use crate::error::{Result, TtlCalculationSnafu}; -use crate::scheduler::Request; -use crate::sst::{FileHandle, FileId, LevelMeta}; - -/// Picker picks input SST files and builds the compaction task. -/// Different compaction strategy may implement different pickers. -pub trait Picker: Debug + Send + 'static { - type Request: Request; - type Task: CompactionTask; - - fn pick(&self, req: &Self::Request) -> Result>; -} - -pub(crate) fn get_expired_ssts( - levels: &[LevelMeta], - ttl: Option, - now: Timestamp, -) -> Result> { - let Some(ttl) = ttl else { - return Ok(vec![]); - }; - - let expire_time = now.sub_duration(ttl).context(TtlCalculationSnafu)?; - - let expired_ssts = levels - .iter() - .flat_map(|l| l.get_expired_files(&expire_time).into_iter()) - .collect(); - Ok(expired_ssts) -} - -pub struct PickerContext { - compaction_time_window: Option, -} - -impl PickerContext { - pub fn with(compaction_time_window: Option) -> Self { - Self { - compaction_time_window, - } - } - - pub fn compaction_time_window(&self) -> Option { - self.compaction_time_window - } -} - -/// `LeveledTimeWindowPicker` only handles level 0 to level 1 compaction in a time-window tiered -/// manner. It picks all SSTs in level 0 and writes rows in these SSTs to a new file partitioned -/// by a inferred time bucket in level 1. -pub struct LeveledTimeWindowPicker { - _phantom_data: PhantomData, -} - -impl Debug for LeveledTimeWindowPicker { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - write!(f, "LeveledTimeWindowPicker{{..}}") - } -} - -impl Default for LeveledTimeWindowPicker { - fn default() -> Self { - Self::new() - } -} - -impl LeveledTimeWindowPicker { - pub fn new() -> Self { - Self { - _phantom_data: Default::default(), - } - } -} - -impl Picker for LeveledTimeWindowPicker { - type Request = CompactionRequestImpl; - type Task = CompactionTaskImpl; - - fn pick(&self, req: &CompactionRequestImpl) -> Result>> { - let levels = &req.levels(); - let expired_ssts = get_expired_ssts(levels.levels(), req.ttl, Timestamp::current_millis()) - .map_err(|e| { - error!(e;"Failed to get region expired SST files, region: {}, ttl: {:?}", req.region_id, req.ttl); - e - }) - .unwrap_or_default(); - - if !expired_ssts.is_empty() { - info!( - "Expired SSTs in region {}: {:?}", - req.region_id, expired_ssts - ); - // here we mark expired SSTs as compacting to avoid them being picked. - expired_ssts.iter().for_each(|f| f.mark_compacting(true)); - } - - let ctx = &PickerContext::with(req.compaction_time_window); - - let mut outputs = vec![]; - for level_num in 0..levels.level_num() { - let level = levels.level(level_num as u8); - let compaction_time_window = Self::pick_level(ctx, level, &mut outputs); - - if outputs.is_empty() { - debug!( - "No SST file can be compacted at level {}, path: {:?}", - level_num, req.sst_layer - ); - continue; - } - - debug!( - "Found SST files to compact {:?} on level: {}, compaction window: {:?}", - outputs, level_num, compaction_time_window, - ); - return Ok(Some(CompactionTaskImpl { - schema: req.schema(), - sst_layer: req.sst_layer.clone(), - outputs, - writer: req.writer.clone(), - shared_data: req.shared.clone(), - wal: req.wal.clone(), - manifest: req.manifest.clone(), - expired_ssts, - sst_write_buffer_size: req.sst_write_buffer_size, - compaction_time_window, - reschedule_on_finish: req.reschedule_on_finish, - })); - } - - Ok(None) - } -} - -impl LeveledTimeWindowPicker { - fn pick_level( - ctx: &PickerContext, - level: &LevelMeta, - results: &mut Vec, - ) -> Option { - // SimpleTimeWindowStrategy only handles level 0 to level 1 compaction. - if level.level() != 0 { - return None; - } - let files = find_compactable_files(level); - debug!("Compactable files found: {:?}", files); - if files.is_empty() { - return None; - } - let time_window = ctx.compaction_time_window().unwrap_or_else(|| { - let inferred = infer_time_bucket(files.iter()); - debug!( - "Compaction window is not present, inferring from files: {:?}", - inferred - ); - inferred - }); - let buckets = calculate_time_buckets(time_window, &files); - debug!("File bucket:{}, file groups: {:?}", time_window, buckets); - - results.extend(buckets.into_iter().map(|(bound, files)| CompactionOutput { - output_file_id: FileId::random(), - output_level: 1, - time_window_bound: bound, - time_window_sec: time_window, - inputs: files, - // strict window is used in simple time window strategy in that rows in one file - // may get compacted to multiple destinations. - strict_window: true, - })); - Some(time_window) - } -} - -/// Finds files that can be compacted in given level. -/// Currently they're files that is not currently under compaction. -#[inline] -fn find_compactable_files(level: &LevelMeta) -> Vec { - level.files().filter(|f| !f.compacting()).cloned().collect() -} - -/// Calculates buckets for files. If file does not contain a time range in metadata, it will be -/// assigned to a special bucket `i64::MAX` (normally no timestamp can be aligned to this bucket) -/// so that all files without timestamp can be compacted together. -fn calculate_time_buckets(bucket_sec: i64, files: &[FileHandle]) -> HashMap> { - let mut buckets = HashMap::new(); - - for file in files { - if let Some((start, end)) = file.time_range() { - let bounds = file_time_bucket_span( - start.convert_to(TimeUnit::Second).unwrap().value(), - end.convert_to(TimeUnit::Second).unwrap().value(), - bucket_sec, - ); - for bound in bounds { - buckets - .entry(bound) - .or_insert_with(Vec::new) - .push(file.clone()); - } - } else { - warn!("Found corrupted SST without timestamp bounds: {:?}", file); - } - } - buckets -} - -/// Calculates timestamp span between start and end timestamp. -fn file_time_bucket_span(start_sec: i64, end_sec: i64, bucket_sec: i64) -> Vec { - assert!(start_sec <= end_sec); - - // if timestamp is between `[i64::MIN, i64::MIN.align_by_bucket(bucket)]`, which cannot - // be aligned to a valid i64 bound, simply return `i64::MIN` rather than just underflow. - let mut start_aligned = start_sec.align_by_bucket(bucket_sec).unwrap_or(i64::MIN); - let end_aligned = end_sec.align_by_bucket(bucket_sec).unwrap_or(i64::MIN); - - let mut res = Vec::with_capacity(((end_aligned - start_aligned) / bucket_sec + 1) as usize); - while start_aligned < end_aligned { - res.push(start_aligned); - start_aligned += bucket_sec; - } - res.push(end_aligned); - res -} - -#[cfg(test)] -mod tests { - use std::collections::{HashMap, HashSet}; - use std::sync::Arc; - - use super::*; - use crate::compaction::tests::new_file_handle; - use crate::compaction::TIME_BUCKETS; - use crate::file_purger::noop::new_noop_file_purger; - use crate::sst::{FileId, Level, LevelMetas}; - - #[test] - fn test_time_bucket_span() { - assert_eq!(vec![0], file_time_bucket_span(1, 9, 10)); - - assert_eq!(vec![0, 10], file_time_bucket_span(1, 10, 10)); - - assert_eq!(vec![-10], file_time_bucket_span(-10, -1, 10)); - - assert_eq!(vec![-10, 0], file_time_bucket_span(-10, 0, 10)); - } - - #[test] - fn test_time_bucket_span_large() { - assert_eq!( - vec![ - (i64::MAX - 10).align_by_bucket(10).unwrap(), - i64::MAX.align_by_bucket(10).unwrap(), - ], - file_time_bucket_span(i64::MAX - 10, i64::MAX, 10) - ); - - // magic hmmm? - for bucket in 1..100 { - assert_eq!( - vec![ - i64::MIN, - (i64::MIN + bucket).align_by_bucket(bucket).unwrap() - ], - file_time_bucket_span(i64::MIN, i64::MIN + bucket, bucket) - ); - } - } - - fn new_file_handles(input: &[(FileId, i64, i64)]) -> Vec { - input - .iter() - .map(|(file_id, start, end)| new_file_handle(*file_id, *start, *end, 0)) - .collect() - } - - fn check_bucket_calculation( - bucket_sec: i64, - files: Vec, - expected: &[(i64, &[FileId])], - ) { - let res = calculate_time_buckets(bucket_sec, &files); - - let expected = expected - .iter() - .map(|(bucket, file_ids)| (*bucket, file_ids.iter().copied().collect::>())) - .collect::>(); - - for (bucket, file_ids) in expected { - let actual = res - .get(&bucket) - .unwrap() - .iter() - .map(|f| f.file_id()) - .collect(); - assert_eq!( - file_ids, actual, - "bucket: {bucket}, expected: {file_ids:?}, actual: {actual:?}", - ); - } - } - - #[test] - fn test_calculate_time_buckets() { - let file_id_a = FileId::random(); - let file_id_b = FileId::random(); - // simple case, files with disjoint - check_bucket_calculation( - 10, - new_file_handles(&[(file_id_a, 0, 9000), (file_id_b, 10000, 19000)]), - &[(0, &[file_id_a]), (10, &[file_id_b])], - ); - - // files across buckets - check_bucket_calculation( - 10, - new_file_handles(&[(file_id_a, 0, 10001), (file_id_b, 10000, 19000)]), - &[(0, &[file_id_a]), (10, &[file_id_a, file_id_b])], - ); - check_bucket_calculation( - 10, - new_file_handles(&[(file_id_a, 0, 10000)]), - &[(0, &[file_id_a]), (10, &[file_id_a])], - ); - - // file with an large time range - let file_id_array = &[file_id_a]; - let expected = (0..(TIME_BUCKETS.get(4) / TIME_BUCKETS.get(0))) - .map(|b| (b * TIME_BUCKETS.get(0), file_id_array as _)) - .collect::>(); - check_bucket_calculation( - TIME_BUCKETS.get(0), - new_file_handles(&[(file_id_a, 0, TIME_BUCKETS.get(4) * 1000)]), - &expected, - ); - } - - struct TtlTester { - files: Vec<(FileId, i64, i64, Level)>, - ttl: Option, - expired: Vec, - now: Timestamp, - } - - impl TtlTester { - fn check(&self) { - let expected_expired = self - .expired - .iter() - .map(|idx| self.files[*idx].0) - .collect::>(); - let file_purger = new_noop_file_purger(); - let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {}); - let file_handles = self - .files - .iter() - .map(|(file_id, start_ts, end_ts, level)| { - new_file_handle(*file_id, *start_ts, *end_ts, *level).meta() - }) - .collect::>(); - let levels = LevelMetas::new(layer, file_purger).merge( - file_handles.into_iter(), - vec![].into_iter(), - None, - ); - let expired = get_expired_ssts(levels.levels(), self.ttl, self.now) - .unwrap() - .into_iter() - .map(|f| f.file_id()) - .collect::>(); - assert_eq!(expected_expired, expired); - } - } - - #[test] - fn test_find_expired_ssts() { - TtlTester { - files: vec![ - (FileId::random(), 8000, 9000, 0), - (FileId::random(), 10000, 11000, 0), - (FileId::random(), 8000, 11000, 1), - (FileId::random(), 2000, 3000, 1), - ], - ttl: Some(Duration::from_secs(1)), - expired: vec![3], - now: Timestamp::new_second(10), - } - .check(); - - TtlTester { - files: vec![ - (FileId::random(), 8000, 8999, 0), - (FileId::random(), 10000, 11000, 0), - (FileId::random(), 8000, 11000, 1), - (FileId::random(), 2000, 3000, 1), - ], - ttl: Some(Duration::from_secs(1)), - expired: vec![0, 3], - now: Timestamp::new_second(10), - } - .check(); - } -} diff --git a/src/storage/src/compaction/scheduler.rs b/src/storage/src/compaction/scheduler.rs deleted file mode 100644 index 170b9c2f5365..000000000000 --- a/src/storage/src/compaction/scheduler.rs +++ /dev/null @@ -1,157 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::marker::PhantomData; -use std::sync::Arc; -use std::time::Duration; - -use common_base::readable_size::ReadableSize; -use common_telemetry::{debug, error, info}; -use store_api::logstore::LogStore; -use store_api::storage::RegionId; -use tokio::sync::oneshot::Sender; -use tokio::sync::Notify; - -use crate::compaction::task::CompactionTask; -use crate::compaction::CompactionPickerRef; -use crate::error::Result; -use crate::manifest::region::RegionManifest; -use crate::region::{RegionWriterRef, SharedDataRef}; -use crate::scheduler::rate_limit::BoxedRateLimitToken; -use crate::scheduler::{Handler, Request}; -use crate::schema::RegionSchemaRef; -use crate::sst::AccessLayerRef; -use crate::version::LevelMetasRef; -use crate::wal::Wal; - -impl Request for CompactionRequestImpl { - type Key = RegionId; - - #[inline] - fn key(&self) -> RegionId { - self.region_id - } - - fn complete(self, result: Result<()>) { - if let Some(sender) = self.sender { - // We don't care the send result as callers might not - // wait the result. - let _ = sender.send(result); - } - } -} - -/// Region compaction request. -pub struct CompactionRequestImpl { - pub region_id: RegionId, - pub sst_layer: AccessLayerRef, - pub writer: RegionWriterRef, - pub shared: SharedDataRef, - pub manifest: RegionManifest, - pub wal: Wal, - pub ttl: Option, - pub compaction_time_window: Option, - /// Compaction result sender. - pub sender: Option>>, - pub picker: CompactionPickerRef, - pub sst_write_buffer_size: ReadableSize, - /// Whether to immediately reschedule another compaction when finished. - pub reschedule_on_finish: bool, -} - -impl CompactionRequestImpl { - #[inline] - pub(crate) fn schema(&self) -> RegionSchemaRef { - self.shared.version_control.current().schema().clone() - } - - #[inline] - pub(crate) fn levels(&self) -> LevelMetasRef { - self.shared.version_control.current().ssts().clone() - } -} - -pub struct CompactionHandler { - _phantom_data: PhantomData, - #[cfg(test)] - pub pending_tasks: Arc>>>, -} - -impl Default for CompactionHandler { - fn default() -> Self { - Self { - _phantom_data: Default::default(), - #[cfg(test)] - pending_tasks: Arc::new(Default::default()), - } - } -} - -impl CompactionHandler { - #[cfg(test)] - pub fn new_with_pending_tasks( - tasks: Arc>>>, - ) -> Self { - Self { - _phantom_data: Default::default(), - pending_tasks: tasks, - } - } -} - -#[async_trait::async_trait] -impl Handler for CompactionHandler -where - S: LogStore, -{ - type Request = CompactionRequestImpl; - - async fn handle_request( - &self, - req: Self::Request, - token: BoxedRateLimitToken, - finish_notifier: Arc, - ) -> Result<()> { - let region_id = req.key(); - let Some(task) = req.picker.pick(&req)? else { - info!("No file needs compaction in region: {:?}", region_id); - req.complete(Ok(())); - return Ok(()); - }; - - debug!("Compaction task, region: {:?}, task: {:?}", region_id, task); - // TODO(hl): we need to keep a track of task handle here to allow task cancellation. - let _handle = common_runtime::spawn_bg(async move { - if let Err(e) = task.run().await { - // TODO(hl): maybe resubmit compaction task on failure? - error!(e; "Failed to compact region: {:?}", region_id); - - req.complete(Err(e)); - } else { - info!("Successfully compacted region: {:?}", region_id); - - req.complete(Ok(())); - } - // releases rate limit token - token.try_release(); - // notify scheduler to schedule next task when current task finishes. - finish_notifier.notify_one(); - }); - - #[cfg(test)] - self.pending_tasks.write().await.push(_handle); - - Ok(()) - } -} diff --git a/src/storage/src/compaction/task.rs b/src/storage/src/compaction/task.rs deleted file mode 100644 index 49188d8a64fb..000000000000 --- a/src/storage/src/compaction/task.rs +++ /dev/null @@ -1,309 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashSet; -use std::fmt::{Debug, Formatter}; - -use common_base::readable_size::ReadableSize; -use common_telemetry::{debug, error, info}; -use itertools::Itertools; -use snafu::ResultExt; -use store_api::logstore::LogStore; -use store_api::storage::{CompactContext, RegionId}; - -use crate::compaction::writer::build_sst_reader; -use crate::error; -use crate::error::Result; -use crate::manifest::action::RegionEdit; -use crate::manifest::region::RegionManifest; -use crate::region::{RegionWriterRef, SharedDataRef, WriterCompactRequest}; -use crate::schema::RegionSchemaRef; -use crate::sst::{ - AccessLayerRef, FileHandle, FileId, FileMeta, Level, Source, SstInfo, WriteOptions, -}; -use crate::wal::Wal; - -const MAX_PARALLEL_COMPACTION: usize = 8; - -#[async_trait::async_trait] -pub trait CompactionTask: Debug + Send + Sync + 'static { - async fn run(self) -> Result<()>; -} - -pub struct CompactionTaskImpl { - pub schema: RegionSchemaRef, - pub sst_layer: AccessLayerRef, - pub outputs: Vec, - pub writer: RegionWriterRef, - pub shared_data: SharedDataRef, - pub wal: Wal, - pub manifest: RegionManifest, - pub expired_ssts: Vec, - pub sst_write_buffer_size: ReadableSize, - pub compaction_time_window: Option, - pub reschedule_on_finish: bool, -} - -impl Debug for CompactionTaskImpl { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_struct("CompactionTaskImpl") - .field("region_name", &self.shared_data.name()) - .finish() - } -} - -impl Drop for CompactionTaskImpl { - fn drop(&mut self) { - self.mark_files_compacting(false); - } -} - -impl CompactionTaskImpl { - /// Compacts inputs SSTs, returns `(output file, compacted input file)`. - async fn merge_ssts(&mut self) -> Result<(HashSet, HashSet)> { - let mut futs = Vec::with_capacity(self.outputs.len()); - let mut compacted_inputs = HashSet::new(); - let region_id = self.shared_data.id(); - for output in self.outputs.drain(..) { - let schema = self.schema.clone(); - let sst_layer = self.sst_layer.clone(); - let sst_write_buffer_size = self.sst_write_buffer_size; - compacted_inputs.extend(output.inputs.iter().map(FileHandle::meta)); - - info!( - "Compaction output [{}]-> {}", - output - .inputs - .iter() - .map(|f| f.file_id().to_string()) - .join(","), - output.output_file_id - ); - - // TODO(hl): Maybe spawn to runtime to exploit in-job parallelism. - futs.push(async move { - output - .build(region_id, schema, sst_layer, sst_write_buffer_size) - .await - }); - } - - let mut outputs = HashSet::with_capacity(futs.len()); - while !futs.is_empty() { - let mut task_chunk = Vec::with_capacity(MAX_PARALLEL_COMPACTION); - for _ in 0..MAX_PARALLEL_COMPACTION { - if let Some(task) = futs.pop() { - task_chunk.push(common_runtime::spawn_bg(task)); - } - } - let metas = futures::future::try_join_all(task_chunk) - .await - .context(error::JoinSnafu)? - .into_iter() - .collect::>>()?; - outputs.extend(metas.into_iter().flatten()); - } - - let inputs = compacted_inputs.into_iter().collect(); - Ok((outputs, inputs)) - } - - /// Writes updated SST info into manifest. - async fn write_manifest_and_apply( - &self, - output: HashSet, - input: HashSet, - ) -> Result<()> { - let version = &self.shared_data.version_control; - let region_version = version.metadata().version(); - - let edit = RegionEdit { - region_version, - flushed_sequence: None, - files_to_add: Vec::from_iter(output), - files_to_remove: Vec::from_iter(input), - compaction_time_window: self.compaction_time_window, - }; - debug!( - "Compacted region: {}, region edit: {:?}", - version.metadata().name(), - edit - ); - self.writer - .write_edit_and_apply(&self.wal, &self.shared_data, &self.manifest, edit, None) - .await - } - - /// Mark files are under compaction. - fn mark_files_compacting(&self, compacting: bool) { - for o in &self.outputs { - for input in &o.inputs { - input.mark_compacting(compacting); - } - } - } -} - -#[async_trait::async_trait] -impl CompactionTask for CompactionTaskImpl { - async fn run(mut self) -> Result<()> { - let _timer = crate::metrics::COMPACT_ELAPSED.start_timer(); - self.mark_files_compacting(true); - - let (output, mut compacted) = self.merge_ssts().await.map_err(|e| { - error!(e; "Failed to compact region: {}", self.shared_data.name()); - e - })?; - compacted.extend(self.expired_ssts.iter().map(FileHandle::meta)); - - let input_ids = compacted.iter().map(|f| f.file_id).collect::>(); - let output_ids = output.iter().map(|f| f.file_id).collect::>(); - info!( - "Compacting SST files, input: {:?}, output: {:?}, window: {:?}", - input_ids, output_ids, self.compaction_time_window - ); - - let no_output = output.is_empty(); - let write_result = self - .write_manifest_and_apply(output, compacted) - .await - .map_err(|e| { - error!(e; "Failed to update region manifest: {}", self.shared_data.name()); - e - }); - - if !no_output && self.reschedule_on_finish { - // only reschedule another compaction if current compaction has output and it's - // triggered by flush. - if let Err(e) = self - .writer - .compact(WriterCompactRequest { - shared_data: self.shared_data.clone(), - sst_layer: self.sst_layer.clone(), - manifest: self.manifest.clone(), - wal: self.wal.clone(), - region_writer: self.writer.clone(), - compact_ctx: CompactContext { wait: false }, - }) - .await - { - error!(e; "Failed to schedule a compaction after compaction, region id: {}", self.shared_data.id()); - } else { - info!( - "Immediately schedule another compaction for region: {}", - self.shared_data.id() - ); - } - } - write_result - } -} - -/// Many-to-many compaction can be decomposed to a many-to-one compaction from level n to level n+1 -/// and a many-to-one compaction from level n+1 to level n+1. -#[derive(Debug)] -pub struct CompactionOutput { - pub output_file_id: FileId, - /// Compaction output file level. - pub output_level: Level, - /// The left bound of time window. - pub time_window_bound: i64, - /// Time window size in seconds. - pub time_window_sec: i64, - /// Compaction input files. - pub inputs: Vec, - /// If the compaction output is strictly windowed. - pub strict_window: bool, -} - -impl CompactionOutput { - async fn build( - &self, - region_id: RegionId, - schema: RegionSchemaRef, - sst_layer: AccessLayerRef, - sst_write_buffer_size: ReadableSize, - ) -> Result> { - let time_range = if self.strict_window { - ( - Some(self.time_window_bound), - Some(self.time_window_bound + self.time_window_sec), - ) - } else { - (None, None) - }; - - let reader = build_sst_reader( - region_id, - schema, - sst_layer.clone(), - &self.inputs, - time_range, - ) - .await?; - - let opts = WriteOptions { - sst_write_buffer_size, - }; - let _timer = crate::metrics::MERGE_ELAPSED.start_timer(); - let meta = sst_layer - .write_sst(self.output_file_id, Source::Reader(reader), &opts) - .await? - .map( - |SstInfo { - time_range, - file_size, - .. - }| FileMeta { - region_id, - file_id: self.output_file_id, - time_range, - level: self.output_level, - file_size, - }, - ); - Ok(meta) - } -} - -#[cfg(test)] -pub mod tests { - use std::sync::Arc; - - use super::*; - use crate::compaction::task::CompactionTask; - - pub type CallbackRef = Arc; - - pub struct NoopCompactionTask { - pub cbs: Vec, - } - - impl Debug for NoopCompactionTask { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_struct("storage::compaction::task::tests::NoopCompactionTask") - .finish() - } - } - - #[async_trait::async_trait] - impl CompactionTask for NoopCompactionTask { - async fn run(self) -> Result<()> { - for cb in &self.cbs { - cb() - } - Ok(()) - } - } -} diff --git a/src/storage/src/compaction/twcs.rs b/src/storage/src/compaction/twcs.rs deleted file mode 100644 index f263cdecaf87..000000000000 --- a/src/storage/src/compaction/twcs.rs +++ /dev/null @@ -1,406 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Time-window compaction strategy - -use std::collections::BTreeMap; -use std::fmt::{Debug, Formatter}; -use std::marker::PhantomData; - -use common_telemetry::{debug, info, warn}; -use common_time::timestamp::TimeUnit; -use common_time::timestamp_millis::BucketAligned; -use common_time::Timestamp; -use store_api::logstore::LogStore; - -use crate::compaction::picker::get_expired_ssts; -use crate::compaction::task::CompactionOutput; -use crate::compaction::{infer_time_bucket, CompactionRequestImpl, CompactionTaskImpl, Picker}; -use crate::sst::{FileHandle, FileId, LevelMeta}; - -/// `TwcsPicker` picks files of which the max timestamp are in the same time window as compaction -/// candidates. -pub struct TwcsPicker { - max_active_window_files: usize, - max_inactive_window_files: usize, - time_window_seconds: Option, - _phantom_data: PhantomData, -} - -impl Debug for TwcsPicker { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_struct("TwcsPicker") - .field("max_active_window_files", &self.max_active_window_files) - .field("max_inactive_window_files", &self.max_inactive_window_files) - .finish() - } -} - -impl TwcsPicker { - pub fn new( - max_active_window_files: usize, - max_inactive_window_files: usize, - time_window_seconds: Option, - ) -> Self { - Self { - max_inactive_window_files, - max_active_window_files, - _phantom_data: Default::default(), - time_window_seconds, - } - } - - /// Builds compaction output from files. - /// For active writing window, we allow for at most `max_active_window_files` files to alleviate - /// fragmentation. For other windows, we allow at most 1 file at each window. - fn build_output( - &self, - time_windows: &BTreeMap>, - active_window: Option, - window_size: i64, - ) -> Vec { - let mut output = vec![]; - for (window, files) in time_windows { - if let Some(active_window) = active_window && *window == active_window { - if files.len() > self.max_active_window_files { - output.push(CompactionOutput { - output_file_id: FileId::random(), - output_level: 1, // we only have two levels and always compact to l1 - time_window_bound: *window, - time_window_sec: window_size, - inputs: files.clone(), - // Strict window is not needed since we always compact many files to one - // single file in TWCS. - strict_window: false, - }); - } else { - debug!("Active window not present or no enough files in active window {:?}, window: {}", active_window, *window); - } - } else { - // not active writing window - if files.len() > self.max_inactive_window_files { - output.push(CompactionOutput { - output_file_id: FileId::random(), - output_level: 1, - time_window_bound: *window, - time_window_sec: window_size, - inputs: files.clone(), - strict_window: false, - }); - } else { - debug!("No enough files, current: {}, max_inactive_window_files: {}", files.len(), self.max_inactive_window_files) - } - } - } - output - } -} - -impl Picker for TwcsPicker { - type Request = CompactionRequestImpl; - type Task = CompactionTaskImpl; - - fn pick(&self, req: &Self::Request) -> crate::error::Result> { - let levels = req.levels(); - let expired_ssts = get_expired_ssts(levels.levels(), req.ttl, Timestamp::current_millis())?; - if !expired_ssts.is_empty() { - info!( - "Expired SSTs in region {}: {:?}", - req.region_id, expired_ssts - ); - // here we mark expired SSTs as compacting to avoid them being picked. - expired_ssts.iter().for_each(|f| f.mark_compacting(true)); - } - - let time_window_size = req - .compaction_time_window - .or(self.time_window_seconds) - .unwrap_or_else(|| { - let inferred = infer_time_bucket(req.levels().level(0).files()); - info!( - "Compaction window for region {} is not present, inferring from files: {:?}", - req.region_id, inferred - ); - inferred - }); - - // Find active window from files in level 0. - let active_window = - find_latest_window_in_seconds(levels.level(0).files(), time_window_size); - - let windows = assign_to_windows( - levels.levels().iter().flat_map(LevelMeta::files), - time_window_size, - ); - - let outputs = self.build_output(&windows, active_window, time_window_size); - - if outputs.is_empty() && expired_ssts.is_empty() { - return Ok(None); - } - let task = CompactionTaskImpl { - schema: req.schema(), - sst_layer: req.sst_layer.clone(), - outputs, - writer: req.writer.clone(), - shared_data: req.shared.clone(), - wal: req.wal.clone(), - manifest: req.manifest.clone(), - expired_ssts, - sst_write_buffer_size: req.sst_write_buffer_size, - compaction_time_window: Some(time_window_size), - reschedule_on_finish: req.reschedule_on_finish, - }; - Ok(Some(task)) - } -} - -/// Assigns files to windows with predefined window size (in seconds) by their max timestamps. -fn assign_to_windows<'a>( - files: impl Iterator, - time_window_size: i64, -) -> BTreeMap> { - let mut windows: BTreeMap> = BTreeMap::new(); - // Iterates all files and assign to time windows according to max timestamp - for file in files { - if let Some((_, end)) = file.time_range() { - let time_window = end - .convert_to(TimeUnit::Second) - .unwrap() - .value() - .align_to_ceil_by_bucket(time_window_size) - .unwrap_or(i64::MIN); - windows.entry(time_window).or_default().push(file.clone()); - } else { - warn!("Unexpected file w/o timestamp: {:?}", file.file_id()); - } - } - windows -} - -/// Finds the latest active writing window among all files. -/// Returns `None` when there are no files or all files are corrupted. -fn find_latest_window_in_seconds<'a>( - files: impl Iterator, - time_window_size: i64, -) -> Option { - let mut latest_timestamp = None; - for f in files { - if let Some((_, end)) = f.time_range() { - if let Some(latest) = latest_timestamp && end > latest { - latest_timestamp = Some(end); - } else { - latest_timestamp = Some(end); - } - } else { - warn!("Cannot find timestamp range of file: {}", f.file_id()); - } - } - latest_timestamp - .and_then(|ts| ts.convert_to_ceil(TimeUnit::Second)) - .and_then(|ts| ts.value().align_to_ceil_by_bucket(time_window_size)) -} - -#[cfg(test)] -mod tests { - use std::collections::HashSet; - - use log_store::NoopLogStore; - - use super::*; - use crate::compaction::tests::new_file_handle; - use crate::sst::{FileId, Level}; - - #[test] - fn test_get_latest_window_in_seconds() { - assert_eq!( - Some(1), - find_latest_window_in_seconds([new_file_handle(FileId::random(), 0, 999, 0)].iter(), 1) - ); - assert_eq!( - Some(1), - find_latest_window_in_seconds( - [new_file_handle(FileId::random(), 0, 1000, 0)].iter(), - 1 - ) - ); - - assert_eq!( - Some(-9223372036854000), - find_latest_window_in_seconds( - [new_file_handle(FileId::random(), i64::MIN, i64::MIN + 1, 0)].iter(), - 3600, - ) - ); - - assert_eq!( - (i64::MAX / 10000000 + 1) * 10000, - find_latest_window_in_seconds( - [new_file_handle(FileId::random(), i64::MIN, i64::MAX, 0)].iter(), - 10000, - ) - .unwrap() - ); - } - - #[test] - fn test_assign_to_windows() { - let windows = assign_to_windows( - [ - new_file_handle(FileId::random(), 0, 999, 0), - new_file_handle(FileId::random(), 0, 999, 0), - new_file_handle(FileId::random(), 0, 999, 0), - new_file_handle(FileId::random(), 0, 999, 0), - new_file_handle(FileId::random(), 0, 999, 0), - ] - .iter(), - 3, - ); - assert_eq!(5, windows.get(&0).unwrap().len()); - - let files = [FileId::random(); 3]; - let windows = assign_to_windows( - [ - new_file_handle(files[0], -2000, -3, 0), - new_file_handle(files[1], 0, 2999, 0), - new_file_handle(files[2], 50, 10001, 0), - ] - .iter(), - 3, - ); - assert_eq!(files[0], windows.get(&0).unwrap().get(0).unwrap().file_id()); - assert_eq!(files[1], windows.get(&3).unwrap().get(0).unwrap().file_id()); - assert_eq!( - files[2], - windows.get(&12).unwrap().get(0).unwrap().file_id() - ); - } - - struct CompactionPickerTestCase { - window_size: i64, - input_files: Vec, - expected_outputs: Vec, - } - - impl CompactionPickerTestCase { - fn check(&self) { - let windows = assign_to_windows(self.input_files.iter(), self.window_size); - let active_window = - find_latest_window_in_seconds(self.input_files.iter(), self.window_size); - let output = TwcsPicker::::new(4, 1, None).build_output( - &windows, - active_window, - self.window_size, - ); - - let output = output - .iter() - .map(|o| { - let input_file_ids = - o.inputs.iter().map(|f| f.file_id()).collect::>(); - ( - input_file_ids, - o.output_level, - o.time_window_sec, - o.time_window_bound, - o.strict_window, - ) - }) - .collect::>(); - - let expected = self - .expected_outputs - .iter() - .map(|o| { - let input_file_ids = o - .input_files - .iter() - .map(|idx| self.input_files[*idx].file_id()) - .collect::>(); - ( - input_file_ids, - o.output_level, - o.time_window_sec, - o.time_window_bound, - o.strict_window, - ) - }) - .collect::>(); - assert_eq!(expected, output); - } - } - - struct ExpectedOutput { - input_files: Vec, - output_level: Level, - time_window_sec: i64, - time_window_bound: i64, - strict_window: bool, - } - - #[test] - fn test_build_twcs_output() { - let file_ids = (0..4).map(|_| FileId::random()).collect::>(); - - CompactionPickerTestCase { - window_size: 3, - input_files: [ - new_file_handle(file_ids[0], -2000, -3, 0), - new_file_handle(file_ids[1], -3000, -100, 0), - new_file_handle(file_ids[2], 0, 2999, 0), //active windows - new_file_handle(file_ids[3], 50, 2998, 0), //active windows - ] - .to_vec(), - expected_outputs: vec![ExpectedOutput { - input_files: vec![0, 1], - output_level: 1, - time_window_sec: 3, - time_window_bound: 0, - strict_window: false, - }], - } - .check(); - - let file_ids = (0..6).map(|_| FileId::random()).collect::>(); - CompactionPickerTestCase { - window_size: 3, - input_files: [ - new_file_handle(file_ids[0], -2000, -3, 0), - new_file_handle(file_ids[1], -3000, -100, 0), - new_file_handle(file_ids[2], 0, 2999, 0), - new_file_handle(file_ids[3], 50, 2998, 0), - new_file_handle(file_ids[4], 11, 2990, 0), - new_file_handle(file_ids[5], 50, 4998, 0), - ] - .to_vec(), - expected_outputs: vec![ - ExpectedOutput { - input_files: vec![0, 1], - output_level: 1, - time_window_sec: 3, - time_window_bound: 0, - strict_window: false, - }, - ExpectedOutput { - input_files: vec![2, 3, 4], - output_level: 1, - time_window_sec: 3, - time_window_bound: 3, - strict_window: false, - }, - ], - } - .check(); - } -} diff --git a/src/storage/src/compaction/writer.rs b/src/storage/src/compaction/writer.rs deleted file mode 100644 index 734d229b4ad4..000000000000 --- a/src/storage/src/compaction/writer.rs +++ /dev/null @@ -1,588 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_query::logical_plan::{DfExpr, Expr}; -use common_time::timestamp::TimeUnit; -use datafusion_expr::Operator; -use datatypes::value::timestamp_to_scalar_value; -use store_api::storage::RegionId; - -use crate::chunk::{ChunkReaderBuilder, ChunkReaderImpl}; -use crate::error; -use crate::schema::RegionSchemaRef; -use crate::sst::{AccessLayerRef, FileHandle}; - -/// Builds an SST reader that only reads rows within given time range. -pub(crate) async fn build_sst_reader( - region_id: RegionId, - schema: RegionSchemaRef, - sst_layer: AccessLayerRef, - files: &[FileHandle], - time_range: (Option, Option), -) -> error::Result { - // TODO(hl): Schemas in different SSTs may differ, thus we should infer - // timestamp column name from Parquet metadata. - - // safety: Region schema's timestamp column must present - let ts_col = schema.user_schema().timestamp_column().unwrap(); - let ts_col_unit = ts_col.data_type.as_timestamp().unwrap().unit(); - let ts_col_name = ts_col.name.clone(); - - ChunkReaderBuilder::new(region_id, schema, sst_layer) - .pick_ssts(files) - .filters( - build_time_range_filter(time_range, &ts_col_name, ts_col_unit) - .into_iter() - .collect(), - ) - .build() - .await -} - -/// Build time range filter expr from lower (inclusive) and upper bound(exclusive). -/// Returns `None` if time range overflows. -fn build_time_range_filter( - time_range: (Option, Option), - ts_col_name: &str, - ts_col_unit: TimeUnit, -) -> Option { - let (low_ts_inclusive, high_ts_exclusive) = time_range; - let ts_col = DfExpr::Column(datafusion_common::Column::from_name(ts_col_name)); - - // Converting seconds to whatever unit won't lose precision. - // Here only handles overflow. - let low_ts = low_ts_inclusive - .map(common_time::Timestamp::new_second) - .and_then(|ts| ts.convert_to(ts_col_unit)) - .map(|ts| ts.value()); - let high_ts = high_ts_exclusive - .map(common_time::Timestamp::new_second) - .and_then(|ts| ts.convert_to(ts_col_unit)) - .map(|ts| ts.value()); - - let expr = match (low_ts, high_ts) { - (Some(low), Some(high)) => { - let lower_bound_expr = - DfExpr::Literal(timestamp_to_scalar_value(ts_col_unit, Some(low))); - let upper_bound_expr = - DfExpr::Literal(timestamp_to_scalar_value(ts_col_unit, Some(high))); - Some(datafusion_expr::and( - datafusion_expr::binary_expr(ts_col.clone(), Operator::GtEq, lower_bound_expr), - datafusion_expr::binary_expr(ts_col, Operator::Lt, upper_bound_expr), - )) - } - - (Some(low), None) => { - let lower_bound_expr = - datafusion_expr::lit(timestamp_to_scalar_value(ts_col_unit, Some(low))); - Some(datafusion_expr::binary_expr( - ts_col, - Operator::GtEq, - lower_bound_expr, - )) - } - - (None, Some(high)) => { - let upper_bound_expr = - datafusion_expr::lit(timestamp_to_scalar_value(ts_col_unit, Some(high))); - Some(datafusion_expr::binary_expr( - ts_col, - Operator::Lt, - upper_bound_expr, - )) - } - - (None, None) => None, - }; - - expr.map(Expr::from) -} - -#[cfg(test)] -mod tests { - use std::sync::atomic::{AtomicU64, Ordering}; - use std::sync::Arc; - - use api::v1::OpType; - use common_base::readable_size::ReadableSize; - use common_test_util::temp_dir::create_temp_dir; - use common_time::Timestamp; - use datatypes::prelude::{LogicalTypeId, ScalarVector, ScalarVectorBuilder}; - use datatypes::timestamp::TimestampMillisecond; - use datatypes::vectors::{ - TimestampMillisecondVector, TimestampMillisecondVectorBuilder, UInt64VectorBuilder, - }; - use object_store::services::Fs; - use object_store::ObjectStore; - use store_api::storage::{ChunkReader, SequenceNumber}; - - use super::*; - use crate::file_purger::noop::new_noop_file_purger; - use crate::memtable::{ - DefaultMemtableBuilder, IterContext, KeyValues, Memtable, MemtableBuilder, - }; - use crate::metadata::RegionMetadata; - use crate::sst::parquet::ParquetWriter; - use crate::sst::{self, FileId, FileMeta, FsAccessLayer, Source, SstInfo, WriteOptions}; - use crate::test_util::descriptor_util::RegionDescBuilder; - - const REGION_ID: RegionId = RegionId::from_u64(1); - - fn schema_for_test() -> RegionSchemaRef { - // Just build a region desc and use its columns metadata. - let desc = RegionDescBuilder::new("test") - .push_field_column(("v", LogicalTypeId::UInt64, true)) - .build(); - let metadata: RegionMetadata = desc.try_into().unwrap(); - metadata.schema().clone() - } - - pub fn write_kvs( - memtable: &dyn Memtable, - sequence: SequenceNumber, - op_type: OpType, - ts: &[i64], // timestamp - values: &[Option], - ) { - let keys: Vec = ts.iter().map(|ts| (*ts).into()).collect(); - let kvs = kvs_for_test(sequence, op_type, &keys, values); - memtable.write(&kvs).unwrap(); - } - - fn kvs_for_test( - sequence: SequenceNumber, - op_type: OpType, - ts: &[TimestampMillisecond], - values: &[Option], - ) -> KeyValues { - let start_index_in_batch = 0; - assert_eq!(ts.len(), values.len()); - let mut key_builders = TimestampMillisecondVectorBuilder::with_capacity(ts.len()); - for key in ts { - key_builders.push(Some(*key)); - } - let ts_col = Arc::new(key_builders.finish()) as _; - let mut value_builders = UInt64VectorBuilder::with_capacity(values.len()); - - for value in values { - value_builders.push(*value); - } - let row_values = vec![Arc::new(value_builders.finish()) as _]; - - let kvs = KeyValues { - sequence, - op_type, - start_index_in_batch, - keys: vec![], - values: row_values, - timestamp: Some(ts_col), - }; - - assert_eq!(ts.len(), kvs.len()); - assert_eq!(ts.is_empty(), kvs.is_empty()); - - kvs - } - - async fn write_sst( - sst_file_id: FileId, - schema: RegionSchemaRef, - seq: &AtomicU64, - object_store: ObjectStore, - ts: &[i64], - ops: &[OpType], - ) -> FileHandle { - let memtable = DefaultMemtableBuilder::default().build(schema.clone()); - let mut breaks = ops - .iter() - .zip(ops.iter().skip(1)) - .enumerate() - .filter_map( - |(idx, (prev, next))| { - if prev != next { - Some(idx + 1) - } else { - None - } - }, - ) - .collect::>(); - - breaks.insert(0, 0); - breaks.push(ts.len()); - - for i in 0..breaks.len() - 1 { - let op = ops[i]; - let seg_len = breaks[i + 1] - breaks[i]; - let ts_seg = ts - .iter() - .skip(breaks[i]) - .take(seg_len) - .copied() - .collect::>(); - let value_seg = ts - .iter() - .skip(breaks[i]) - .take(seg_len) - .map(|i| (*i) as u64) - .map(Some) - .collect::>(); - - write_kvs( - &*memtable, - seq.load(Ordering::Relaxed), // sequence - op, - &ts_seg, // keys - &value_seg, // values - ); - let _ = seq.fetch_add(1, Ordering::Relaxed); - } - - let iter = memtable.iter(IterContext::default()).unwrap(); - let file_path = sst_file_id.as_parquet(); - let writer = ParquetWriter::new(&file_path, Source::Iter(iter), object_store.clone()); - - let SstInfo { - time_range, - file_size, - .. - } = writer - .write_sst(&sst::WriteOptions::default()) - .await - .unwrap() - .unwrap(); - let handle = FileHandle::new( - FileMeta { - region_id: 0.into(), - file_id: sst_file_id, - time_range, - level: 0, - file_size, - }, - Arc::new(crate::test_util::access_layer_util::MockAccessLayer {}), - new_noop_file_purger(), - ); - let _ = seq.fetch_add(1, Ordering::Relaxed); - handle - } - - // The region id is only used to build the reader, we don't check its content. - async fn check_reads( - region_id: RegionId, - schema: RegionSchemaRef, - sst_layer: AccessLayerRef, - files: &[FileHandle], - lower_sec_inclusive: i64, - upper_sec_exclusive: i64, - expect: &[i64], - ) { - let mut reader = build_sst_reader( - region_id, - schema, - sst_layer, - files, - (Some(lower_sec_inclusive), Some(upper_sec_exclusive)), - ) - .await - .unwrap(); - - let mut res = vec![]; - while let Some(f) = reader.next_chunk().await.unwrap() { - let ts_col = f.columns[0] - .as_any() - .downcast_ref::() - .unwrap(); - res.extend(ts_col.iter_data().map(|t| t.unwrap().0.value())); - } - assert_eq!(expect, &res); - } - - #[tokio::test] - async fn test_sst_reader() { - let dir = create_temp_dir("write_parquet"); - let path = dir.path().to_str().unwrap(); - let mut builder = Fs::default(); - let _ = builder.root(path); - - let object_store = ObjectStore::new(builder).unwrap().finish(); - - let seq = AtomicU64::new(0); - let schema = schema_for_test(); - let file1 = write_sst( - FileId::random(), - schema.clone(), - &seq, - object_store.clone(), - &[1000, 2000, 3000, 4001, 5001], - &[ - OpType::Put, - OpType::Put, - OpType::Put, - OpType::Put, - OpType::Put, - ], - ) - .await; - let file2 = write_sst( - FileId::random(), - schema.clone(), - &seq, - object_store.clone(), - &[4002, 5002, 6000, 7000, 8000], - &[ - OpType::Put, - OpType::Put, - OpType::Put, - OpType::Put, - OpType::Put, - ], - ) - .await; - let sst_layer = Arc::new(FsAccessLayer::new("./", object_store)); - - let files = vec![file1, file2]; - // read from two sst files with time range filter, - check_reads( - REGION_ID, - schema.clone(), - sst_layer.clone(), - &files, - 3, - 6, - &[3000, 4001, 4002, 5001, 5002], - ) - .await; - - check_reads(REGION_ID, schema, sst_layer, &files, 1, 2, &[1000]).await; - } - - async fn read_file( - files: &[FileHandle], - schema: RegionSchemaRef, - sst_layer: AccessLayerRef, - ) -> Vec { - let mut timestamps = vec![]; - let mut reader = build_sst_reader( - REGION_ID, - schema, - sst_layer, - files, - (Some(i64::MIN), Some(i64::MAX)), - ) - .await - .unwrap(); - while let Some(chunk) = reader.next_chunk().await.unwrap() { - let ts = chunk.columns[0] - .as_any() - .downcast_ref::() - .unwrap(); - timestamps.extend(ts.iter_data().map(|t| t.unwrap().0.value())); - } - timestamps - } - - /// Writes rows into file i1/i2 and splits these rows into sst file o1/o2/o3, - /// and check the output contains the same data as input files. - #[tokio::test] - async fn test_sst_split() { - let dir = create_temp_dir("write_parquet"); - let path = dir.path().to_str().unwrap(); - let mut builder = Fs::default(); - let _ = builder.root(path); - let object_store = ObjectStore::new(builder).unwrap().finish(); - - let schema = schema_for_test(); - let seq = AtomicU64::new(0); - - let input_file_ids = [FileId::random(), FileId::random()]; - let output_file_ids = [FileId::random(), FileId::random(), FileId::random()]; - - let file1 = write_sst( - input_file_ids[0], - schema.clone(), - &seq, - object_store.clone(), - &[1000, 2000, 3000, 4001, 5001], - &[ - OpType::Put, - OpType::Put, - OpType::Put, - OpType::Put, - OpType::Put, - ], - ) - .await; - - // in file2 we delete the row with timestamp 1000. - let file2 = write_sst( - input_file_ids[1], - schema.clone(), - &seq, - object_store.clone(), - &[1000, 5002, 6000, 7000, 8000], - &[ - OpType::Delete, // a deletion - OpType::Put, - OpType::Put, - OpType::Put, - OpType::Put, - ], - ) - .await; - let sst_layer = Arc::new(FsAccessLayer::new("./", object_store.clone())); - let input_files = vec![file2, file1]; - - let reader1 = build_sst_reader( - REGION_ID, - schema.clone(), - sst_layer.clone(), - &input_files, - (Some(0), Some(3)), - ) - .await - .unwrap(); - let reader2 = build_sst_reader( - REGION_ID, - schema.clone(), - sst_layer.clone(), - &input_files, - (Some(3), Some(6)), - ) - .await - .unwrap(); - let reader3 = build_sst_reader( - REGION_ID, - schema.clone(), - sst_layer.clone(), - &input_files, - (Some(6), Some(10)), - ) - .await - .unwrap(); - - let opts = WriteOptions { - sst_write_buffer_size: ReadableSize::mb(8), - }; - let s1 = ParquetWriter::new( - &output_file_ids[0].as_parquet(), - Source::Reader(reader1), - object_store.clone(), - ) - .write_sst(&opts) - .await - .unwrap() - .unwrap(); - assert_eq!( - Some(( - Timestamp::new_millisecond(2000), - Timestamp::new_millisecond(2000) - )), - s1.time_range, - ); - - let s2 = ParquetWriter::new( - &output_file_ids[1].as_parquet(), - Source::Reader(reader2), - object_store.clone(), - ) - .write_sst(&opts) - .await - .unwrap() - .unwrap(); - assert_eq!( - Some(( - Timestamp::new_millisecond(3000), - Timestamp::new_millisecond(5002) - )), - s2.time_range, - ); - - let s3 = ParquetWriter::new( - &output_file_ids[2].as_parquet(), - Source::Reader(reader3), - object_store.clone(), - ) - .write_sst(&opts) - .await - .unwrap() - .unwrap(); - - assert_eq!( - Some(( - Timestamp::new_millisecond(6000), - Timestamp::new_millisecond(8000) - )), - s3.time_range - ); - - let output_files = output_file_ids - .into_iter() - .map(|f| { - FileHandle::new( - FileMeta { - region_id: 0.into(), - file_id: f, - level: 1, - time_range: None, - file_size: 0, - }, - Arc::new(crate::test_util::access_layer_util::MockAccessLayer {}), - new_noop_file_purger(), - ) - }) - .collect::>(); - - let timestamps_in_inputs = read_file(&input_files, schema.clone(), sst_layer.clone()).await; - let timestamps_in_outputs = - read_file(&output_files, schema.clone(), sst_layer.clone()).await; - - assert_eq!(timestamps_in_outputs, timestamps_in_inputs); - } - - #[test] - fn test_build_time_range_filter() { - assert!(build_time_range_filter( - (Some(i64::MIN), Some(i64::MAX)), - "ts", - TimeUnit::Nanosecond - ) - .is_none()); - - assert_eq!( - Expr::from(datafusion_expr::binary_expr( - datafusion_expr::col("ts"), - Operator::Lt, - datafusion_expr::lit(timestamp_to_scalar_value( - TimeUnit::Nanosecond, - Some(TimeUnit::Second.factor() as i64 / TimeUnit::Nanosecond.factor() as i64), - )), - )), - build_time_range_filter((Some(i64::MIN), Some(1)), "ts", TimeUnit::Nanosecond).unwrap() - ); - - assert_eq!( - Expr::from(datafusion_expr::binary_expr( - datafusion_expr::col("ts"), - Operator::GtEq, - datafusion_expr::lit(timestamp_to_scalar_value( - TimeUnit::Nanosecond, - Some( - 2 * TimeUnit::Second.factor() as i64 / TimeUnit::Nanosecond.factor() as i64 - ), - )), - )), - build_time_range_filter((Some(2), Some(i64::MAX)), "ts", TimeUnit::Nanosecond).unwrap() - ); - } -} diff --git a/src/storage/src/config.rs b/src/storage/src/config.rs deleted file mode 100644 index 73d75dd24594..000000000000 --- a/src/storage/src/config.rs +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! storage engine config - -use std::time::Duration; - -use common_base::readable_size::ReadableSize; - -/// Default max flush tasks. -pub const DEFAULT_MAX_FLUSH_TASKS: usize = 8; -/// Default region write buffer size. -pub const DEFAULT_REGION_WRITE_BUFFER_SIZE: ReadableSize = ReadableSize::mb(32); -/// Default interval to trigger auto flush in millis. -pub const DEFAULT_AUTO_FLUSH_INTERVAL: u32 = 60 * 60 * 1000; -/// Default interval to schedule the picker to flush automatically in millis. -pub const DEFAULT_PICKER_SCHEDULE_INTERVAL: u32 = 5 * 60 * 1000; - -#[derive(Debug, Clone)] -pub struct EngineConfig { - pub compress_manifest: bool, - pub manifest_checkpoint_margin: Option, - pub manifest_gc_duration: Option, - pub max_files_in_l0: usize, - pub max_purge_tasks: usize, - /// Max inflight flush tasks. - pub max_flush_tasks: usize, - /// Default write buffer size for a region. - pub region_write_buffer_size: ReadableSize, - /// Interval to schedule the auto flush picker. - pub picker_schedule_interval: Duration, - /// Interval to auto flush a region if it has not flushed yet. - pub auto_flush_interval: Duration, - /// Limit for global write buffer size. Disabled by default. - pub global_write_buffer_size: Option, - /// Global retention period for all regions. - /// - /// The precedence order is: region ttl > global ttl. - pub global_ttl: Option, -} - -impl Default for EngineConfig { - fn default() -> Self { - Self { - compress_manifest: false, - manifest_checkpoint_margin: Some(10), - manifest_gc_duration: Some(Duration::from_secs(30)), - max_files_in_l0: 8, - max_purge_tasks: 32, - max_flush_tasks: DEFAULT_MAX_FLUSH_TASKS, - region_write_buffer_size: DEFAULT_REGION_WRITE_BUFFER_SIZE, - picker_schedule_interval: Duration::from_millis( - DEFAULT_PICKER_SCHEDULE_INTERVAL.into(), - ), - auto_flush_interval: Duration::from_millis(DEFAULT_AUTO_FLUSH_INTERVAL.into()), - global_write_buffer_size: None, - global_ttl: None, - } - } -} diff --git a/src/storage/src/engine.rs b/src/storage/src/engine.rs deleted file mode 100644 index d6079ecc8eca..000000000000 --- a/src/storage/src/engine.rs +++ /dev/null @@ -1,750 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; -use std::sync::{Arc, RwLock}; -use std::time::Duration; - -use async_trait::async_trait; -use common_telemetry::logging::{self, debug}; -use object_store::{util, ObjectStore}; -use snafu::ResultExt; -use store_api::logstore::LogStore; -use store_api::manifest::Manifest; -use store_api::storage::{ - CloseContext, CloseOptions, CompactionStrategy, CreateOptions, EngineContext, OpenOptions, - Region, RegionDescriptor, StorageEngine, -}; - -use crate::compaction::CompactionSchedulerRef; -use crate::config::EngineConfig; -use crate::error::{self, Error, Result}; -use crate::file_purger::{FilePurgeHandler, FilePurgerRef}; -use crate::flush::{ - FlushScheduler, FlushSchedulerRef, FlushStrategyRef, PickerConfig, SizeBasedStrategy, -}; -use crate::manifest::region::RegionManifest; -use crate::manifest::storage::manifest_compress_type; -use crate::memtable::{DefaultMemtableBuilder, MemtableBuilderRef}; -use crate::metadata::RegionMetadata; -use crate::region::{RegionImpl, StoreConfig}; -use crate::scheduler::{LocalScheduler, Scheduler, SchedulerConfig}; -use crate::sst::FsAccessLayer; - -/// [StorageEngine] implementation. -pub struct EngineImpl { - inner: Arc>, -} - -impl Clone for EngineImpl { - fn clone(&self) -> Self { - Self { - inner: self.inner.clone(), - } - } -} - -#[async_trait] -impl StorageEngine for EngineImpl { - type Error = Error; - type Region = RegionImpl; - - async fn open_region( - &self, - _ctx: &EngineContext, - name: &str, - opts: &OpenOptions, - ) -> Result> { - self.inner.open_region(name, opts).await - } - - async fn close_region( - &self, - _ctx: &EngineContext, - name: &str, - opts: &CloseOptions, - ) -> Result<()> { - self.inner.close_region(name, opts).await - } - - async fn create_region( - &self, - _ctx: &EngineContext, - descriptor: RegionDescriptor, - opts: &CreateOptions, - ) -> Result { - self.inner.create_region(descriptor, opts).await - } - - async fn drop_region(&self, _ctx: &EngineContext, region: Self::Region) -> Result<()> { - region.drop_region().await?; - self.inner.remove_region(region.name()); - Ok(()) - } - - fn get_region(&self, _ctx: &EngineContext, name: &str) -> Result> { - Ok(self.inner.get_region(name)) - } - - async fn close(&self, _ctx: &EngineContext) -> Result<()> { - logging::info!("Stopping storage engine"); - - self.inner.close().await?; - - logging::info!("Storage engine stopped"); - - Ok(()) - } -} - -impl EngineImpl { - pub fn new( - config: EngineConfig, - log_store: Arc, - object_store: ObjectStore, - compaction_scheduler: CompactionSchedulerRef, - ) -> Result { - Ok(Self { - inner: Arc::new(EngineInner::new( - config, - log_store, - object_store, - compaction_scheduler, - )?), - }) - } -} - -/// Generate region sst path, -/// parent_dir is resolved in function `region_store_config` to ensure it's ended with '/'. -#[inline] -pub fn region_sst_dir(parent_dir: &str, region_name: &str) -> String { - format!("{parent_dir}{region_name}/") -} - -/// Generate region manifest path, -/// parent_dir is resolved in function `region_store_config` to ensure it's ended with '/'. -#[inline] -pub fn region_manifest_dir(parent_dir: &str, region_name: &str) -> String { - format!("{parent_dir}{region_name}/manifest/") -} - -/// A slot for region in the engine. -/// -/// Also used as a placeholder in the region map when the region isn't ready, e.g. during -/// creating/opening. -#[derive(Debug)] -pub(crate) enum RegionSlot { - /// The region is during creation. - Creating, - /// The region is during opening. - Opening, - /// The region is ready for access. - Ready(RegionImpl), -} - -impl RegionSlot { - /// Try to get a ready region. - fn try_get_ready_region(&self) -> Result> { - if let RegionSlot::Ready(region) = self { - Ok(region.clone()) - } else { - error::InvalidRegionStateSnafu { - state: self.state_name(), - } - .fail() - } - } - - /// Returns the ready region or `None`. - fn get_ready_region(&self) -> Option> { - if let RegionSlot::Ready(region) = self { - Some(region.clone()) - } else { - None - } - } - - fn state_name(&self) -> &'static str { - match self { - RegionSlot::Creating => "creating", - RegionSlot::Opening => "opening", - RegionSlot::Ready(_) => "ready", - } - } -} - -impl Clone for RegionSlot { - // Manually implement Clone due to [rust#26925](https://github.com/rust-lang/rust/issues/26925). - // Maybe we should require `LogStore` to be clonable to work around this. - fn clone(&self) -> RegionSlot { - match self { - RegionSlot::Creating => RegionSlot::Creating, - RegionSlot::Opening => RegionSlot::Opening, - RegionSlot::Ready(region) => RegionSlot::Ready(region.clone()), - } - } -} - -/// Used to update slot or clean the slot on failure. -struct SlotGuard<'a, S: LogStore> { - name: &'a str, - regions: &'a RegionMap, - skip_clean: bool, -} - -impl<'a, S: LogStore> SlotGuard<'a, S> { - fn new(name: &'a str, regions: &'a RegionMap) -> SlotGuard<'a, S> { - SlotGuard { - name, - regions, - skip_clean: false, - } - } - - /// Update the slot and skip cleaning on drop. - fn update(&mut self, slot: RegionSlot) { - self.regions.update(self.name, slot); - self.skip_clean = true; - } -} - -impl<'a, S: LogStore> Drop for SlotGuard<'a, S> { - fn drop(&mut self) { - if !self.skip_clean { - self.regions.remove(self.name) - } - } -} - -/// Region slot map. -pub struct RegionMap(RwLock>>); - -impl RegionMap { - /// Returns a new region map. - pub fn new() -> RegionMap { - RegionMap(RwLock::new(HashMap::new())) - } - - /// Returns the `Some(slot)` if there is existing slot with given `name`, or insert - /// given `slot` and returns `None`. - pub(crate) fn get_or_occupy_slot( - &self, - name: &str, - slot: RegionSlot, - ) -> Option> { - { - // Try to get the region under read lock. - let regions = self.0.read().unwrap(); - if let Some(slot) = regions.get(name) { - return Some(slot.clone()); - } - } - - // Get the region under write lock. - let mut regions = self.0.write().unwrap(); - if let Some(slot) = regions.get(name) { - return Some(slot.clone()); - } - - // No slot in map, we can insert the slot now. - let _ = regions.insert(name.to_string(), slot); - - None - } - - /// Gets the region by the specific name. - fn get_region(&self, name: &str) -> Option> { - let slot = self.0.read().unwrap().get(name).cloned()?; - slot.get_ready_region() - } - - /// Update the slot by name. - fn update(&self, name: &str, slot: RegionSlot) { - let mut regions = self.0.write().unwrap(); - if let Some(old) = regions.get_mut(name) { - *old = slot; - } - } - - /// Remove region by name. - fn remove(&self, name: &str) { - let mut regions = self.0.write().unwrap(); - let _ = regions.remove(name); - } - - /// Collects regions. - pub(crate) fn list_regions(&self) -> Vec> { - let regions = self.0.read().unwrap(); - regions - .values() - .filter_map(|slot| slot.get_ready_region()) - .collect() - } - - /// Clear the region map. - pub(crate) fn clear(&self) { - self.0.write().unwrap().clear(); - } -} - -impl Default for RegionMap { - fn default() -> Self { - Self::new() - } -} - -struct EngineInner { - object_store: ObjectStore, - log_store: Arc, - regions: Arc>, - memtable_builder: MemtableBuilderRef, - flush_scheduler: FlushSchedulerRef, - flush_strategy: FlushStrategyRef, - compaction_scheduler: CompactionSchedulerRef, - file_purger: FilePurgerRef, - config: Arc, -} - -impl EngineInner { - pub fn new( - config: EngineConfig, - log_store: Arc, - object_store: ObjectStore, - compaction_scheduler: CompactionSchedulerRef, - ) -> Result { - let regions = Arc::new(RegionMap::new()); - let flush_scheduler = Arc::new(FlushScheduler::new( - SchedulerConfig { - max_inflight_tasks: config.max_flush_tasks, - }, - compaction_scheduler.clone(), - regions.clone(), - PickerConfig { - schedule_interval: config.picker_schedule_interval, - auto_flush_interval: config.auto_flush_interval, - }, - )?); - - let file_purger = Arc::new(LocalScheduler::new( - SchedulerConfig { - max_inflight_tasks: config.max_purge_tasks, - }, - FilePurgeHandler, - )); - let flush_strategy = Arc::new(SizeBasedStrategy::new( - config - .global_write_buffer_size - .map(|size| size.as_bytes() as usize), - )); - let memtable_builder = if config.global_write_buffer_size.is_some() { - // If global write buffer size is provided, we set the flush strategy - // to the memtable to track global memtable usage. - DefaultMemtableBuilder::with_flush_strategy(Some(flush_strategy.clone())) - } else { - DefaultMemtableBuilder::default() - }; - Ok(Self { - object_store, - log_store, - regions, - memtable_builder: Arc::new(memtable_builder), - flush_scheduler, - flush_strategy, - compaction_scheduler, - file_purger, - config: Arc::new(config), - }) - } - - async fn close_region(&self, name: &str, opts: &CloseOptions) -> Result<()> { - if let Some(region) = self.get_region(name) { - let ctx = CloseContext { flush: opts.flush }; - region.close(&ctx).await?; - } - - self.regions.remove(name); - - Ok(()) - } - - async fn open_region(&self, name: &str, opts: &OpenOptions) -> Result>> { - // We can wait until the state of the slot has been changed to ready, but this will - // make the code more complicate, so we just return the error here. - if let Some(slot) = self.regions.get_or_occupy_slot(name, RegionSlot::Opening) { - return slot.try_get_ready_region().map(Some); - } - - let mut guard = SlotGuard::new(name, &self.regions); - - let store_config = self - .region_store_config( - &opts.parent_dir, - opts.write_buffer_size, - name, - &self.config, - opts.ttl, - opts.compaction_strategy.clone(), - ) - .await?; - - let region = match RegionImpl::open(name.to_string(), store_config, opts).await? { - None => return Ok(None), - Some(v) => v, - }; - guard.update(RegionSlot::Ready(region.clone())); - debug!( - "Storage engine open region {}, id: {}", - region.name(), - region.id() - ); - Ok(Some(region)) - } - - async fn create_region( - &self, - descriptor: RegionDescriptor, - opts: &CreateOptions, - ) -> Result> { - if let Some(slot) = self - .regions - .get_or_occupy_slot(&descriptor.name, RegionSlot::Creating) - { - return slot.try_get_ready_region(); - } - - // Now the region in under `Creating` state. - let region_name = descriptor.name.clone(); - let mut guard = SlotGuard::new(®ion_name, &self.regions); - - let metadata: RegionMetadata = - descriptor - .try_into() - .context(error::InvalidRegionDescSnafu { - region: ®ion_name, - })?; - let store_config = self - .region_store_config( - &opts.parent_dir, - opts.write_buffer_size, - ®ion_name, - &self.config, - opts.ttl, - opts.compaction_strategy.clone(), - ) - .await?; - - let region = RegionImpl::create(metadata, store_config).await?; - - guard.update(RegionSlot::Ready(region.clone())); - - debug!( - "Storage engine create region {}, id: {}", - region.name(), - region.id() - ); - - Ok(region) - } - - fn get_region(&self, name: &str) -> Option> { - self.regions.get_region(name) - } - - fn remove_region(&self, name: &str) { - self.regions.remove(name) - } - - async fn region_store_config( - &self, - parent_dir: &str, - write_buffer_size: Option, - region_name: &str, - config: &EngineConfig, - region_ttl: Option, - compaction_strategy: CompactionStrategy, - ) -> Result> { - let parent_dir = util::normalize_dir(parent_dir); - - let sst_dir = ®ion_sst_dir(&parent_dir, region_name); - let sst_layer = Arc::new(FsAccessLayer::new(sst_dir, self.object_store.clone())); - let manifest_dir = region_manifest_dir(&parent_dir, region_name); - let manifest = RegionManifest::with_checkpointer( - &manifest_dir, - self.object_store.clone(), - manifest_compress_type(config.compress_manifest), - config.manifest_checkpoint_margin, - config.manifest_gc_duration, - ); - manifest.start().await?; - let flush_strategy = self.flush_strategy.clone(); - - // If region_ttl is `None`, the global ttl takes effect. - let ttl = region_ttl.or(self.config.global_ttl); - - Ok(StoreConfig { - log_store: self.log_store.clone(), - sst_layer, - manifest, - memtable_builder: self.memtable_builder.clone(), - flush_scheduler: self.flush_scheduler.clone(), - flush_strategy, - compaction_scheduler: self.compaction_scheduler.clone(), - engine_config: self.config.clone(), - file_purger: self.file_purger.clone(), - ttl, - write_buffer_size: write_buffer_size - .unwrap_or(self.config.region_write_buffer_size.as_bytes() as usize), - compaction_strategy, - }) - } - - async fn close(&self) -> Result<()> { - let regions = self.regions.list_regions(); - let ctx = CloseContext::default(); - for region in regions { - // Tolerate failure during closing regions. - if let Err(e) = region.close(&ctx).await { - logging::error!(e; "Failed to close region {}", region.id()); - } - } - // Clear regions to release references to regions in the region map. - self.regions.clear(); - - self.compaction_scheduler.stop(true).await?; - self.flush_scheduler.stop().await?; - self.file_purger.stop(true).await - } -} - -#[cfg(test)] -mod tests { - use std::ffi::OsStr; - use std::path::Path; - - use common_test_util::temp_dir::{create_temp_dir, TempDir}; - use datatypes::type_id::LogicalTypeId; - use datatypes::vectors::{Float32Vector, Int32Vector, TimestampMillisecondVector, VectorRef}; - use log_store::raft_engine::log_store::RaftEngineLogStore; - use log_store::test_util::log_store_util; - use object_store::services::Fs; - use store_api::storage::{ - ChunkReader, FlushContext, ReadContext, Region, ScanRequest, Snapshot, WriteContext, - WriteRequest, - }; - - use super::*; - use crate::compaction::noop::NoopCompactionScheduler; - use crate::test_util::descriptor_util::RegionDescBuilder; - - type TestEngine = EngineImpl; - type TestRegion = RegionImpl; - - async fn create_engine_and_region( - tmp_dir: &TempDir, - log_file_dir: &TempDir, - region_name: &str, - region_id: u64, - config: EngineConfig, - ) -> (TestEngine, TestRegion) { - let log_file_dir_path = log_file_dir.path().to_str().unwrap(); - let log_store = log_store_util::create_tmp_local_file_log_store(log_file_dir_path).await; - - let store_dir = tmp_dir.path().to_string_lossy(); - - let mut builder = Fs::default(); - let _ = builder.root(&store_dir); - let object_store = ObjectStore::new(builder).unwrap().finish(); - - let compaction_scheduler = Arc::new(NoopCompactionScheduler::default()); - - let engine = EngineImpl::new( - config, - Arc::new(log_store), - object_store, - compaction_scheduler, - ) - .unwrap(); - - let desc = RegionDescBuilder::new(region_name) - .id(region_id) - .push_key_column(("k1", LogicalTypeId::Int32, false)) - .push_field_column(("v1", LogicalTypeId::Float32, true)) - .timestamp(("ts", LogicalTypeId::TimestampMillisecond, false)) - .build(); - - let region = engine - .create_region(&EngineContext::default(), desc, &CreateOptions::default()) - .await - .unwrap(); - - (engine, region) - } - - fn parquet_file_num(path: &Path) -> usize { - path.read_dir() - .unwrap() - .filter_map(|entry| entry.ok()) - .filter(|entry| entry.path().extension() == Some(OsStr::new("parquet"))) - .count() - } - - #[tokio::test] - async fn test_create_new_region() { - let dir = create_temp_dir("test_create_region"); - let log_file_dir = create_temp_dir("test_engine_wal"); - - let region_name = "region-0"; - let region_id = 123456; - let config = EngineConfig::default(); - - let (engine, region) = - create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await; - assert_eq!(region_name, region.name()); - - let ctx = EngineContext::default(); - let region2 = engine.get_region(&ctx, region_name).unwrap().unwrap(); - assert_eq!(region_name, region2.name()); - - assert!(engine.get_region(&ctx, "no such region").unwrap().is_none()); - } - - #[tokio::test] - async fn test_create_region_with_buffer_size() { - let dir = create_temp_dir("test_buffer_size"); - let log_file_dir = create_temp_dir("test_buffer_wal"); - - let region_name = "region-0"; - let region_id = 123456; - let mut config = EngineConfig::default(); - let expect_buffer_size = config.region_write_buffer_size / 2; - config.region_write_buffer_size = expect_buffer_size; - - let (_engine, region) = - create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await; - assert_eq!( - expect_buffer_size.as_bytes() as usize, - region.write_buffer_size().await - ); - } - - #[tokio::test] - async fn test_drop_region() { - common_telemetry::init_default_ut_logging(); - let dir = create_temp_dir("test_drop_region"); - let log_file_dir = create_temp_dir("test_engine_wal"); - - let region_name = "test_region"; - let region_id = 123456; - let config = EngineConfig::default(); - - let (engine, region) = - create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await; - - assert_eq!(region_name, region.name()); - - let mut wb = region.write_request(); - let k1 = Arc::new(Int32Vector::from_slice([1, 2, 3])) as VectorRef; - let v1 = Arc::new(Float32Vector::from_slice([0.1, 0.2, 0.3])) as VectorRef; - let tsv = Arc::new(TimestampMillisecondVector::from_slice([0, 0, 0])) as VectorRef; - - let put_data = HashMap::from([ - ("k1".to_string(), k1), - ("v1".to_string(), v1), - ("ts".to_string(), tsv), - ]); - wb.put(put_data).unwrap(); - let _ = region.write(&WriteContext::default(), wb).await.unwrap(); - - // Flush memtable to sst. - region.flush(&FlushContext::default()).await.unwrap(); - let ctx = EngineContext::default(); - engine - .close_region(&ctx, region.name(), &CloseOptions::default()) - .await - .unwrap(); - - let dir_path = dir.path().join(region_name); - - assert_eq!(1, parquet_file_num(&dir_path)); - - { - let region = engine - .open_region(&ctx, region_name, &OpenOptions::default()) - .await - .unwrap() - .unwrap(); - - engine.drop_region(&ctx, region).await.unwrap(); - - assert!(engine.get_region(&ctx, region_name).unwrap().is_none()); - assert!(!engine - .inner - .object_store - .is_exist(dir_path.join("manifest").to_str().unwrap()) - .await - .unwrap()); - } - - // Wait for gc - tokio::time::sleep(Duration::from_millis(60)).await; - assert_eq!(0, parquet_file_num(&dir_path)); - } - - #[tokio::test] - async fn test_truncate_region() { - common_telemetry::init_default_ut_logging(); - let dir = create_temp_dir("test_truncate_region"); - let log_file_dir = create_temp_dir("test_engine_wal"); - - let region_name = "test_region"; - let region_id = 123456; - let config = EngineConfig::default(); - - let (engine, region) = - create_engine_and_region(&dir, &log_file_dir, region_name, region_id, config).await; - - assert_eq!(region_name, region.name()); - - let mut wb = region.write_request(); - let k1 = Arc::new(Int32Vector::from_slice([1, 2, 3])) as VectorRef; - let v1 = Arc::new(Float32Vector::from_slice([0.1, 0.2, 0.3])) as VectorRef; - let tsv = Arc::new(TimestampMillisecondVector::from_slice([0, 0, 0])) as VectorRef; - - let put_data = HashMap::from([ - ("k1".to_string(), k1), - ("v1".to_string(), v1), - ("ts".to_string(), tsv), - ]); - wb.put(put_data).unwrap(); - - // Insert data. - region.write(&WriteContext::default(), wb).await.unwrap(); - let ctx = EngineContext::default(); - - // Truncate region. - region.truncate().await.unwrap(); - assert!(engine.get_region(&ctx, region.name()).unwrap().is_some()); - - // Scan to verify the region is empty. - let read_ctx = ReadContext::default(); - let snapshot = region.snapshot(&read_ctx).unwrap(); - let resp = snapshot - .scan(&read_ctx, ScanRequest::default()) - .await - .unwrap(); - let mut reader = resp.reader; - assert!(reader.next_chunk().await.unwrap().is_none()); - } -} diff --git a/src/storage/src/error.rs b/src/storage/src/error.rs deleted file mode 100644 index c2a043e6d07a..000000000000 --- a/src/storage/src/error.rs +++ /dev/null @@ -1,635 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; -use std::io::Error as IoError; -use std::str::Utf8Error; - -use common_datasource::compression::CompressionType; -use common_error::ext::{BoxedError, ErrorExt}; -use common_error::status_code::StatusCode; -use common_macro::stack_trace_debug; -use common_runtime::error::Error as RuntimeError; -use datatypes::arrow::error::ArrowError; -use datatypes::prelude::ConcreteDataType; -use object_store::ErrorKind; -use serde_json::error::Error as JsonError; -use snafu::{Location, Snafu}; -use store_api::manifest::action::ProtocolVersion; -use store_api::manifest::ManifestVersion; -use store_api::storage::{RegionId, SequenceNumber}; -use tokio::task::JoinError; - -use crate::metadata::Error as MetadataError; -use crate::write_batch; - -#[derive(Snafu)] -#[snafu(visibility(pub))] -#[stack_trace_debug] -pub enum Error { - #[snafu(display("Invalid region descriptor, region: {}", region))] - InvalidRegionDesc { - region: String, - location: Location, - source: MetadataError, - }, - - #[snafu(display("Missing column {} in write batch", column))] - BatchMissingColumn { column: String, location: Location }, - - #[snafu(display("Failed to write parquet file"))] - WriteParquet { - #[snafu(source)] - error: parquet::errors::ParquetError, - location: Location, - }, - - #[snafu(display("Failed to write to buffer"))] - WriteBuffer { - location: Location, - source: common_datasource::error::Error, - }, - - #[snafu(display("Failed to create RecordBatch from vectors"))] - NewRecordBatch { - location: Location, - #[snafu(source)] - error: ArrowError, - }, - - #[snafu(display("Fail to read object from path: {}", path))] - ReadObject { - path: String, - location: Location, - #[snafu(source)] - error: object_store::Error, - }, - - #[snafu(display("Fail to write object into path: {}", path))] - WriteObject { - path: String, - location: Location, - #[snafu(source)] - error: object_store::Error, - }, - - #[snafu(display("Fail to delete object from path: {}", path))] - DeleteObject { - path: String, - location: Location, - #[snafu(source)] - error: object_store::Error, - }, - - #[snafu(display("Fail to compress object by {}, path: {}", compress_type, path))] - CompressObject { - compress_type: CompressionType, - path: String, - #[snafu(source)] - error: std::io::Error, - }, - - #[snafu(display("Fail to decompress object by {}, path: {}", compress_type, path))] - DecompressObject { - compress_type: CompressionType, - path: String, - #[snafu(source)] - error: std::io::Error, - }, - - #[snafu(display("Fail to list objects in path: {}", path))] - ListObjects { - path: String, - location: Location, - #[snafu(source)] - error: object_store::Error, - }, - - #[snafu(display("Fail to create str from bytes"))] - Utf8 { - location: Location, - #[snafu(source)] - error: Utf8Error, - }, - - #[snafu(display("Fail to encode object into json "))] - EncodeJson { - location: Location, - #[snafu(source)] - error: JsonError, - }, - - #[snafu(display("Fail to decode object from json "))] - DecodeJson { - location: Location, - #[snafu(source)] - error: JsonError, - }, - - #[snafu(display("Invalid scan index, start: {}, end: {}", start, end))] - InvalidScanIndex { - start: ManifestVersion, - end: ManifestVersion, - location: Location, - }, - - #[snafu(display("Failed to write WAL, WAL region_id: {}", region_id))] - WriteWal { - region_id: RegionId, - location: Location, - source: BoxedError, - }, - - #[snafu(display("Failed to encode WAL header"))] - EncodeWalHeader { - location: Location, - #[snafu(source)] - error: std::io::Error, - }, - - #[snafu(display("Failed to decode WAL header"))] - DecodeWalHeader { - location: Location, - #[snafu(source)] - error: std::io::Error, - }, - - #[snafu(display("Failed to wait flushing, region_id: {}", region_id))] - WaitFlush { - region_id: RegionId, - #[snafu(source)] - error: tokio::sync::oneshot::error::RecvError, - location: Location, - }, - - #[snafu(display( - "Manifest protocol forbid to read, min_version: {}, supported_version: {}", - min_version, - supported_version - ))] - ManifestProtocolForbidRead { - min_version: ProtocolVersion, - supported_version: ProtocolVersion, - location: Location, - }, - - #[snafu(display( - "Manifest protocol forbid to write, min_version: {}, supported_version: {}", - min_version, - supported_version - ))] - ManifestProtocolForbidWrite { - min_version: ProtocolVersion, - supported_version: ProtocolVersion, - location: Location, - }, - - #[snafu(display("Failed to decode action list, {}", msg))] - DecodeMetaActionList { msg: String, location: Location }, - - #[snafu(display("Failed to read line, err"))] - Readline { - #[snafu(source)] - error: IoError, - }, - - #[snafu(display("Failed to read Parquet file: {}", file))] - ReadParquet { - file: String, - #[snafu(source)] - error: parquet::errors::ParquetError, - location: Location, - }, - - #[snafu(display("Region is under {} state, cannot proceed operation", state))] - InvalidRegionState { - state: &'static str, - location: Location, - }, - - #[snafu(display("Failed to read WAL, region_id: {}", region_id))] - ReadWal { - region_id: RegionId, - location: Location, - source: BoxedError, - }, - - #[snafu(display("Failed to mark WAL as obsolete, region id: {}", region_id))] - MarkWalObsolete { - region_id: u64, - location: Location, - source: BoxedError, - }, - - #[snafu(display("WAL data corrupted, region_id: {}, message: {}", region_id, message))] - WalDataCorrupted { - region_id: RegionId, - message: String, - location: Location, - }, - - #[snafu(display("Failed to delete WAL namespace, region id: {}", region_id))] - DeleteWalNamespace { - region_id: RegionId, - location: Location, - source: BoxedError, - }, - - #[snafu(display( - "Sequence of region should increase monotonically (should be {} < {})", - prev, - given - ))] - SequenceNotMonotonic { - prev: SequenceNumber, - given: SequenceNumber, - location: Location, - }, - - #[snafu(display("Failed to convert store schema, file: {}", file))] - ConvertStoreSchema { - file: String, - location: Location, - source: MetadataError, - }, - - #[snafu(display("Invalid raw region metadata, region: {}", region))] - InvalidRawRegion { - region: String, - location: Location, - source: MetadataError, - }, - - #[snafu(display("Try to write the closed region"))] - ClosedRegion { location: Location }, - - #[snafu(display("Invalid projection"))] - InvalidProjection { - location: Location, - source: MetadataError, - }, - - #[snafu(display("Failed to push data to batch builder"))] - PushBatch { - location: Location, - source: datatypes::error::Error, - }, - - #[snafu(display("Failed to build batch, {}", msg))] - BuildBatch { msg: String, location: Location }, - - #[snafu(display("Failed to filter column {}", name))] - FilterColumn { - name: String, - location: Location, - source: datatypes::error::Error, - }, - - #[snafu(display("Invalid alter request"))] - InvalidAlterRequest { - location: Location, - source: MetadataError, - }, - - #[snafu(display("Failed to alter metadata"))] - AlterMetadata { - location: Location, - source: MetadataError, - }, - - #[snafu(display("Failed to create default value for column {}", name))] - CreateDefault { - name: String, - location: Location, - source: datatypes::error::Error, - }, - - #[snafu(display( - "Not allowed to write data with version {} to schema with version {}", - data_version, - schema_version - ))] - WriteToOldVersion { - /// Schema version of data to write. - data_version: u32, - schema_version: u32, - location: Location, - }, - - #[snafu(display("Column {} not in schema with version {}", column, version))] - NotInSchemaToCompat { - column: String, - version: u32, - location: Location, - }, - - #[snafu(display("Incompatible schema to read, reason: {}", reason))] - CompatRead { reason: String, location: Location }, - - #[snafu(display("Failed to read column {}, could not create default value", column))] - CreateDefaultToRead { - column: String, - location: Location, - source: datatypes::error::Error, - }, - - #[snafu(display("Failed to read column {}, no proper default value for it", column))] - NoDefaultToRead { column: String, location: Location }, - - #[snafu(display("Failed to convert arrow chunk to batch, name: {}", name))] - ConvertChunk { - name: String, - location: Location, - source: datatypes::error::Error, - }, - - #[snafu(display("Unknown column {}", name))] - UnknownColumn { name: String, location: Location }, - - #[snafu(display("Failed to create record batch for write batch"))] - CreateRecordBatch { - location: Location, - source: common_recordbatch::error::Error, - }, - - #[snafu(display( - "Request is too large, max is {}, current is {}", - write_batch::MAX_BATCH_SIZE, - num_rows - ))] - RequestTooLarge { num_rows: usize, location: Location }, - - #[snafu(display( - "Type of column {} does not match type in schema, expect {:?}, given {:?}", - name, - expect, - given - ))] - TypeMismatch { - name: String, - expect: ConcreteDataType, - given: ConcreteDataType, - location: Location, - }, - - #[snafu(display("Column {} is not null but input has null", name))] - HasNull { name: String, location: Location }, - - #[snafu(display( - "Length of column {} not equals to other columns, expect {}, given {}", - name, - expect, - given - ))] - UnequalLengths { - name: String, - expect: usize, - given: usize, - location: Location, - }, - - #[snafu(display("Failed to decode write batch, corrupted data {}", message))] - BatchCorrupted { message: String, location: Location }, - - #[snafu(display("Failed to decode arrow data"))] - DecodeArrow { - location: Location, - #[snafu(source)] - error: ArrowError, - }, - - #[snafu(display("Failed to encode arrow data"))] - EncodeArrow { - location: Location, - #[snafu(source)] - error: ArrowError, - }, - - #[snafu(display("Failed to parse schema"))] - ParseSchema { - location: Location, - source: datatypes::error::Error, - }, - - #[snafu(display("More columns than expected in the request"))] - MoreColumnThanExpected { location: Location }, - - #[snafu(display("Failed to decode parquet file time range, msg: {}", msg))] - DecodeParquetTimeRange { msg: String, location: Location }, - - #[snafu(display("Scheduler rate limited, msg: {}", msg))] - RateLimited { msg: String }, - - #[snafu(display("Cannot schedule request, scheduler's already stopped"))] - IllegalSchedulerState { location: Location }, - - #[snafu(display("Failed to start manifest gc task"))] - StartManifestGcTask { - location: Location, - source: RuntimeError, - }, - - #[snafu(display("Failed to stop manifest gc task"))] - StopManifestGcTask { - location: Location, - source: RuntimeError, - }, - - #[snafu(display("Failed to stop scheduler"))] - StopScheduler { - #[snafu(source)] - error: JoinError, - location: Location, - }, - - #[snafu(display("Failed to delete SST file"))] - DeleteSst { - #[snafu(source)] - error: object_store::Error, - location: Location, - }, - - #[snafu(display("Failed to calculate SST expire time"))] - TtlCalculation { - location: Location, - source: common_time::error::Error, - }, - - #[snafu(display("Failed to create a checkpoint: {}", msg))] - ManifestCheckpoint { msg: String, location: Location }, - - #[snafu(display("The compaction task is cancelled, region_id: {}", region_id))] - CompactTaskCancel { - region_id: RegionId, - #[snafu(source)] - error: tokio::sync::oneshot::error::RecvError, - }, - - #[snafu(display( - "The flush request is duplicate, region_id: {}, sequence: {}", - region_id, - sequence - ))] - DuplicateFlush { - region_id: RegionId, - sequence: SequenceNumber, - location: Location, - }, - - #[snafu(display("Failed to start picking task for flush"))] - StartPickTask { - location: Location, - source: RuntimeError, - }, - - #[snafu(display("Failed to stop picking task for flush"))] - StopPickTask { - location: Location, - source: RuntimeError, - }, - - #[snafu(display("Failed to convert columns to rows"))] - ConvertColumnsToRows { - #[snafu(source)] - error: ArrowError, - location: Location, - }, - - #[snafu(display("Failed to sort arrays"))] - SortArrays { - #[snafu(source)] - error: ArrowError, - location: Location, - }, - - #[snafu(display("Failed to build scan predicate"))] - BuildPredicate { - source: table::error::Error, - location: Location, - }, - - #[snafu(display("Failed to join spawned tasks"))] - JoinError { - #[snafu(source)] - error: JoinError, - location: Location, - }, -} - -pub type Result = std::result::Result; - -impl Error { - /// Returns true if the error is the object path to delete - /// doesn't exist. - pub(crate) fn is_object_to_delete_not_found(&self) -> bool { - if let Error::DeleteObject { error, .. } = self { - error.kind() == ErrorKind::NotFound - } else { - false - } - } -} - -impl ErrorExt for Error { - fn status_code(&self) -> StatusCode { - use Error::*; - - match self { - InvalidScanIndex { .. } - | BatchMissingColumn { .. } - | InvalidProjection { .. } - | BuildBatch { .. } - | NotInSchemaToCompat { .. } - | WriteToOldVersion { .. } - | CreateRecordBatch { .. } - | RequestTooLarge { .. } - | TypeMismatch { .. } - | HasNull { .. } - | UnequalLengths { .. } - | MoreColumnThanExpected { .. } => StatusCode::InvalidArguments, - - Utf8 { .. } - | EncodeJson { .. } - | DecodeJson { .. } - | WaitFlush { .. } - | DecodeMetaActionList { .. } - | Readline { .. } - | WalDataCorrupted { .. } - | SequenceNotMonotonic { .. } - | ConvertStoreSchema { .. } - | InvalidRawRegion { .. } - | ClosedRegion { .. } - | FilterColumn { .. } - | AlterMetadata { .. } - | CompatRead { .. } - | CreateDefaultToRead { .. } - | NoDefaultToRead { .. } - | NewRecordBatch { .. } - | BatchCorrupted { .. } - | DecodeArrow { .. } - | EncodeArrow { .. } - | ManifestCheckpoint { .. } - | CompressObject { .. } - | DecompressObject { .. } - | ParseSchema { .. } => StatusCode::Unexpected, - - WriteParquet { .. } - | ReadObject { .. } - | WriteObject { .. } - | ListObjects { .. } - | DeleteObject { .. } - | WriteWal { .. } - | DecodeWalHeader { .. } - | EncodeWalHeader { .. } - | ManifestProtocolForbidRead { .. } - | ManifestProtocolForbidWrite { .. } - | ReadParquet { .. } - | InvalidRegionState { .. } - | ReadWal { .. } => StatusCode::StorageUnavailable, - - UnknownColumn { .. } => StatusCode::TableColumnNotFound, - - InvalidAlterRequest { source, .. } | InvalidRegionDesc { source, .. } => { - source.status_code() - } - WriteBuffer { source, .. } => source.status_code(), - PushBatch { source, .. } => source.status_code(), - CreateDefault { source, .. } => source.status_code(), - ConvertChunk { source, .. } => source.status_code(), - MarkWalObsolete { source, .. } => source.status_code(), - DeleteWalNamespace { source, .. } => source.status_code(), - DecodeParquetTimeRange { .. } => StatusCode::Unexpected, - RateLimited { .. } | StopScheduler { .. } | CompactTaskCancel { .. } => { - StatusCode::Internal - } - DeleteSst { .. } => StatusCode::StorageUnavailable, - - StartManifestGcTask { .. } - | StopManifestGcTask { .. } - | IllegalSchedulerState { .. } - | DuplicateFlush { .. } - | StartPickTask { .. } - | StopPickTask { .. } => StatusCode::Unexpected, - - TtlCalculation { source, .. } => source.status_code(), - ConvertColumnsToRows { .. } | SortArrays { .. } => StatusCode::Unexpected, - BuildPredicate { source, .. } => source.status_code(), - JoinError { .. } => StatusCode::Unexpected, - } - } - - fn as_any(&self) -> &dyn Any { - self - } -} diff --git a/src/storage/src/file_purger.rs b/src/storage/src/file_purger.rs deleted file mode 100644 index da7899e6002b..000000000000 --- a/src/storage/src/file_purger.rs +++ /dev/null @@ -1,235 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use common_telemetry::{debug, error}; -use store_api::storage::RegionId; -use tokio::sync::Notify; - -use crate::error::Result; -use crate::scheduler::rate_limit::{BoxedRateLimitToken, RateLimitToken}; -use crate::scheduler::{Handler, LocalScheduler, Request}; -use crate::sst::{AccessLayerRef, FileId}; - -pub struct FilePurgeRequest { - pub region_id: RegionId, - pub file_id: FileId, - pub sst_layer: AccessLayerRef, -} - -impl Request for FilePurgeRequest { - type Key = String; - - fn key(&self) -> Self::Key { - format!("{}/{}", self.region_id, self.file_id) - } - - fn complete(self, _result: Result<()>) {} -} - -pub struct FilePurgeHandler; - -#[async_trait::async_trait] -impl Handler for FilePurgeHandler { - type Request = FilePurgeRequest; - - async fn handle_request( - &self, - req: Self::Request, - token: BoxedRateLimitToken, - finish_notifier: Arc, - ) -> Result<()> { - req.sst_layer.delete_sst(req.file_id).await.map_err(|e| { - error!(e; "Failed to delete SST file, file: {}, region: {}", - req.file_id.as_parquet(), req.region_id); - e - })?; - debug!( - "Successfully deleted SST file: {}, region: {}", - req.file_id.as_parquet(), - req.region_id - ); - token.try_release(); - finish_notifier.notify_one(); - Ok(()) - } -} - -pub type FilePurgerRef = Arc>; - -#[cfg(test)] -pub mod noop { - use std::sync::Arc; - - use tokio::sync::Notify; - - use crate::error::Result; - use crate::file_purger::{FilePurgeRequest, FilePurgerRef}; - use crate::scheduler::rate_limit::{BoxedRateLimitToken, RateLimitToken}; - use crate::scheduler::{Handler, LocalScheduler, SchedulerConfig}; - - pub fn new_noop_file_purger() -> FilePurgerRef { - Arc::new(LocalScheduler::new( - SchedulerConfig::default(), - NoopFilePurgeHandler, - )) - } - - #[derive(Debug)] - pub struct NoopFilePurgeHandler; - - #[async_trait::async_trait] - impl Handler for NoopFilePurgeHandler { - type Request = FilePurgeRequest; - - async fn handle_request( - &self, - _req: Self::Request, - token: BoxedRateLimitToken, - finish_notifier: Arc, - ) -> Result<()> { - token.try_release(); - finish_notifier.notify_one(); - Ok(()) - } - } -} - -#[cfg(test)] -mod tests { - use api::v1::OpType; - use common_test_util::temp_dir::create_temp_dir; - use object_store::services::Fs; - use object_store::ObjectStore; - - use super::*; - use crate::file_purger::noop::NoopFilePurgeHandler; - use crate::memtable::tests::{schema_for_test, write_kvs}; - use crate::memtable::{DefaultMemtableBuilder, IterContext, MemtableBuilder}; - use crate::scheduler::{Scheduler, SchedulerConfig}; - use crate::sst::{AccessLayer, FileHandle, FileMeta, FsAccessLayer, Source, WriteOptions}; - - struct MockRateLimitToken; - - impl RateLimitToken for MockRateLimitToken { - fn try_release(&self) {} - } - - async fn create_sst_file( - os: ObjectStore, - sst_file_id: FileId, - file_purger: FilePurgerRef, - ) -> (FileHandle, String, AccessLayerRef) { - let schema = schema_for_test(); - let memtable = DefaultMemtableBuilder::default().build(schema.clone()); - - write_kvs( - &*memtable, - 10, - OpType::Put, - &[1, 2], - &[(Some(1), Some(1)), (Some(2), Some(2))], - ); - - let iter = memtable.iter(IterContext::default()).unwrap(); - let sst_path = "table1"; - let layer = Arc::new(FsAccessLayer::new(sst_path, os.clone())); - let sst_info = layer - .write_sst(sst_file_id, Source::Iter(iter), &WriteOptions::default()) - .await - .unwrap() - .unwrap(); - - ( - FileHandle::new( - FileMeta { - region_id: 0.into(), - file_id: sst_file_id, - time_range: None, - level: 0, - file_size: sst_info.file_size, - }, - layer.clone(), - file_purger, - ), - sst_path.to_string(), - layer as _, - ) - } - - #[tokio::test] - async fn test_file_purger_handler() { - let dir = create_temp_dir("file-purge"); - let mut builder = Fs::default(); - let _ = builder.root(dir.path().to_str().unwrap()); - let object_store = ObjectStore::new(builder).unwrap().finish(); - - let sst_file_id = FileId::random(); - - let noop_file_purger = Arc::new(LocalScheduler::new( - SchedulerConfig::default(), - NoopFilePurgeHandler, - )); - let (_file, path, layer) = - create_sst_file(object_store.clone(), sst_file_id, noop_file_purger).await; - let request = FilePurgeRequest { - region_id: 0.into(), - file_id: sst_file_id, - sst_layer: layer, - }; - - let handler = FilePurgeHandler; - let notify = Arc::new(Notify::new()); - handler - .handle_request(request, Box::new(MockRateLimitToken {}), notify.clone()) - .await - .unwrap(); - - notify.notified().await; - let exists = object_store - .is_exist(&format!("{}/{}", path, sst_file_id.as_parquet())) - .await - .unwrap(); - assert!(!exists); - } - - #[tokio::test] - async fn test_file_purge_loop() { - common_telemetry::init_default_ut_logging(); - let dir = create_temp_dir("file-purge"); - let mut builder = Fs::default(); - let _ = builder.root(dir.path().to_str().unwrap()); - let object_store = ObjectStore::new(builder).unwrap().finish(); - let sst_file_id = FileId::random(); - let scheduler = Arc::new(LocalScheduler::new( - SchedulerConfig::default(), - FilePurgeHandler, - )); - let (handle, path, _layer) = - create_sst_file(object_store.clone(), sst_file_id, scheduler.clone()).await; - - { - // mark file as deleted and drop the handle, we expect the file is deleted. - handle.mark_deleted(); - drop(handle); - } - scheduler.stop(true).await.unwrap(); - - assert!(!object_store - .is_exist(&format!("{}/{}", path, sst_file_id.as_parquet())) - .await - .unwrap()); - } -} diff --git a/src/storage/src/flush.rs b/src/storage/src/flush.rs deleted file mode 100644 index 9aa82c4154e0..000000000000 --- a/src/storage/src/flush.rs +++ /dev/null @@ -1,495 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -mod picker; -mod scheduler; - -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::Arc; - -use common_base::readable_size::ReadableSize; -use common_telemetry::logging; -pub use picker::{FlushPicker, PickerConfig}; -pub use scheduler::{ - FlushHandle, FlushRegionRequest, FlushRequest, FlushScheduler, FlushSchedulerRef, -}; -use store_api::logstore::LogStore; -use store_api::storage::consts::WRITE_ROW_GROUP_SIZE; -use store_api::storage::{RegionId, SequenceNumber}; - -use crate::config::EngineConfig; -use crate::error::Result; -use crate::manifest::action::*; -use crate::manifest::region::RegionManifest; -use crate::memtable::{IterContext, MemtableId, MemtableRef}; -use crate::metrics::{FLUSH_BYTES_TOTAL, FLUSH_ELAPSED}; -use crate::region::{RegionWriterRef, SharedDataRef}; -use crate::sst::{AccessLayerRef, FileId, FileMeta, Source, SstInfo, WriteOptions}; -use crate::wal::Wal; - -/// Current flush-related status of a region. -#[derive(Debug, Clone, Copy)] -pub struct RegionStatus { - /// Id of the region this status belongs to. - pub region_id: RegionId, - /// Size of the mutable memtable. - pub bytes_mutable: usize, - /// Write buffer size of the region. - pub write_buffer_size: usize, -} - -/// Type of flush request to send. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum FlushType { - /// Flush current region. - Region, - /// Engine level flush. Find regions to flush globally. - Engine, -} - -/// Strategy to control whether to flush a region before writing to the region. -pub trait FlushStrategy: Send + Sync + std::fmt::Debug { - /// Returns whether to trigger a flush operation. - fn should_flush(&self, status: RegionStatus) -> Option; - - /// Reserves `mem` bytes. - fn reserve_mem(&self, mem: usize); - - /// Tells the strategy we are freeing `mem` bytes. - /// - /// We are in the process of freeing `mem` bytes, so it is not considered - /// when checking the soft limit. - fn schedule_free_mem(&self, mem: usize); - - /// We have freed `mem` bytes. - fn free_mem(&self, mem: usize); -} - -pub type FlushStrategyRef = Arc; - -/// Flush strategy based on memory usage. -#[derive(Debug)] -pub struct SizeBasedStrategy { - /// Write buffer size for all memtables. - global_write_buffer_size: Option, - /// Mutable memtable memory size limitation, only valid when `global_write_buffer_size` - /// is `Some`. - mutable_limitation: usize, - /// Memory in used (e.g. used by mutable and immutable memtables). - memory_used: AtomicUsize, - /// Memory that hasn't been scheduled to free (e.g. used by mutable memtables). - memory_active: AtomicUsize, -} - -impl SizeBasedStrategy { - /// Returns a new [SizeBasedStrategy] with specific `global_write_buffer_size`. - pub fn new(global_write_buffer_size: Option) -> Self { - Self { - global_write_buffer_size, - mutable_limitation: get_mutable_limitation(global_write_buffer_size), - memory_used: AtomicUsize::new(0), - memory_active: AtomicUsize::new(0), - } - } - - /// Returns whether to trigger an engine level flush. - /// - /// Inspired by RocksDB's WriteBufferManager. - /// - fn should_flush_engine(&self) -> bool { - // We only check global limit when it is Some. - let Some(global_write_buffer_size) = self.global_write_buffer_size else { - return false; - }; - - let mutable_memtable_memory_usage = self.memory_active.load(Ordering::Relaxed); - if mutable_memtable_memory_usage > self.mutable_limitation { - logging::info!( - "Engine should flush (over mutable limit), mutable_usage: {}, mutable_limitation: {}.", - mutable_memtable_memory_usage, - self.mutable_limitation, - ); - return true; - } - - let memory_usage = self.memory_used.load(Ordering::Relaxed); - // If the memory exceeds the buffer size, we trigger more aggressive - // flush. But if already more than half memory is being flushed, - // triggering more flush may not help. We will hold it instead. - if memory_usage >= global_write_buffer_size - && mutable_memtable_memory_usage >= global_write_buffer_size / 2 - { - logging::info!( - "Engine should flush (over total limit), memory_usage: {}, global_write_buffer_size: {}, \ - mutable_usage: {}.", - memory_usage, - global_write_buffer_size, - mutable_memtable_memory_usage, - ); - return true; - } - - false - } - - /// Returns true if the global memory limitation is enabled. - #[inline] - fn is_global_limit_enabled(&self) -> bool { - self.global_write_buffer_size.is_some() - } -} - -#[inline] -fn get_mutable_limitation(global_write_buffer_size: Option) -> usize { - // Inspired by RocksDB. - // https://github.com/facebook/rocksdb/blob/main/include/rocksdb/write_buffer_manager.h#L86 - global_write_buffer_size - .map(|size| size * 7 / 8) - .unwrap_or(0) -} - -impl Default for SizeBasedStrategy { - fn default() -> Self { - Self { - global_write_buffer_size: None, - mutable_limitation: 0, - memory_used: AtomicUsize::new(0), - memory_active: AtomicUsize::new(0), - } - } -} - -impl FlushStrategy for SizeBasedStrategy { - fn should_flush(&self, status: RegionStatus) -> Option { - if status.bytes_mutable >= status.write_buffer_size { - // If the mutable memtable is full, we should freeze it and flush it. - logging::debug!( - "Region should flush as mutable memtable is full, region: {}, bytes_mutable: {}, \ - write_buffer_size: {}.", - status.region_id, - status.bytes_mutable, - status.write_buffer_size, - ); - - return Some(FlushType::Region); - } - - if self.should_flush_engine() { - return Some(FlushType::Engine); - } - - None - } - - fn reserve_mem(&self, mem: usize) { - if self.is_global_limit_enabled() { - let _ = self.memory_used.fetch_add(mem, Ordering::Relaxed); - let _ = self.memory_active.fetch_add(mem, Ordering::Relaxed); - } - } - - fn schedule_free_mem(&self, mem: usize) { - if self.is_global_limit_enabled() { - let _ = self.memory_active.fetch_sub(mem, Ordering::Relaxed); - } - } - - fn free_mem(&self, mem: usize) { - if self.is_global_limit_enabled() { - let _ = self.memory_used.fetch_sub(mem, Ordering::Relaxed); - } - } -} - -pub struct FlushJob { - /// Max memtable id in these memtables, - /// used to remove immutable memtables in current version. - pub max_memtable_id: MemtableId, - /// Memtables to be flushed. - pub memtables: Vec, - /// Last sequence of data to be flushed. - pub flush_sequence: SequenceNumber, - /// Shared data of region to be flushed. - pub shared: SharedDataRef, - /// Sst access layer of the region. - pub sst_layer: AccessLayerRef, - /// Region writer, used to persist log entry that points to the latest manifest file. - pub writer: RegionWriterRef, - /// Region write-ahead logging, used to write data/meta to the log file. - pub wal: Wal, - /// Region manifest service, used to persist metadata. - pub manifest: RegionManifest, - /// Storage engine config - pub engine_config: Arc, -} - -impl FlushJob { - /// Execute the flush job. - async fn run(&mut self) -> Result<()> { - let _timer = FLUSH_ELAPSED.start_timer(); - - let file_metas = self.write_memtables_to_layer().await?; - if file_metas.is_empty() { - // skip writing manifest and wal if no files are flushed. - return Ok(()); - } - self.write_manifest_and_apply(&file_metas).await?; - - Ok(()) - } - - async fn write_memtables_to_layer(&mut self) -> Result> { - let region_id = self.shared.id(); - let mut futures = Vec::with_capacity(self.memtables.len()); - let iter_ctx = IterContext { - // TODO(ruihang): dynamic row group size based on content (#412) - batch_size: WRITE_ROW_GROUP_SIZE, - // All sequences are visible by default. - ..Default::default() - }; - - for m in &self.memtables { - // skip empty memtable - if m.num_rows() == 0 { - continue; - } - - let file_id = FileId::random(); - // TODO(hl): Check if random file name already exists in meta. - let iter = m.iter(iter_ctx.clone())?; - let sst_layer = self.sst_layer.clone(); - let write_options = WriteOptions { - sst_write_buffer_size: ReadableSize::mb(8), // deprecated usage - }; - futures.push(async move { - Ok(sst_layer - .write_sst(file_id, Source::Iter(iter), &write_options) - .await? - .map( - |SstInfo { - time_range, - file_size, - .. - }| FileMeta { - region_id, - file_id, - time_range, - level: 0, - file_size, - }, - )) - }); - } - - let metas: Vec<_> = futures_util::future::try_join_all(futures) - .await? - .into_iter() - .flatten() - .collect(); - - let flush_bytes = metas.iter().map(|f| f.file_size).sum(); - - FLUSH_BYTES_TOTAL.inc_by(flush_bytes); - - let file_ids = metas.iter().map(|f| f.file_id).collect::>(); - logging::info!("Successfully flush memtables, region:{region_id}, files: {file_ids:?}"); - Ok(metas) - } - - async fn write_manifest_and_apply(&mut self, file_metas: &[FileMeta]) -> Result<()> { - let edit = RegionEdit { - region_version: self.shared.version_control.metadata().version(), - flushed_sequence: Some(self.flush_sequence), - files_to_add: file_metas.to_vec(), - files_to_remove: Vec::default(), - compaction_time_window: None, - }; - - self.writer - .write_edit_and_apply( - &self.wal, - &self.shared, - &self.manifest, - edit, - Some(self.max_memtable_id), - ) - .await?; - self.wal.obsolete(self.flush_sequence).await - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::memtable::AllocTracker; - - #[test] - fn test_get_mutable_limitation() { - assert_eq!(7, get_mutable_limitation(Some(8))); - assert_eq!(8, get_mutable_limitation(Some(10))); - assert_eq!(56, get_mutable_limitation(Some(64))); - assert_eq!(0, get_mutable_limitation(None)); - } - - #[test] - fn test_strategy_global_disabled() { - let strategy = SizeBasedStrategy::new(None); - strategy.reserve_mem(1000); - assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed)); - assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed)); - strategy.schedule_free_mem(1000); - assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed)); - assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed)); - strategy.free_mem(1000); - assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed)); - assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed)); - - let status = RegionStatus { - region_id: 1.into(), - bytes_mutable: 400, - write_buffer_size: 300, - }; - assert_eq!(Some(FlushType::Region), strategy.should_flush(status)); - let status = RegionStatus { - region_id: 1.into(), - bytes_mutable: 100, - write_buffer_size: 300, - }; - assert_eq!(None, strategy.should_flush(status)); - } - - #[test] - fn test_strategy_over_mutable_limit() { - let strategy = SizeBasedStrategy::new(Some(1000)); - strategy.reserve_mem(500); - let status = RegionStatus { - region_id: 1.into(), - bytes_mutable: 300, - write_buffer_size: 500, - }; - assert_eq!(None, strategy.should_flush(status)); - strategy.reserve_mem(400); - - // Flush region. - let status = RegionStatus { - region_id: 1.into(), - bytes_mutable: 400, - write_buffer_size: 300, - }; - assert_eq!(Some(FlushType::Region), strategy.should_flush(status)); - - // More than mutable limitation, Flush global. - let status = RegionStatus { - region_id: 1.into(), - bytes_mutable: 100, - write_buffer_size: 300, - }; - assert_eq!(Some(FlushType::Engine), strategy.should_flush(status)); - - strategy.schedule_free_mem(500); - assert_eq!(None, strategy.should_flush(status)); - assert_eq!(900, strategy.memory_used.load(Ordering::Relaxed)); - assert_eq!(400, strategy.memory_active.load(Ordering::Relaxed)); - - strategy.free_mem(500); - assert_eq!(400, strategy.memory_used.load(Ordering::Relaxed)); - assert_eq!(400, strategy.memory_active.load(Ordering::Relaxed)); - } - - #[test] - fn test_strategy_over_global() { - common_telemetry::init_default_ut_logging(); - - let strategy = SizeBasedStrategy::new(Some(1000)); - strategy.reserve_mem(1100); - strategy.schedule_free_mem(200); - // More than global limit. - let status = RegionStatus { - region_id: 1.into(), - bytes_mutable: 100, - write_buffer_size: 300, - }; - assert_eq!(Some(FlushType::Engine), strategy.should_flush(status)); - - // More than global limit, but mutable not enough (< 500). - strategy.schedule_free_mem(450); - let status = RegionStatus { - region_id: 1.into(), - bytes_mutable: 100, - write_buffer_size: 300, - }; - assert_eq!(None, strategy.should_flush(status)); - strategy.schedule_free_mem(100); - assert_eq!(None, strategy.should_flush(status)); - - // Now mutable is enough. - strategy.reserve_mem(150); - // We can flush again. - assert_eq!(Some(FlushType::Engine), strategy.should_flush(status)); - strategy.reserve_mem(100); - assert_eq!(Some(FlushType::Engine), strategy.should_flush(status)); - } - - #[test] - fn test_alloc_tracker_without_strategy() { - let tracker = AllocTracker::new(None); - assert_eq!(0, tracker.bytes_allocated()); - tracker.on_allocate(100); - assert_eq!(100, tracker.bytes_allocated()); - tracker.on_allocate(200); - assert_eq!(300, tracker.bytes_allocated()); - - tracker.done_allocating(); - assert_eq!(300, tracker.bytes_allocated()); - } - - #[test] - fn test_alloc_tracker_with_strategy() { - let strategy = Arc::new(SizeBasedStrategy::new(Some(1000))); - { - let tracker = AllocTracker::new(Some(strategy.clone() as FlushStrategyRef)); - - tracker.on_allocate(100); - assert_eq!(100, tracker.bytes_allocated()); - assert_eq!(100, strategy.memory_used.load(Ordering::Relaxed)); - assert_eq!(100, strategy.memory_active.load(Ordering::Relaxed)); - - for _ in 0..2 { - // Done allocating won't free the same memory multiple times. - tracker.done_allocating(); - assert_eq!(100, strategy.memory_used.load(Ordering::Relaxed)); - assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed)); - } - } - - assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed)); - assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed)); - } - - #[test] - fn test_alloc_tracker_without_done_allocating() { - let strategy = Arc::new(SizeBasedStrategy::new(Some(1000))); - { - let tracker = AllocTracker::new(Some(strategy.clone() as FlushStrategyRef)); - - tracker.on_allocate(100); - assert_eq!(100, tracker.bytes_allocated()); - assert_eq!(100, strategy.memory_used.load(Ordering::Relaxed)); - assert_eq!(100, strategy.memory_active.load(Ordering::Relaxed)); - } - - assert_eq!(0, strategy.memory_used.load(Ordering::Relaxed)); - assert_eq!(0, strategy.memory_active.load(Ordering::Relaxed)); - } -} diff --git a/src/storage/src/flush/picker.rs b/src/storage/src/flush/picker.rs deleted file mode 100644 index ce3fd424c96e..000000000000 --- a/src/storage/src/flush/picker.rs +++ /dev/null @@ -1,263 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::time::Duration; - -use async_trait::async_trait; -use common_telemetry::logging; -use common_time::util; -use store_api::logstore::LogStore; -use store_api::storage::{FlushContext, FlushReason, Region}; - -use crate::config::{DEFAULT_AUTO_FLUSH_INTERVAL, DEFAULT_PICKER_SCHEDULE_INTERVAL}; -use crate::region::RegionImpl; - -/// Config for [FlushPicker]. -pub struct PickerConfig { - /// Interval to schedule the picker. - pub schedule_interval: Duration, - /// Interval to auto flush a region if it has not flushed yet. - pub auto_flush_interval: Duration, -} - -impl PickerConfig { - /// Returns the auto flush interval in millis or a default value - /// if overflow occurs. - fn auto_flush_interval_millis(&self) -> i64 { - self.auto_flush_interval - .as_millis() - .try_into() - .unwrap_or(DEFAULT_AUTO_FLUSH_INTERVAL.into()) - } -} - -impl Default for PickerConfig { - fn default() -> Self { - PickerConfig { - schedule_interval: Duration::from_millis(DEFAULT_PICKER_SCHEDULE_INTERVAL.into()), - auto_flush_interval: Duration::from_millis(DEFAULT_AUTO_FLUSH_INTERVAL.into()), - } - } -} - -/// Flush task picker. -#[derive(Debug, Clone)] -pub struct FlushPicker { - /// Interval to flush a region automatically. - auto_flush_interval_millis: i64, -} - -impl FlushPicker { - /// Returns a new FlushPicker. - pub fn new(config: PickerConfig) -> FlushPicker { - FlushPicker { - auto_flush_interval_millis: config.auto_flush_interval_millis(), - } - } - - /// Picks regions and flushes them by interval. - /// - /// Returns the number of flushed regions. - pub async fn pick_by_interval(&self, regions: &[T]) -> usize { - let now = util::current_time_millis(); - // Flush regions by interval. - if let Some(earliest_flush_millis) = now.checked_sub(self.auto_flush_interval_millis) { - flush_regions_by_interval(regions, earliest_flush_millis).await - } else { - 0 - } - } - - /// Picks and flushes regions when the write buffer is full. - pub async fn pick_by_write_buffer_full(&self, regions: &[T]) { - // In such case, we pick the oldest region to flush. If this is not enough, - // the next time the region writer will trigger the picker again. Then we - // can pick another region to flush. The total memory will go down eventually. - let target = regions - .iter() - .filter(|region| region.mutable_memtable_usage() > 0) - .min_by_key(|region| region.last_flush_time()); - if let Some(region) = target { - logging::debug!( - "Request flush for region {} due to global buffer is full", - region.item_id() - ); - - region.request_flush(FlushReason::GlobalBufferFull).await; - } - } -} - -/// Item for picker to flush. -#[async_trait] -pub trait FlushItem { - /// Id of the item. - fn item_id(&self) -> u64; - - /// Last flush time in millis. - fn last_flush_time(&self) -> i64; - - /// Mutable memtable usage. - fn mutable_memtable_usage(&self) -> usize; - - /// Requests the item to schedule a flush for specific `reason`. - /// - /// The flush job itself should run in background. - async fn request_flush(&self, reason: FlushReason); -} - -#[async_trait] -impl FlushItem for RegionImpl { - fn item_id(&self) -> u64 { - self.id().into() - } - - fn last_flush_time(&self) -> i64 { - self.last_flush_millis() - } - - fn mutable_memtable_usage(&self) -> usize { - let current = self.version_control().current(); - let memtables = current.memtables(); - memtables.mutable_bytes_allocated() - } - - async fn request_flush(&self, reason: FlushReason) { - let ctx = FlushContext { - wait: false, - reason, - ..Default::default() - }; - - if let Err(e) = self.flush(&ctx).await { - logging::error!(e; "Failed to flush region {}", self.id()); - } - } -} - -/// Auto flush regions based on last flush time. -/// -/// Returns the number of flushed regions. -async fn flush_regions_by_interval( - regions: &[T], - earliest_flush_millis: i64, -) -> usize { - let mut flushed = 0; - for region in regions { - if region.last_flush_time() < earliest_flush_millis { - logging::debug!( - "Auto flush region {} due to last flush time ({} < {})", - region.item_id(), - region.last_flush_time(), - earliest_flush_millis, - ); - - flushed += 1; - region.request_flush(FlushReason::Periodically).await; - } - } - - flushed -} - -#[cfg(test)] -mod tests { - use std::sync::Mutex; - - use super::*; - - struct MockItem { - id: u64, - last_flush_time: i64, - usage: usize, - flush_reason: Mutex>, - } - - impl MockItem { - fn new(id: u64, last_flush_time: i64, usage: usize) -> MockItem { - MockItem { - id, - last_flush_time, - usage, - flush_reason: Mutex::new(None), - } - } - - fn flush_reason(&self) -> Option { - *self.flush_reason.lock().unwrap() - } - } - - #[async_trait] - impl FlushItem for MockItem { - fn item_id(&self) -> u64 { - self.id - } - - fn last_flush_time(&self) -> i64 { - self.last_flush_time - } - - fn mutable_memtable_usage(&self) -> usize { - self.usage - } - - async fn request_flush(&self, reason: FlushReason) { - let mut flush_reason = self.flush_reason.lock().unwrap(); - *flush_reason = Some(reason); - } - } - - #[tokio::test] - async fn test_pick_by_interval() { - let regions = [ - MockItem::new(0, util::current_time_millis(), 1), - MockItem::new(1, util::current_time_millis() - 60 * 1000, 1), - ]; - let picker = FlushPicker::new(PickerConfig { - // schedule_interval is unused in this test. - schedule_interval: Duration::from_millis(10), - auto_flush_interval: Duration::from_millis(30 * 1000), - }); - let flushed = picker.pick_by_interval(®ions).await; - assert_eq!(1, flushed); - assert!(regions[0].flush_reason().is_none()); - assert_eq!(Some(FlushReason::Periodically), regions[1].flush_reason()); - } - - #[tokio::test] - async fn test_pick_by_buffer_full() { - let regions = [ - MockItem::new(0, util::current_time_millis(), 10), - MockItem::new(1, util::current_time_millis() - 60 * 1000, 0), - MockItem::new(1, util::current_time_millis() - 60 * 1000, 10), - ]; - let picker = FlushPicker::new(PickerConfig { - schedule_interval: Duration::from_millis(10), - auto_flush_interval: Duration::from_millis(30 * 1000), - }); - picker.pick_by_write_buffer_full(®ions).await; - assert!(regions[0].flush_reason().is_none()); - assert!(regions[1].flush_reason().is_none()); - assert_eq!( - Some(FlushReason::GlobalBufferFull), - regions[2].flush_reason() - ); - - // No target. - let regions = [MockItem::new(1, util::current_time_millis(), 0)]; - picker.pick_by_write_buffer_full(®ions).await; - assert!(regions[0].flush_reason().is_none()); - } -} diff --git a/src/storage/src/flush/scheduler.rs b/src/storage/src/flush/scheduler.rs deleted file mode 100644 index 8d03ed6af68e..000000000000 --- a/src/storage/src/flush/scheduler.rs +++ /dev/null @@ -1,378 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; -use std::time::Duration; - -use async_trait::async_trait; -use common_base::readable_size::ReadableSize; -use common_runtime::{RepeatedTask, TaskFunction}; -use common_telemetry::logging; -use snafu::{ensure, ResultExt}; -use store_api::logstore::LogStore; -use store_api::storage::{RegionId, SequenceNumber}; -use tokio::sync::oneshot::{Receiver, Sender}; -use tokio::sync::{oneshot, Notify}; - -use crate::compaction::{CompactionPickerRef, CompactionRequestImpl, CompactionSchedulerRef}; -use crate::config::EngineConfig; -use crate::engine::RegionMap; -use crate::error::{ - DuplicateFlushSnafu, Error, Result, StartPickTaskSnafu, StopPickTaskSnafu, WaitFlushSnafu, -}; -use crate::flush::{FlushJob, FlushPicker, PickerConfig}; -use crate::manifest::region::RegionManifest; -use crate::memtable::{MemtableId, MemtableRef}; -use crate::metrics::FLUSH_ERRORS_TOTAL; -use crate::region; -use crate::region::{RegionWriterRef, SharedDataRef}; -use crate::scheduler::rate_limit::BoxedRateLimitToken; -use crate::scheduler::{Handler, LocalScheduler, Request, Scheduler, SchedulerConfig}; -use crate::sst::AccessLayerRef; -use crate::wal::Wal; - -/// Key for [FlushRequest]. -#[derive(Debug, Clone, Hash, PartialEq, Eq)] -pub enum FlushKey { - Engine, - Region(RegionId, SequenceNumber), -} - -/// Flush request. -pub enum FlushRequest { - /// Flush the engine. - Engine, - /// Flush a region. - Region { - /// Region flush request. - req: FlushRegionRequest, - /// Flush result sender. - sender: Sender>, - }, -} - -impl Request for FlushRequest { - type Key = FlushKey; - - #[inline] - fn key(&self) -> FlushKey { - match &self { - FlushRequest::Engine => FlushKey::Engine, - FlushRequest::Region { req, .. } => { - FlushKey::Region(req.shared.id(), req.flush_sequence) - } - } - } - - fn complete(self, result: Result<()>) { - if let FlushRequest::Region { sender, .. } = self { - let _ = sender.send(result); - } - } -} - -/// Region flush request. -pub struct FlushRegionRequest { - /// Max memtable id in these memtables, - /// used to remove immutable memtables in current version. - pub max_memtable_id: MemtableId, - /// Memtables to be flushed. - pub memtables: Vec, - /// Last sequence of data to be flushed. - pub flush_sequence: SequenceNumber, - /// Shared data of region to be flushed. - pub shared: SharedDataRef, - /// Sst access layer of the region. - pub sst_layer: AccessLayerRef, - /// Region writer, used to persist log entry that points to the latest manifest file. - pub writer: RegionWriterRef, - /// Region write-ahead logging, used to write data/meta to the log file. - pub wal: Wal, - /// Region manifest service, used to persist metadata. - pub manifest: RegionManifest, - /// Storage engine config - pub engine_config: Arc, - - // Compaction related options: - /// TTL of the region. - pub ttl: Option, - /// Time window for compaction. - pub compaction_time_window: Option, - pub compaction_picker: CompactionPickerRef, -} - -impl FlushRegionRequest { - #[inline] - fn region_id(&self) -> RegionId { - self.shared.id() - } -} - -impl From<&FlushRegionRequest> for FlushJob { - fn from(req: &FlushRegionRequest) -> FlushJob { - FlushJob { - max_memtable_id: req.max_memtable_id, - memtables: req.memtables.clone(), - flush_sequence: req.flush_sequence, - shared: req.shared.clone(), - sst_layer: req.sst_layer.clone(), - writer: req.writer.clone(), - wal: req.wal.clone(), - manifest: req.manifest.clone(), - engine_config: req.engine_config.clone(), - } - } -} - -impl From<&FlushRegionRequest> for CompactionRequestImpl { - fn from(req: &FlushRegionRequest) -> CompactionRequestImpl { - CompactionRequestImpl { - region_id: req.region_id(), - sst_layer: req.sst_layer.clone(), - writer: req.writer.clone(), - shared: req.shared.clone(), - manifest: req.manifest.clone(), - wal: req.wal.clone(), - ttl: req.ttl, - compaction_time_window: req.compaction_time_window, - sender: None, - picker: req.compaction_picker.clone(), - sst_write_buffer_size: ReadableSize::mb(8), // deprecated usage - // compaction triggered by flush always reschedules - reschedule_on_finish: true, - } - } -} - -/// A handle to get the flush result. -#[derive(Debug)] -pub struct FlushHandle { - region_id: RegionId, - receiver: Receiver>, -} - -impl FlushHandle { - /// Waits until the flush job is finished. - pub async fn wait(self) -> Result<()> { - self.receiver.await.context(WaitFlushSnafu { - region_id: self.region_id, - })? - } -} - -/// Flush scheduler. -pub struct FlushScheduler { - /// Flush task scheduler. - scheduler: LocalScheduler>, - /// Auto flush task. - auto_flush_task: RepeatedTask, - #[cfg(test)] - pending_tasks: Arc>>>, -} - -pub type FlushSchedulerRef = Arc>; - -impl FlushScheduler { - /// Returns a new [FlushScheduler]. - pub fn new( - config: SchedulerConfig, - compaction_scheduler: CompactionSchedulerRef, - regions: Arc>, - picker_config: PickerConfig, - ) -> Result { - let task_interval = picker_config.schedule_interval; - let picker = FlushPicker::new(picker_config); - // Now we just clone the picker since we don't need to share states and - // the clone of picker is cheap. - let task_fn = AutoFlushFunction { - regions: regions.clone(), - picker: picker.clone(), - }; - let auto_flush_task = RepeatedTask::new(task_interval, Box::new(task_fn)); - auto_flush_task - .start(common_runtime::bg_runtime()) - .context(StartPickTaskSnafu)?; - #[cfg(test)] - let pending_tasks = Arc::new(tokio::sync::RwLock::new(vec![])); - let handler = FlushHandler { - compaction_scheduler, - regions, - picker, - #[cfg(test)] - pending_tasks: pending_tasks.clone(), - }; - - Ok(Self { - scheduler: LocalScheduler::new(config, handler), - auto_flush_task, - #[cfg(test)] - pending_tasks, - }) - } - - /// Schedules a region flush request and return the handle to the flush task. - pub fn schedule_region_flush(&self, req: FlushRegionRequest) -> Result { - let region_id = req.region_id(); - let sequence = req.flush_sequence; - let (sender, receiver) = oneshot::channel(); - - let scheduled = self - .scheduler - .schedule(FlushRequest::Region { req, sender })?; - // Normally we should not have duplicate flush request. - ensure!( - scheduled, - DuplicateFlushSnafu { - region_id, - sequence, - } - ); - - Ok(FlushHandle { - region_id, - receiver, - }) - } - - /// Schedules a engine flush request. - pub fn schedule_engine_flush(&self) -> Result<()> { - let _ = self.scheduler.schedule(FlushRequest::Engine)?; - Ok(()) - } - - /// Stop the scheduler. - pub async fn stop(&self) -> Result<()> { - self.auto_flush_task - .stop() - .await - .context(StopPickTaskSnafu)?; - self.scheduler.stop(true).await?; - - #[cfg(test)] - let _ = futures::future::join_all(self.pending_tasks.write().await.drain(..)).await; - - Ok(()) - } -} - -struct FlushHandler { - compaction_scheduler: CompactionSchedulerRef, - regions: Arc>, - picker: FlushPicker, - #[cfg(test)] - pending_tasks: Arc>>>, -} - -#[async_trait::async_trait] -impl Handler for FlushHandler { - type Request = FlushRequest; - - async fn handle_request( - &self, - req: FlushRequest, - token: BoxedRateLimitToken, - finish_notifier: Arc, - ) -> Result<()> { - let compaction_scheduler = self.compaction_scheduler.clone(); - let region_map = self.regions.clone(); - let picker = self.picker.clone(); - let _handle = common_runtime::spawn_bg(async move { - match req { - FlushRequest::Engine => { - let regions = region_map.list_regions(); - picker.pick_by_write_buffer_full(®ions).await; - } - FlushRequest::Region { req, sender } => { - execute_flush_region(req, sender, compaction_scheduler).await; - } - } - - // releases rate limit token - token.try_release(); - // notify scheduler to schedule next task when current task finishes. - finish_notifier.notify_one(); - }); - - #[cfg(test)] - self.pending_tasks.write().await.push(_handle); - Ok(()) - } -} - -async fn execute_flush_region( - req: FlushRegionRequest, - sender: Sender>, - compaction_scheduler: CompactionSchedulerRef, -) { - let mut flush_job = FlushJob::from(&req); - - if let Err(e) = flush_job.run().await { - logging::error!(e; "Failed to flush region {}", req.region_id()); - - FLUSH_ERRORS_TOTAL.inc(); - - FlushRequest::Region { req, sender }.complete(Err(e)); - } else { - logging::debug!("Successfully flush region: {}", req.region_id()); - - // Update last flush time. - req.shared.update_flush_millis(); - - let compaction_request = CompactionRequestImpl::from(&req); - let max_files_in_l0 = req.engine_config.max_files_in_l0; - let shared_data = req.shared.clone(); - - let level0_file_num = shared_data - .version_control - .current() - .ssts() - .level(0) - .file_num(); - if level0_file_num <= max_files_in_l0 { - logging::debug!( - "No enough SST files in level 0 (threshold: {}), skip compaction", - max_files_in_l0 - ); - } else { - // If flush is success, schedule a compaction request for this region. - let _ = - region::schedule_compaction(shared_data, compaction_scheduler, compaction_request); - } - - // Complete the request. - FlushRequest::Region { req, sender }.complete(Ok(())); - } -} - -/// Task function to pick regions to flush. -struct AutoFlushFunction { - /// Regions of the engine. - regions: Arc>, - picker: FlushPicker, -} - -#[async_trait] -impl TaskFunction for AutoFlushFunction { - async fn call(&mut self) -> Result<()> { - // Get all regions. - let regions = self.regions.list_regions(); - let _ = self.picker.pick_by_interval(®ions).await; - - Ok(()) - } - - fn name(&self) -> &str { - "FlushPicker-pick-task" - } -} diff --git a/src/storage/src/lib.rs b/src/storage/src/lib.rs deleted file mode 100644 index 743ff026228d..000000000000 --- a/src/storage/src/lib.rs +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Storage engine implementation. - -#![feature(let_chains)] - -mod chunk; -pub mod codec; -pub mod compaction; -pub mod config; -mod engine; -pub mod error; -mod flush; -pub mod manifest; -pub mod memtable; -pub mod metadata; -pub mod proto; -pub mod read; -pub mod region; -pub mod scheduler; -pub mod schema; -mod snapshot; -pub mod sst; -mod sync; -#[cfg(test)] -mod test_util; -mod version; -mod wal; -pub mod write_batch; - -pub use engine::EngineImpl; -mod file_purger; -mod metrics; -mod window_infer; - -pub use sst::parquet::ParquetWriter; -pub use sst::Source; diff --git a/src/storage/src/manifest.rs b/src/storage/src/manifest.rs deleted file mode 100644 index 30c49dc69b6d..000000000000 --- a/src/storage/src/manifest.rs +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! manifest storage -pub(crate) mod action; -pub mod checkpoint; -pub mod helper; -mod impl_; -pub mod region; -pub(crate) mod storage; -#[cfg(test)] -pub mod test_utils; - -pub use self::impl_::*; -pub use self::storage::manifest_compress_type; diff --git a/src/storage/src/manifest/action.rs b/src/storage/src/manifest/action.rs deleted file mode 100644 index 3158ee94aea9..000000000000 --- a/src/storage/src/manifest/action.rs +++ /dev/null @@ -1,443 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; -use std::io::{BufRead, BufReader}; - -use serde::{Deserialize, Serialize}; -use serde_json as json; -use snafu::{ensure, OptionExt, ResultExt}; -use store_api::manifest::action::{ProtocolAction, ProtocolVersion, VersionHeader}; -use store_api::manifest::{Checkpoint, ManifestVersion, MetaAction}; -use store_api::storage::{RegionId, SequenceNumber}; - -use crate::error::{ - self, DecodeJsonSnafu, DecodeMetaActionListSnafu, ManifestProtocolForbidReadSnafu, - ReadlineSnafu, Result, -}; -use crate::manifest::helper; -use crate::metadata::{ColumnFamilyMetadata, ColumnMetadata, VersionNumber}; -use crate::sst::{FileId, FileMeta}; - -/// Minimal data that could be used to persist and recover [RegionMetadata](crate::metadata::RegionMetadata). -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)] -pub struct RawRegionMetadata { - pub id: RegionId, - pub name: String, - pub columns: RawColumnsMetadata, - pub column_families: RawColumnFamiliesMetadata, - pub version: VersionNumber, -} - -/// Minimal data that could be used to persist and recover [ColumnsMetadata](crate::metadata::ColumnsMetadata). -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)] -pub struct RawColumnsMetadata { - pub columns: Vec, - pub row_key_end: usize, - pub timestamp_key_index: usize, - pub user_column_end: usize, -} - -/// Minimal data that could be used to persist and recover [ColumnFamiliesMetadata](crate::metadata::ColumnFamiliesMetadata). -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)] -pub struct RawColumnFamiliesMetadata { - pub column_families: Vec, -} - -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] -pub struct RegionChange { - /// The committed sequence of the region when this change happens. So the - /// data with sequence **greater than** this sequence would use the new - /// metadata. - pub committed_sequence: SequenceNumber, - /// The metadata after changed. - pub metadata: RawRegionMetadata, -} - -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] -pub struct RegionRemove { - pub region_id: RegionId, -} - -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] -pub struct RegionEdit { - pub region_version: VersionNumber, - pub flushed_sequence: Option, - pub files_to_add: Vec, - pub files_to_remove: Vec, - pub compaction_time_window: Option, -} - -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] -pub struct RegionTruncate { - pub region_id: RegionId, - pub committed_sequence: SequenceNumber, -} - -/// The region version checkpoint -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] -pub struct RegionVersion { - pub manifest_version: ManifestVersion, - pub flushed_sequence: Option, - pub files: HashMap, -} - -/// The region manifest data checkpoint -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq, Default)] -pub struct RegionManifestData { - pub committed_sequence: SequenceNumber, - pub metadata: RawRegionMetadata, - pub version: Option, -} - -#[derive(Debug, Default)] -pub struct RegionManifestDataBuilder { - committed_sequence: SequenceNumber, - metadata: RawRegionMetadata, - version: Option, -} - -impl RegionManifestDataBuilder { - pub fn with_checkpoint(checkpoint: Option) -> Self { - if let Some(s) = checkpoint { - Self { - metadata: s.metadata, - version: s.version, - committed_sequence: s.committed_sequence, - } - } else { - Default::default() - } - } - - pub fn apply_change(&mut self, change: RegionChange) { - self.metadata = change.metadata; - self.committed_sequence = change.committed_sequence; - } - - pub fn apply_edit(&mut self, manifest_version: ManifestVersion, edit: RegionEdit) { - if let Some(version) = &mut self.version { - version.manifest_version = manifest_version; - version.flushed_sequence = edit.flushed_sequence; - for file in edit.files_to_add { - let _ = version.files.insert(file.file_id, file); - } - for file in edit.files_to_remove { - let _ = version.files.remove(&file.file_id); - } - } else { - self.version = Some(RegionVersion { - manifest_version, - flushed_sequence: edit.flushed_sequence, - files: edit - .files_to_add - .into_iter() - .map(|f| (f.file_id, f)) - .collect(), - }); - } - } - pub fn build(self) -> RegionManifestData { - RegionManifestData { - metadata: self.metadata, - version: self.version, - committed_sequence: self.committed_sequence, - } - } -} - -// The checkpoint of region manifest, generated by checkpoint. -#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)] -pub struct RegionCheckpoint { - /// The snasphot protocol - pub protocol: ProtocolAction, - /// The last manifest version that this checkpoint compacts(inclusive). - pub last_version: ManifestVersion, - // The number of manifest actions that this checkpoint compacts. - pub compacted_actions: usize, - // The checkpoint data - pub checkpoint: Option, -} - -impl Checkpoint for RegionCheckpoint { - type Error = error::Error; - - fn set_protocol(&mut self, action: ProtocolAction) { - self.protocol = action; - } - - fn last_version(&self) -> ManifestVersion { - self.last_version - } - - fn encode(&self) -> Result> { - helper::encode_checkpoint(self) - } - - fn decode(bs: &[u8], reader_version: ProtocolVersion) -> Result { - helper::decode_checkpoint(bs, reader_version) - } -} - -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] -pub enum RegionMetaAction { - Protocol(ProtocolAction), - Change(RegionChange), - Remove(RegionRemove), - Edit(RegionEdit), - Truncate(RegionTruncate), -} - -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] -pub struct RegionMetaActionList { - pub actions: Vec, - pub prev_version: ManifestVersion, -} - -impl RegionMetaActionList { - pub fn with_action(action: RegionMetaAction) -> Self { - Self { - actions: vec![action], - prev_version: 0, - } - } - - pub fn new(actions: Vec) -> Self { - Self { - actions, - prev_version: 0, - } - } -} - -impl MetaAction for RegionMetaActionList { - type Error = error::Error; - - fn set_protocol(&mut self, action: ProtocolAction) { - // The protocol action should be the first action in action list by convention. - self.actions.insert(0, RegionMetaAction::Protocol(action)); - } - - fn set_prev_version(&mut self, version: ManifestVersion) { - self.prev_version = version; - } - - /// Encode self into json in the form of string lines, starts with prev_version and then action json list. - fn encode(&self) -> Result> { - helper::encode_actions(self.prev_version, &self.actions) - } - - fn decode( - bs: &[u8], - reader_version: ProtocolVersion, - ) -> Result<(Self, Option)> { - let mut lines = BufReader::new(bs).lines(); - - let mut action_list = RegionMetaActionList { - actions: Vec::default(), - prev_version: 0, - }; - - { - let first_line = lines - .next() - .with_context(|| DecodeMetaActionListSnafu { - msg: format!( - "Invalid content in manifest: {}", - std::str::from_utf8(bs).unwrap_or("**invalid bytes**") - ), - })? - .context(ReadlineSnafu)?; - - // Decode prev_version - let v: VersionHeader = json::from_str(&first_line).context(DecodeJsonSnafu)?; - action_list.prev_version = v.prev_version; - } - - // Decode actions - let mut protocol_action = None; - let mut actions = Vec::default(); - for line in lines { - let line = &line.context(ReadlineSnafu)?; - let action: RegionMetaAction = json::from_str(line).context(DecodeJsonSnafu)?; - - if let RegionMetaAction::Protocol(p) = &action { - ensure!( - p.is_readable(reader_version), - ManifestProtocolForbidReadSnafu { - min_version: p.min_reader_version, - supported_version: reader_version, - } - ); - protocol_action = Some(p.clone()); - } - - actions.push(action); - } - action_list.actions = actions; - - Ok((action_list, protocol_action)) - } -} - -#[cfg(test)] -mod tests { - use common_telemetry::logging; - use datatypes::type_id::LogicalTypeId; - - use super::*; - use crate::manifest::test_utils; - use crate::metadata::RegionMetadata; - use crate::sst::FileId; - use crate::test_util::descriptor_util::RegionDescBuilder; - - #[test] - fn test_encode_decode_action_list() { - common_telemetry::init_default_ut_logging(); - let mut protocol = ProtocolAction::new(); - protocol.min_reader_version = 1; - let mut action_list = RegionMetaActionList::new(vec![ - RegionMetaAction::Protocol(protocol.clone()), - RegionMetaAction::Edit(test_utils::build_region_edit( - 99, - &[FileId::random(), FileId::random()], - &[FileId::random()], - )), - ]); - action_list.set_prev_version(3); - - let bs = action_list.encode().unwrap(); - // {"prev_version":3} - // {"Protocol":{"min_reader_version":1,"min_writer_version":0}} - // {"Edit":{"region_version":0,"flush_sequence":99,"files_to_add":[{"file_name":"test1","level":1},{"file_name":"test2","level":2}],"files_to_remove":[{"file_name":"test0","level":0}]}} - - logging::debug!( - "Encoded action list: \r\n{}", - String::from_utf8(bs.clone()).unwrap() - ); - - let e = RegionMetaActionList::decode(&bs, 0); - assert!(e.is_err()); - assert_eq!( - "Manifest protocol forbid to read, min_version: 1, supported_version: 0", - format!("{}", e.err().unwrap()) - ); - - let (decode_list, p) = RegionMetaActionList::decode(&bs, 1).unwrap(); - assert_eq!(decode_list, action_list); - assert_eq!(p.unwrap(), protocol); - } - - // These tests are used to ensure backward compatibility of manifest files. - // DO NOT modify the serialized string when they fail, check if your - // modification to manifest-related structs is compatible with older manifests. - #[test] - fn test_region_manifest_compatibility() { - let region_edit = r#"{"region_version":0,"flushed_sequence":null,"files_to_add":[{"region_id":4402341478400,"file_name":"4b220a70-2b03-4641-9687-b65d94641208.parquet","time_range":[{"value":1451609210000,"unit":"Millisecond"},{"value":1451609520000,"unit":"Millisecond"}],"level":1}],"files_to_remove":[{"region_id":4402341478400,"file_name":"34b6ebb9-b8a5-4a4b-b744-56f67defad02.parquet","time_range":[{"value":1451609210000,"unit":"Millisecond"},{"value":1451609520000,"unit":"Millisecond"}],"level":0}]}"#; - let _ = serde_json::from_str::(region_edit).unwrap(); - - let region_change = r#" {"committed_sequence":42,"metadata":{"id":0,"name":"region-0","columns":{"columns":[{"cf_id":0,"desc":{"id":2,"name":"k1","data_type":{"Int32":{}},"is_nullable":false,"is_time_index":false,"default_constraint":null,"comment":""}},{"cf_id":0,"desc":{"id":1,"name":"timestamp","data_type":{"Timestamp":{"Millisecond":null}},"is_nullable":false,"is_time_index":true,"default_constraint":null,"comment":""}},{"cf_id":1,"desc":{"id":3,"name":"v1","data_type":{"Float32":{}},"is_nullable":true,"is_time_index":false,"default_constraint":null,"comment":""}},{"cf_id":1,"desc":{"id":2147483649,"name":"__sequence","data_type":{"UInt64":{}},"is_nullable":false,"is_time_index":false,"default_constraint":null,"comment":""}},{"cf_id":1,"desc":{"id":2147483650,"name":"__op_type","data_type":{"UInt8":{}},"is_nullable":false,"is_time_index":false,"default_constraint":null,"comment":""}}],"row_key_end":2,"timestamp_key_index":1,"enable_version_column":false,"user_column_end":3},"column_families":{"column_families":[{"name":"default","cf_id":1,"column_index_start":2,"column_index_end":3}]},"version":0}}"#; - let _ = serde_json::from_str::(region_change).unwrap(); - - let region_remove = r#"{"region_id":42}"#; - let _ = serde_json::from_str::(region_remove).unwrap(); - - let protocol_action = r#"{"min_reader_version":1,"min_writer_version":2}"#; - let _ = serde_json::from_str::(protocol_action).unwrap(); - } - - fn mock_file_meta() -> FileMeta { - FileMeta { - region_id: 0.into(), - file_id: FileId::random(), - time_range: None, - level: 0, - file_size: 1024, - } - } - - #[test] - fn test_region_manifest_builder() { - let desc = RegionDescBuilder::new("test_region_manifest_builder") - .push_field_column(("v0", LogicalTypeId::Int64, true)) - .build(); - let region_metadata: RegionMetadata = desc.try_into().unwrap(); - - let mut builder = RegionManifestDataBuilder::with_checkpoint(None); - - builder.apply_change(RegionChange { - committed_sequence: 42, - metadata: RawRegionMetadata::from(®ion_metadata), - }); - let files = vec![mock_file_meta(), mock_file_meta()]; - builder.apply_edit( - 84, - RegionEdit { - region_version: 0, - flushed_sequence: Some(99), - files_to_add: files.clone(), - files_to_remove: vec![], - compaction_time_window: None, - }, - ); - builder.apply_edit( - 85, - RegionEdit { - region_version: 0, - flushed_sequence: Some(100), - files_to_add: vec![], - files_to_remove: vec![files[0].clone()], - compaction_time_window: None, - }, - ); - - let manifest = builder.build(); - assert_eq!(manifest.metadata, RawRegionMetadata::from(®ion_metadata)); - assert_eq!(manifest.committed_sequence, 42); - assert_eq!( - manifest.version, - Some(RegionVersion { - manifest_version: 85, - flushed_sequence: Some(100), - files: files[1..].iter().map(|f| (f.file_id, f.clone())).collect(), - }) - ); - } - - #[test] - fn test_encode_decode_region_checkpoint() { - let region_checkpoint = RegionCheckpoint { - protocol: ProtocolAction::default(), - last_version: 42, - compacted_actions: 10, - checkpoint: Some(RegionManifestData { - committed_sequence: 100, - metadata: RawRegionMetadata::default(), - version: Some(RegionVersion { - manifest_version: 84, - flushed_sequence: Some(99), - files: vec![mock_file_meta(), mock_file_meta()] - .into_iter() - .map(|f| (f.file_id, f)) - .collect(), - }), - }), - }; - - let bytes = region_checkpoint.encode().unwrap(); - assert!(!bytes.is_empty()); - let decoded_checkpoint = RegionCheckpoint::decode(&bytes, 0).unwrap(); - assert_eq!(region_checkpoint, decoded_checkpoint); - } -} diff --git a/src/storage/src/manifest/checkpoint.rs b/src/storage/src/manifest/checkpoint.rs deleted file mode 100644 index 323e8bb92f3a..000000000000 --- a/src/storage/src/manifest/checkpoint.rs +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::any::Any; - -use async_trait::async_trait; -use store_api::manifest::{Checkpoint, MetaAction}; - -use crate::error::{Error, Result}; -use crate::manifest::ManifestImpl; - -#[async_trait] -pub trait Checkpointer: Send + Sync + std::fmt::Debug { - type Checkpoint: Checkpoint; - type MetaAction: MetaAction; - - /// Try to create a checkpoint, return the checkpoint if successes. - async fn do_checkpoint( - &self, - manifest: &ManifestImpl, - ) -> Result>; - - fn as_any(&self) -> &dyn Any; -} diff --git a/src/storage/src/manifest/helper.rs b/src/storage/src/manifest/helper.rs deleted file mode 100644 index 8130f60a8932..000000000000 --- a/src/storage/src/manifest/helper.rs +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::io::Write; - -use serde::Serialize; -use serde_json::to_writer; -use snafu::{ensure, ResultExt}; -use store_api::manifest::action::{ProtocolVersion, VersionHeader}; -use store_api::manifest::ManifestVersion; - -use crate::error::{ - DecodeJsonSnafu, EncodeJsonSnafu, ManifestProtocolForbidReadSnafu, Result, Utf8Snafu, -}; -use crate::manifest::action::RegionCheckpoint; - -pub const NEWLINE: &[u8] = b"\n"; - -pub fn encode_actions( - prev_version: ManifestVersion, - actions: &[T], -) -> Result> { - let mut bytes = Vec::default(); - { - // Encode prev_version - let v = VersionHeader { prev_version }; - - to_writer(&mut bytes, &v).context(EncodeJsonSnafu)?; - // unwrap is fine here, because we write into a buffer. - bytes.write_all(NEWLINE).unwrap(); - } - - for action in actions { - to_writer(&mut bytes, action).context(EncodeJsonSnafu)?; - bytes.write_all(NEWLINE).unwrap(); - } - - Ok(bytes) -} - -pub fn encode_checkpoint(snasphot: &RegionCheckpoint) -> Result> { - let s = serde_json::to_string(snasphot).context(EncodeJsonSnafu)?; - Ok(s.into_bytes()) -} - -pub fn decode_checkpoint(bs: &[u8], reader_version: ProtocolVersion) -> Result { - let s = std::str::from_utf8(bs).context(Utf8Snafu)?; - let checkpoint: RegionCheckpoint = serde_json::from_str(s).context(DecodeJsonSnafu)?; - ensure!( - checkpoint.protocol.is_readable(reader_version), - ManifestProtocolForbidReadSnafu { - min_version: checkpoint.protocol.min_reader_version, - supported_version: reader_version, - } - ); - - Ok(checkpoint) -} diff --git a/src/storage/src/manifest/impl_.rs b/src/storage/src/manifest/impl_.rs deleted file mode 100644 index 835b03feed3e..000000000000 --- a/src/storage/src/manifest/impl_.rs +++ /dev/null @@ -1,405 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::marker::PhantomData; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::Arc; -use std::time::Duration; - -use arc_swap::ArcSwap; -use async_trait::async_trait; -use common_datasource::compression::CompressionType; -use common_runtime::{RepeatedTask, TaskFunction}; -use common_telemetry::{debug, logging, warn}; -use object_store::ObjectStore; -use snafu::{ensure, ResultExt}; -use store_api::manifest::action::{self, ProtocolAction, ProtocolVersion}; -use store_api::manifest::*; - -use crate::error::{ - Error, ManifestProtocolForbidWriteSnafu, Result, StartManifestGcTaskSnafu, - StopManifestGcTaskSnafu, -}; -use crate::manifest::action::RegionCheckpoint; -use crate::manifest::checkpoint::Checkpointer; -use crate::manifest::storage::{ManifestObjectStore, ObjectStoreLogIterator}; - -const CHECKPOINT_ACTIONS_MARGIN: u16 = 10; -const GC_DURATION_SECS: u64 = 600; - -#[derive(Clone, Debug)] -pub struct ManifestImpl, M: MetaAction> { - inner: Arc>, - checkpointer: Option>>, - last_checkpoint_version: Arc, - checkpoint_actions_margin: u16, - gc_task: Option>>, -} - -impl, M: 'static + MetaAction> - ManifestImpl -{ - pub fn new( - manifest_dir: &str, - object_store: ObjectStore, - compress_type: CompressionType, - checkpoint_actions_margin: Option, - gc_duration: Option, - checkpointer: Option>>, - ) -> Self { - let inner = Arc::new(ManifestImplInner::new( - manifest_dir, - object_store, - compress_type, - )); - let gc_task = if checkpointer.is_some() { - // only start gc task when checkpoint is enabled. - Some(Arc::new(RepeatedTask::new( - gc_duration.unwrap_or_else(|| Duration::from_secs(GC_DURATION_SECS)), - Box::new(ManifestGcTask { - inner: inner.clone(), - }), - ))) - } else { - None - }; - - ManifestImpl { - inner, - checkpointer, - checkpoint_actions_margin: checkpoint_actions_margin - .unwrap_or(CHECKPOINT_ACTIONS_MARGIN), - last_checkpoint_version: Arc::new(AtomicU64::new(MIN_VERSION)), - gc_task, - } - } - - pub fn create( - manifest_dir: &str, - object_store: ObjectStore, - compress_type: CompressionType, - ) -> Self { - Self::new(manifest_dir, object_store, compress_type, None, None, None) - } - - #[inline] - pub(crate) fn checkpointer( - &self, - ) -> &Option>> { - &self.checkpointer - } - - #[inline] - pub(crate) fn set_last_checkpoint_version(&self, version: ManifestVersion) { - self.last_checkpoint_version - .store(version, Ordering::Relaxed); - } - - /// Update inner state. - pub fn update_state(&self, version: ManifestVersion, protocol: Option) { - self.inner.update_state(version, protocol); - } - - pub(crate) async fn save_checkpoint(&self, checkpoint: &RegionCheckpoint) -> Result<()> { - ensure!( - checkpoint - .protocol - .is_writable(self.inner.supported_writer_version), - ManifestProtocolForbidWriteSnafu { - min_version: checkpoint.protocol.min_writer_version, - supported_version: self.inner.supported_writer_version, - } - ); - let bytes = checkpoint.encode()?; - self.manifest_store() - .save_checkpoint(checkpoint.last_version, &bytes) - .await - } - - pub(crate) async fn may_do_checkpoint(&self, version: ManifestVersion) -> Result<()> { - if version - self.last_checkpoint_version.load(Ordering::Relaxed) - >= self.checkpoint_actions_margin as u64 - { - let s = self.do_checkpoint().await?; - debug!("Manifest checkpoint, checkpoint: {:#?}", s); - } - - Ok(()) - } - - #[inline] - pub(crate) fn manifest_store(&self) -> &Arc { - self.inner.manifest_store() - } -} - -#[async_trait] -impl, M: 'static + MetaAction> Manifest - for ManifestImpl -{ - type Error = Error; - type Checkpoint = S; - type MetaAction = M; - type MetaActionIterator = MetaActionIteratorImpl; - - async fn update(&self, action_list: M) -> Result { - let version = self.inner.save(action_list).await?; - - self.may_do_checkpoint(version).await?; - Ok(version) - } - - async fn scan( - &self, - start: ManifestVersion, - end: ManifestVersion, - ) -> Result { - self.inner.scan(start, end).await - } - - async fn do_checkpoint(&self) -> Result> { - if let Some(cp) = &self.checkpointer { - let checkpoint = cp.do_checkpoint(self).await?; - if let Some(checkpoint) = &checkpoint { - self.set_last_checkpoint_version(checkpoint.last_version()); - } - return Ok(checkpoint); - } - - Ok(None) - } - - async fn last_checkpoint(&self) -> Result> { - self.inner.last_checkpoint().await - } - - fn last_version(&self) -> ManifestVersion { - self.inner.last_version() - } - - async fn start(&self) -> Result<()> { - if let Some(task) = &self.gc_task { - task.start(common_runtime::bg_runtime()) - .context(StartManifestGcTaskSnafu)?; - } - - Ok(()) - } - - async fn stop(&self) -> Result<()> { - if let Some(task) = &self.gc_task { - task.stop().await.context(StopManifestGcTaskSnafu)?; - } - - Ok(()) - } -} - -#[derive(Debug)] -struct ManifestImplInner, M: MetaAction> { - store: Arc, - version: AtomicU64, - /// Current using protocol - protocol: ArcSwap, - /// Current node supported protocols (reader_version, writer_version) - supported_reader_version: ProtocolVersion, - supported_writer_version: ProtocolVersion, - _phantom: PhantomData<(S, M)>, -} - -pub struct MetaActionIteratorImpl> { - log_iter: ObjectStoreLogIterator, - reader_version: ProtocolVersion, - last_protocol: Option, - _phantom: PhantomData, -} - -impl> MetaActionIteratorImpl { - pub fn last_protocol(&self) -> &Option { - &self.last_protocol - } -} - -#[async_trait] -impl> MetaActionIterator for MetaActionIteratorImpl { - type Error = Error; - type MetaAction = M; - - async fn next_action(&mut self) -> Result> { - match self.log_iter.next_log().await? { - Some((v, bytes)) => { - let (action_list, protocol) = M::decode(&bytes, self.reader_version)?; - - if protocol.is_some() { - self.last_protocol = protocol; - } - - Ok(Some((v, action_list))) - } - None => Ok(None), - } - } -} - -struct ManifestGcTask, M: MetaAction> { - inner: Arc>, -} - -#[async_trait::async_trait] -impl, M: MetaAction> TaskFunction - for ManifestGcTask -{ - fn name(&self) -> &str { - "region-manifest-gc" - } - - async fn call(&mut self) -> Result<()> { - if let Some((last_version, _)) = self.inner.store.load_last_checkpoint().await? { - // Purge all manifest <= last_version and checkpoint files < last_version. - let deleted = self - .inner - .store - .delete_until(last_version + 1, true) - .await?; - debug!( - "Deleted {} logs from region manifest storage(path={}), last_version: {}.", - deleted, - self.inner.store.path(), - last_version, - ); - } - - Ok(()) - } -} - -impl, M: MetaAction> ManifestImplInner { - fn new(manifest_dir: &str, object_store: ObjectStore, compress_type: CompressionType) -> Self { - let (reader_version, writer_version) = action::supported_protocol_version(); - - Self { - store: Arc::new(ManifestObjectStore::new( - manifest_dir, - object_store, - compress_type, - )), - version: AtomicU64::new(0), - protocol: ArcSwap::new(Arc::new(ProtocolAction::new())), - supported_reader_version: reader_version, - supported_writer_version: writer_version, - _phantom: PhantomData, - } - } - - #[inline] - fn manifest_store(&self) -> &Arc { - &self.store - } - - #[inline] - fn inc_version(&self) -> ManifestVersion { - self.version.fetch_add(1, Ordering::Relaxed) - } - - fn update_state(&self, version: ManifestVersion, protocol: Option) { - self.version.store(version, Ordering::Relaxed); - if let Some(p) = protocol { - self.protocol.store(Arc::new(p)); - } - } - - #[inline] - fn last_version(&self) -> ManifestVersion { - self.version.load(Ordering::Relaxed) - } - - async fn save(&self, mut action_list: M) -> Result { - let protocol = self.protocol.load(); - - ensure!( - protocol.is_writable(self.supported_writer_version), - ManifestProtocolForbidWriteSnafu { - min_version: protocol.min_writer_version, - supported_version: self.supported_writer_version, - } - ); - - let version = self.inc_version(); - - if version == 0 || protocol.min_writer_version < self.supported_writer_version { - let new_protocol = ProtocolAction { - min_reader_version: self.supported_reader_version, - min_writer_version: self.supported_writer_version, - }; - action_list.set_protocol(new_protocol.clone()); - - logging::info!( - "Updated manifest protocol from {} to {}.", - protocol, - new_protocol - ); - - self.protocol.store(Arc::new(new_protocol)); - } - - logging::debug!( - "Save region metadata action: {:?}, version: {}", - action_list, - version - ); - - self.store.save(version, &action_list.encode()?).await?; - - Ok(version) - } - - async fn scan( - &self, - start: ManifestVersion, - end: ManifestVersion, - ) -> Result> { - Ok(MetaActionIteratorImpl { - log_iter: self.store.scan(start, end).await?, - reader_version: self.supported_reader_version, - last_protocol: None, - _phantom: PhantomData, - }) - } - - async fn last_checkpoint(&self) -> Result> { - let protocol = self.protocol.load(); - let last_checkpoint = self.store.load_last_checkpoint().await?; - - if let Some((version, bytes)) = last_checkpoint { - let checkpoint = S::decode(&bytes, protocol.min_reader_version)?; - assert!(checkpoint.last_version() >= version); - if checkpoint.last_version() > version { - // It happens when saving checkpoint successfully, but failed at saving checkpoint metadata(the "__last_checkpoint" file). - // Then we try to use the old checkpoint and do the checkpoint next time. - // If the old checkpoint was deleted, it's fine that we return the latest checkpoint. - // The only side effect is leaving some unused checkpoint files, - // and they will be purged by gc task. - warn!("The checkpoint manifest version {} in {} is greater than checkpoint metadata version {}.", self.store.path(), checkpoint.last_version(), version); - - if let Some((_, bytes)) = self.store.load_checkpoint(version).await? { - let old_checkpoint = S::decode(&bytes, protocol.min_reader_version)?; - return Ok(Some(old_checkpoint)); - } - } - Ok(Some(checkpoint)) - } else { - Ok(None) - } - } -} diff --git a/src/storage/src/manifest/region.rs b/src/storage/src/manifest/region.rs deleted file mode 100644 index fa1b9de06ad8..000000000000 --- a/src/storage/src/manifest/region.rs +++ /dev/null @@ -1,690 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Region manifest impl -use std::any::Any; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::Arc; -use std::time::Duration; - -use async_trait::async_trait; -use common_datasource::compression::CompressionType; -use common_telemetry::{info, warn}; -use object_store::ObjectStore; -use store_api::manifest::action::ProtocolAction; -use store_api::manifest::{ - Manifest, ManifestLogStorage, ManifestVersion, MetaActionIterator, MIN_VERSION, -}; - -use crate::error::{ManifestCheckpointSnafu, Result}; -use crate::manifest::action::*; -use crate::manifest::checkpoint::Checkpointer; -use crate::manifest::ManifestImpl; - -pub type RegionManifest = ManifestImpl; - -#[derive(Debug)] -pub struct RegionManifestCheckpointer { - // The latest manifest version when flushing memtables. - // Checkpoint can't exceed over flushed manifest version because we have to keep - // the region metadata for replaying WAL to ensure correct data schema. - flushed_manifest_version: AtomicU64, -} - -impl RegionManifestCheckpointer { - pub(crate) fn set_flushed_manifest_version(&self, manifest_version: ManifestVersion) { - let current = self.flushed_manifest_version.load(Ordering::Relaxed); - - self.flushed_manifest_version - .store(current.max(manifest_version), Ordering::Relaxed); - } -} - -#[async_trait] -impl Checkpointer for RegionManifestCheckpointer { - type Checkpoint = RegionCheckpoint; - type MetaAction = RegionMetaActionList; - - async fn do_checkpoint( - &self, - manifest: &ManifestImpl, - ) -> Result> { - let last_checkpoint = manifest.last_checkpoint().await?; - - let current_version = manifest.last_version(); - let (start_version, mut protocol, mut manifest_builder) = - if let Some(checkpoint) = last_checkpoint { - ( - checkpoint.last_version + 1, - checkpoint.protocol, - RegionManifestDataBuilder::with_checkpoint(checkpoint.checkpoint), - ) - } else { - ( - MIN_VERSION, - ProtocolAction::default(), - RegionManifestDataBuilder::default(), - ) - }; - - let end_version = - current_version.min(self.flushed_manifest_version.load(Ordering::Relaxed)) + 1; - if start_version >= end_version { - return Ok(None); - } - - info!("Begin to do region manifest checkpoint, path: {}, start_version: {}, end_version: {}, flushed_manifest_version: {}", - manifest.manifest_store().path(), - start_version, - end_version, - self.flushed_manifest_version.load(Ordering::Relaxed)); - - let mut iter = manifest.scan(start_version, end_version).await?; - - let mut last_version = start_version; - let mut compacted_actions = 0; - while let Some((version, action_list)) = iter.next_action().await? { - for action in action_list.actions { - match action { - RegionMetaAction::Change(c) => manifest_builder.apply_change(c), - RegionMetaAction::Edit(e) => manifest_builder.apply_edit(version, e), - RegionMetaAction::Protocol(p) => protocol = p, - action => { - return ManifestCheckpointSnafu { - msg: format!("can't apply region action: {:?}", action), - } - .fail(); - } - } - } - last_version = version; - compacted_actions += 1; - } - - if compacted_actions == 0 { - return Ok(None); - } - - let region_manifest = manifest_builder.build(); - let checkpoint = RegionCheckpoint { - protocol, - last_version, - compacted_actions, - checkpoint: Some(region_manifest), - }; - - manifest.save_checkpoint(&checkpoint).await?; - if let Err(e) = manifest - .manifest_store() - .delete(start_version, last_version + 1) - .await - { - // We only log when the error kind isn't `NotFound` - if !e.is_object_to_delete_not_found() { - // It doesn't matter when deletion fails, they will be purged by gc task. - warn!( - "Failed to delete manifest logs [{},{}] in path: {}. err: {}", - start_version, - last_version, - manifest.manifest_store().path(), - e - ); - } - } - - info!("Region manifest checkpoint, path: {}, start_version: {}, last_version: {}, compacted actions: {}", - manifest.manifest_store().path(), - start_version, - last_version, - compacted_actions); - - Ok(Some(checkpoint)) - } - - fn as_any(&self) -> &dyn Any { - self - } -} - -impl RegionManifest { - pub fn with_checkpointer( - manifest_dir: &str, - object_store: ObjectStore, - compress_type: CompressionType, - checkpoint_actions_margin: Option, - gc_duration: Option, - ) -> Self { - Self::new( - manifest_dir, - object_store, - compress_type, - checkpoint_actions_margin, - gc_duration, - Some(Arc::new(RegionManifestCheckpointer { - flushed_manifest_version: AtomicU64::new(0), - })), - ) - } - - // Update flushed manifest version in checkpointer - pub fn set_flushed_manifest_version(&self, manifest_version: ManifestVersion) { - if let Some(checkpointer) = self.checkpointer() { - if let Some(checkpointer) = checkpointer - .as_any() - .downcast_ref::() - { - checkpointer.set_flushed_manifest_version(manifest_version); - } - } - } -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use common_test_util::temp_dir::create_temp_dir; - use object_store::services::{Fs, S3}; - use object_store::test_util::{s3_test_config, TempFolder}; - use object_store::ObjectStore; - use store_api::manifest::action::ProtocolAction; - use store_api::manifest::{Manifest, MetaActionIterator, MAX_VERSION}; - - use super::*; - use crate::manifest::manifest_compress_type; - use crate::manifest::test_utils::*; - use crate::metadata::RegionMetadata; - use crate::sst::FileId; - - #[tokio::test] - async fn test_fs_region_manifest_compress() { - let manifest = new_fs_manifest(true, None).await; - test_region_manifest(&manifest).await - } - - #[tokio::test] - async fn test_fs_region_manifest_uncompress() { - let manifest = new_fs_manifest(false, None).await; - test_region_manifest(&manifest).await - } - - #[tokio::test] - async fn test_s3_region_manifest_compress() { - if s3_test_config().is_some() { - let (manifest, temp_dir) = new_s3_manifest(true, None).await; - test_region_manifest(&manifest).await; - temp_dir.remove_all().await.unwrap(); - } - } - - #[tokio::test] - async fn test_s3_region_manifest_uncompress() { - if s3_test_config().is_some() { - let (manifest, temp_dir) = new_s3_manifest(false, None).await; - test_region_manifest(&manifest).await; - temp_dir.remove_all().await.unwrap(); - } - } - - async fn new_fs_manifest(compress: bool, gc_duration: Option) -> RegionManifest { - let tmp_dir = create_temp_dir("test_region_manifest"); - let mut builder = Fs::default(); - let _ = builder.root(&tmp_dir.path().to_string_lossy()); - let object_store = ObjectStore::new(builder).unwrap().finish(); - - let manifest = RegionManifest::with_checkpointer( - "/manifest/", - object_store, - manifest_compress_type(compress), - None, - gc_duration, - ); - manifest.start().await.unwrap(); - manifest - } - - async fn new_s3_manifest( - compress: bool, - gc_duration: Option, - ) -> (RegionManifest, TempFolder) { - let s3_config = s3_test_config().unwrap(); - let mut builder = S3::default(); - let _ = builder - .root(&s3_config.root) - .access_key_id(&s3_config.access_key_id) - .secret_access_key(&s3_config.secret_access_key) - .bucket(&s3_config.bucket); - - if s3_config.region.is_some() { - let _ = builder.region(s3_config.region.as_ref().unwrap()); - } - let store = ObjectStore::new(builder).unwrap().finish(); - let temp_folder = TempFolder::new(&store, "/"); - let manifest = RegionManifest::with_checkpointer( - "/manifest/", - store, - manifest_compress_type(compress), - None, - gc_duration, - ); - manifest.start().await.unwrap(); - - (manifest, temp_folder) - } - - async fn test_region_manifest(manifest: &RegionManifest) { - common_telemetry::init_default_ut_logging(); - - let region_meta = Arc::new(build_region_meta()); - - assert_eq!( - None, - manifest - .scan(0, MAX_VERSION) - .await - .unwrap() - .next_action() - .await - .unwrap() - ); - - assert!(manifest - .update(RegionMetaActionList::with_action(RegionMetaAction::Change( - RegionChange { - metadata: region_meta.as_ref().into(), - committed_sequence: 99, - }, - ))) - .await - .is_ok()); - - let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap(); - - let (v, action_list) = iter.next_action().await.unwrap().unwrap(); - assert_eq!(0, v); - assert_eq!(2, action_list.actions.len()); - let protocol = &action_list.actions[0]; - assert!(matches!( - protocol, - RegionMetaAction::Protocol(ProtocolAction { .. }) - )); - - let action = &action_list.actions[1]; - - match action { - RegionMetaAction::Change(c) => { - assert_eq!( - RegionMetadata::try_from(c.metadata.clone()).unwrap(), - *region_meta - ); - assert_eq!(c.committed_sequence, 99); - } - _ => unreachable!(), - } - - // Save some actions - assert!(manifest - .update(RegionMetaActionList::new(vec![ - RegionMetaAction::Edit(build_region_edit(1, &[FileId::random()], &[])), - RegionMetaAction::Edit(build_region_edit( - 2, - &[FileId::random(), FileId::random()], - &[], - )), - ])) - .await - .is_ok()); - - let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap(); - let (v, action_list) = iter.next_action().await.unwrap().unwrap(); - assert_eq!(0, v); - assert_eq!(2, action_list.actions.len()); - let protocol = &action_list.actions[0]; - assert!(matches!( - protocol, - RegionMetaAction::Protocol(ProtocolAction { .. }) - )); - - let action = &action_list.actions[1]; - match action { - RegionMetaAction::Change(c) => { - assert_eq!( - RegionMetadata::try_from(c.metadata.clone()).unwrap(), - *region_meta - ); - assert_eq!(c.committed_sequence, 99); - } - _ => unreachable!(), - } - - let (v, action_list) = iter.next_action().await.unwrap().unwrap(); - assert_eq!(1, v); - assert_eq!(2, action_list.actions.len()); - assert!(matches!(&action_list.actions[0], RegionMetaAction::Edit(_))); - assert!(matches!(&action_list.actions[1], RegionMetaAction::Edit(_))); - - // Reach end - assert!(iter.next_action().await.unwrap().is_none()); - - manifest.stop().await.unwrap(); - } - - async fn assert_scan(manifest: &RegionManifest, start_version: ManifestVersion, expected: u64) { - let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap(); - let mut actions = 0; - while let Some((v, _)) = iter.next_action().await.unwrap() { - assert_eq!(v, start_version + actions); - actions += 1; - } - assert_eq!(expected, actions); - } - - #[tokio::test(flavor = "multi_thread")] - async fn test_fs_region_manifest_checkpoint_compress() { - let duration = Duration::from_millis(50); - let manifest = new_fs_manifest(true, Some(duration)).await; - - test_region_manifest_checkpoint(&manifest, duration).await - } - - #[tokio::test] - async fn test_fs_region_manifest_checkpoint_uncompress() { - let duration = Duration::from_millis(50); - let manifest = new_fs_manifest(false, Some(duration)).await; - - test_region_manifest_checkpoint(&manifest, duration).await - } - - #[tokio::test] - async fn test_s3_region_manifest_checkpoint_compress() { - if s3_test_config().is_some() { - let duration = Duration::from_millis(50); - let (manifest, temp_dir) = new_s3_manifest(true, Some(duration)).await; - - test_region_manifest_checkpoint(&manifest, duration).await; - temp_dir.remove_all().await.unwrap(); - } - } - - #[tokio::test] - async fn test_s3_region_manifest_checkpoint_uncompress() { - if s3_test_config().is_some() { - let duration = Duration::from_millis(50); - let (manifest, temp_dir) = new_s3_manifest(false, Some(duration)).await; - - test_region_manifest_checkpoint(&manifest, duration).await; - temp_dir.remove_all().await.unwrap(); - } - } - - async fn test_region_manifest_checkpoint( - manifest: &RegionManifest, - test_gc_duration: Duration, - ) { - common_telemetry::init_default_ut_logging(); - - let region_meta = Arc::new(build_region_meta()); - let new_region_meta = Arc::new(build_altered_region_meta()); - - let file = FileId::random(); - let file_ids = vec![FileId::random(), FileId::random()]; - - let actions: Vec = vec![ - RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange { - metadata: region_meta.as_ref().into(), - committed_sequence: 1, - })), - RegionMetaActionList::new(vec![ - RegionMetaAction::Edit(build_region_edit(2, &[file], &[])), - RegionMetaAction::Edit(build_region_edit(3, &file_ids, &[file])), - ]), - RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange { - metadata: new_region_meta.as_ref().into(), - committed_sequence: 99, - })), - ]; - - for action in actions { - let _ = manifest.update(action).await.unwrap(); - } - assert!(manifest.last_checkpoint().await.unwrap().is_none()); - assert_scan(manifest, 0, 3).await; - // update flushed manifest version for doing checkpoint - manifest.set_flushed_manifest_version(2); - - let mut checkpoint_versions = vec![]; - - // do a checkpoint - let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap(); - let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap(); - assert_eq!(checkpoint, last_checkpoint); - assert_eq!(checkpoint.compacted_actions, 3); - assert_eq!(checkpoint.last_version, 2); - checkpoint_versions.push(2); - let alterd_raw_meta = RawRegionMetadata::from(new_region_meta.as_ref()); - assert!(matches!(&checkpoint.checkpoint, Some(RegionManifestData { - committed_sequence: 99, - metadata, - version: Some(RegionVersion { - manifest_version: 1, - flushed_sequence: Some(3), - files, - }), - }) if files.len() == 2 && - files.contains_key(&file_ids[0]) && - files.contains_key(&file_ids[1]) && - *metadata == alterd_raw_meta)); - // all actions were compacted - assert_eq!( - None, - manifest - .scan(0, MAX_VERSION) - .await - .unwrap() - .next_action() - .await - .unwrap() - ); - - assert!(manifest.do_checkpoint().await.unwrap().is_none()); - let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap(); - assert_eq!(checkpoint, last_checkpoint); - - // add new actions - let new_file = FileId::random(); - let actions: Vec = vec![ - RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange { - metadata: region_meta.as_ref().into(), - committed_sequence: 200, - })), - RegionMetaActionList::new(vec![RegionMetaAction::Edit(build_region_edit( - 201, - &[new_file], - &file_ids, - ))]), - ]; - for action in actions { - let _ = manifest.update(action).await.unwrap(); - } - - assert_scan(manifest, 3, 2).await; - - // do another checkpoints - // compacted RegionChange - manifest.set_flushed_manifest_version(3); - let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap(); - let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap(); - assert_eq!(checkpoint, last_checkpoint); - assert_eq!(checkpoint.compacted_actions, 1); - assert_eq!(checkpoint.last_version, 3); - checkpoint_versions.push(3); - assert!(matches!(&checkpoint.checkpoint, Some(RegionManifestData { - committed_sequence: 200, - metadata, - version: Some(RegionVersion { - manifest_version: 1, - flushed_sequence: Some(3), - files, - }), - }) if files.len() == 2 && - files.contains_key(&file_ids[0]) && - files.contains_key(&file_ids[1]) && - *metadata == RawRegionMetadata::from(region_meta.as_ref()))); - - assert_scan(manifest, 4, 1).await; - // compacted RegionEdit - manifest.set_flushed_manifest_version(4); - let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap(); - let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap(); - assert_eq!(checkpoint, last_checkpoint); - assert_eq!(checkpoint.compacted_actions, 1); - assert_eq!(checkpoint.last_version, 4); - checkpoint_versions.push(4); - assert!(matches!(&checkpoint.checkpoint, Some(RegionManifestData { - committed_sequence: 200, - metadata, - version: Some(RegionVersion { - manifest_version: 4, - flushed_sequence: Some(201), - files, - }), - }) if files.len() == 1 && - files.contains_key(&new_file) && - *metadata == RawRegionMetadata::from(region_meta.as_ref()))); - - // all actions were compacted - assert_eq!( - None, - manifest - .scan(0, MAX_VERSION) - .await - .unwrap() - .next_action() - .await - .unwrap() - ); - - // wait for gc - tokio::time::sleep(test_gc_duration * 3).await; - - for v in checkpoint_versions { - if v < 4 { - // ensure old checkpoints were purged. - assert!(manifest - .manifest_store() - .load_checkpoint(v) - .await - .unwrap() - .is_none()); - } else { - // the last checkpoints is still exists. - let last_checkpoint = manifest.last_checkpoint().await.unwrap().unwrap(); - assert_eq!(checkpoint, last_checkpoint); - } - } - - manifest.stop().await.unwrap(); - } - - #[tokio::test] - async fn test_region_manifest_truncate() { - common_telemetry::init_default_ut_logging(); - - let manifest = new_fs_manifest(false, None).await; - let region_meta = Arc::new(build_region_meta()); - let committed_sequence = 99; - - let file = FileId::random(); - let file_ids = vec![FileId::random(), FileId::random()]; - - // Save some actions. - let actions: Vec = vec![ - RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange { - metadata: region_meta.as_ref().into(), - committed_sequence: 1, - })), - RegionMetaActionList::new(vec![ - RegionMetaAction::Edit(build_region_edit(2, &[file], &[])), - RegionMetaAction::Edit(build_region_edit(3, &file_ids, &[file])), - ]), - RegionMetaActionList::with_action(RegionMetaAction::Truncate(RegionTruncate { - region_id: 0.into(), - committed_sequence, - })), - RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange { - metadata: region_meta.as_ref().into(), - committed_sequence: 1, - })), - ]; - - for action in actions { - manifest.update(action).await.unwrap(); - } - - // Scan manifest. - let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap(); - - let (v, action_list) = iter.next_action().await.unwrap().unwrap(); - info!("action_list = {:?}", action_list.actions); - assert_eq!(0, v); - assert_eq!(2, action_list.actions.len()); - let protocol = &action_list.actions[0]; - assert!(matches!( - protocol, - RegionMetaAction::Protocol(ProtocolAction { .. }) - )); - - let change = &action_list.actions[1]; - assert!(matches!( - change, - RegionMetaAction::Change(RegionChange { - committed_sequence: 1, - .. - }) - )); - - let (v, action_list) = iter.next_action().await.unwrap().unwrap(); - assert_eq!(1, v); - assert_eq!(2, action_list.actions.len()); - assert!(matches!(&action_list.actions[0], RegionMetaAction::Edit(_))); - assert!(matches!(&action_list.actions[1], RegionMetaAction::Edit(_))); - - let (v, action_list) = iter.next_action().await.unwrap().unwrap(); - assert_eq!(2, v); - assert_eq!(1, action_list.actions.len()); - let truncate = &action_list.actions[0]; - assert!(matches!( - truncate, - RegionMetaAction::Truncate(RegionTruncate { - committed_sequence: 99, - .. - }) - )); - - let (v, action_list) = iter.next_action().await.unwrap().unwrap(); - assert_eq!(3, v); - assert_eq!(1, action_list.actions.len()); - let change = &action_list.actions[0]; - assert!(matches!( - change, - RegionMetaAction::Change(RegionChange { - committed_sequence: 1, - .. - }) - )); - - // Reach end - assert!(iter.next_action().await.unwrap().is_none()); - } -} diff --git a/src/storage/src/manifest/storage.rs b/src/storage/src/manifest/storage.rs deleted file mode 100644 index 806c00bceea8..000000000000 --- a/src/storage/src/manifest/storage.rs +++ /dev/null @@ -1,741 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; -use std::iter::Iterator; -use std::str::FromStr; - -use async_trait::async_trait; -use common_datasource::compression::CompressionType; -use common_telemetry::logging; -use futures::TryStreamExt; -use lazy_static::lazy_static; -use object_store::{raw_normalize_path, util, Entry, ErrorKind, ObjectStore}; -use regex::Regex; -use serde::{Deserialize, Serialize}; -use snafu::{ensure, ResultExt}; -use store_api::manifest::{LogIterator, ManifestLogStorage, ManifestVersion}; - -use crate::error::{ - CompressObjectSnafu, DecodeJsonSnafu, DecompressObjectSnafu, DeleteObjectSnafu, - EncodeJsonSnafu, Error, InvalidScanIndexSnafu, ListObjectsSnafu, ReadObjectSnafu, Result, - Utf8Snafu, WriteObjectSnafu, -}; - -lazy_static! { - static ref DELTA_RE: Regex = Regex::new("^\\d+\\.json").unwrap(); - static ref CHECKPOINT_RE: Regex = Regex::new("^\\d+\\.checkpoint").unwrap(); -} - -const LAST_CHECKPOINT_FILE: &str = "_last_checkpoint"; -const DEFAULT_MANIFEST_COMPRESSION_TYPE: CompressionType = CompressionType::Gzip; -/// Due to backward compatibility, it is possible that the user's manifest file has not been compressed. -/// So when we encounter problems, we need to fall back to `FALL_BACK_COMPRESS_TYPE` for processing. -const FALL_BACK_COMPRESS_TYPE: CompressionType = CompressionType::Uncompressed; - -#[inline] -pub const fn manifest_compress_type(compress: bool) -> CompressionType { - if compress { - DEFAULT_MANIFEST_COMPRESSION_TYPE - } else { - FALL_BACK_COMPRESS_TYPE - } -} - -#[inline] -pub fn delta_file(version: ManifestVersion) -> String { - format!("{version:020}.json") -} - -#[inline] -pub fn checkpoint_file(version: ManifestVersion) -> String { - format!("{version:020}.checkpoint") -} - -#[inline] -pub fn gen_path(path: &str, file: &str, compress_type: CompressionType) -> String { - if compress_type == CompressionType::Uncompressed { - format!("{}{}", path, file) - } else { - format!("{}{}.{}", path, file, compress_type.file_extension()) - } -} - -/// Return's the file manifest version from path -/// -/// # Panics -/// Panics if the file path is not a valid delta or checkpoint file. -#[inline] -pub fn file_version(path: &str) -> ManifestVersion { - let s = path.split('.').next().unwrap(); - s.parse().unwrap_or_else(|_| panic!("Invalid file: {path}")) -} - -/// Return's the file compress algorithm by file extension. -/// -/// for example file -/// `00000000000000000000.json.gz` -> `CompressionType::GZIP` -#[inline] -pub fn file_compress_type(path: &str) -> CompressionType { - let s = path.rsplit('.').next().unwrap_or(""); - CompressionType::from_str(s).unwrap_or(CompressionType::Uncompressed) -} - -#[inline] -pub fn is_delta_file(file_name: &str) -> bool { - DELTA_RE.is_match(file_name) -} - -#[inline] -pub fn is_checkpoint_file(file_name: &str) -> bool { - CHECKPOINT_RE.is_match(file_name) -} - -pub struct ObjectStoreLogIterator { - object_store: ObjectStore, - iter: Box + Send + Sync>, -} - -#[async_trait] -impl LogIterator for ObjectStoreLogIterator { - type Error = Error; - - async fn next_log(&mut self) -> Result)>> { - match self.iter.next() { - Some((v, entry)) => { - let compress_type = file_compress_type(entry.name()); - let bytes = self - .object_store - .read(entry.path()) - .await - .context(ReadObjectSnafu { path: entry.path() })?; - let data = compress_type - .decode(bytes) - .await - .context(DecompressObjectSnafu { - compress_type, - path: entry.path(), - })?; - Ok(Some((v, data))) - } - None => Ok(None), - } - } -} - -#[derive(Clone, Debug)] -pub struct ManifestObjectStore { - object_store: ObjectStore, - compress_type: CompressionType, - path: String, -} - -impl ManifestObjectStore { - pub fn new(path: &str, object_store: ObjectStore, compress_type: CompressionType) -> Self { - Self { - object_store, - compress_type, - path: util::normalize_dir(path), - } - } - - #[inline] - /// Returns the delta file path under the **current** compression algorithm - fn delta_file_path(&self, version: ManifestVersion) -> String { - gen_path(&self.path, &delta_file(version), self.compress_type) - } - - #[inline] - /// Returns the checkpoint file path under the **current** compression algorithm - fn checkpoint_file_path(&self, version: ManifestVersion) -> String { - gen_path(&self.path, &checkpoint_file(version), self.compress_type) - } - - #[inline] - /// Returns the last checkpoint path, because the last checkpoint is not compressed, - /// so its path name has nothing to do with the compression algorithm used by `ManifestObjectStore` - fn last_checkpoint_path(&self) -> String { - format!("{}{}", self.path, LAST_CHECKPOINT_FILE) - } - - /// Return all `R`s in the root directory that meet the `filter` conditions (that is, the `filter` closure returns `Some(R)`), - /// and discard `R` that does not meet the conditions (that is, the `filter` closure returns `None`) - async fn get_paths(&self, filter: F) -> Result> - where - F: Fn(Entry) -> Option, - { - let streamer = self - .object_store - .lister_with(&self.path) - .await - .context(ListObjectsSnafu { path: &self.path })?; - streamer - .try_filter_map(|e| async { Ok(filter(e)) }) - .try_collect::>() - .await - .context(ListObjectsSnafu { path: &self.path }) - } - - pub(crate) fn path(&self) -> &str { - &self.path - } -} - -#[derive(Serialize, Deserialize, Debug)] -struct CheckpointMetadata { - pub size: usize, - /// The latest version this checkpoint contains. - pub version: ManifestVersion, - pub checksum: Option, - pub extend_metadata: Option>, -} - -impl CheckpointMetadata { - fn encode(&self) -> Result> { - serde_json::to_string(self).context(EncodeJsonSnafu) - } - - fn decode(bs: &[u8]) -> Result { - let data = std::str::from_utf8(bs).context(Utf8Snafu)?; - - serde_json::from_str(data).context(DecodeJsonSnafu) - } -} - -#[async_trait] -impl ManifestLogStorage for ManifestObjectStore { - type Error = Error; - type Iter = ObjectStoreLogIterator; - - async fn scan( - &self, - start: ManifestVersion, - end: ManifestVersion, - ) -> Result { - ensure!(start <= end, InvalidScanIndexSnafu { start, end }); - - let mut entries: Vec<(ManifestVersion, Entry)> = self - .get_paths(|entry| { - let file_name = entry.name(); - if is_delta_file(file_name) { - let version = file_version(file_name); - if start <= version && version < end { - return Some((version, entry)); - } - } - None - }) - .await?; - - entries.sort_unstable_by(|(v1, _), (v2, _)| v1.cmp(v2)); - - Ok(ObjectStoreLogIterator { - object_store: self.object_store.clone(), - iter: Box::new(entries.into_iter()), - }) - } - - async fn delete_until( - &self, - end: ManifestVersion, - keep_last_checkpoint: bool, - ) -> Result { - // Stores (entry, is_checkpoint, version) in a Vec. - let entries: Vec<_> = self - .get_paths(|entry| { - let file_name = entry.name(); - let is_checkpoint = is_checkpoint_file(file_name); - if is_delta_file(file_name) || is_checkpoint_file(file_name) { - let version = file_version(file_name); - if version < end { - return Some((entry, is_checkpoint, version)); - } - } - None - }) - .await?; - let checkpoint_version = if keep_last_checkpoint { - // Note that the order of entries is unspecific. - entries - .iter() - .filter_map( - |(_e, is_checkpoint, version)| { - if *is_checkpoint { - Some(version) - } else { - None - } - }, - ) - .max() - } else { - None - }; - let paths: Vec<_> = entries - .iter() - .filter(|(_e, is_checkpoint, version)| { - if let Some(max_version) = checkpoint_version { - if *is_checkpoint { - // We need to keep the checkpoint file. - version < max_version - } else { - // We can delete the log file with max_version as the checkpoint - // file contains the log file's content. - version <= max_version - } - } else { - true - } - }) - .map(|e| e.0.path().to_string()) - .collect(); - let ret = paths.len(); - - logging::debug!( - "Deleting {} logs from manifest storage path {} until {}, checkpoint: {:?}, paths: {:?}", - ret, - self.path, - end, - checkpoint_version, - paths, - ); - - self.object_store - .remove(paths) - .await - .with_context(|_| DeleteObjectSnafu { - path: self.path.clone(), - })?; - - Ok(ret) - } - - async fn delete_all(&self, remove_action_manifest: ManifestVersion) -> Result<()> { - let entries: Vec = self.get_paths(Some).await?; - - // Filter out the latest delta file. - let paths: Vec<_> = entries - .iter() - .filter(|e| { - let name = e.name(); - if is_delta_file(name) && file_version(name) == remove_action_manifest { - return false; - } - true - }) - .map(|e| e.path().to_string()) - .collect(); - - logging::info!( - "Deleting {} from manifest storage path {} paths: {:?}", - paths.len(), - self.path, - paths, - ); - - // Delete all files except the latest delta file. - self.object_store - .remove(paths) - .await - .with_context(|_| DeleteObjectSnafu { - path: self.path.clone(), - })?; - - // Delete the latest delta file and the manifest directory. - self.object_store - .remove_all(&self.path) - .await - .with_context(|_| DeleteObjectSnafu { - path: self.path.clone(), - })?; - logging::info!("Deleted manifest storage path {}", self.path); - - Ok(()) - } - - async fn save(&self, version: ManifestVersion, bytes: &[u8]) -> Result<()> { - let path = self.delta_file_path(version); - logging::debug!("Save log to manifest storage, version: {}", version); - let data = self - .compress_type - .encode(bytes) - .await - .context(CompressObjectSnafu { - compress_type: self.compress_type, - path: &path, - })?; - self.object_store - .write(&path, data) - .await - .context(WriteObjectSnafu { path }) - } - - async fn delete(&self, start: ManifestVersion, end: ManifestVersion) -> Result<()> { - ensure!(start <= end, InvalidScanIndexSnafu { start, end }); - - // Due to backward compatibility, it is possible that the user's log between start and end has not been compressed, - // so we need to delete the uncompressed file corresponding to that version, even if the uncompressed file in that version do not exist. - let mut paths = Vec::with_capacity(((end - start) * 2) as usize); - for version in start..end { - paths.push(raw_normalize_path(&self.delta_file_path(version))); - if self.compress_type != FALL_BACK_COMPRESS_TYPE { - paths.push(raw_normalize_path(&gen_path( - &self.path, - &delta_file(version), - FALL_BACK_COMPRESS_TYPE, - ))); - } - } - - logging::debug!( - "Deleting logs from manifest storage, start: {}, end: {}", - start, - end - ); - - self.object_store - .remove(paths.clone()) - .await - .with_context(|_| DeleteObjectSnafu { - path: paths.join(","), - })?; - - Ok(()) - } - - async fn save_checkpoint(&self, version: ManifestVersion, bytes: &[u8]) -> Result<()> { - let path = self.checkpoint_file_path(version); - let data = self - .compress_type - .encode(bytes) - .await - .context(CompressObjectSnafu { - compress_type: self.compress_type, - path: &path, - })?; - self.object_store - .write(&path, data) - .await - .context(WriteObjectSnafu { path })?; - - // Because last checkpoint file only contain size and version, which is tiny, so we don't compress it. - let last_checkpoint_path = self.last_checkpoint_path(); - - let checkpoint_metadata = CheckpointMetadata { - size: bytes.len(), - version, - checksum: None, - extend_metadata: None, - }; - - logging::debug!( - "Save checkpoint in path: {}, metadata: {:?}", - last_checkpoint_path, - checkpoint_metadata - ); - - let bs = checkpoint_metadata.encode()?; - self.object_store - .write(&last_checkpoint_path, bs.as_ref().to_vec()) - .await - .context(WriteObjectSnafu { - path: last_checkpoint_path, - })?; - - Ok(()) - } - - async fn load_checkpoint( - &self, - version: ManifestVersion, - ) -> Result)>> { - let path = self.checkpoint_file_path(version); - // Due to backward compatibility, it is possible that the user's checkpoint not compressed, - // so if we don't find file by compressed type. fall back to checkpoint not compressed find again. - let checkpoint_data = - match self.object_store.read(&path).await { - Ok(checkpoint) => { - let decompress_data = self.compress_type.decode(checkpoint).await.context( - DecompressObjectSnafu { - compress_type: self.compress_type, - path, - }, - )?; - Ok(Some(decompress_data)) - } - Err(e) => { - if e.kind() == ErrorKind::NotFound { - if self.compress_type != FALL_BACK_COMPRESS_TYPE { - let fall_back_path = gen_path( - &self.path, - &checkpoint_file(version), - FALL_BACK_COMPRESS_TYPE, - ); - logging::debug!( - "Failed to load checkpoint from path: {}, fall back to path: {}", - path, - fall_back_path - ); - match self.object_store.read(&fall_back_path).await { - Ok(checkpoint) => { - let decompress_data = FALL_BACK_COMPRESS_TYPE - .decode(checkpoint) - .await - .context(DecompressObjectSnafu { - compress_type: FALL_BACK_COMPRESS_TYPE, - path, - })?; - Ok(Some(decompress_data)) - } - Err(e) if e.kind() == ErrorKind::NotFound => Ok(None), - Err(e) => Err(e).context(ReadObjectSnafu { - path: &fall_back_path, - }), - } - } else { - Ok(None) - } - } else { - Err(e).context(ReadObjectSnafu { path: &path }) - } - } - }?; - Ok(checkpoint_data.map(|data| (version, data))) - } - - async fn delete_checkpoint(&self, version: ManifestVersion) -> Result<()> { - // Due to backward compatibility, it is possible that the user's checkpoint file has not been compressed, - // so we need to delete the uncompressed checkpoint file corresponding to that version, even if the uncompressed checkpoint file in that version do not exist. - let paths = if self.compress_type != FALL_BACK_COMPRESS_TYPE { - vec![ - raw_normalize_path(&self.checkpoint_file_path(version)), - raw_normalize_path(&gen_path( - &self.path, - &checkpoint_file(version), - FALL_BACK_COMPRESS_TYPE, - )), - ] - } else { - vec![raw_normalize_path(&self.checkpoint_file_path(version))] - }; - - self.object_store - .remove(paths.clone()) - .await - .context(DeleteObjectSnafu { - path: paths.join(","), - })?; - Ok(()) - } - - async fn load_last_checkpoint(&self) -> Result)>> { - let last_checkpoint_path = self.last_checkpoint_path(); - let last_checkpoint_data = match self.object_store.read(&last_checkpoint_path).await { - Ok(data) => data, - Err(e) if e.kind() == ErrorKind::NotFound => { - return Ok(None); - } - Err(e) => { - return Err(e).context(ReadObjectSnafu { - path: last_checkpoint_path, - }); - } - }; - - let checkpoint_metadata = CheckpointMetadata::decode(&last_checkpoint_data)?; - - logging::debug!( - "Load checkpoint in path: {}, metadata: {:?}", - last_checkpoint_path, - checkpoint_metadata - ); - - self.load_checkpoint(checkpoint_metadata.version).await - } -} - -#[cfg(test)] -mod tests { - use common_test_util::temp_dir::create_temp_dir; - use object_store::services::Fs; - use object_store::ObjectStore; - - use super::*; - - fn new_test_manifest_store() -> ManifestObjectStore { - common_telemetry::init_default_ut_logging(); - let tmp_dir = create_temp_dir("test_manifest_log_store"); - let mut builder = Fs::default(); - let _ = builder.root(&tmp_dir.path().to_string_lossy()); - let object_store = ObjectStore::new(builder).unwrap().finish(); - ManifestObjectStore::new("/", object_store, CompressionType::Uncompressed) - } - - #[test] - // Define this test mainly to prevent future unintentional changes may break the backward compatibility. - fn test_compress_file_path_generation() { - let path = "/foo/bar/"; - let version: ManifestVersion = 0; - let file_path = gen_path(path, &delta_file(version), CompressionType::Gzip); - assert_eq!(file_path.as_str(), "/foo/bar/00000000000000000000.json.gz") - } - - #[tokio::test] - async fn test_manifest_log_store_uncompress() { - let mut log_store = new_test_manifest_store(); - log_store.compress_type = CompressionType::Uncompressed; - test_manifest_log_store_case(log_store).await; - } - - #[tokio::test] - async fn test_manifest_log_store_compress() { - let mut log_store = new_test_manifest_store(); - log_store.compress_type = CompressionType::Gzip; - test_manifest_log_store_case(log_store).await; - } - - async fn test_manifest_log_store_case(log_store: ManifestObjectStore) { - for v in 0..5 { - log_store - .save(v, format!("hello, {v}").as_bytes()) - .await - .unwrap(); - } - - let mut it = log_store.scan(1, 4).await.unwrap(); - for v in 1..4 { - let (version, bytes) = it.next_log().await.unwrap().unwrap(); - assert_eq!(v, version); - assert_eq!(format!("hello, {v}").as_bytes(), bytes); - } - assert!(it.next_log().await.unwrap().is_none()); - - let mut it = log_store.scan(0, 11).await.unwrap(); - for v in 0..5 { - let (version, bytes) = it.next_log().await.unwrap().unwrap(); - assert_eq!(v, version); - assert_eq!(format!("hello, {v}").as_bytes(), bytes); - } - assert!(it.next_log().await.unwrap().is_none()); - - // Delete [0, 3) - log_store.delete(0, 3).await.unwrap(); - - // [3, 5) remains - let mut it = log_store.scan(0, 11).await.unwrap(); - for v in 3..5 { - let (version, bytes) = it.next_log().await.unwrap().unwrap(); - assert_eq!(v, version); - assert_eq!(format!("hello, {v}").as_bytes(), bytes); - } - assert!(it.next_log().await.unwrap().is_none()); - - // test checkpoint - assert!(log_store.load_last_checkpoint().await.unwrap().is_none()); - log_store - .save_checkpoint(3, "checkpoint".as_bytes()) - .await - .unwrap(); - - let (v, checkpoint) = log_store.load_last_checkpoint().await.unwrap().unwrap(); - assert_eq!(checkpoint, "checkpoint".as_bytes()); - assert_eq!(3, v); - - //delete (,4) logs and keep checkpoint 3. - let _ = log_store.delete_until(4, true).await.unwrap(); - let _ = log_store.load_checkpoint(3).await.unwrap().unwrap(); - let _ = log_store.load_last_checkpoint().await.unwrap().unwrap(); - let mut it = log_store.scan(0, 11).await.unwrap(); - let (version, bytes) = it.next_log().await.unwrap().unwrap(); - assert_eq!(4, version); - assert_eq!("hello, 4".as_bytes(), bytes); - assert!(it.next_log().await.unwrap().is_none()); - - // delete all logs and checkpoints - let _ = log_store.delete_until(11, false).await.unwrap(); - assert!(log_store.load_checkpoint(3).await.unwrap().is_none()); - assert!(log_store.load_last_checkpoint().await.unwrap().is_none()); - let mut it = log_store.scan(0, 11).await.unwrap(); - assert!(it.next_log().await.unwrap().is_none()); - } - - #[tokio::test] - // test ManifestObjectStore can read/delete previously uncompressed data correctly - async fn test_compress_backward_compatible() { - let mut log_store = new_test_manifest_store(); - - // write uncompress data to stimulate previously uncompressed data - log_store.compress_type = CompressionType::Uncompressed; - for v in 0..5 { - log_store - .save(v, format!("hello, {v}").as_bytes()) - .await - .unwrap(); - } - log_store - .save_checkpoint(5, "checkpoint_uncompressed".as_bytes()) - .await - .unwrap(); - - // change compress type - log_store.compress_type = CompressionType::Gzip; - - // test load_last_checkpoint work correctly for previously uncompressed data - let (v, checkpoint) = log_store.load_last_checkpoint().await.unwrap().unwrap(); - assert_eq!(v, 5); - assert_eq!(checkpoint, "checkpoint_uncompressed".as_bytes()); - - // write compressed data to stimulate compress alogorithom take effect - for v in 5..10 { - log_store - .save(v, format!("hello, {v}").as_bytes()) - .await - .unwrap(); - } - log_store - .save_checkpoint(10, "checkpoint_compressed".as_bytes()) - .await - .unwrap(); - - // test data reading - let mut it = log_store.scan(0, 10).await.unwrap(); - for v in 0..10 { - let (version, bytes) = it.next_log().await.unwrap().unwrap(); - assert_eq!(v, version); - assert_eq!(format!("hello, {v}").as_bytes(), bytes); - } - let (v, checkpoint) = log_store.load_checkpoint(5).await.unwrap().unwrap(); - assert_eq!(v, 5); - assert_eq!(checkpoint, "checkpoint_uncompressed".as_bytes()); - let (v, checkpoint) = log_store.load_last_checkpoint().await.unwrap().unwrap(); - assert_eq!(v, 10); - assert_eq!(checkpoint, "checkpoint_compressed".as_bytes()); - - // Delete previously uncompressed checkpoint - log_store.delete_checkpoint(5).await.unwrap(); - assert!(log_store.load_checkpoint(5).await.unwrap().is_none()); - - // Delete [3, 7), contain uncompressed/compressed data - log_store.delete(3, 7).await.unwrap(); - // [3, 7) deleted - let mut it = log_store.scan(3, 7).await.unwrap(); - assert!(it.next_log().await.unwrap().is_none()); - - // Delete util 10, contain uncompressed/compressed data - // log 0, 1, 2, 7, 8, 9 will be delete - assert_eq!(6, log_store.delete_until(10, false).await.unwrap()); - let mut it = log_store.scan(0, 10).await.unwrap(); - assert!(it.next_log().await.unwrap().is_none()); - } -} diff --git a/src/storage/src/manifest/test_utils.rs b/src/storage/src/manifest/test_utils.rs deleted file mode 100644 index 35b6d2fb0b13..000000000000 --- a/src/storage/src/manifest/test_utils.rs +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use datatypes::type_id::LogicalTypeId; -use store_api::storage::SequenceNumber; - -use crate::manifest::action::*; -use crate::metadata::RegionMetadata; -use crate::sst::{FileId, FileMeta}; -use crate::test_util::descriptor_util::RegionDescBuilder; - -pub const DEFAULT_TEST_FILE_SIZE: u64 = 1024; - -pub fn build_region_meta() -> RegionMetadata { - let region_name = "region-0"; - let desc = RegionDescBuilder::new(region_name) - .id(0) - .push_key_column(("k1", LogicalTypeId::Int32, false)) - .push_field_column(("v1", LogicalTypeId::Float32, true)) - .build(); - desc.try_into().unwrap() -} - -pub fn build_altered_region_meta() -> RegionMetadata { - let region_name = "region-0"; - let desc = RegionDescBuilder::new(region_name) - .id(0) - .push_key_column(("k1", LogicalTypeId::Int32, false)) - .push_field_column(("v1", LogicalTypeId::Float32, true)) - .push_field_column(("v2", LogicalTypeId::Float32, true)) - .build(); - desc.try_into().unwrap() -} - -pub fn build_region_edit( - sequence: SequenceNumber, - files_to_add: &[FileId], - files_to_remove: &[FileId], -) -> RegionEdit { - RegionEdit { - region_version: 0, - flushed_sequence: Some(sequence), - files_to_add: files_to_add - .iter() - .map(|f| FileMeta { - region_id: 0.into(), - file_id: *f, - time_range: None, - level: 0, - file_size: DEFAULT_TEST_FILE_SIZE, - }) - .collect(), - files_to_remove: files_to_remove - .iter() - .map(|f| FileMeta { - region_id: 0.into(), - file_id: *f, - time_range: None, - level: 0, - file_size: DEFAULT_TEST_FILE_SIZE, - }) - .collect(), - compaction_time_window: None, - } -} - -pub fn build_region_truncate(committed_sequence: u64) -> RegionTruncate { - RegionTruncate { - region_id: 0.into(), - committed_sequence, - } -} diff --git a/src/storage/src/memtable.rs b/src/storage/src/memtable.rs deleted file mode 100644 index 546c40f383e4..000000000000 --- a/src/storage/src/memtable.rs +++ /dev/null @@ -1,294 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -mod btree; -mod inserter; -#[cfg(test)] -pub mod tests; -mod version; - -use std::fmt; -use std::sync::atomic::{AtomicBool, AtomicU32, AtomicUsize, Ordering}; -use std::sync::Arc; - -use api::v1::OpType; -use common_time::range::TimestampRange; -use common_time::Timestamp; -use datatypes::vectors::VectorRef; -use store_api::storage::{consts, SequenceNumber}; - -use crate::error::Result; -use crate::flush::FlushStrategyRef; -use crate::memtable::btree::BTreeMemtable; -pub use crate::memtable::inserter::Inserter; -pub use crate::memtable::version::MemtableVersion; -use crate::metrics::WRITE_BUFFER_BYTES; -use crate::read::Batch; -use crate::schema::{ProjectedSchemaRef, RegionSchemaRef}; - -/// Unique id for memtables under same region. -pub type MemtableId = u32; - -#[derive(Debug, Default)] -pub struct MemtableStats { - /// The estimated bytes allocated by this memtable from heap. Result - /// of this method may be larger than the estimated based on `num_rows` because - /// of the implementor's pre-alloc behavior. - pub estimated_bytes: usize, - /// The max timestamp that this memtable contains. - pub max_timestamp: Timestamp, - /// The min timestamp that this memtable contains. - pub min_timestamp: Timestamp, -} - -impl MemtableStats { - pub fn bytes_allocated(&self) -> usize { - self.estimated_bytes - } -} - -/// In memory storage. -pub trait Memtable: Send + Sync + fmt::Debug { - /// Returns id of this memtable. - fn id(&self) -> MemtableId; - - /// Returns schema of the memtable. - fn schema(&self) -> RegionSchemaRef; - - /// Write key/values to the memtable. - /// - /// # Panics - /// Panics if the schema of key/value differs from memtable's schema. - fn write(&self, kvs: &KeyValues) -> Result<()>; - - /// Iterates the memtable. - fn iter(&self, ctx: IterContext) -> Result; - - /// Returns the number of rows in the memtable. - fn num_rows(&self) -> usize; - - /// Returns stats of this memtable. - fn stats(&self) -> MemtableStats; - - /// Mark the memtable is immutable. - /// - /// The region MUST call this inside the region writer's write lock. - fn mark_immutable(&self); -} - -pub type MemtableRef = Arc; - -/// Context for iterating memtable. -/// -/// Should be cheap to clone. -#[derive(Debug, Clone)] -pub struct IterContext { - /// The suggested batch size of the iterator. - pub batch_size: usize, - /// Max visible sequence (inclusive). - pub visible_sequence: SequenceNumber, - - /// Schema the reader expect to read. - /// - /// Set to `None` to read all columns. - pub projected_schema: Option, - - /// Timestamp range - pub time_range: Option, -} - -impl Default for IterContext { - fn default() -> Self { - Self { - batch_size: consts::READ_BATCH_SIZE, - // All data in memory is visible by default. - visible_sequence: SequenceNumber::MAX, - projected_schema: None, - time_range: None, - } - } -} - -/// The ordering of the iterator output. -#[derive(Debug, PartialEq, Eq)] -pub enum RowOrdering { - /// The output rows are unordered. - Unordered, - - /// The output rows are ordered by key. - Key, -} - -/// Iterator of memtable. -/// -/// Since data of memtable are stored in memory, so avoid defining this trait -/// as an async trait. -pub trait BatchIterator: Iterator> + Send + Sync { - /// Returns the schema of this iterator. - fn schema(&self) -> ProjectedSchemaRef; - - /// Returns the ordering of the output rows from this iterator. - fn ordering(&self) -> RowOrdering; -} - -pub type BoxedBatchIterator = Box; - -pub trait MemtableBuilder: Send + Sync + fmt::Debug { - fn build(&self, schema: RegionSchemaRef) -> MemtableRef; -} - -pub type MemtableBuilderRef = Arc; - -/// Key-value pairs in columnar format. -pub struct KeyValues { - pub sequence: SequenceNumber, - pub op_type: OpType, - /// Start index of these key-value paris in batch. Each row in the same batch has - /// a unique index to identify it. - pub start_index_in_batch: usize, - pub keys: Vec, - pub values: Vec, - pub timestamp: Option, -} - -impl KeyValues { - // Note that `sequence` is not reset. - fn reset(&mut self, op_type: OpType, index_in_batch: usize) { - self.op_type = op_type; - self.start_index_in_batch = index_in_batch; - self.keys.clear(); - self.values.clear(); - self.timestamp = None; - } - - pub fn len(&self) -> usize { - self.timestamp.as_ref().map(|v| v.len()).unwrap_or_default() - } - - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - pub fn estimated_memory_size(&self) -> usize { - self.keys.iter().fold(0, |acc, v| acc + v.memory_size()) - + self.values.iter().fold(0, |acc, v| acc + v.memory_size()) - + self - .timestamp - .as_ref() - .map(|t| t.memory_size()) - .unwrap_or_default() - } -} - -/// Memtable memory allocation tracker. -pub struct AllocTracker { - flush_strategy: Option, - /// Bytes allocated by the tracker. - bytes_allocated: AtomicUsize, - /// Whether allocating is done. - is_done_allocating: AtomicBool, -} - -impl fmt::Debug for AllocTracker { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("AllocTracker") - .field("bytes_allocated", &self.bytes_allocated) - .field("is_done_allocating", &self.is_done_allocating) - .finish() - } -} - -impl AllocTracker { - /// Returns a new [AllocTracker]. - pub fn new(flush_strategy: Option) -> AllocTracker { - AllocTracker { - flush_strategy, - bytes_allocated: AtomicUsize::new(0), - is_done_allocating: AtomicBool::new(false), - } - } - - /// Tracks `bytes` memory is allocated. - pub(crate) fn on_allocate(&self, bytes: usize) { - let _ = self.bytes_allocated.fetch_add(bytes, Ordering::Relaxed); - WRITE_BUFFER_BYTES.add(bytes as i64); - if let Some(flush_strategy) = &self.flush_strategy { - flush_strategy.reserve_mem(bytes); - } - } - - /// Marks we have finished allocating memory so we can free it from - /// the write buffer's limit. - /// - /// The region MUST ensure that it calls this method inside the region writer's write lock. - pub(crate) fn done_allocating(&self) { - if let Some(flush_strategy) = &self.flush_strategy { - if self - .is_done_allocating - .compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed) - .is_ok() - { - flush_strategy.schedule_free_mem(self.bytes_allocated.load(Ordering::Relaxed)); - } - } - } - - /// Returns bytes allocated. - pub(crate) fn bytes_allocated(&self) -> usize { - self.bytes_allocated.load(Ordering::Relaxed) - } -} - -impl Drop for AllocTracker { - fn drop(&mut self) { - if !self.is_done_allocating.load(Ordering::Relaxed) { - self.done_allocating(); - } - - let bytes_allocated = self.bytes_allocated.load(Ordering::Relaxed); - WRITE_BUFFER_BYTES.sub(bytes_allocated as i64); - - // Memory tracked by this tracker is freed. - if let Some(flush_strategy) = &self.flush_strategy { - flush_strategy.free_mem(bytes_allocated); - } - } -} - -/// Default memtable builder that builds `BTreeMemtable`. -#[derive(Debug, Default)] -pub struct DefaultMemtableBuilder { - memtable_id: AtomicU32, - flush_strategy: Option, -} - -impl DefaultMemtableBuilder { - /// Returns a new [DefaultMemtableBuilder] with specific `flush_strategy`. - /// - /// If `flush_strategy` is `Some`, the memtable will report its memory usage - /// to the `flush_strategy`. - pub fn with_flush_strategy(flush_strategy: Option) -> Self { - Self { - memtable_id: AtomicU32::new(0), - flush_strategy, - } - } -} - -impl MemtableBuilder for DefaultMemtableBuilder { - fn build(&self, schema: RegionSchemaRef) -> MemtableRef { - let id = self.memtable_id.fetch_add(1, Ordering::Relaxed); - Arc::new(BTreeMemtable::new(id, schema, self.flush_strategy.clone())) - } -} diff --git a/src/storage/src/memtable/btree.rs b/src/storage/src/memtable/btree.rs deleted file mode 100644 index 0874e96e7096..000000000000 --- a/src/storage/src/memtable/btree.rs +++ /dev/null @@ -1,573 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::cmp::Ordering; -use std::collections::{btree_map, BTreeMap}; -use std::fmt; -use std::ops::Bound; -use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering}; -use std::sync::{Arc, RwLock}; - -use api::v1::OpType; -use common_time::range::TimestampRange; -use datatypes::data_type::DataType; -use datatypes::prelude::*; -use datatypes::value::Value; -use datatypes::vectors::{UInt64Vector, UInt64VectorBuilder, UInt8Vector, UInt8VectorBuilder}; -use store_api::storage::{SequenceNumber, MIN_OP_TYPE}; - -use crate::error::Result; -use crate::flush::FlushStrategyRef; -use crate::memtable::{ - AllocTracker, BatchIterator, BoxedBatchIterator, IterContext, KeyValues, Memtable, MemtableId, - MemtableStats, RowOrdering, -}; -use crate::read::Batch; -use crate::schema::compat::ReadAdapter; -use crate::schema::{ProjectedSchema, ProjectedSchemaRef, RegionSchemaRef}; - -type RwLockMap = RwLock>; - -/// A simple memtable implementation based on std's [`BTreeMap`]. -/// -/// Mainly for test purpose, don't use in production. -pub struct BTreeMemtable { - id: MemtableId, - schema: RegionSchemaRef, - map: Arc, - alloc_tracker: AllocTracker, - max_timestamp: AtomicI64, - min_timestamp: AtomicI64, -} - -impl BTreeMemtable { - pub fn new( - id: MemtableId, - schema: RegionSchemaRef, - flush_strategy: Option, - ) -> BTreeMemtable { - BTreeMemtable { - id, - schema, - map: Arc::new(RwLock::new(BTreeMap::new())), - alloc_tracker: AllocTracker::new(flush_strategy), - max_timestamp: AtomicI64::new(i64::MIN), - min_timestamp: AtomicI64::new(i64::MAX), - } - } - - /// Updates memtable stats. - /// This function is guarded by `BTreeMemtable::map` so that store-after-load is safe. - fn update_stats(&self, request_size: usize, min: Option, max: Option) { - self.alloc_tracker.on_allocate(request_size); - - if let Some(min) = min { - let min_val = min - .as_timestamp() - .expect("Min timestamp must be a valid timestamp value") - .value(); - let cur_min = self.min_timestamp.load(AtomicOrdering::Relaxed); - if min_val < cur_min { - self.min_timestamp.store(min_val, AtomicOrdering::Relaxed); - } - } - - if let Some(max) = max { - let cur_max = self.max_timestamp.load(AtomicOrdering::Relaxed); - let max_val = max - .as_timestamp() - .expect("Max timestamp must be a valid timestamp value") - .value(); - if max_val > cur_max { - self.max_timestamp.store(max_val, AtomicOrdering::Relaxed); - } - } - } -} - -impl fmt::Debug for BTreeMemtable { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let len = self.map.read().unwrap().len(); - - f.debug_struct("BTreeMemtable") - .field("id", &self.id) - // Only show StoreSchema - .field("schema", &self.schema) - .field("rows", &len) - .field("alloc_tracker", &self.alloc_tracker) - .field("max_timestamp", &self.max_timestamp) - .field("min_timestamp", &self.min_timestamp) - .finish() - } -} - -impl Memtable for BTreeMemtable { - fn id(&self) -> MemtableId { - self.id - } - - fn schema(&self) -> RegionSchemaRef { - self.schema.clone() - } - - fn write(&self, kvs: &KeyValues) -> Result<()> { - debug_assert!(kvs.timestamp.is_some()); - let iter_row = IterRow::new(kvs); - let mut map = self.map.write().unwrap(); - - let mut min_ts = None; - let mut max_ts = None; - for (inner_key, row_value) in iter_row { - let ts = inner_key.timestamp(); - let min_ts = min_ts.get_or_insert_with(|| ts.clone()); - let max_ts = max_ts.get_or_insert_with(|| ts.clone()); - if ts < min_ts { - *min_ts = ts.clone(); - } - if ts > max_ts { - *max_ts = ts.clone(); - } - let _ = map.insert(inner_key, row_value); - } - - self.update_stats(kvs.estimated_memory_size(), min_ts, max_ts); - - Ok(()) - } - - fn iter(&self, ctx: IterContext) -> Result { - assert!(ctx.batch_size > 0); - - let iter = BTreeIterator::new(ctx, self.schema.clone(), self.map.clone())?; - - Ok(Box::new(iter)) - } - - fn num_rows(&self) -> usize { - self.map.read().unwrap().len() - } - - fn stats(&self) -> MemtableStats { - let ts_meta = self.schema.column_metadata(self.schema.timestamp_index()); - - let Some(timestamp_type) = ts_meta.desc.data_type.as_timestamp() else { - // safety: timestamp column always has timestamp type, otherwise it's a bug. - panic!( - "Timestamp column is not a valid timestamp type: {:?}", - self.schema - ); - }; - - MemtableStats { - estimated_bytes: self.alloc_tracker.bytes_allocated(), - max_timestamp: timestamp_type - .create_timestamp(self.max_timestamp.load(AtomicOrdering::Relaxed)), - min_timestamp: timestamp_type - .create_timestamp(self.min_timestamp.load(AtomicOrdering::Relaxed)), - } - } - - fn mark_immutable(&self) { - self.alloc_tracker.done_allocating(); - } -} - -struct BTreeIterator { - ctx: IterContext, - /// Schema of this memtable. - schema: RegionSchemaRef, - /// Projected schema that user expect to read. - projected_schema: ProjectedSchemaRef, - adapter: ReadAdapter, - map: Arc, - last_key: Option, -} - -impl BatchIterator for BTreeIterator { - fn schema(&self) -> ProjectedSchemaRef { - self.projected_schema.clone() - } - - fn ordering(&self) -> RowOrdering { - RowOrdering::Key - } -} - -impl Iterator for BTreeIterator { - type Item = Result; - - fn next(&mut self) -> Option> { - self.next_batch().transpose() - } -} - -impl BTreeIterator { - fn new( - ctx: IterContext, - schema: RegionSchemaRef, - map: Arc, - ) -> Result { - let projected_schema = ctx - .projected_schema - .clone() - .unwrap_or_else(|| Arc::new(ProjectedSchema::no_projection(schema.clone()))); - let adapter = ReadAdapter::new(schema.store_schema().clone(), projected_schema.clone())?; - - Ok(BTreeIterator { - ctx, - schema, - projected_schema, - adapter, - map, - last_key: None, - }) - } - - fn next_batch(&mut self) -> Result> { - let map = self.map.read().unwrap(); - let iter = if let Some(last_key) = &self.last_key { - map.range((Bound::Excluded(last_key), Bound::Unbounded)) - } else { - map.range(..) - }; - - let iter = MapIterWrapper::new(iter, self.ctx.visible_sequence, self.ctx.time_range); - let (keys, sequences, op_types, values) = collect_iter(iter, self.ctx.batch_size); - - if keys.is_empty() { - return Ok(None); - } - self.last_key = keys.last().map(|k| { - let mut last_key = (*k).clone(); - last_key.reset_for_seek(); - last_key - }); - - let key_data_types = self - .schema - .row_key_columns() - .map(|column_meta| column_meta.desc.data_type.clone()); - let value_data_types = self - .schema - .field_columns() - .map(|column_meta| column_meta.desc.data_type.clone()); - - let key_columns = rows_to_vectors( - key_data_types, - self.adapter.source_key_needed(), - keys.as_slice(), - ); - let field_columns = rows_to_vectors( - value_data_types, - self.adapter.source_value_needed(), - values.as_slice(), - ); - - let batch = self.adapter.batch_from_parts( - key_columns, - field_columns, - Arc::new(sequences), - Arc::new(op_types), - )?; - - Ok(Some(batch)) - } -} - -fn collect_iter<'a, I: Iterator>( - iter: I, - batch_size: usize, -) -> ( - Vec<&'a InnerKey>, - UInt64Vector, - UInt8Vector, - Vec<&'a RowValue>, -) { - let mut keys = Vec::with_capacity(batch_size); - let mut sequences = UInt64VectorBuilder::with_capacity(batch_size); - let mut op_types = UInt8VectorBuilder::with_capacity(batch_size); - let mut values = Vec::with_capacity(batch_size); - for (inner_key, row_value) in iter.take(batch_size) { - keys.push(inner_key); - sequences.push(Some(inner_key.sequence)); - op_types.push(Some(inner_key.op_type as u8)); - values.push(row_value); - } - - (keys, sequences.finish(), op_types.finish(), values) -} - -/// `MapIterWrapper` removes same user key with invisible sequence. -struct MapIterWrapper<'a, InnerKey, RowValue> { - iter: btree_map::Range<'a, InnerKey, RowValue>, - prev_key: Option, - visible_sequence: SequenceNumber, - time_range: Option, -} - -impl<'a> MapIterWrapper<'a, InnerKey, RowValue> { - fn new( - iter: btree_map::Range<'a, InnerKey, RowValue>, - visible_sequence: SequenceNumber, - time_range: Option, - ) -> MapIterWrapper<'a, InnerKey, RowValue> { - MapIterWrapper { - iter, - prev_key: None, - visible_sequence, - time_range, - } - } - - fn next_visible_entry(&mut self) -> Option<(&'a InnerKey, &'a RowValue)> { - for (k, v) in self.iter.by_ref() { - if k.is_visible(self.visible_sequence) && k.is_in_time_range(&self.time_range) { - return Some((k, v)); - } - } - - None - } -} - -impl<'a> Iterator for MapIterWrapper<'a, InnerKey, RowValue> { - type Item = (&'a InnerKey, &'a RowValue); - - fn next(&mut self) -> Option<(&'a InnerKey, &'a RowValue)> { - let (mut current_key, mut current_value) = self.next_visible_entry()?; - if self.prev_key.is_none() { - self.prev_key = Some(current_key.clone()); - return Some((current_key, current_value)); - } - - let prev_key = self.prev_key.take().unwrap(); - while prev_key.is_row_key_equal(current_key) { - if let Some((next_key, next_value)) = self.next_visible_entry() { - (current_key, current_value) = (next_key, next_value); - } else { - return None; - } - } - - self.prev_key = Some(current_key.clone()); - - Some((current_key, current_value)) - } -} - -struct IterRow<'a> { - kvs: &'a KeyValues, - index: usize, - len: usize, -} - -impl<'a> IterRow<'a> { - fn new(kvs: &KeyValues) -> IterRow { - IterRow { - kvs, - index: 0, - len: kvs.len(), - } - } - - fn fetch_row(&mut self) -> (InnerKey, RowValue) { - let mut row_key: Vec<_> = self - .kvs - .keys - .iter() - .map(|vector| vector.get(self.index)) - .collect(); - - // unwrap safety: KeyValues always contains a timestamp as guaranteed in [Inserter::write_one_mutation] - row_key.push(self.kvs.timestamp.as_ref().unwrap().get(self.index)); - let inner_key = InnerKey { - row_key, - sequence: self.kvs.sequence, - index_in_batch: self.kvs.start_index_in_batch + self.index, - op_type: self.kvs.op_type, - }; - - let row_value = RowValue { - values: self - .kvs - .values - .iter() - .map(|vector| vector.get(self.index)) - .collect(), - }; - - self.index += 1; - - (inner_key, row_value) - } -} - -impl<'a> Iterator for IterRow<'a> { - type Item = (InnerKey, RowValue); - - fn next(&mut self) -> Option<(InnerKey, RowValue)> { - if self.index >= self.len { - return None; - } - - Some(self.fetch_row()) - } - - fn size_hint(&self) -> (usize, Option) { - (self.kvs.keys.len(), Some(self.kvs.keys.len())) - } -} - -#[derive(Clone, Debug, PartialEq, Eq)] -struct InnerKey { - /// User defined primary keys - row_key: Vec, - /// Sequence number of row - sequence: SequenceNumber, - index_in_batch: usize, - op_type: OpType, -} - -impl Ord for InnerKey { - fn cmp(&self, other: &InnerKey) -> Ordering { - // Order by (row_key asc, sequence desc, index_in_batch desc, op_type desc), though (key, - // sequence, index_in_batch) should be enough to disambiguate. - self.row_key - .cmp(&other.row_key) - .then_with(|| other.sequence.cmp(&self.sequence)) - .then_with(|| other.index_in_batch.cmp(&self.index_in_batch)) - .then_with(|| other.op_type.cmp(&self.op_type)) - } -} - -impl PartialOrd for InnerKey { - fn partial_cmp(&self, other: &InnerKey) -> Option { - Some(self.cmp(other)) - } -} - -impl InnerKey { - #[inline] - fn timestamp(&self) -> &Value { - // safety: row key shall at least contain a timestamp column - self.row_key.last().unwrap() - } - - #[inline] - fn is_row_key_equal(&self, other: &InnerKey) -> bool { - self.row_key == other.row_key - } - - #[inline] - fn is_visible(&self, sequence: SequenceNumber) -> bool { - self.sequence <= sequence - } - - #[inline] - fn is_in_time_range(&self, range: &Option) -> bool { - let Some(range) = range else { - return true; - }; - range.contains( - &self - .timestamp() - .as_timestamp() - .expect("Timestamp field must be a valid timestamp value"), - ) - } - - /// Reset the `InnerKey` so that we can use it to seek next key that - /// has different row key. - fn reset_for_seek(&mut self) { - // sequence, index_in_batch, op_type are ordered in desc order, so - // we can represent the last inner key with same row key by setting them - // to zero (Minimum value). - self.sequence = 0; - self.index_in_batch = 0; - self.op_type = MIN_OP_TYPE; - } -} - -#[derive(Clone, Debug)] -struct RowValue { - values: Vec, -} - -trait RowsProvider { - fn row_num(&self) -> usize; - - fn column_num(&self) -> usize { - self.row_by_index(0).len() - } - - fn is_empty(&self) -> bool { - self.row_num() == 0 - } - - fn row_by_index(&self, idx: usize) -> &Vec; -} - -impl<'a> RowsProvider for &'a [&InnerKey] { - fn row_num(&self) -> usize { - self.len() - } - - fn row_by_index(&self, idx: usize) -> &Vec { - &self[idx].row_key - } -} - -impl<'a> RowsProvider for &'a [&RowValue] { - fn row_num(&self) -> usize { - self.len() - } - - fn row_by_index(&self, idx: usize) -> &Vec { - &self[idx].values - } -} - -fn rows_to_vectors, T: RowsProvider>( - data_types: I, - column_needed: &[bool], - provider: T, -) -> Vec { - if provider.is_empty() { - return Vec::new(); - } - - let column_num = provider.column_num(); - let row_num = provider.row_num(); - let mut builders = Vec::with_capacity(column_num); - for data_type in data_types { - builders.push(data_type.create_mutable_vector(row_num)); - } - - let mut vectors = Vec::with_capacity(column_num); - for (col_idx, builder) in builders.iter_mut().enumerate() { - if !column_needed[col_idx] { - continue; - } - - for row_idx in 0..row_num { - let row = provider.row_by_index(row_idx); - let value = &row[col_idx]; - builder.as_mut().push_value_ref(value.as_value_ref()); - } - - vectors.push(builder.to_vector()); - } - - vectors -} diff --git a/src/storage/src/memtable/inserter.rs b/src/storage/src/memtable/inserter.rs deleted file mode 100644 index b54b4897b321..000000000000 --- a/src/storage/src/memtable/inserter.rs +++ /dev/null @@ -1,251 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use api::v1::OpType; -use store_api::storage::SequenceNumber; - -use super::MemtableRef; -use crate::error::Result; -use crate::memtable::KeyValues; -use crate::metrics::MEMTABLE_WRITE_ELAPSED; -use crate::write_batch::{Mutation, Payload}; - -/// Wraps logic of inserting key/values in [WriteBatch](crate::write_batch::WriteBatch) to [Memtable](crate::memtable::Memtable). -pub struct Inserter { - /// Sequence of the batch to be inserted. - sequence: SequenceNumber, - /// Used to calculate the start index in batch for `KeyValues`. - index_in_batch: usize, -} - -impl Inserter { - pub fn new(sequence: SequenceNumber) -> Inserter { - Inserter { - sequence, - index_in_batch: 0, - } - } - - /// Insert write batch payload into memtable. - /// - /// Won't do schema validation if not configured. Caller (mostly the `RegionWriter` should ensure the - /// schemas of `memtable` are consistent with `payload`'s. - pub fn insert_memtable(&mut self, payload: &Payload, memtable: &MemtableRef) -> Result<()> { - let _timer = MEMTABLE_WRITE_ELAPSED.start_timer(); - - if payload.is_empty() { - return Ok(()); - } - - // This function only makes effect in debug mode. - validate_input_and_memtable_schemas(payload, memtable); - - // Enough to hold all key or value columns. - let total_column_num = payload.schema.num_columns(); - // Reusable KeyValues buffer. - let mut kvs = KeyValues { - sequence: self.sequence, - op_type: OpType::Put, - start_index_in_batch: self.index_in_batch, - keys: Vec::with_capacity(total_column_num), - values: Vec::with_capacity(total_column_num), - timestamp: None, - }; - - for mutation in &payload.mutations { - self.write_one_mutation(mutation, memtable, &mut kvs)?; - } - - Ok(()) - } - - fn write_one_mutation( - &mut self, - mutation: &Mutation, - memtable: &MemtableRef, - kvs: &mut KeyValues, - ) -> Result<()> { - let schema = memtable.schema(); - let num_rows = mutation.record_batch.num_rows(); - - kvs.reset(mutation.op_type, self.index_in_batch); - - let ts_idx = schema.timestamp_index(); - kvs.timestamp = Some(mutation.record_batch.column(ts_idx).clone()); - for key_idx in 0..ts_idx { - kvs.keys.push(mutation.record_batch.column(key_idx).clone()); - } - for value_idx in schema.value_indices() { - kvs.values - .push(mutation.record_batch.column(value_idx).clone()); - } - - memtable.write(kvs)?; - - self.index_in_batch += num_rows; - - Ok(()) - } -} - -fn validate_input_and_memtable_schemas(payload: &Payload, memtable: &MemtableRef) { - if cfg!(debug_assertions) { - let payload_schema = &payload.schema; - let memtable_schema = memtable.schema(); - let user_schema = memtable_schema.user_schema(); - debug_assert_eq!(payload_schema.version(), user_schema.version()); - // Only validate column schemas. - debug_assert_eq!( - payload_schema.column_schemas(), - user_schema.column_schemas() - ); - } -} - -/// Holds `start` and `end` indexes to get a slice `[start, end)` from the vector whose -/// timestamps belong to same time range at `range_index`. -#[derive(Debug, PartialEq)] -struct SliceIndex { - start: usize, - end: usize, - /// Index in time ranges. - range_index: usize, -} - -#[cfg(test)] -mod tests { - use std::collections::HashMap; - use std::sync::Arc; - - use common_time::timestamp::Timestamp; - use datatypes::type_id::LogicalTypeId; - use datatypes::value::Value; - use datatypes::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef}; - use store_api::storage::WriteRequest; - - use super::*; - use crate::memtable::{DefaultMemtableBuilder, IterContext, MemtableBuilder}; - use crate::metadata::RegionMetadata; - use crate::schema::RegionSchemaRef; - use crate::test_util::descriptor_util::RegionDescBuilder; - use crate::test_util::write_batch_util; - use crate::write_batch::WriteBatch; - - fn new_test_write_batch() -> WriteBatch { - write_batch_util::new_write_batch( - &[ - ("ts", LogicalTypeId::TimestampMillisecond, false), - ("value", LogicalTypeId::Int64, true), - ], - Some(0), - 1, - ) - } - - fn new_region_schema() -> RegionSchemaRef { - let desc = RegionDescBuilder::new("test") - .timestamp(("ts", LogicalTypeId::TimestampMillisecond, false)) - .push_field_column(("value", LogicalTypeId::Int64, true)) - .build(); - let metadata: RegionMetadata = desc.try_into().unwrap(); - - metadata.schema().clone() - } - - fn put_batch(batch: &mut WriteBatch, data: &[(i64, Option)]) { - let ts = TimestampMillisecondVector::from_values(data.iter().map(|v| v.0)); - let value = Int64Vector::from(data.iter().map(|v| v.1).collect::>()); - let put_data = HashMap::from([ - ("ts".to_string(), Arc::new(ts) as VectorRef), - ("value".to_string(), Arc::new(value) as VectorRef), - ]); - - batch.put(put_data).unwrap(); - } - - fn check_memtable_content( - mem: &MemtableRef, - sequence: SequenceNumber, - data: &[(i64, Option)], - max_ts: i64, - min_ts: i64, - ) { - let iter = mem.iter(IterContext::default()).unwrap(); - assert_eq!(min_ts, mem.stats().min_timestamp.value()); - assert_eq!(max_ts, mem.stats().max_timestamp.value()); - - let mut index = 0; - for batch in iter { - let batch = batch.unwrap(); - let row_num = batch.column(0).len(); - for i in 0..row_num { - let ts = batch.column(0).get(i); - let v = batch.column(1).get(i); - assert_eq!( - Value::Timestamp(Timestamp::new_millisecond(data[index].0)), - ts - ); - assert_eq!(Value::from(data[index].1), v); - assert_eq!(Value::from(sequence), batch.column(2).get(i)); - - index += 1; - } - } - - assert_eq!(data.len(), index); - } - - #[test] - fn test_inserter_put_one_memtable() { - let sequence = 11111; - let memtable_schema = new_region_schema(); - let mutable_memtable = DefaultMemtableBuilder::default().build(memtable_schema); - let mut inserter = Inserter::new(sequence); - - let mut batch = new_test_write_batch(); - put_batch(&mut batch, &[(1, Some(1)), (2, None)]); - // Also test multiple put data in one batch. - put_batch( - &mut batch, - &[ - (3, None), - (2, None), // Duplicate entries in same put data. - (2, Some(2)), - (4, Some(4)), - (201, Some(201)), - (102, None), - (101, Some(101)), - ], - ); - - inserter - .insert_memtable(batch.payload(), &mutable_memtable) - .unwrap(); - check_memtable_content( - &mutable_memtable, - sequence, - &[ - (1, Some(1)), - (2, Some(2)), - (3, None), - (4, Some(4)), - (101, Some(101)), - (102, None), - (201, Some(201)), - ], - 201, - 1, - ); - } -} diff --git a/src/storage/src/memtable/tests.rs b/src/storage/src/memtable/tests.rs deleted file mode 100644 index 36bd466134b5..000000000000 --- a/src/storage/src/memtable/tests.rs +++ /dev/null @@ -1,595 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_time::Timestamp; -use datatypes::prelude::*; -use datatypes::timestamp::TimestampMillisecond; -use datatypes::type_id::LogicalTypeId; -use datatypes::vectors::{ - TimestampMillisecondVector, TimestampMillisecondVectorBuilder, UInt64Vector, - UInt64VectorBuilder, UInt8Vector, -}; - -use super::*; -use crate::metadata::RegionMetadata; -use crate::schema::{ProjectedSchema, RegionSchemaRef}; -use crate::test_util::descriptor_util::RegionDescBuilder; - -// Schema for testing memtable: -// - key: Int64(timestamp), UInt64(version), -// - value: UInt64, UInt64 -pub fn schema_for_test() -> RegionSchemaRef { - // Just build a region desc and use its columns metadata. - let desc = RegionDescBuilder::new("test") - .push_field_column(("v0", LogicalTypeId::UInt64, true)) - .push_field_column(("v1", LogicalTypeId::UInt64, true)) - .build(); - let metadata: RegionMetadata = desc.try_into().unwrap(); - - metadata.schema().clone() -} - -fn kvs_for_test_with_index( - sequence: SequenceNumber, - op_type: OpType, - start_index_in_batch: usize, - keys: &[TimestampMillisecond], - values: &[(Option, Option)], -) -> KeyValues { - assert_eq!(keys.len(), values.len()); - - let mut key_builders = TimestampMillisecondVectorBuilder::with_capacity(keys.len()); - for key in keys { - key_builders.push(Some(*key)); - } - let ts_col = Arc::new(key_builders.finish()) as _; - - let mut value_builders = ( - UInt64VectorBuilder::with_capacity(values.len()), - UInt64VectorBuilder::with_capacity(values.len()), - ); - for value in values { - value_builders.0.push(value.0); - value_builders.1.push(value.1); - } - let row_values = vec![ - Arc::new(value_builders.0.finish()) as _, - Arc::new(value_builders.1.finish()) as _, - ]; - - let kvs = KeyValues { - sequence, - op_type, - start_index_in_batch, - keys: vec![], - values: row_values, - timestamp: Some(ts_col), - }; - - assert_eq!(keys.len(), kvs.len()); - assert_eq!(keys.is_empty(), kvs.is_empty()); - - kvs -} - -fn kvs_for_test( - sequence: SequenceNumber, - op_type: OpType, - keys: &[TimestampMillisecond], - values: &[(Option, Option)], -) -> KeyValues { - kvs_for_test_with_index(sequence, op_type, 0, keys, values) -} - -pub fn write_kvs( - memtable: &dyn Memtable, - sequence: SequenceNumber, - op_type: OpType, - keys: &[i64], - values: &[(Option, Option)], -) { - let keys: Vec = keys.iter().map(|l| ((*l).into())).collect(); - - let kvs = kvs_for_test(sequence, op_type, &keys, values); - - memtable.write(&kvs).unwrap(); -} - -fn check_batch_valid(batch: &Batch) { - assert_eq!(5, batch.num_columns()); - let row_num = batch.column(0).len(); - for i in 1..5 { - assert_eq!(row_num, batch.column(i).len()); - } -} - -fn check_iter_content( - iter: &mut dyn BatchIterator, - keys: &[i64], - sequences: &[u64], - op_types: &[OpType], - values: &[(Option, Option)], -) { - let keys: Vec = keys.iter().map(|l| (*l).into()).collect(); - - let mut index = 0; - for batch in iter { - let batch = batch.unwrap(); - check_batch_valid(&batch); - - let row_num = batch.column(0).len(); - for i in 0..row_num { - let k0 = batch.column(0).get(i); - let (v0, v1) = (batch.column(1).get(i), batch.column(2).get(i)); - let sequence = batch.column(3).get(i); - let op_type = batch.column(4).get(i); - - assert_eq!(Value::from(keys[index]), k0); - assert_eq!(Value::from(values[index].0), v0); - assert_eq!(Value::from(values[index].1), v1); - assert_eq!(Value::from(sequences[index]), sequence); - assert_eq!(Value::from(op_types[index] as u8), op_type); - - index += 1; - } - } - - assert_eq!(keys.len(), index); -} - -struct MemtableTester { - schema: RegionSchemaRef, - builders: Vec, -} - -impl Default for MemtableTester { - fn default() -> MemtableTester { - MemtableTester::new() - } -} - -impl MemtableTester { - fn new() -> MemtableTester { - let schema = schema_for_test(); - let builders = vec![Arc::new(DefaultMemtableBuilder::default()) as _]; - - MemtableTester { schema, builders } - } - - fn new_memtables(&self) -> Vec { - self.builders - .iter() - .map(|b| b.build(self.schema.clone())) - .collect() - } - - fn run_testcase(&self, testcase: F) - where - F: Fn(TestContext), - { - for memtable in self.new_memtables() { - let test_ctx = TestContext { - schema: self.schema.clone(), - memtable, - }; - - testcase(test_ctx); - } - } -} - -struct TestContext { - schema: RegionSchemaRef, - memtable: MemtableRef, -} - -fn write_iter_memtable_case(ctx: &TestContext) { - // Test iterating an empty memtable. - let mut iter = ctx.memtable.iter(IterContext::default()).unwrap(); - assert!(iter.next().is_none()); - // Poll the empty iterator again. - assert!(iter.next().is_none()); - assert_eq!(0, ctx.memtable.stats().bytes_allocated()); - - // Init test data. - write_kvs( - &*ctx.memtable, - 10, // sequence - OpType::Put, - &[1000, 1000, 2002, 2003, 2003, 1001], // keys - &[ - (Some(1), None), - (Some(2), None), - (Some(7), None), - (Some(8), None), - (Some(9), None), - (Some(3), None), - ], // values - ); - write_kvs( - &*ctx.memtable, - 11, // sequence - OpType::Put, - &[1002, 1003, 1004], // keys - &[(None, None), (Some(5), None), (None, None)], // values - ); - - // 9 key value pairs (6 + 3). - assert_eq!(576, ctx.memtable.stats().bytes_allocated()); - - let batch_sizes = [1, 4, 8, consts::READ_BATCH_SIZE]; - for batch_size in batch_sizes { - let iter_ctx = IterContext { - batch_size, - ..Default::default() - }; - let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap(); - assert_eq!( - ctx.schema.user_schema(), - iter.schema().projected_user_schema() - ); - assert_eq!(RowOrdering::Key, iter.ordering()); - - check_iter_content( - &mut *iter, - &[1000, 1001, 1002, 1003, 1004, 2002, 2003], // keys - &[10, 10, 11, 11, 11, 10, 10], // sequences - &[ - OpType::Put, - OpType::Put, - OpType::Put, - OpType::Put, - OpType::Put, - OpType::Put, - OpType::Put, - ], // op_types - &[ - (Some(2), None), - (Some(3), None), - (None, None), - (Some(5), None), - (None, None), - (Some(7), None), - (Some(9), None), - ], // values - ); - } -} - -#[test] -fn test_iter_context_default() { - let ctx = IterContext::default(); - assert_eq!(SequenceNumber::MAX, ctx.visible_sequence); -} - -#[test] -fn test_write_iter_memtable() { - let tester = MemtableTester::default(); - tester.run_testcase(|ctx| { - write_iter_memtable_case(&ctx); - }); -} - -fn check_iter_batch_size(iter: &mut dyn BatchIterator, total: usize, batch_size: usize) { - let mut remains = total; - for batch in iter { - let batch = batch.unwrap(); - check_batch_valid(&batch); - - let row_num = batch.column(0).len(); - if remains >= batch_size { - assert_eq!(batch_size, row_num); - remains -= batch_size; - } else { - assert_eq!(remains, row_num); - remains = 0; - } - } - - assert_eq!(0, remains); -} - -#[test] -fn test_iter_batch_size() { - let tester = MemtableTester::default(); - tester.run_testcase(|ctx| { - write_kvs( - &*ctx.memtable, - 10, // sequence - OpType::Put, - &[1000, 1000, 1001, 2002, 2003, 2003], // keys - &[ - (Some(1), None), - (Some(2), None), - (Some(3), None), - (Some(4), None), - (None, None), - (None, None), - ], // values - ); - - let total = 4; - // Batch size [less than, equal to, greater than] total - let batch_sizes = [1, 6, 8]; - for batch_size in batch_sizes { - let iter_ctx = IterContext { - batch_size, - ..Default::default() - }; - - let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap(); - check_iter_batch_size(&mut *iter, total, batch_size); - } - }); -} - -#[test] -fn test_duplicate_key_across_batch() { - let tester = MemtableTester::default(); - tester.run_testcase(|ctx| { - write_kvs( - &*ctx.memtable, - 10, // sequence - OpType::Put, - &[1000, 1001, 2000, 2001], // keys - &[(Some(1), None), (None, None), (None, None), (None, None)], // values - ); - - write_kvs( - &*ctx.memtable, - 11, // sequence - OpType::Put, - &[1000, 2001], // keys - &[(Some(1231), None), (Some(1232), None)], // values - ); - - let batch_sizes = [1, 2, 3, 4, 5]; - for batch_size in batch_sizes { - let iter_ctx = IterContext { - batch_size, - ..Default::default() - }; - - let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap(); - check_iter_content( - &mut *iter, - &[1000, 1001, 2000, 2001], // keys - &[11, 10, 10, 11], // sequences - &[OpType::Put, OpType::Put, OpType::Put, OpType::Put], // op_types - &[ - (Some(1231), None), - (None, None), - (None, None), - (Some(1232), None), - ], // values - ); - } - }); -} - -#[test] -fn test_duplicate_key_in_batch() { - let tester = MemtableTester::default(); - tester.run_testcase(|ctx| { - write_kvs( - &*ctx.memtable, - 10, // sequence - OpType::Put, - &[1000, 1000, 1001, 2001], // keys - &[(None, None), (None, None), (Some(1234), None), (None, None)], // values - ); - - let batch_sizes = [1, 2, 3, 4, 5]; - for batch_size in batch_sizes { - let iter_ctx = IterContext { - batch_size, - ..Default::default() - }; - - let mut iter = ctx.memtable.iter(iter_ctx.clone()).unwrap(); - check_iter_content( - &mut *iter, - &[1000, 1001, 2001], // keys - &[10, 10, 10], // sequences - &[OpType::Put, OpType::Put, OpType::Put], // op_types - &[(None, None), (Some(1234), None), (None, None)], // values - ); - } - }); -} - -#[test] -fn test_sequence_visibility() { - let tester = MemtableTester::default(); - tester.run_testcase(|ctx| { - write_kvs( - &*ctx.memtable, - 10, // sequence - OpType::Put, - &[1000, 1000], // keys - &[(Some(1), None), (Some(2), None)], // values - ); - - write_kvs( - &*ctx.memtable, - 11, // sequence - OpType::Put, - &[1000, 1000], // keys - &[(Some(11), None), (Some(12), None)], // values - ); - - write_kvs( - &*ctx.memtable, - 12, // sequence - OpType::Put, - &[1000, 1000], // keys - &[(Some(21), None), (Some(22), None)], // values - ); - - { - let iter_ctx = IterContext { - batch_size: 1, - visible_sequence: 9, - projected_schema: None, - time_range: None, - }; - - let mut iter = ctx.memtable.iter(iter_ctx).unwrap(); - check_iter_content( - &mut *iter, - &[], // keys - &[], // sequences - &[], // op_types - &[], // values - ); - } - - { - let iter_ctx = IterContext { - batch_size: 1, - visible_sequence: 10, - projected_schema: None, - time_range: None, - }; - - let mut iter = ctx.memtable.iter(iter_ctx).unwrap(); - check_iter_content( - &mut *iter, - &[1000], // keys - &[10], // sequences - &[OpType::Put, OpType::Put], // op_types - &[(Some(2), None)], // values - ); - } - - { - let iter_ctx = IterContext { - batch_size: 1, - visible_sequence: 11, - projected_schema: None, - time_range: None, - }; - - let mut iter = ctx.memtable.iter(iter_ctx).unwrap(); - check_iter_content( - &mut *iter, - &[1000], // keys - &[11], // sequences - &[OpType::Put, OpType::Put], // op_types - &[(Some(12), None)], // values - ); - } - }); -} - -#[test] -fn test_iter_after_none() { - let tester = MemtableTester::default(); - tester.run_testcase(|ctx| { - write_kvs( - &*ctx.memtable, - 10, // sequence - OpType::Put, - &[1000, 1001, 1002], // keys - &[(Some(0), None), (Some(1), None), (Some(2), None)], // values - ); - - let iter_ctx = IterContext { - batch_size: 4, - ..Default::default() - }; - - let mut iter = ctx.memtable.iter(iter_ctx).unwrap(); - let _ = iter.next().unwrap(); - assert!(iter.next().is_none()); - assert!(iter.next().is_none()); - }); -} - -#[test] -fn test_filter_memtable() { - let tester = MemtableTester::default(); - tester.run_testcase(|ctx| { - write_kvs( - &*ctx.memtable, - 10, // sequence - OpType::Put, - &[1000, 1001, 1002], // keys - &[(Some(0), None), (Some(1), None), (Some(2), None)], // values - ); - - let iter_ctx = IterContext { - batch_size: 4, - time_range: Some( - TimestampRange::new( - Timestamp::new_millisecond(0), - Timestamp::new_millisecond(1001), - ) - .unwrap(), - ), - ..Default::default() - }; - - let mut iter = ctx.memtable.iter(iter_ctx).unwrap(); - let batch = iter.next().unwrap().unwrap(); - assert_eq!(5, batch.columns.len()); - assert_eq!( - Arc::new(TimestampMillisecondVector::from_slice([1000])) as Arc<_>, - batch.columns[0] - ); - }); -} - -#[test] -fn test_memtable_projection() { - let tester = MemtableTester::default(); - // Only need v0, but row key columns and internal columns would also be read. - let projected_schema = - Arc::new(ProjectedSchema::new(tester.schema.clone(), Some(vec![2])).unwrap()); - - tester.run_testcase(|ctx| { - write_kvs( - &*ctx.memtable, - 9, // sequence - OpType::Put, - &[1000, 1001, 1002], // keys - &[ - (Some(10), Some(20)), - (Some(11), Some(21)), - (Some(12), Some(22)), - ], // values - ); - - let iter_ctx = IterContext { - batch_size: 4, - projected_schema: Some(projected_schema.clone()), - ..Default::default() - }; - - let mut iter = ctx.memtable.iter(iter_ctx).unwrap(); - let batch = iter.next().unwrap().unwrap(); - assert!(iter.next().is_none()); - - assert_eq!(4, batch.num_columns()); - let k0 = Arc::new(TimestampMillisecondVector::from_slice([1000, 1001, 1002])) as VectorRef; - let v0 = Arc::new(UInt64Vector::from_slice([20, 21, 22])) as VectorRef; - let sequences = Arc::new(UInt64Vector::from_slice([9, 9, 9])) as VectorRef; - let op_types = Arc::new(UInt8Vector::from_slice([1, 1, 1])) as VectorRef; - - assert_eq!(k0, *batch.column(0)); - assert_eq!(v0, *batch.column(1)); - assert_eq!(sequences, *batch.column(2)); - assert_eq!(op_types, *batch.column(3)); - }); -} diff --git a/src/storage/src/memtable/version.rs b/src/storage/src/memtable/version.rs deleted file mode 100644 index 0efac7d627e6..000000000000 --- a/src/storage/src/memtable/version.rs +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::cmp::Ordering; - -use common_time::RangeMillis; - -use crate::memtable::{MemtableId, MemtableRef}; - -/// A version of all memtables. -/// -/// This structure is immutable now. -#[derive(Debug)] -pub struct MemtableVersion { - mutable: MemtableRef, - /// Immutable memtables. - immutables: Vec, -} - -impl MemtableVersion { - pub fn new(mutable: MemtableRef) -> MemtableVersion { - Self { - mutable, - immutables: vec![], - } - } - - #[inline] - pub fn mutable_memtable(&self) -> &MemtableRef { - &self.mutable - } - - #[inline] - pub fn immutable_memtables(&self) -> &[MemtableRef] { - &self.immutables - } - - pub fn num_memtables(&self) -> usize { - // the last `1` is for `mutable` - self.immutable_memtables().len() + 1 - } - - /// Clone current memtable version and freeze its mutable memtables, which moves - /// all mutable memtables to immutable memtable list. - /// - /// This method also calls [Memtable::mark_immutable()](crate::memtable::Memtable::mark_immutable()) to - /// mark the mutable memtable as immutable. - pub fn freeze_mutable(&self, new_mutable: MemtableRef) -> MemtableVersion { - let mut immutables = self.immutables.clone(); - // Marks the mutable memtable as immutable so it can free the memory usage from our - // soft limit. - self.mutable.mark_immutable(); - immutables.push(self.mutable.clone()); - - MemtableVersion { - mutable: new_mutable, - immutables, - } - } - - pub fn mutable_bytes_allocated(&self) -> usize { - self.mutable.stats().bytes_allocated() - } - - pub fn total_bytes_allocated(&self) -> usize { - self.immutables - .iter() - .map(|m| m.stats().bytes_allocated()) - .sum::() - + self.mutable.stats().bytes_allocated() - } - - /// Creates a new `MemtableVersion` that removes immutable memtables - /// less than or equal to max_memtable_id. - pub fn remove_immutables(&self, max_memtable_id: MemtableId) -> MemtableVersion { - let immutables = self - .immutables - .iter() - .filter(|immem| immem.id() > max_memtable_id) - .cloned() - .collect(); - - MemtableVersion { - mutable: self.mutable.clone(), - immutables, - } - } - - pub fn memtables_to_flush(&self) -> (Option, Vec) { - let max_memtable_id = self.immutables.iter().map(|immem| immem.id()).max(); - let memtables = self.immutables.clone(); - - (max_memtable_id, memtables) - } -} - -// We use a new type to order time ranges by (end, start). -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -struct RangeKey(RangeMillis); - -impl Ord for RangeKey { - fn cmp(&self, other: &RangeKey) -> Ordering { - self.0 - .end() - .cmp(other.0.end()) - .then_with(|| self.0.start().cmp(other.0.start())) - } -} - -impl PartialOrd for RangeKey { - fn partial_cmp(&self, other: &RangeKey) -> Option { - Some(self.cmp(other)) - } -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use super::*; - use crate::memtable::{DefaultMemtableBuilder, MemtableBuilder}; - use crate::test_util::schema_util; - - #[test] - fn test_memtable_version() { - let memtable_builder = DefaultMemtableBuilder::default(); - let region_schema = Arc::new(schema_util::new_region_schema(1, 1)); - - let memtable_1 = memtable_builder.build(region_schema.clone()); - let v1 = MemtableVersion::new(memtable_1); - assert_eq!(1, v1.num_memtables()); - - // Freeze and add new mutable. - let memtable_2 = memtable_builder.build(region_schema.clone()); - let v2 = v1.freeze_mutable(memtable_2); - let v2_immutables = v2.immutable_memtables(); - assert_eq!(1, v2_immutables.len()); - assert_eq!(0, v2_immutables[0].id()); - assert_eq!(1, v2.mutable_memtable().id()); - assert_eq!(2, v2.num_memtables()); - - // Add another one and check immutable memtables that need flush - let memtable_3 = memtable_builder.build(region_schema); - let v3 = v2.freeze_mutable(memtable_3); - let (max_table_id, immutables) = v3.memtables_to_flush(); - assert_eq!(1, max_table_id.unwrap()); - assert_eq!(2, immutables.len()); - - // Remove memtables - let v4 = v3.remove_immutables(1); - assert_eq!(1, v4.num_memtables()); - assert_eq!(0, v4.immutable_memtables().len()); - assert_eq!(2, v4.mutable_memtable().id()); - } -} diff --git a/src/storage/src/metadata.rs b/src/storage/src/metadata.rs deleted file mode 100644 index 2351674125c3..000000000000 --- a/src/storage/src/metadata.rs +++ /dev/null @@ -1,1313 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::{HashMap, HashSet}; -use std::num::ParseIntError; -use std::str::FromStr; -use std::sync::Arc; - -use common_error::ext::ErrorExt; -use common_error::status_code::StatusCode; -use common_macro::stack_trace_debug; -use datatypes::data_type::ConcreteDataType; -use datatypes::schema::{ColumnSchema, Metadata, COMMENT_KEY}; -use serde::{Deserialize, Serialize}; -use snafu::{ensure, Location, OptionExt, ResultExt, Snafu}; -use store_api::storage::consts::{self, ReservedColumnId}; -use store_api::storage::{ - AddColumn, AlterOperation, AlterRequest, ColumnDescriptor, ColumnDescriptorBuilder, - ColumnDescriptorBuilderError, ColumnFamilyDescriptor, ColumnFamilyDescriptorBuilder, - ColumnFamilyId, ColumnId, RegionDescriptor, RegionDescriptorBuilder, RegionId, RegionMeta, - RowKeyDescriptor, RowKeyDescriptorBuilder, Schema, SchemaRef, -}; - -use crate::manifest::action::{RawColumnFamiliesMetadata, RawColumnsMetadata, RawRegionMetadata}; -use crate::schema::{RegionSchema, RegionSchemaRef}; - -/// Error for handling metadata. -#[derive(Snafu)] -#[snafu(visibility(pub(crate)))] -#[stack_trace_debug] -pub enum Error { - #[snafu(display("Column name {} already exists", name))] - ColNameExists { name: String, location: Location }, - - #[snafu(display("Column family name {} already exists", name))] - CfNameExists { name: String, location: Location }, - - #[snafu(display("Column family id {} already exists", id))] - CfIdExists { id: ColumnId, location: Location }, - - #[snafu(display("Column id {} already exists", id))] - ColIdExists { id: ColumnId, location: Location }, - - #[snafu(display("Failed to build schema"))] - InvalidSchema { - location: Location, - source: datatypes::error::Error, - }, - - #[snafu(display("Column name {} is reserved by the system", name))] - ReservedColumn { name: String, location: Location }, - - #[snafu(display("Missing timestamp key column"))] - MissingTimestamp { location: Location }, - - // Variants for validating `AlterRequest`, which won't have a backtrace. - #[snafu(display("Expect altering metadata with version {}, given {}", expect, given))] - InvalidAlterVersion { - expect: VersionNumber, - given: VersionNumber, - }, - - #[snafu(display("Failed to add column as there is already a column named {}", name))] - AddExistColumn { name: String }, - - #[snafu(display("Failed to add a non null column {}", name))] - AddNonNullColumn { name: String }, - - #[snafu(display("Failed to drop column as there is no column named {}", name))] - DropAbsentColumn { name: String }, - - #[snafu(display("Failed to drop column {} as it is part of key", name))] - DropKeyColumn { name: String }, - - #[snafu(display("Failed to drop column {} as it is an internal column", name))] - DropInternalColumn { name: String }, - - // End of variants for validating `AlterRequest`. - #[snafu(display("Failed to convert to column schema"))] - ToColumnSchema { - location: Location, - source: datatypes::error::Error, - }, - - #[snafu(display("Failed to parse metadata to int, key_value: {}", key_value))] - ParseMetaInt { - // Store key and value in one string to reduce the enum size. - key_value: String, - #[snafu(source)] - error: std::num::ParseIntError, - location: Location, - }, - - #[snafu(display("Metadata of {} not found", key))] - MetaNotFound { key: String, location: Location }, - - #[snafu(display("Failed to build column descriptor"))] - BuildColumnDescriptor { - #[snafu(source)] - error: ColumnDescriptorBuilderError, - location: Location, - }, - - #[snafu(display("Failed to convert from arrow schema"))] - ConvertArrowSchema { - location: Location, - source: datatypes::error::Error, - }, - - #[snafu(display("Invalid internal column index in arrow schema"))] - InvalidIndex { location: Location }, - - #[snafu(display("Failed to convert arrow chunk to batch, name: {}", name))] - ConvertChunk { - name: String, - location: Location, - source: datatypes::error::Error, - }, - - #[snafu(display("Failed to convert schema"))] - ConvertSchema { - location: Location, - source: datatypes::error::Error, - }, - - #[snafu(display("Invalid projection, {}", msg))] - InvalidProjection { msg: String, location: Location }, -} - -pub type Result = std::result::Result; - -impl ErrorExt for Error { - fn status_code(&self) -> StatusCode { - StatusCode::InvalidArguments - } - - fn as_any(&self) -> &dyn std::any::Any { - self - } -} - -/// Implementation of [RegionMeta]. -/// -/// Holds a snapshot of region metadata. -#[derive(Clone, Debug)] -pub struct RegionMetaImpl { - metadata: RegionMetadataRef, -} - -impl RegionMetaImpl { - pub fn new(metadata: RegionMetadataRef) -> RegionMetaImpl { - RegionMetaImpl { metadata } - } -} - -impl RegionMeta for RegionMetaImpl { - fn schema(&self) -> &SchemaRef { - self.metadata.user_schema() - } - - fn version(&self) -> u32 { - self.metadata.version - } -} - -pub type VersionNumber = u32; - -// TODO(yingwen): We may need to hold a list of history schema. - -/// In memory metadata of region. -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct RegionMetadata { - // The following fields are immutable. - id: RegionId, - name: String, - - // The following fields are mutable. - /// Latest schema of the region. - schema: RegionSchemaRef, - pub columns: ColumnsMetadataRef, - column_families: ColumnFamiliesMetadata, - version: VersionNumber, -} - -impl RegionMetadata { - #[inline] - pub fn id(&self) -> RegionId { - self.id - } - - #[inline] - pub fn name(&self) -> &str { - &self.name - } - - #[inline] - pub fn schema(&self) -> &RegionSchemaRef { - &self.schema - } - - #[inline] - pub fn user_schema(&self) -> &SchemaRef { - self.schema.user_schema() - } - - #[inline] - pub fn version(&self) -> u32 { - self.schema.version() - } - - /// Checks whether the `req` is valid, returns `Err` if it is invalid. - pub fn validate_alter(&self, req: &AlterRequest) -> Result<()> { - ensure!( - req.version == self.version, - InvalidAlterVersionSnafu { - expect: req.version, - given: self.version, - } - ); - - match &req.operation { - AlterOperation::AddColumns { columns } => { - for col in columns { - self.validate_add_column(col)?; - } - } - AlterOperation::DropColumns { names } => { - for name in names { - self.validate_drop_column(name)?; - } - } - } - - Ok(()) - } - - /// Returns a new [RegionMetadata] after alteration, leave `self` unchanged. - /// - /// Caller should use [RegionMetadata::validate_alter] to validate the `req` and - /// ensure the version of the `req` is equal to the version of the metadata. - /// - /// # Panics - /// Panics if `req.version != self.version`. - pub fn alter(&self, req: &AlterRequest) -> Result { - // The req should have been validated before. - assert_eq!(req.version, self.version); - - let mut desc = self.to_descriptor(); - // Apply the alter operation to the descriptor. - req.operation.apply(&mut desc); - - RegionMetadataBuilder::try_from(desc)? - .version(self.version + 1) // Bump the metadata version. - .build() - } - - fn validate_add_column(&self, add_column: &AddColumn) -> Result<()> { - // We don't check the case that the column is not nullable but default constraint is null. The - // caller should guarantee this. - ensure!( - add_column.desc.is_nullable() || add_column.desc.default_constraint().is_some(), - AddNonNullColumnSnafu { - name: &add_column.desc.name, - } - ); - - // Use the store schema to check the column as it contains all internal columns. - let store_schema = self.schema.store_schema(); - ensure!( - !store_schema.contains_column(&add_column.desc.name), - AddExistColumnSnafu { - name: &add_column.desc.name, - } - ); - - Ok(()) - } - - fn validate_drop_column(&self, name: &str) -> Result<()> { - let store_schema = self.schema.store_schema(); - ensure!( - store_schema.contains_column(name), - DropAbsentColumnSnafu { name } - ); - ensure!( - !store_schema.is_key_column(name), - DropKeyColumnSnafu { name } - ); - ensure!( - store_schema.is_user_column(name), - DropInternalColumnSnafu { name } - ); - - Ok(()) - } - - fn to_descriptor(&self) -> RegionDescriptor { - let row_key = self.columns.to_row_key_descriptor(); - let mut builder = RegionDescriptorBuilder::default() - .id(self.id) - .name(&self.name) - .row_key(row_key); - - for (cf_id, cf) in &self.column_families.id_to_cfs { - let mut cf_builder = ColumnFamilyDescriptorBuilder::default() - .cf_id(*cf_id) - .name(&cf.name); - for column in &self.columns.columns[cf.column_index_start..cf.column_index_end] { - cf_builder = cf_builder.push_column(column.desc.clone()); - } - // It should always be able to build the descriptor back. - let desc = cf_builder.build().unwrap(); - if *cf_id == consts::DEFAULT_CF_ID { - builder = builder.default_cf(desc); - } else { - builder = builder.push_extra_column_family(desc); - } - } - - // We could ensure all fields are set here. - builder.build().unwrap() - } -} - -pub type RegionMetadataRef = Arc; - -impl From<&RegionMetadata> for RawRegionMetadata { - fn from(data: &RegionMetadata) -> RawRegionMetadata { - RawRegionMetadata { - id: data.id, - name: data.name.clone(), - columns: RawColumnsMetadata::from(&*data.columns), - column_families: RawColumnFamiliesMetadata::from(&data.column_families), - version: data.version, - } - } -} - -impl TryFrom for RegionMetadata { - type Error = Error; - - fn try_from(raw: RawRegionMetadata) -> Result { - let columns = Arc::new(ColumnsMetadata::from(raw.columns)); - let schema = Arc::new(RegionSchema::new(columns.clone(), raw.version)?); - - Ok(RegionMetadata { - id: raw.id, - name: raw.name, - schema, - columns, - column_families: raw.column_families.into(), - version: raw.version, - }) - } -} - -const METADATA_CF_ID_KEY: &str = "greptime:storage:cf_id"; -const METADATA_COLUMN_ID_KEY: &str = "greptime:storage:column_id"; - -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] -pub struct ColumnMetadata { - pub cf_id: ColumnFamilyId, - pub desc: ColumnDescriptor, -} - -impl ColumnMetadata { - #[inline] - pub fn id(&self) -> ColumnId { - self.desc.id - } - - #[inline] - pub fn name(&self) -> &str { - &self.desc.name - } - - /// Convert `self` to [`ColumnSchema`] for building a [`StoreSchema`](crate::schema::StoreSchema). This - /// would store additional metadatas to the ColumnSchema. - pub fn to_column_schema(&self) -> Result { - let desc = &self.desc; - - ColumnSchema::new(&desc.name, desc.data_type.clone(), desc.is_nullable()) - .with_metadata(self.to_metadata()) - .with_time_index(self.desc.is_time_index()) - .with_default_constraint(desc.default_constraint().cloned()) - .context(ToColumnSchemaSnafu) - } - - /// Convert [`ColumnSchema`] in [`StoreSchema`](crate::schema::StoreSchema) to [`ColumnMetadata`]. - pub fn from_column_schema(column_schema: &ColumnSchema) -> Result { - let metadata = column_schema.metadata(); - let cf_id = try_parse_int(metadata, METADATA_CF_ID_KEY, Some(consts::DEFAULT_CF_ID))?; - let column_id = try_parse_int(metadata, METADATA_COLUMN_ID_KEY, None)?; - let comment = metadata.get(COMMENT_KEY).cloned().unwrap_or_default(); - - let desc = ColumnDescriptorBuilder::new( - column_id, - &column_schema.name, - column_schema.data_type.clone(), - ) - .is_nullable(column_schema.is_nullable()) - .is_time_index(column_schema.is_time_index()) - .default_constraint(column_schema.default_constraint().cloned()) - .comment(comment) - .build() - .context(BuildColumnDescriptorSnafu)?; - - Ok(ColumnMetadata { cf_id, desc }) - } - - fn to_metadata(&self) -> Metadata { - let mut metadata = Metadata::new(); - if self.cf_id != consts::DEFAULT_CF_ID { - let _ = metadata.insert(METADATA_CF_ID_KEY.to_string(), self.cf_id.to_string()); - } - let _ = metadata.insert(METADATA_COLUMN_ID_KEY.to_string(), self.desc.id.to_string()); - if !self.desc.comment.is_empty() { - let _ = metadata.insert(COMMENT_KEY.to_string(), self.desc.comment.clone()); - } - - metadata - } -} - -fn try_parse_int(metadata: &Metadata, key: &str, default_value: Option) -> Result -where - T: FromStr, -{ - if let Some(value) = metadata.get(key) { - return value.parse().with_context(|_| ParseMetaIntSnafu { - key_value: format!("{key}={value}"), - }); - } - // No such key in metadata. - - default_value.context(MetaNotFoundSnafu { key }) -} - -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct ColumnsMetadata { - /// All columns. - /// - /// Columns are organized in the following order: - /// ```text - /// key columns, timestamp, [version,] value columns, internal columns - /// ``` - /// - /// The key columns, timestamp and version forms the row key. - columns: Vec, - /// Maps column name to index of columns, used to fast lookup column by name. - name_to_col_index: HashMap, - /// Exclusive end index of row key columns. - row_key_end: usize, - /// Index of timestamp key column. - timestamp_key_index: usize, - /// Exclusive end index of user columns. - /// - /// Columns in `[user_column_end..)` are internal columns. - user_column_end: usize, -} - -impl ColumnsMetadata { - /// Returns an iterator to all row key columns. - /// - /// Row key columns includes all key columns, the timestamp column and the - /// optional version column. - pub fn iter_row_key_columns(&self) -> impl Iterator { - self.columns.iter().take(self.row_key_end) - } - - /// Returns an iterator to all value columns (internal columns are excluded). - pub fn iter_field_columns(&self) -> impl Iterator { - self.columns[self.row_key_end..self.user_column_end].iter() - } - - pub fn iter_user_columns(&self) -> impl Iterator { - self.columns.iter().take(self.user_column_end) - } - - #[inline] - pub fn columns(&self) -> &[ColumnMetadata] { - &self.columns - } - - #[inline] - pub fn num_row_key_columns(&self) -> usize { - self.row_key_end - } - - #[inline] - pub fn num_field_columns(&self) -> usize { - self.user_column_end - self.row_key_end - } - - #[inline] - pub fn timestamp_key_index(&self) -> usize { - self.timestamp_key_index - } - - #[inline] - pub fn row_key_end(&self) -> usize { - self.row_key_end - } - - #[inline] - pub fn user_column_end(&self) -> usize { - self.user_column_end - } - - #[inline] - pub fn column_metadata(&self, idx: usize) -> &ColumnMetadata { - &self.columns[idx] - } - - fn to_row_key_descriptor(&self) -> RowKeyDescriptor { - let mut builder = RowKeyDescriptorBuilder::default(); - for (idx, column) in self.iter_row_key_columns().enumerate() { - // Not a timestamp column. - if idx != self.timestamp_key_index { - builder = builder.push_column(column.desc.clone()); - } - } - builder = builder.timestamp(self.column_metadata(self.timestamp_key_index).desc.clone()); - // Since the metadata is built from descriptor, so it should always be able to build the descriptor back. - builder.build().unwrap() - } -} - -pub type ColumnsMetadataRef = Arc; - -impl From<&ColumnsMetadata> for RawColumnsMetadata { - fn from(data: &ColumnsMetadata) -> RawColumnsMetadata { - RawColumnsMetadata { - columns: data.columns.clone(), - row_key_end: data.row_key_end, - timestamp_key_index: data.timestamp_key_index, - user_column_end: data.user_column_end, - } - } -} - -impl From for ColumnsMetadata { - fn from(raw: RawColumnsMetadata) -> ColumnsMetadata { - let name_to_col_index = raw - .columns - .iter() - .enumerate() - .map(|(i, col)| (col.desc.name.clone(), i)) - .collect(); - - ColumnsMetadata { - columns: raw.columns, - name_to_col_index, - row_key_end: raw.row_key_end, - timestamp_key_index: raw.timestamp_key_index, - user_column_end: raw.user_column_end, - } - } -} - -#[derive(Clone, Debug, PartialEq, Eq)] -pub struct ColumnFamiliesMetadata { - /// Map column family id to column family metadata. - id_to_cfs: HashMap, -} - -impl ColumnFamiliesMetadata { - pub fn cf_by_id(&self, cf_id: ColumnFamilyId) -> Option<&ColumnFamilyMetadata> { - self.id_to_cfs.get(&cf_id) - } -} - -impl From<&ColumnFamiliesMetadata> for RawColumnFamiliesMetadata { - fn from(data: &ColumnFamiliesMetadata) -> RawColumnFamiliesMetadata { - let column_families = data.id_to_cfs.values().cloned().collect(); - RawColumnFamiliesMetadata { column_families } - } -} - -impl From for ColumnFamiliesMetadata { - fn from(raw: RawColumnFamiliesMetadata) -> ColumnFamiliesMetadata { - let id_to_cfs = raw - .column_families - .into_iter() - .map(|cf| (cf.cf_id, cf)) - .collect(); - ColumnFamiliesMetadata { id_to_cfs } - } -} - -#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] -pub struct ColumnFamilyMetadata { - /// Column family name. - pub name: String, - pub cf_id: ColumnFamilyId, - /// Inclusive start index of columns in the column family. - pub column_index_start: usize, - /// Exclusive end index of columns in the column family. - pub column_index_end: usize, -} - -impl TryFrom for RegionMetadataBuilder { - type Error = Error; - - fn try_from(desc: RegionDescriptor) -> Result { - let mut builder = RegionMetadataBuilder::new() - .name(desc.name) - .id(desc.id) - .row_key(desc.row_key)? - .add_column_family(desc.default_cf)?; - for cf in desc.extra_cfs { - builder = builder.add_column_family(cf)?; - } - - Ok(builder) - } -} - -impl TryFrom for RegionMetadata { - type Error = Error; - - fn try_from(desc: RegionDescriptor) -> Result { - // Doesn't set version explicitly here, because this is a new region meta - // created from descriptor, using initial version is reasonable. - let builder = RegionMetadataBuilder::try_from(desc)?; - - builder.build() - } -} - -#[derive(Default)] -struct ColumnsMetadataBuilder { - columns: Vec, - name_to_col_index: HashMap, - /// Column id set, used to validate column id uniqueness. - column_ids: HashSet, - - // Row key metadata: - row_key_end: usize, - timestamp_key_index: Option, -} - -impl ColumnsMetadataBuilder { - fn row_key(&mut self, key: RowKeyDescriptor) -> Result<&mut Self> { - for col in key.columns { - let _ = self.push_row_key_column(col)?; - } - - // TODO(yingwen): Validate this is a timestamp column. - self.timestamp_key_index = Some(self.columns.len()); - let _ = self.push_row_key_column(key.timestamp)?; - self.row_key_end = self.columns.len(); - - Ok(self) - } - - fn push_row_key_column(&mut self, desc: ColumnDescriptor) -> Result<&mut Self> { - self.push_field_column(consts::KEY_CF_ID, desc) - } - - fn push_field_column( - &mut self, - cf_id: ColumnFamilyId, - desc: ColumnDescriptor, - ) -> Result<&mut Self> { - ensure!( - !is_internal_field_column(&desc.name), - ReservedColumnSnafu { name: &desc.name } - ); - - self.push_new_column(cf_id, desc) - } - - fn push_new_column( - &mut self, - cf_id: ColumnFamilyId, - desc: ColumnDescriptor, - ) -> Result<&mut Self> { - ensure!( - !self.name_to_col_index.contains_key(&desc.name), - ColNameExistsSnafu { name: &desc.name } - ); - ensure!( - !self.column_ids.contains(&desc.id), - ColIdExistsSnafu { id: desc.id } - ); - - let column_name = desc.name.clone(); - let column_id = desc.id; - let meta = ColumnMetadata { cf_id, desc }; - - let column_index = self.columns.len(); - self.columns.push(meta); - let _ = self.name_to_col_index.insert(column_name, column_index); - let _ = self.column_ids.insert(column_id); - - Ok(self) - } - - fn build(mut self) -> Result { - let timestamp_key_index = self.timestamp_key_index.context(MissingTimestampSnafu)?; - - let user_column_end = self.columns.len(); - // Setup internal columns. - for internal_desc in internal_column_descs() { - let _ = self.push_new_column(consts::DEFAULT_CF_ID, internal_desc)?; - } - - Ok(ColumnsMetadata { - columns: self.columns, - name_to_col_index: self.name_to_col_index, - row_key_end: self.row_key_end, - timestamp_key_index, - user_column_end, - }) - } -} - -#[derive(Default)] -struct ColumnFamiliesMetadataBuilder { - id_to_cfs: HashMap, - cf_names: HashSet, -} - -impl ColumnFamiliesMetadataBuilder { - fn add_column_family(&mut self, cf: ColumnFamilyMetadata) -> Result<&mut Self> { - ensure!( - !self.id_to_cfs.contains_key(&cf.cf_id), - CfIdExistsSnafu { id: cf.cf_id } - ); - - ensure!( - !self.cf_names.contains(&cf.name), - CfNameExistsSnafu { name: &cf.name } - ); - - let _ = self.cf_names.insert(cf.name.clone()); - let _ = self.id_to_cfs.insert(cf.cf_id, cf); - - Ok(self) - } - - fn build(self) -> ColumnFamiliesMetadata { - ColumnFamiliesMetadata { - id_to_cfs: self.id_to_cfs, - } - } -} - -struct RegionMetadataBuilder { - id: RegionId, - name: String, - columns_meta_builder: ColumnsMetadataBuilder, - cfs_meta_builder: ColumnFamiliesMetadataBuilder, - version: VersionNumber, -} - -impl Default for RegionMetadataBuilder { - fn default() -> RegionMetadataBuilder { - RegionMetadataBuilder::new() - } -} - -impl RegionMetadataBuilder { - fn new() -> RegionMetadataBuilder { - RegionMetadataBuilder { - id: 0.into(), - name: String::new(), - columns_meta_builder: ColumnsMetadataBuilder::default(), - cfs_meta_builder: ColumnFamiliesMetadataBuilder::default(), - version: Schema::INITIAL_VERSION, - } - } - - fn name(mut self, name: impl Into) -> Self { - self.name = name.into(); - self - } - - fn id(mut self, id: RegionId) -> Self { - self.id = id; - self - } - - fn version(mut self, version: VersionNumber) -> Self { - self.version = version; - self - } - - fn row_key(mut self, key: RowKeyDescriptor) -> Result { - let _ = self.columns_meta_builder.row_key(key)?; - - Ok(self) - } - - fn add_column_family(mut self, cf: ColumnFamilyDescriptor) -> Result { - let column_index_start = self.columns_meta_builder.columns.len(); - let column_index_end = column_index_start + cf.columns.len(); - let cf_meta = ColumnFamilyMetadata { - name: cf.name.clone(), - cf_id: cf.cf_id, - column_index_start, - column_index_end, - }; - - let _ = self.cfs_meta_builder.add_column_family(cf_meta)?; - - for col in cf.columns { - let _ = self.columns_meta_builder.push_field_column(cf.cf_id, col)?; - } - - Ok(self) - } - - fn build(self) -> Result { - let columns = Arc::new(self.columns_meta_builder.build()?); - let schema = Arc::new(RegionSchema::new(columns.clone(), self.version)?); - - Ok(RegionMetadata { - id: self.id, - name: self.name, - schema, - columns, - column_families: self.cfs_meta_builder.build(), - version: self.version, - }) - } -} - -fn internal_column_descs() -> [ColumnDescriptor; 2] { - [ - ColumnDescriptorBuilder::new( - ReservedColumnId::sequence(), - consts::SEQUENCE_COLUMN_NAME.to_string(), - ConcreteDataType::uint64_datatype(), - ) - .is_nullable(false) - .build() - .unwrap(), - ColumnDescriptorBuilder::new( - ReservedColumnId::op_type(), - consts::OP_TYPE_COLUMN_NAME.to_string(), - ConcreteDataType::uint8_datatype(), - ) - .is_nullable(false) - .build() - .unwrap(), - ] -} - -/// Returns true if this is an internal column for value column. -#[inline] -fn is_internal_field_column(column_name: &str) -> bool { - matches!( - column_name, - consts::SEQUENCE_COLUMN_NAME | consts::OP_TYPE_COLUMN_NAME - ) -} - -#[cfg(test)] -mod tests { - use datatypes::schema::ColumnDefaultConstraint; - use datatypes::type_id::LogicalTypeId; - use datatypes::value::Value; - use store_api::storage::{ - AddColumn, AlterOperation, ColumnDescriptorBuilder, ColumnFamilyDescriptorBuilder, - RowKeyDescriptorBuilder, - }; - - use super::*; - use crate::test_util::descriptor_util::RegionDescBuilder; - use crate::test_util::schema_util; - - const TEST_REGION: &str = "test-region"; - - #[test] - fn test_descriptor_to_region_metadata() { - let region_name = "region-0"; - let desc = RegionDescBuilder::new(region_name) - .timestamp(("ts", LogicalTypeId::TimestampMillisecond, false)) - .push_key_column(("k1", LogicalTypeId::Int32, false)) - .push_field_column(("v1", LogicalTypeId::Float32, true)) - .build(); - - let expect_schema = schema_util::new_schema_ref( - &[ - ("k1", LogicalTypeId::Int32, false), - ("ts", LogicalTypeId::TimestampMillisecond, false), - ("v1", LogicalTypeId::Float32, true), - ], - Some(1), - ); - - let metadata = RegionMetadata::try_from(desc).unwrap(); - assert_eq!(region_name, metadata.name); - assert_eq!(expect_schema, *metadata.user_schema()); - assert_eq!(2, metadata.columns.num_row_key_columns()); - assert_eq!(1, metadata.columns.num_field_columns()); - } - - #[test] - fn test_build_empty_region_metadata() { - let err = RegionMetadataBuilder::default().build().err().unwrap(); - assert!(matches!(err, Error::MissingTimestamp { .. })); - } - - #[test] - fn test_build_metadata_duplicate_name() { - let cf = ColumnFamilyDescriptorBuilder::default() - .push_column( - ColumnDescriptorBuilder::new(4, "v1", ConcreteDataType::int64_datatype()) - .build() - .unwrap(), - ) - .push_column( - ColumnDescriptorBuilder::new(5, "v1", ConcreteDataType::int64_datatype()) - .build() - .unwrap(), - ) - .build() - .unwrap(); - let err = RegionMetadataBuilder::new() - .add_column_family(cf) - .err() - .unwrap(); - assert!(matches!(err, Error::ColNameExists { .. })); - } - - #[test] - fn test_build_metadata_internal_name() { - let names = [consts::SEQUENCE_COLUMN_NAME, consts::OP_TYPE_COLUMN_NAME]; - for name in names { - let cf = ColumnFamilyDescriptorBuilder::default() - .push_column( - ColumnDescriptorBuilder::new(5, name, ConcreteDataType::int64_datatype()) - .build() - .unwrap(), - ) - .build() - .unwrap(); - let err = RegionMetadataBuilder::new() - .add_column_family(cf) - .err() - .unwrap(); - assert!(matches!(err, Error::ReservedColumn { .. })); - } - } - - #[test] - fn test_build_metadata_duplicate_id() { - let cf = ColumnFamilyDescriptorBuilder::default() - .push_column( - ColumnDescriptorBuilder::new(4, "v1", ConcreteDataType::int64_datatype()) - .build() - .unwrap(), - ) - .push_column( - ColumnDescriptorBuilder::new(4, "v2", ConcreteDataType::int64_datatype()) - .build() - .unwrap(), - ) - .build() - .unwrap(); - let err = RegionMetadataBuilder::new() - .add_column_family(cf) - .err() - .unwrap(); - assert!(matches!(err, Error::ColIdExists { .. })); - - let timestamp = ColumnDescriptorBuilder::new(2, "ts", ConcreteDataType::int64_datatype()) - .is_nullable(false) - .is_time_index(true) - .build() - .unwrap(); - let row_key = RowKeyDescriptorBuilder::new(timestamp) - .push_column( - ColumnDescriptorBuilder::new(2, "k1", ConcreteDataType::int64_datatype()) - .is_nullable(false) - .build() - .unwrap(), - ) - .build() - .unwrap(); - let err = RegionMetadataBuilder::new().row_key(row_key).err().unwrap(); - assert!(matches!(err, Error::ColIdExists { .. })); - } - - fn new_metadata() -> RegionMetadata { - let timestamp = ColumnDescriptorBuilder::new( - 2, - "ts", - ConcreteDataType::timestamp_millisecond_datatype(), - ) - .is_nullable(false) - .is_time_index(true) - .build() - .unwrap(); - let row_key = RowKeyDescriptorBuilder::new(timestamp) - .push_column( - ColumnDescriptorBuilder::new(3, "k1", ConcreteDataType::int64_datatype()) - .is_nullable(false) - .build() - .unwrap(), - ) - .build() - .unwrap(); - let cf = ColumnFamilyDescriptorBuilder::default() - .push_column( - ColumnDescriptorBuilder::new(4, "v1", ConcreteDataType::int64_datatype()) - .build() - .unwrap(), - ) - .build() - .unwrap(); - RegionMetadataBuilder::new() - .name(TEST_REGION) - .row_key(row_key) - .unwrap() - .add_column_family(cf) - .unwrap() - .build() - .unwrap() - } - - #[test] - fn test_build_metedata_disable_version() { - let metadata = new_metadata(); - assert_eq!(TEST_REGION, metadata.name); - - let expect_schema = schema_util::new_schema_ref( - &[ - ("k1", LogicalTypeId::Int64, false), - ("ts", LogicalTypeId::TimestampMillisecond, false), - ("v1", LogicalTypeId::Int64, true), - ], - Some(1), - ); - - assert_eq!(expect_schema, *metadata.user_schema()); - - // 3 user columns and 2 internal columns - assert_eq!(5, metadata.columns.columns.len()); - // 2 row key columns - assert_eq!(2, metadata.columns.num_row_key_columns()); - let row_key_names: Vec<_> = metadata - .columns - .iter_row_key_columns() - .map(|column| &column.desc.name) - .collect(); - assert_eq!(["k1", "ts"], &row_key_names[..]); - // 1 value column - assert_eq!(1, metadata.columns.num_field_columns()); - let value_names: Vec<_> = metadata - .columns - .iter_field_columns() - .map(|column| &column.desc.name) - .collect(); - assert_eq!(["v1"], &value_names[..]); - // Check timestamp index. - assert_eq!(1, metadata.columns.timestamp_key_index); - - assert!(metadata - .column_families - .cf_by_id(consts::DEFAULT_CF_ID) - .is_some()); - - assert_eq!(0, metadata.version); - } - - #[test] - fn test_convert_between_raw() { - let metadata = new_metadata(); - let raw = RawRegionMetadata::from(&metadata); - - let converted = RegionMetadata::try_from(raw).unwrap(); - assert_eq!(metadata, converted); - } - - #[test] - fn test_alter_metadata_add_columns() { - let region_name = "region-0"; - let builder = RegionDescBuilder::new(region_name) - .push_key_column(("k1", LogicalTypeId::Int32, false)) - .push_field_column(("v1", LogicalTypeId::Float32, true)); - let last_column_id = builder.last_column_id(); - let metadata: RegionMetadata = builder.build().try_into().unwrap(); - - let req = AlterRequest { - operation: AlterOperation::AddColumns { - columns: vec![ - AddColumn { - desc: ColumnDescriptorBuilder::new( - last_column_id + 1, - "k2", - ConcreteDataType::int32_datatype(), - ) - .build() - .unwrap(), - is_key: true, - }, - AddColumn { - desc: ColumnDescriptorBuilder::new( - last_column_id + 2, - "v2", - ConcreteDataType::float32_datatype(), - ) - .build() - .unwrap(), - is_key: false, - }, - ], - }, - version: 0, - }; - metadata.validate_alter(&req).unwrap(); - let metadata = metadata.alter(&req).unwrap(); - - let builder: RegionMetadataBuilder = RegionDescBuilder::new(region_name) - .push_key_column(("k1", LogicalTypeId::Int32, false)) - .push_field_column(("v1", LogicalTypeId::Float32, true)) - .push_key_column(("k2", LogicalTypeId::Int32, true)) - .push_field_column(("v2", LogicalTypeId::Float32, true)) - .build() - .try_into() - .unwrap(); - let expect = builder.version(1).build().unwrap(); - assert_eq!(expect, metadata); - } - - #[test] - fn test_alter_metadata_drop_columns() { - let region_name = "region-0"; - let metadata: RegionMetadata = RegionDescBuilder::new(region_name) - .push_key_column(("k1", LogicalTypeId::Int32, false)) - .push_key_column(("k2", LogicalTypeId::Int32, false)) - .push_field_column(("v1", LogicalTypeId::Float32, true)) - .push_field_column(("v2", LogicalTypeId::Float32, true)) - .build() - .try_into() - .unwrap(); - - let req = AlterRequest { - operation: AlterOperation::DropColumns { - names: vec![ - String::from("k1"), // k1 would be ignored. - String::from("v1"), - ], - }, - version: 0, - }; - let metadata = metadata.alter(&req).unwrap(); - - let builder = RegionDescBuilder::new(region_name) - .push_key_column(("k1", LogicalTypeId::Int32, false)) - .push_key_column(("k2", LogicalTypeId::Int32, false)); - let last_column_id = builder.last_column_id() + 1; - let builder: RegionMetadataBuilder = builder - .set_last_column_id(last_column_id) // This id is reserved for v1 - .push_field_column(("v2", LogicalTypeId::Float32, true)) - .build() - .try_into() - .unwrap(); - let expect = builder.version(1).build().unwrap(); - assert_eq!(expect, metadata); - } - - #[test] - fn test_validate_alter_request() { - let builder = RegionDescBuilder::new("region-alter") - .timestamp(("ts", LogicalTypeId::TimestampMillisecond, false)) - .push_key_column(("k0", LogicalTypeId::Int32, false)) - .push_field_column(("v0", LogicalTypeId::Float32, true)) - .push_field_column(("v1", LogicalTypeId::Float32, true)); - let last_column_id = builder.last_column_id(); - let metadata: RegionMetadata = builder.build().try_into().unwrap(); - - // Test request with different version. - let mut req = AlterRequest { - operation: AlterOperation::AddColumns { - columns: vec![AddColumn { - desc: ColumnDescriptorBuilder::new( - last_column_id + 1, - "k2", - ConcreteDataType::int32_datatype(), - ) - .build() - .unwrap(), - is_key: true, - }], - }, - version: 1, - }; - assert!(matches!( - metadata.validate_alter(&req).err().unwrap(), - Error::InvalidAlterVersion { .. } - )); - req.version = 0; - - // Add existing column. - req.operation = AlterOperation::AddColumns { - columns: vec![AddColumn { - desc: ColumnDescriptorBuilder::new( - last_column_id + 1, - "ts", - ConcreteDataType::int32_datatype(), - ) - .build() - .unwrap(), - is_key: false, - }], - }; - assert!(matches!( - metadata.validate_alter(&req).err().unwrap(), - Error::AddExistColumn { .. } - )); - - // Add non null column. - req.operation = AlterOperation::AddColumns { - columns: vec![AddColumn { - desc: ColumnDescriptorBuilder::new( - last_column_id + 1, - "v2", - ConcreteDataType::int32_datatype(), - ) - .is_nullable(false) - .build() - .unwrap(), - is_key: false, - }], - }; - assert!(matches!( - metadata.validate_alter(&req).err().unwrap(), - Error::AddNonNullColumn { .. } - )); - - // Drop absent column. - let mut req = AlterRequest { - operation: AlterOperation::DropColumns { - names: vec![String::from("v2")], - }, - version: 0, - }; - assert!(matches!( - metadata.validate_alter(&req).err().unwrap(), - Error::DropAbsentColumn { .. } - )); - - // Drop key column. - req.operation = AlterOperation::DropColumns { - names: vec![String::from("ts")], - }; - assert!(matches!( - metadata.validate_alter(&req).err().unwrap(), - Error::DropKeyColumn { .. } - )); - req.operation = AlterOperation::DropColumns { - names: vec![String::from("k0")], - }; - assert!(matches!( - metadata.validate_alter(&req).err().unwrap(), - Error::DropKeyColumn { .. } - )); - - // Drop internal column. - req.operation = AlterOperation::DropColumns { - names: vec![String::from(consts::SEQUENCE_COLUMN_NAME)], - }; - assert!(matches!( - metadata.validate_alter(&req).err().unwrap(), - Error::DropInternalColumn { .. } - )); - - // Valid request - req.operation = AlterOperation::DropColumns { - names: vec![String::from("v0")], - }; - metadata.validate_alter(&req).unwrap(); - } - - #[test] - fn test_column_metadata_conversion() { - let desc = ColumnDescriptorBuilder::new(123, "test", ConcreteDataType::int32_datatype()) - .is_nullable(false) - .default_constraint(Some(ColumnDefaultConstraint::Value(Value::Int32(321)))) - .comment("hello") - .build() - .unwrap(); - - let meta = ColumnMetadata { - cf_id: consts::DEFAULT_CF_ID, - desc: desc.clone(), - }; - let column_schema = meta.to_column_schema().unwrap(); - let new_meta = ColumnMetadata::from_column_schema(&column_schema).unwrap(); - assert_eq!(meta, new_meta); - - let meta = ColumnMetadata { cf_id: 567, desc }; - let column_schema = meta.to_column_schema().unwrap(); - let new_meta = ColumnMetadata::from_column_schema(&column_schema).unwrap(); - assert_eq!(meta, new_meta); - } -} diff --git a/src/storage/src/metrics.rs b/src/storage/src/metrics.rs deleted file mode 100644 index 605e0b492709..000000000000 --- a/src/storage/src/metrics.rs +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! storage metrics - -use lazy_static::lazy_static; -use prometheus::*; - -/// Reason to flush. -pub const FLUSH_REASON: &str = "reason"; - -lazy_static! { - /// Elapsed time of updating manifest when creating regions. - pub static ref CREATE_REGION_UPDATE_MANIFEST: Histogram = - register_histogram!("storage_create_region_update_manifest", "storage create region update manifest").unwrap(); - /// Counter of scheduled flush requests. - pub static ref FLUSH_REQUESTS_TOTAL: IntCounterVec = - register_int_counter_vec!("storage_flush_requests_total", "storage flush requests total", &[FLUSH_REASON]).unwrap(); - /// Counter of scheduled failed flush jobs. - pub static ref FLUSH_ERRORS_TOTAL: IntCounter = - register_int_counter!("storage_flush_errors_total", "storage flush errors total").unwrap(); - //// Elapsed time of a flush job. - pub static ref FLUSH_ELAPSED: Histogram = - register_histogram!("storage_flush_elapsed", "storage flush elapsed").unwrap(); - /// Counter of flushed bytes. - pub static ref FLUSH_BYTES_TOTAL: IntCounter = - register_int_counter!("storage_flush_bytes_total", "storage flush bytes total").unwrap(); - /// Gauge for open regions - pub static ref REGION_COUNT: IntGauge = - register_int_gauge!("storage_region_count", "storage region count").unwrap(); - /// Timer for logstore write - pub static ref LOG_STORE_WRITE_ELAPSED: Histogram = - register_histogram!("storage_logstore_write_elapsed", "storage logstore write elapsed").unwrap(); - /// Elapsed time of a compact job. - pub static ref COMPACT_ELAPSED: Histogram = - register_histogram!("storage_compact_elapsed", "storage compact elapsed").unwrap(); - /// Elapsed time for merging SST files. - pub static ref MERGE_ELAPSED: Histogram = - register_histogram!("storage_compaction_merge_elapsed", "storage compaction merge elapsed").unwrap(); - /// Global write buffer size in bytes. - pub static ref WRITE_BUFFER_BYTES: IntGauge = - register_int_gauge!("storage_write_buffer_bytes", "storage write buffer bytes").unwrap(); - /// Elapsed time of inserting memtable. - pub static ref MEMTABLE_WRITE_ELAPSED: Histogram = - register_histogram!("storage_memtable_write_elapsed", "storage memtable write elapsed").unwrap(); - /// Elapsed time of preprocessing write batch. - pub static ref PREPROCESS_ELAPSED: Histogram = - register_histogram!("storage_write_preprocess_elapsed", "storage write preprocess elapsed").unwrap(); - /// Elapsed time for windowed scan - pub static ref WINDOW_SCAN_ELAPSED: Histogram = - register_histogram!("query_scan_window_scan_elapsed", "query scan window scan elapsed").unwrap(); - /// Rows per window during window scan - pub static ref WINDOW_SCAN_ROWS_PER_WINDOW: Histogram = - register_histogram!("query_scan_window_scan_window_row_size", "query scan window scan window row size").unwrap(); -} diff --git a/src/storage/src/proto.rs b/src/storage/src/proto.rs deleted file mode 100644 index 7f85132d1835..000000000000 --- a/src/storage/src/proto.rs +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub mod wal; diff --git a/src/storage/src/proto/wal.rs b/src/storage/src/proto/wal.rs deleted file mode 100644 index c07fd5e9f091..000000000000 --- a/src/storage/src/proto/wal.rs +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#![allow(clippy::all)] -tonic::include_proto!("greptime.storage.wal.v1"); - -use api::v1::OpType; - -use crate::write_batch::Payload; - -pub fn gen_mutation_types(payload: &Payload) -> Vec { - payload - .mutations - .iter() - .map(|m| match m.op_type { - OpType::Delete => MutationType::Delete.into(), - OpType::Put => MutationType::Put.into(), - }) - .collect::>() -} - -impl WalHeader { - pub fn with_last_manifest_version(last_manifest_version: u64) -> Self { - Self { - last_manifest_version, - ..Default::default() - } - } -} diff --git a/src/storage/src/read.rs b/src/storage/src/read.rs deleted file mode 100644 index fb8710c05fd1..000000000000 --- a/src/storage/src/read.rs +++ /dev/null @@ -1,271 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Common structs and utilities for read. - -mod chain; -mod dedup; -mod merge; -mod windowed; - -use std::cmp::Ordering; - -use async_trait::async_trait; -use common_base::BitVec; -use datatypes::data_type::DataType; -use datatypes::prelude::ConcreteDataType; -use datatypes::vectors::{BooleanVector, MutableVector, VectorRef}; -use snafu::{ensure, ResultExt}; - -use crate::error::{self, Result}; -pub use crate::read::chain::ChainReader; -pub use crate::read::dedup::DedupReader; -pub use crate::read::merge::{MergeReader, MergeReaderBuilder}; -pub use crate::read::windowed::WindowedReader; - -/// Storage internal representation of a batch of rows. -// Now the structure of `Batch` is still unstable, all pub fields may be changed. -#[derive(Debug, Default, PartialEq, Eq, Clone)] -pub struct Batch { - /// Rows organized in columnar format. - /// - /// Columns follow the same order convention of region schema: - /// key, value, internal columns. - pub columns: Vec, -} - -impl Batch { - /// Create a new `Batch` from `columns`. - /// - /// # Panics - /// Panics if vectors in `columns` have different length. - pub fn new(columns: Vec) -> Batch { - Self::assert_columns(&columns); - - Batch { columns } - } - - #[inline] - pub fn num_columns(&self) -> usize { - self.columns.len() - } - - #[inline] - pub fn num_rows(&self) -> usize { - self.columns.get(0).map(|v| v.len()).unwrap_or(0) - } - - #[inline] - pub fn is_empty(&self) -> bool { - self.num_rows() == 0 - } - - #[inline] - pub fn columns(&self) -> &[VectorRef] { - &self.columns - } - - #[inline] - pub fn column(&self, idx: usize) -> &VectorRef { - &self.columns[idx] - } - - /// Slice the batch, returning a new batch. - /// - /// # Panics - /// Panics if `offset + length > self.num_rows()`. - fn slice(&self, offset: usize, length: usize) -> Batch { - let columns = self - .columns - .iter() - .map(|v| v.slice(offset, length)) - .collect(); - Batch { columns } - } - - fn assert_columns(columns: &[VectorRef]) { - if columns.is_empty() { - return; - } - - let length = columns[0].len(); - assert!(columns.iter().all(|col| col.len() == length)); - } -} - -/// Compute operations for Batch. -pub trait BatchOp { - /// Compare `i-th` in `left` to `j-th` row in `right` by key (row key + internal columns). - /// - /// The caller should ensure `left` and `right` have same schema as `self`. - /// - /// # Panics - /// Panics if - /// - `i` or `j` is out of bound. - /// - `left` or `right` has insufficient column num. - fn compare_row(&self, left: &Batch, i: usize, right: &Batch, j: usize) -> Ordering; - - /// Find unique rows in `batch` by row key. - /// - /// If `prev` is `Some` and not empty, the last row of `prev` would be used to dedup - /// current `batch`. Set `i-th` bit of `selected` to `true` if `i-th` row is unique, - /// which means the row key of `i-th` row is different from `i+1-th`'s. - /// - /// The caller could use `selected` to build a [BooleanVector] to filter the - /// batch, and must ensure `selected` is initialized by filling `batch.num_rows()` bits - /// to zero. - /// - /// # Panics - /// Panics if - /// - `batch` and `prev` have different number of columns (unless `prev` is - /// empty). - /// - `selected.len()` is less than the number of rows. - fn find_unique(&self, batch: &Batch, selected: &mut BitVec, prev: Option<&Batch>); - - /// Filters the `batch`, returns elements matching the `filter` (i.e. where the values - /// are true). - /// - /// Note that the nulls of `filter` are interpreted as `false` will lead to these elements - /// being masked out. - fn filter(&self, batch: &Batch, filter: &BooleanVector) -> Result; - - /// Unselect deleted rows according to the [`OpType`](api::v1::OpType). - /// - /// # Panics - /// Panics if - /// - `batch` doesn't have a valid op type column. - /// - `selected.len()` is less than the number of rows. - fn unselect_deleted(&self, batch: &Batch, selected: &mut BitVec); -} - -/// Reusable [Batch] builder. -pub struct BatchBuilder { - builders: Vec>, -} - -impl BatchBuilder { - /// Create a new `BatchBuilder` from data types with given `capacity`. - /// - /// # Panics - /// Panics if `types` is empty. - pub fn with_capacity<'a, I>(types: I, capacity: usize) -> BatchBuilder - where - I: IntoIterator, - { - let builders: Vec<_> = types - .into_iter() - .map(|t| t.create_mutable_vector(capacity)) - .collect(); - assert!(!builders.is_empty()); - - BatchBuilder { builders } - } - - /// Returns number of rows already in this builder. - #[inline] - pub fn num_rows(&self) -> usize { - self.builders[0].len() - } - - /// Returns true if no rows in this builder. - #[inline] - pub fn is_empty(&self) -> bool { - self.num_rows() == 0 - } - - /// Extend the builder by slice of batch. - /// - /// # Panics - /// Panics if - /// - `offset + length > batch.num_rows()`. - /// - Number of columns in `batch` is not equal to the builder's. - pub fn extend_slice_of(&mut self, batch: &Batch, offset: usize, length: usize) -> Result<()> { - assert_eq!(self.builders.len(), batch.num_columns()); - - for (builder, column) in self.builders.iter_mut().zip(batch.columns()) { - builder - .extend_slice_of(&**column, offset, length) - .context(error::PushBatchSnafu)?; - } - - Ok(()) - } - - /// Push `i-th` row of batch into the builder. - /// - /// # Panics - /// Panics if - /// - `i` is out of bound. - /// - Number of columns in `batch` is not equal to the builder's. - pub fn push_row_of(&mut self, batch: &Batch, i: usize) -> Result<()> { - assert_eq!(self.builders.len(), batch.num_columns()); - - for (builder, column) in self.builders.iter_mut().zip(batch.columns()) { - let value = column.get_ref(i); - builder - .try_push_value_ref(value) - .context(error::PushBatchSnafu)?; - } - - Ok(()) - } - - /// Create a new [Batch] and reset this builder. - pub fn build(&mut self) -> Result { - // Checks length of each builder. - let rows = self.num_rows(); - for (i, builder) in self.builders.iter().enumerate() { - ensure!( - rows == builder.len(), - error::BuildBatchSnafu { - msg: format!( - "expect row num {} but builder {} has {}", - rows, - i, - builder.len() - ), - } - ); - } - - let columns = self.builders.iter_mut().map(|b| b.to_vector()).collect(); - - Ok(Batch { columns }) - } -} - -/// Async batch reader. -#[async_trait] -pub trait BatchReader: Send { - // TODO(yingwen): Schema of batch. - - /// Fetch next [Batch]. - /// - /// Returns `Ok(None)` when the reader has reached its end and calling `next_batch()` - /// again won't return batch again. - /// - /// If `Err` is returned, caller **must** not call this method again, the implementor - /// may or may not panic in such case. - async fn next_batch(&mut self) -> Result>; -} - -/// Pointer to [BatchReader]. -pub type BoxedBatchReader = Box; - -#[async_trait::async_trait] -impl BatchReader for Box { - async fn next_batch(&mut self) -> Result> { - (**self).next_batch().await - } -} diff --git a/src/storage/src/read/chain.rs b/src/storage/src/read/chain.rs deleted file mode 100644 index 5701682325ee..000000000000 --- a/src/storage/src/read/chain.rs +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::error::Result; -use crate::read::{Batch, BatchReader}; -use crate::schema::ProjectedSchemaRef; - -/// A reader that simply chain the outputs of input readers. -pub struct ChainReader { - /// Schema to read - pub schema: ProjectedSchemaRef, - /// Each reader reads a slice of time window - pub readers: Vec, -} - -impl ChainReader { - /// Returns a new [ChainReader] with specific input `readers`. - pub fn new(schema: ProjectedSchemaRef, mut readers: Vec) -> Self { - // Reverse readers since we iter them backward. - readers.reverse(); - Self { schema, readers } - } -} - -#[async_trait::async_trait] -impl BatchReader for ChainReader -where - R: BatchReader, -{ - async fn next_batch(&mut self) -> Result> { - while let Some(reader) = self.readers.last_mut() { - if let Some(batch) = reader.next_batch().await? { - return Ok(Some(batch)); - } else { - // Remove the exhausted reader. - self.readers.pop(); - } - } - Ok(None) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::test_util::read_util::{self, Batches, VecBatchReader}; - - fn build_chain_reader(sources: &[Batches]) -> ChainReader { - let schema = read_util::new_projected_schema(); - let readers = sources - .iter() - .map(|source| read_util::build_vec_reader(source)) - .collect(); - - ChainReader::new(schema, readers) - } - - async fn check_chain_reader_result( - mut reader: ChainReader, - input: &[Batches<'_>], - ) { - let expect: Vec<_> = input - .iter() - .flat_map(|v| v.iter()) - .flat_map(|v| v.iter().copied()) - .collect(); - - let result = read_util::collect_kv_batch(&mut reader).await; - assert_eq!(expect, result); - - // Call next_batch() again is allowed. - assert!(reader.next_batch().await.unwrap().is_none()); - } - - #[tokio::test] - async fn test_chain_empty() { - let mut reader = build_chain_reader(&[]); - - assert!(reader.next_batch().await.unwrap().is_none()); - // Call next_batch() again is allowed. - assert!(reader.next_batch().await.unwrap().is_none()); - } - - #[tokio::test] - async fn test_chain_one() { - let input: &[Batches] = &[&[ - &[(1, Some(1)), (2, Some(2))], - &[(3, Some(3)), (4, Some(4))], - &[(5, Some(5))], - ]]; - - let reader = build_chain_reader(input); - - check_chain_reader_result(reader, input).await; - } - - #[tokio::test] - async fn test_chain_multi() { - let input: &[Batches] = &[ - &[ - &[(1, Some(1)), (2, Some(2))], - &[(3, Some(3)), (4, Some(4))], - &[(5, Some(5))], - ], - &[&[(6, Some(3)), (7, Some(4)), (8, Some(8))], &[(9, Some(9))]], - &[&[(10, Some(10)), (11, Some(11))], &[(12, Some(12))]], - ]; - - let reader = build_chain_reader(input); - - check_chain_reader_result(reader, input).await; - } -} diff --git a/src/storage/src/read/dedup.rs b/src/storage/src/read/dedup.rs deleted file mode 100644 index d08415aac2c3..000000000000 --- a/src/storage/src/read/dedup.rs +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use async_trait::async_trait; -use common_base::BitVec; -use datatypes::prelude::ScalarVector; -use datatypes::vectors::BooleanVector; - -use crate::error::Result; -use crate::read::{Batch, BatchOp, BatchReader}; -use crate::schema::ProjectedSchemaRef; - -/// A reader that dedup rows from inner reader. -pub struct DedupReader { - /// Projected schema to read. - schema: ProjectedSchemaRef, - /// The inner reader. - reader: R, - /// Previous batch from the reader. - prev_batch: Option, - /// Reused bitmap buffer. - selected: BitVec, -} - -impl DedupReader { - pub fn new(schema: ProjectedSchemaRef, reader: R) -> DedupReader { - DedupReader { - schema, - reader, - prev_batch: None, - selected: BitVec::default(), - } - } - - /// Take `batch` and then returns a new batch with no duplicated rows. - /// - /// This method may returns empty `Batch`. - fn dedup_batch(&mut self, batch: Batch) -> Result { - if batch.is_empty() { - // No need to update `prev_batch` if current batch is empty. - return Ok(batch); - } - - // Reinitialize the bit map to zeros. - self.selected.clear(); - self.selected.resize(batch.num_rows(), false); - self.schema - .find_unique(&batch, &mut self.selected, self.prev_batch.as_ref()); - - // Store current batch to `prev_batch` so we could compare the next batch - // with this batch. We store batch before filtering it mainly for correctness, as - // once we supports `DELETE`, rows with `OpType::Delete` would be removed from the - // batch after filter, then we may store an incorrect `last row` of previous batch. - self.prev_batch - .get_or_insert_with(Batch::default) - .clone_from(&batch); // Use `clone_from` to reuse allocated memory if possible. - - // Find all rows whose op_types are `OpType::Delete`, mark their `selected` to false. - self.schema.unselect_deleted(&batch, &mut self.selected); - - let filter = BooleanVector::from_iterator(self.selected.iter().by_vals()); - // Filter duplicate rows. - self.schema.filter(&batch, &filter) - } -} - -#[async_trait] -impl BatchReader for DedupReader { - async fn next_batch(&mut self) -> Result> { - while let Some(batch) = self.reader.next_batch().await? { - let filtered = self.dedup_batch(batch)?; - // Skip empty batch. - if !filtered.is_empty() { - return Ok(Some(filtered)); - } - } - - Ok(None) - } -} - -#[cfg(test)] -mod tests { - use api::v1::OpType; - - use super::*; - use crate::test_util::read_util; - - #[tokio::test] - async fn test_dedup_reader_empty() { - let schema = read_util::new_projected_schema(); - let reader = read_util::build_vec_reader(&[]); - let mut reader = DedupReader::new(schema, reader); - - assert!(reader.next_batch().await.unwrap().is_none()); - // Call next_batch() again is allowed. - assert!(reader.next_batch().await.unwrap().is_none()); - } - - #[tokio::test] - async fn test_dedup_by_sequence() { - let schema = read_util::new_projected_schema(); - let reader = read_util::build_full_vec_reader(&[ - // key, value, sequence, op_type - &[ - (100, 1, 1000, OpType::Put), - (100, 2, 999, OpType::Put), - (100, 3, 998, OpType::Put), - (101, 1, 1000, OpType::Put), - ], - &[ - (101, 2, 999, OpType::Put), - (102, 12, 1000, OpType::Put), - (103, 13, 1000, OpType::Put), - ], - &[(103, 2, 999, OpType::Put)], - ]); - let mut reader = DedupReader::new(schema, reader); - - let result = read_util::collect_kv_batch(&mut reader).await; - let expect = [ - (100, Some(1)), - (101, Some(1)), - (102, Some(12)), - (103, Some(13)), - ]; - assert_eq!(&expect, &result[..]); - } - - #[tokio::test] - async fn test_dedup_contains_empty_input() { - let schema = read_util::new_projected_schema(); - let reader = read_util::build_full_vec_reader(&[ - // key, value, sequence, op_type - &[ - (100, 1, 1000, OpType::Put), - (100, 2, 999, OpType::Put), - (101, 1, 1000, OpType::Put), - ], - &[], - &[(101, 2, 999, OpType::Put), (102, 12, 1000, OpType::Put)], - ]); - let mut reader = DedupReader::new(schema, reader); - - let result = read_util::collect_kv_batch(&mut reader).await; - let expect = [(100, Some(1)), (101, Some(1)), (102, Some(12))]; - assert_eq!(&expect, &result[..]); - } - - #[tokio::test] - async fn test_dedup_contains_empty_output() { - let schema = read_util::new_projected_schema(); - let reader = read_util::build_full_vec_reader(&[ - // key, value, sequence, op_type - &[ - (100, 1, 1000, OpType::Put), - (100, 2, 999, OpType::Put), - (101, 1, 1000, OpType::Put), - ], - &[(101, 2, 999, OpType::Put)], - &[(101, 3, 998, OpType::Put), (101, 4, 997, OpType::Put)], - &[(102, 12, 998, OpType::Put)], - ]); - let mut reader = DedupReader::new(schema, reader); - - let result = read_util::collect_kv_batch(&mut reader).await; - let expect = [(100, Some(1)), (101, Some(1)), (102, Some(12))]; - assert_eq!(&expect, &result[..]); - } -} diff --git a/src/storage/src/read/merge.rs b/src/storage/src/read/merge.rs deleted file mode 100644 index d27d05b47b06..000000000000 --- a/src/storage/src/read/merge.rs +++ /dev/null @@ -1,828 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Merge reader. -//! -//! The implementation of [`MergeReader`] is inspired by -//! [`kudu's MergeIterator`](https://github.com/apache/kudu/blob/9021f275824faa2bdfe699786957c40c219697c1/src/kudu/common/generic_iterators.cc#L107) -//! and [`CeresDB's MergeIterator`](https://github.com/CeresDB/ceresdb/blob/02a7e3100f47cf16aa6c245ed529a6978be20fbd/analytic_engine/src/row_iter/merge.rs) -//! -//! The main idea of the merge algorithm is to maintain a `merge window`. The window describes, -//! at any given time, the key range where we expect to find the row with the smallest key. -//! A [`Node`] (known as the sub-iterator in kudu) whose NEXT overlaps with the `merge window` -//! is said to be actively participating in the merge. -//! -//! The `merge window` is defined as follows: -//! 1. The window's start is the smallest lower bound of all nodes. We -//! refer to the node that owns this lower bound as LOW. -//! 2. The window’s end is the smallest upper bound of all nodes whose -//! lower bounds are less than or equal to LOW's upper bound. -//! 2a. The window's end could be LOW's upper bound itself, if it is the smallest -//! upper bound, but this isn't necessarily the case. -//! 3. The merge window's dimensions change as the merge proceeds, though it -//! only ever moves "to the right" (i.e. the window start/end only increase). -//! -//! We can divide the nodes into two sets, one for whose next rows overlap with the `merge window`, -//! another for whose next rows do not. The merge steady state resembles that of a traditional -//! heap-based merge: the top-most node is popped from HOT, the lower bound is copied to the output -//! and advanced, and the node is pushed back to HOT. -//! -//! In the steady state, we need to move nodes from COLD to HOT whenever the end of the merge window -//! moves; that's a sign that the window may now overlap with a NEXT belonging to a nodes in the -//! second set (COLD). The end of the merge window moves when a node is fully exhausted (i.e. all rows have -//! been copied to the output), or when a node finishes its NEXT and needs to peek again. -//! -//! At any given time, the NEXT belonging to the top-most node in COLD is nearest the merge window. -//! When the merge window's end has moved and we need to refill HOT, the top-most node in COLD is -//! the best candidate. To figure out whether it should be moved, we compare its NEXT's lower bound -//! against the upper bound in HOT's first node: if the lower bound is less than or equal to the key, -//! we move the node from COLD to HOT. On the flip side, when a node from HOT finishes its NEXT and peeks -//! again, we also need to check whether it has exited the merge window. The approach is similar: if -//! its NEXT's lower bound is greater than the upper bound of HOT'S first node, it's time to move it to COLD. -//! -//! A full description of the merge algorithm could be found in [`kudu's comment`](https://github.com/apache/kudu/blob/9021f275824faa2bdfe699786957c40c219697c1/src/kudu/common/generic_iterators.cc#L349) -//! and the [google doc](https://docs.google.com/document/d/1uP0ubjM6ulnKVCRrXtwT_dqrTWjF9tlFSRk0JN2e_O0/edit#). - -use std::cmp::Ordering; -use std::collections::BinaryHeap; -use std::fmt; - -use async_trait::async_trait; -use store_api::storage::consts; - -use crate::error::Result; -use crate::memtable::BoxedBatchIterator; -use crate::read::{Batch, BatchBuilder, BatchOp, BatchReader, BoxedBatchReader}; -use crate::schema::{ProjectedSchema, ProjectedSchemaRef}; - -/// Batch data source. -enum Source { - // To avoid the overhead of async-trait (typically a heap allocation), wraps the - // BatchIterator into an enum instead of converting the iterator into a BatchReader. - Iter(BoxedBatchIterator), - Reader(BoxedBatchReader), -} - -impl Source { - async fn next_batch(&mut self) -> Result> { - match self { - Source::Iter(iter) => iter.next().transpose(), - Source::Reader(reader) => reader.next_batch().await, - } - } - - /// Fetch next non empty batch. - async fn next_non_empty_batch(&mut self) -> Result> { - while let Some(batch) = self.next_batch().await? { - if !batch.is_empty() { - return Ok(Some(batch)); - } - } - Ok(None) - } -} - -impl fmt::Debug for Source { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - Source::Iter(_) => write!(f, "Iter(..)"), - Source::Reader(_) => write!(f, "Reader(..)"), - } - } -} - -/// Reference to a row in [BatchCursor]. -#[derive(Debug)] -struct RowCursor<'a> { - batch: &'a Batch, - pos: usize, -} - -impl<'a> RowCursor<'a> { - #[inline] - fn compare(&self, schema: &ProjectedSchema, other: &RowCursor) -> Ordering { - schema.compare_row(self.batch, self.pos, other.batch, other.pos) - } -} - -/// A `BatchCursor` wraps the `Batch` and allows reading the `Batch` by row. -#[derive(Debug)] -struct BatchCursor { - /// Current buffered `Batch`. - /// - /// `Batch` must contains at least one row. - batch: Batch, - /// Index of current row. - /// - /// `pos == batch.num_rows()` indicates no more rows to read. - pos: usize, -} - -impl BatchCursor { - /// Create a new `BatchCursor`. - /// - /// # Panics - /// Panics if `batch` is empty. - fn new(batch: Batch) -> BatchCursor { - assert!(!batch.is_empty()); - - BatchCursor { batch, pos: 0 } - } - - /// Returns true if there are remaining rows to read. - #[inline] - fn is_valid(&self) -> bool { - !self.is_empty() - } - - /// Returns first row of current batch. - /// - /// # Panics - /// Panics if `self` is invalid. - fn first_row(&self) -> RowCursor { - assert!(self.is_valid()); - - RowCursor { - batch: &self.batch, - pos: self.pos, - } - } - - /// Returns last row of current batch. - /// - /// # Panics - /// Panics if `self` is invalid. - fn last_row(&self) -> RowCursor { - assert!(self.is_valid()); - - RowCursor { - batch: &self.batch, - pos: self.batch.num_rows() - 1, - } - } - - #[inline] - fn is_empty(&self) -> bool { - self.pos >= self.batch.num_rows() - } - - /// Take slice of batch with at most `length` rows from the cursor, then - /// advance the cursor. - /// - /// # Panics - /// Panics if `self` is invalid. - fn take_batch_slice(&mut self, length: usize) -> Batch { - let length = length.min(self.batch.num_rows() - self.pos); - let batch = self.batch.slice(self.pos, length); - self.pos += batch.num_rows(); - - batch - } - - /// Push at most `length` rows from `self` to the `builder` and advance the cursor. - /// - /// # Panics - /// Panics if `self` is invalid. - fn push_rows_to(&mut self, builder: &mut BatchBuilder, length: usize) -> Result<()> { - let length = length.min(self.batch.num_rows() - self.pos); - builder.extend_slice_of(&self.batch, self.pos, length)?; - self.pos += length; - - Ok(()) - } - - /// Push next row from `self` to the `builder` and advance the cursor. - /// - /// # Panics - /// Panics if `self` is invalid. - fn push_next_row_to(&mut self, builder: &mut BatchBuilder) -> Result<()> { - builder.push_row_of(&self.batch, self.pos)?; - self.pos += 1; - - Ok(()) - } -} - -/// A `Node` represent an individual input data source to be merged. -struct Node { - /// Schema of data source. - schema: ProjectedSchemaRef, - /// Data source of this `Node`. - source: Source, - /// Current batch to be read. - /// - /// `None` means the `source` has reached EOF. - cursor: Option, -} - -impl fmt::Debug for Node { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("Node") - .field("source", &self.source) - .field("cursor", &self.cursor) - .finish_non_exhaustive() - } -} - -impl Node { - async fn new(schema: ProjectedSchemaRef, mut source: Source) -> Result { - let cursor = source.next_non_empty_batch().await?.map(BatchCursor::new); - Ok(Node { - schema, - source, - cursor, - }) - } - - /// Returns the reference to the cursor. - /// - /// # Panics - /// Panics if `self` is EOF. - fn cursor_ref(&self) -> &BatchCursor { - self.cursor.as_ref().unwrap() - } - - /// Returns first row in cursor. - /// - /// # Panics - /// Panics if `self` is EOF. - fn first_row(&self) -> RowCursor { - self.cursor_ref().first_row() - } - - /// Returns last row in cursor. - /// - /// # Panics - /// Panics if `self` is EOF. - fn last_row(&self) -> RowCursor { - self.cursor_ref().last_row() - } - - /// Compare first row of two nodes. - /// - /// # Panics - /// Panics if - /// - either `self` or `other` is EOF. - fn compare_first_row(&self, other: &Node) -> Ordering { - self.first_row().compare(&self.schema, &other.first_row()) - } - - /// Returns true if no more batch could be fetched from this node. - fn is_eof(&self) -> bool { - self.cursor.is_none() - } - - /// Returns true if the key range of current batch in `self` is behind (exclusive) current - /// batch in `other`. - /// - /// # Panics - /// Panics if - /// - either `self` or `other` is EOF. - fn is_behind(&self, other: &Node) -> bool { - let first = self.first_row(); - let last = other.last_row(); - // `self` is after `other` if min (first) row of `self` is greater than - // max (last) row of `other`. - first.compare(&self.schema, &last) == Ordering::Greater - } - - /// Fetch next batch and reset its cursor if `self` isn't EOF and the cursor - /// is empty. - /// - /// Returns true if a new batch has been fetched. - async fn maybe_fetch_next_batch(&mut self) -> Result { - let need_fetch = !self.is_eof() && self.cursor_ref().is_empty(); - if !need_fetch { - // Still has remaining rows, no need to fetch. - return Ok(false); - } - - // This ensure the cursor is either non empty or None (EOF). - match self.source.next_non_empty_batch().await? { - Some(batch) => { - self.cursor = Some(BatchCursor::new(batch)); - Ok(true) - } - None => { - // EOF - self.cursor = None; - Ok(false) - } - } - } - - /// Returns the mutable reference to the cursor. - /// - /// # Panics - /// Panics if `self` is EOF. - fn cursor_mut(&mut self) -> &mut BatchCursor { - self.cursor.as_mut().unwrap() - } - - /// Take batch from this node. - /// - /// # Panics - /// Panics if `self` is EOF. - fn take_batch_slice(&mut self, length: usize) -> Batch { - self.cursor_mut().take_batch_slice(length) - } - - /// Push at most `length` rows from `self` to the `builder`. - /// - /// # Panics - /// Panics if `self` is EOF. - fn push_rows_to(&mut self, builder: &mut BatchBuilder, length: usize) -> Result<()> { - self.cursor_mut().push_rows_to(builder, length) - } - - /// Push next row from `self` to the `builder`. - /// - /// # Panics - /// Panics if `self` is EOF. - fn push_next_row_to(&mut self, builder: &mut BatchBuilder) -> Result<()> { - self.cursor_mut().push_next_row_to(builder) - } -} - -impl PartialEq for Node { - fn eq(&self, other: &Node) -> bool { - self.compare_first_row(other) == Ordering::Equal - } -} - -impl Eq for Node {} - -impl PartialOrd for Node { - fn partial_cmp(&self, other: &Node) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for Node { - fn cmp(&self, other: &Node) -> Ordering { - // The std binary heap is a max heap, but we want the nodes are ordered in - // ascend order, so we compare the nodes in reverse order. - other.compare_first_row(self) - } -} - -/// A reader that would sort and merge `Batch` from multiple sources by key. -/// -/// `Batch` from each `Source` **must** be sorted. -pub struct MergeReader { - /// Whether the reader has been initialized. - initialized: bool, - /// Schema of data source. - schema: ProjectedSchemaRef, - /// Input data sources. - /// - /// All data source must have same schema. Initialize the reader would - /// convert all `Source`s into `Node`s and then clear this vector. - sources: Vec, - /// Holds `Node` whose key range of current batch **is** overlapped with the merge window. - /// - /// `Node` in this heap **must** not be empty. A `merge window` is the key range of the - /// root node in the `hot` heap. - hot: BinaryHeap, - /// Holds `Node` whose key range of current batch **isn't** overlapped with the merge window. - /// - /// `Node` in this heap **must** not be empty. - cold: BinaryHeap, - /// Suggested row number of each batch. - /// - /// The size of the batch yield from this reader may not always equal to this suggested size. - batch_size: usize, - /// Buffered batch. - batch_builder: BatchBuilder, -} - -#[async_trait] -impl BatchReader for MergeReader { - async fn next_batch(&mut self) -> Result> { - self.fetch_next_batch().await - } -} - -pub struct MergeReaderBuilder { - schema: ProjectedSchemaRef, - sources: Vec, - batch_size: usize, -} - -impl MergeReaderBuilder { - pub fn new(schema: ProjectedSchemaRef) -> Self { - MergeReaderBuilder::with_capacity(schema, 0) - } - - pub fn with_capacity(schema: ProjectedSchemaRef, capacity: usize) -> Self { - MergeReaderBuilder { - schema, - sources: Vec::with_capacity(capacity), - batch_size: consts::READ_BATCH_SIZE, - } - } - - pub fn push_batch_iter(mut self, iter: BoxedBatchIterator) -> Self { - self.sources.push(Source::Iter(iter)); - self - } - - pub fn push_batch_reader(mut self, reader: BoxedBatchReader) -> Self { - self.sources.push(Source::Reader(reader)); - self - } - - pub fn batch_size(mut self, size: usize) -> Self { - self.batch_size = size; - self - } - - pub fn build(self) -> MergeReader { - let num_sources = self.sources.len(); - let column_schemas = self.schema.schema_to_read().schema().column_schemas(); - let batch_builder = BatchBuilder::with_capacity( - column_schemas.iter().map(|c| &c.data_type), - self.batch_size, - ); - - MergeReader { - initialized: false, - schema: self.schema, - sources: self.sources, - hot: BinaryHeap::with_capacity(num_sources), - cold: BinaryHeap::with_capacity(num_sources), - batch_size: self.batch_size, - batch_builder, - } - } -} - -impl MergeReader { - /// Initialize the reader if it has not yet been initialized. - async fn try_init(&mut self) -> Result<()> { - if self.initialized { - return Ok(()); - } - - if self.sources.is_empty() { - self.initialized = true; - return Ok(()); - } - - for source in self.sources.drain(..) { - let node = Node::new(self.schema.clone(), source).await?; - - if !node.is_eof() { - self.cold.push(node); - } - } - - self.refill_hot(); - - self.initialized = true; - - Ok(()) - } - - async fn fetch_next_batch(&mut self) -> Result> { - self.try_init().await?; - - while !self.hot.is_empty() && self.batch_builder.num_rows() < self.batch_size { - if self.hot.len() == 1 { - // No need to do merge sort if only one batch in the hot heap. - let fetch_row_num = self.batch_size - self.batch_builder.num_rows(); - if let Some(batch) = self.fetch_batch_from_hottest(fetch_row_num).await? { - // The builder is empty and we have fetched a new batch from this node. - return Ok(Some(batch)); - } - // Otherwise, some rows may have been pushed into the builder. - } else { - // We could only fetch one row from the hottest node. - self.fetch_one_row_from_hottest().await?; - } - } - - // Check buffered rows in the builder. - if self.batch_builder.is_empty() { - Ok(None) - } else { - self.batch_builder.build().map(Some) - } - } - - /// Move nodes in `cold` heap, whose key range is overlapped with current merge - /// window to `hot` heap. - fn refill_hot(&mut self) { - while !self.cold.is_empty() { - if let Some(merge_window) = self.hot.peek() { - let warmest = self.cold.peek().unwrap(); - if warmest.is_behind(merge_window) { - // if the warmest node in the `cold` heap is totally after the - // `merge_window`, then no need to add more nodes into the `hot` - // heap for merge sorting. - break; - } - } - - let warmest = self.cold.pop().unwrap(); - self.hot.push(warmest); - } - } - - /// Fetch at most `fetch_row_num` from the hottest node and attempt to return them directly - /// instead of pushing into the builder if the `self.batch_builder` is empty. - async fn fetch_batch_from_hottest(&mut self, fetch_row_num: usize) -> Result> { - assert_eq!(1, self.hot.len()); - - let mut hottest = self.hot.pop().unwrap(); - let batch = if self.batch_builder.is_empty() { - Some(hottest.take_batch_slice(fetch_row_num)) - } else { - hottest.push_rows_to(&mut self.batch_builder, fetch_row_num)?; - - None - }; - - self.reheap(hottest).await?; - - Ok(batch) - } - - /// Fetch one row from the hottest node. - async fn fetch_one_row_from_hottest(&mut self) -> Result<()> { - let mut hottest = self.hot.pop().unwrap(); - hottest.push_next_row_to(&mut self.batch_builder)?; - - self.reheap(hottest).await - } - - /// Fetch next batch from this node and reset its cursor, then push the node back to a - /// proper heap. - async fn reheap(&mut self, mut node: Node) -> Result<()> { - let fetched_new_batch = node.maybe_fetch_next_batch().await?; - - if node.is_eof() { - // The merge window would be updated, need to refill the hot heap. - self.refill_hot(); - } else if fetched_new_batch { - // A new batch has been fetched from the node, thus the key range of this node - // has been changed. Try to find a proper heap for this node. - let node_is_cold = if let Some(hottest) = self.hot.peek() { - // Now key range of this node is behind the hottest node's. - node.is_behind(hottest) - } else { - // Setting this to false should not affect correctness but performance because - // `refille_hot()` ensures the hottest node is correct. - true - }; - - if node_is_cold { - self.cold.push(node); - } else { - self.hot.push(node); - } - // Anyway, the merge window has been changed, we need to refill the hot heap. - self.refill_hot(); - } else { - // No new batch has been fetched, so the end key of merge window has not been - // changed, we could just put the node back to the hot heap. - self.hot.push(node); - } - - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use datatypes::prelude::ScalarVector; - use datatypes::vectors::{Int64Vector, TimestampMillisecondVector}; - - use super::*; - use crate::test_util::read_util::{self, Batches}; - - #[tokio::test] - async fn test_merge_reader_empty() { - let schema = read_util::new_projected_schema(); - - let mut reader = MergeReaderBuilder::new(schema).build(); - - assert!(reader.next_batch().await.unwrap().is_none()); - // Call next_batch() again is allowed. - assert!(reader.next_batch().await.unwrap().is_none()); - } - - #[tokio::test] - async fn test_node() { - let schema = read_util::new_projected_schema(); - let left_source = read_util::build_boxed_iter(&[&[(1, None), (3, None), (5, None)]]); - let mut left = Node::new(schema.clone(), Source::Iter(left_source)) - .await - .unwrap(); - - let right_source = read_util::build_boxed_reader(&[&[(2, None), (3, None), (6, None)]]); - let mut right = Node::new(schema.clone(), Source::Reader(right_source)) - .await - .unwrap(); - - // We use reverse order for a node. - assert!(left > right); - assert_ne!(left, right); - - // Advance the left and right node. - left.cursor_mut().pos += 1; - right.cursor_mut().pos += 1; - assert_eq!(left, right); - - // Check Debug is implemented. - let output = format!("{left:?}"); - assert!(output.contains("cursor")); - assert!(output.contains("pos: 1")); - let output = format!("{right:?}"); - assert!(output.contains("cursor")); - let output = format!("{:?}", left.first_row()); - assert!(output.contains("pos: 1")); - } - - fn build_merge_reader(sources: &[Batches], num_iter: usize, batch_size: usize) -> MergeReader { - let schema = read_util::new_projected_schema(); - let mut builder = - MergeReaderBuilder::with_capacity(schema, sources.len()).batch_size(batch_size); - - for (i, source) in sources.iter().enumerate() { - if i < num_iter { - builder = builder.push_batch_iter(read_util::build_boxed_iter(source)); - } else { - builder = builder.push_batch_reader(read_util::build_boxed_reader(source)); - } - } - - builder.build() - } - - async fn check_merge_reader_result(mut reader: MergeReader, input: &[Batches<'_>]) { - let mut expect: Vec<_> = input - .iter() - .flat_map(|v| v.iter()) - .flat_map(|v| v.iter().copied()) - .collect(); - expect.sort_by_key(|k| k.0); - - let result = read_util::collect_kv_batch(&mut reader).await; - assert_eq!(expect, result); - - // Call next_batch() again is allowed. - assert!(reader.next_batch().await.unwrap().is_none()); - } - - async fn check_merge_reader_by_batch(mut reader: MergeReader, expect_batches: Batches<'_>) { - let mut result = Vec::new(); - while let Some(batch) = reader.next_batch().await.unwrap() { - let key = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - let value = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - - let batch: Vec<_> = key - .iter_data() - .zip(value.iter_data()) - .map(|(k, v)| (k.unwrap().into(), v)) - .collect(); - result.push(batch); - } - - for (expect, actual) in expect_batches.iter().zip(result.iter()) { - assert_eq!(expect, actual); - } - } - - #[tokio::test] - async fn test_merge_multiple_interleave() { - common_telemetry::init_default_ut_logging(); - - let input: &[Batches] = &[ - &[&[(1, Some(1)), (5, Some(5)), (9, Some(9))]], - &[&[(2, Some(2)), (3, Some(3)), (8, Some(8))]], - &[&[(7, Some(7)), (12, Some(12))]], - ]; - let reader = build_merge_reader(input, 1, 3); - check_merge_reader_result(reader, input).await; - - let input: &[Batches] = &[ - &[ - &[(1, Some(1)), (2, Some(2))], - &[(3, Some(3)), (4, Some(4))], - &[(5, Some(5)), (12, Some(12))], - ], - &[&[(6, Some(6)), (7, Some(7)), (18, Some(18))]], - &[&[(13, Some(13)), (15, Some(15))]], - ]; - let reader = build_merge_reader(input, 1, 3); - check_merge_reader_by_batch( - reader, - &[ - // The former two batches could be returned directly. - &[(1, Some(1)), (2, Some(2))], - &[(3, Some(3)), (4, Some(4))], - &[(5, Some(5)), (6, Some(6)), (7, Some(7))], - &[(12, Some(12)), (13, Some(13)), (15, Some(15))], - &[(18, Some(18))], - ], - ) - .await; - - let input: &[Batches] = &[ - &[ - &[(1, Some(1)), (2, Some(2))], - &[(5, Some(5)), (9, Some(9))], - &[(14, Some(14)), (17, Some(17))], - ], - &[&[(6, Some(6)), (7, Some(7))], &[(15, Some(15))]], - ]; - let reader = build_merge_reader(input, 1, 2); - check_merge_reader_by_batch( - reader, - &[ - &[(1, Some(1)), (2, Some(2))], - // Could not return batch (6, 7) directly. - &[(5, Some(5)), (6, Some(6))], - &[(7, Some(7)), (9, Some(9))], - &[(14, Some(14)), (15, Some(15))], - &[(17, Some(17))], - ], - ) - .await; - } - - #[tokio::test] - async fn test_merge_one_source() { - common_telemetry::init_default_ut_logging(); - - let input: &[Batches] = &[&[ - &[(1, Some(1)), (2, Some(2)), (3, Some(3))], - &[(4, Some(4)), (5, Some(5)), (6, Some(6))], - ]]; - let reader = build_merge_reader(input, 1, 2); - - check_merge_reader_result(reader, input).await; - } - - #[tokio::test] - async fn test_merge_with_empty_batch() { - let input: &[Batches] = &[ - &[ - &[(1, Some(1)), (2, Some(2))], - &[(3, Some(3)), (6, Some(6))], - &[], - &[], - &[(8, Some(8)), (12, Some(12))], - &[], - ], - &[ - &[(4, Some(4)), (5, Some(5))], - &[], - &[(15, None), (18, None), (20, None)], - ], - &[&[(13, Some(13)), (19, None)], &[], &[]], - ]; - let reader = build_merge_reader(input, 1, 2); - - check_merge_reader_result(reader, input).await; - } - - #[tokio::test] - async fn test_merge_duplicate_key() { - let input: &[Batches] = &[ - &[ - &[(1, Some(1)), (5, Some(5)), (8, Some(8))], - &[(9, None), (11, None)], - &[(12, Some(12)), (15, None)], - ], - &[&[(1, Some(1)), (3, Some(3)), (8, Some(8))], &[(16, None)]], - &[ - &[(7, Some(7)), (12, Some(12))], - &[(15, None), (16, None), (17, None)], - ], - &[&[(15, None)]], - ]; - let reader = build_merge_reader(input, 2, 2); - check_merge_reader_result(reader, input).await; - } -} diff --git a/src/storage/src/read/windowed.rs b/src/storage/src/read/windowed.rs deleted file mode 100644 index c9828ad62930..000000000000 --- a/src/storage/src/read/windowed.rs +++ /dev/null @@ -1,171 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use arrow::compute::SortOptions; -use arrow::row::{RowConverter, SortField}; -use arrow_array::{Array, ArrayRef}; -use common_recordbatch::OrderOption; -use datatypes::data_type::DataType; -use datatypes::vectors::Helper; -use snafu::ResultExt; - -use crate::error::{self, Result}; -use crate::read::{Batch, BatchReader}; -use crate::schema::{ProjectedSchemaRef, StoreSchema}; - -/// [WindowedReader] provides a windowed record batch reader that scans all rows within a window -/// at a time and sort these rows ordered in `[, ]` order. -pub struct WindowedReader { - /// Schema to read - pub schema: ProjectedSchemaRef, - /// Each reader reads a slice of time window - pub readers: Vec, - /// `order_options` defines how records within windows are sorted. - pub order_options: Vec, -} - -impl WindowedReader { - /// Creates a new [WindowedReader] from given schema and a set of boxed readers. - /// - /// ### Note - /// [WindowedReader] always reads the readers in a reverse order. The last reader in `readers` - /// gets polled first. - pub fn new( - schema: ProjectedSchemaRef, - readers: Vec, - order_options: Vec, - ) -> Self { - Self { - schema, - readers, - order_options, - } - } -} - -#[async_trait::async_trait] -impl BatchReader for WindowedReader -where - R: BatchReader, -{ - async fn next_batch(&mut self) -> Result> { - let _window_scan_elapsed = crate::metrics::WINDOW_SCAN_ELAPSED.start_timer(); - let Some(mut reader) = self.readers.pop() else { - return Ok(None); - }; - - let store_schema = self.schema.schema_to_read(); - let mut batches = vec![]; - while let Some(batch) = reader.next_batch().await? { - batches.push( - batch - .columns - .into_iter() - .map(|v| v.to_arrow_array()) - .collect::>(), - ); - } - - let Some(num_columns) = batches.get(0).map(|b| b.len()) else { - // the reader does not yield data, a batch of empty vectors must be returned instead of - // an empty batch without any column. - let empty_columns = store_schema - .columns() - .iter() - .map(|s| s.desc.data_type.create_mutable_vector(0).to_vector()) - .collect(); - return Ok(Some(Batch::new(empty_columns))); - }; - let mut vectors_in_batch = Vec::with_capacity(num_columns); - - for idx in 0..num_columns { - let columns: Vec<&dyn Array> = - batches.iter().map(|b| b[idx].as_ref()).collect::>(); - vectors_in_batch - .push(arrow::compute::concat(&columns).context(error::ConvertColumnsToRowsSnafu)?); - } - if let Some(v) = vectors_in_batch.get(0) { - crate::metrics::WINDOW_SCAN_ROWS_PER_WINDOW.observe(v.len() as f64); - } - let sorted = sort_by_rows(&self.schema, vectors_in_batch, &self.order_options)?; - let vectors = sorted - .iter() - .zip(store_schema.columns().iter().map(|c| &c.desc.name)) - .map(|(arr, name)| { - Helper::try_into_vector(arr).context(error::ConvertChunkSnafu { name }) - }) - .collect::>()?; - Ok(Some(Batch::new(vectors))) - } -} - -fn sort_by_rows( - schema: &ProjectedSchemaRef, - arrays: Vec, - order_options: &[OrderOption], -) -> Result> { - let store_schema = schema.schema_to_read(); - let sort_columns = build_sorted_columns(store_schema, order_options); - // Convert columns to rows to speed lexicographic sort - // TODO(hl): maybe optimize to lexsort_to_index when only timestamp column is involved. - let row_converter = RowConverter::new( - sort_columns - .iter() - .map(|(idx, descending)| { - SortField::new_with_options( - store_schema.columns()[*idx].desc.data_type.as_arrow_type(), - SortOptions { - descending: *descending, - nulls_first: true, - }, - ) - }) - .collect(), - ) - .context(error::ConvertColumnsToRowsSnafu)?; - - let columns_to_sort = sort_columns - .into_iter() - .map(|(idx, _)| arrays[idx].clone()) - .collect::>(); - - let rows_to_sort = row_converter - .convert_columns(&columns_to_sort) - .context(error::ConvertColumnsToRowsSnafu)?; - - let mut sort_pairs = rows_to_sort.iter().enumerate().collect::>(); - sort_pairs.sort_unstable_by(|(_, a), (_, b)| a.cmp(b)); - - let idx = - arrow::array::UInt32Array::from_iter_values(sort_pairs.iter().map(|(i, _)| *i as u32)); - - let sorted = arrays - .iter() - .map(|arr| arrow::compute::take(arr, &idx, None)) - .collect::>>() - .context(error::SortArraysSnafu)?; - - debug_assert_eq!(sorted.len(), store_schema.num_columns()); - - Ok(sorted) -} - -/// Builds sorted columns from `order_options`. -/// Returns a vector of columns indices to sort and sort orders (true means descending order). -fn build_sorted_columns(schema: &StoreSchema, order_options: &[OrderOption]) -> Vec<(usize, bool)> { - order_options - .iter() - .map(|o| (schema.column_index(&o.name), o.options.descending)) - .collect() -} diff --git a/src/storage/src/region.rs b/src/storage/src/region.rs deleted file mode 100644 index 3106adb48ff1..000000000000 --- a/src/storage/src/region.rs +++ /dev/null @@ -1,808 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#[cfg(test)] -mod tests; -mod writer; - -use std::collections::BTreeMap; -use std::fmt; -use std::sync::atomic::{AtomicI64, Ordering}; -use std::sync::Arc; -use std::time::Duration; - -use async_trait::async_trait; -use common_telemetry::{info, logging}; -use common_time::util; -use snafu::ResultExt; -use store_api::logstore::LogStore; -use store_api::manifest::{ - self, Manifest, ManifestLogStorage, ManifestVersion, MetaActionIterator, -}; -use store_api::storage::{ - AlterRequest, CloseContext, CompactContext, CompactionStrategy, FlushContext, FlushReason, - OpenOptions, ReadContext, Region, RegionId, SequenceNumber, WriteContext, WriteResponse, -}; - -use crate::compaction::{ - compaction_strategy_to_picker, CompactionPickerRef, CompactionSchedulerRef, -}; -use crate::config::EngineConfig; -use crate::error::{self, Error, Result}; -use crate::file_purger::FilePurgerRef; -use crate::flush::{FlushSchedulerRef, FlushStrategyRef}; -use crate::manifest::action::{ - RawRegionMetadata, RegionChange, RegionCheckpoint, RegionMetaAction, RegionMetaActionList, -}; -use crate::manifest::region::RegionManifest; -use crate::memtable::{MemtableBuilderRef, MemtableVersion}; -use crate::metadata::{RegionMetaImpl, RegionMetadata, RegionMetadataRef}; -pub(crate) use crate::region::writer::schedule_compaction; -pub use crate::region::writer::{ - AlterContext, RegionWriter, RegionWriterRef, WriterCompactRequest, WriterContext, -}; -use crate::region::writer::{DropContext, TruncateContext}; -use crate::schema::compat::CompatWrite; -use crate::snapshot::SnapshotImpl; -use crate::sst::{AccessLayerRef, LevelMetas}; -use crate::version::{ - Version, VersionControl, VersionControlRef, VersionEdit, INIT_COMMITTED_SEQUENCE, -}; -use crate::wal::Wal; -use crate::write_batch::WriteBatch; - -/// [Region] implementation. -pub struct RegionImpl { - inner: Arc>, -} - -impl Clone for RegionImpl { - fn clone(&self) -> Self { - Self { - inner: self.inner.clone(), - } - } -} - -impl fmt::Debug for RegionImpl { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("RegionImpl") - .field("id", &self.inner.shared.id) - .field("name", &self.inner.shared.name) - .field("wal", &self.inner.wal) - .field("flush_strategy", &self.inner.flush_strategy) - .field("compaction_scheduler", &self.inner.compaction_scheduler) - .field("sst_layer", &self.inner.sst_layer) - .field("manifest", &self.inner.manifest) - .finish() - } -} - -#[async_trait] -impl Region for RegionImpl { - type Error = Error; - type Meta = RegionMetaImpl; - type WriteRequest = WriteBatch; - type Snapshot = SnapshotImpl; - - fn id(&self) -> RegionId { - self.inner.shared.id - } - - fn name(&self) -> &str { - &self.inner.shared.name - } - - fn in_memory_metadata(&self) -> RegionMetaImpl { - self.inner.in_memory_metadata() - } - - async fn write(&self, ctx: &WriteContext, mut request: WriteBatch) -> Result { - // Compat the schema of the write batch outside of the write lock. - self.inner.compat_write_batch(&mut request)?; - - self.inner.write(ctx, request).await - } - - fn snapshot(&self, _ctx: &ReadContext) -> Result { - Ok(self.inner.create_snapshot()) - } - - fn write_request(&self) -> Self::WriteRequest { - let metadata = self.inner.version_control().metadata(); - let user_schema = metadata.user_schema().clone(); - let row_key_end = metadata.schema().store_schema().row_key_end(); - - WriteBatch::new(user_schema, row_key_end) - } - - async fn alter(&self, request: AlterRequest) -> Result<()> { - self.inner.alter(request).await - } - - async fn drop_region(&self) -> Result<()> { - crate::metrics::REGION_COUNT.dec(); - self.inner.drop_region().await - } - - fn disk_usage_bytes(&self) -> u64 { - let version = self.inner.version_control().current(); - version - .ssts() - .levels() - .iter() - .map(|level_ssts| level_ssts.files().map(|sst| sst.file_size()).sum::()) - .sum() - } - - async fn flush(&self, ctx: &FlushContext) -> Result<()> { - self.inner.flush(ctx).await - } - - async fn compact(&self, ctx: &CompactContext) -> std::result::Result<(), Self::Error> { - self.inner.compact(ctx).await - } - - async fn truncate(&self) -> Result<()> { - self.inner.truncate().await - } -} - -/// Storage related config for region. -/// -/// Contains all necessary storage related components needed by the region, such as logstore, -/// manifest, memtable builder. -pub struct StoreConfig { - pub log_store: Arc, - pub sst_layer: AccessLayerRef, - pub manifest: RegionManifest, - pub memtable_builder: MemtableBuilderRef, - pub flush_scheduler: FlushSchedulerRef, - pub flush_strategy: FlushStrategyRef, - pub compaction_scheduler: CompactionSchedulerRef, - pub engine_config: Arc, - pub file_purger: FilePurgerRef, - pub ttl: Option, - pub write_buffer_size: usize, - pub compaction_strategy: CompactionStrategy, -} - -pub type RecoveredMetadata = (SequenceNumber, (ManifestVersion, RawRegionMetadata)); -pub type RecoveredMetadataMap = BTreeMap; - -impl RegionImpl { - /// Create a new region and also persist the region metadata to manifest. - /// - /// The caller should avoid calling this method simultaneously. - pub async fn create( - metadata: RegionMetadata, - store_config: StoreConfig, - ) -> Result> { - let metadata = Arc::new(metadata); - - // Try to persist region data to manifest, ensure the new region could be recovered from - // the manifest. - let manifest_version = { - let _timer = crate::metrics::CREATE_REGION_UPDATE_MANIFEST.start_timer(); - store_config - .manifest - .update(RegionMetaActionList::with_action(RegionMetaAction::Change( - RegionChange { - metadata: metadata.as_ref().into(), - committed_sequence: INIT_COMMITTED_SEQUENCE, - }, - ))) - .await? - }; - - let mutable_memtable = store_config - .memtable_builder - .build(metadata.schema().clone()); - let version = Version::with_manifest_version( - metadata, - manifest_version, - mutable_memtable, - store_config.sst_layer.clone(), - store_config.file_purger.clone(), - ); - let region = RegionImpl::new(version, store_config); - crate::metrics::REGION_COUNT.inc(); - - Ok(region) - } - - /// Create a new region without persisting manifest. - fn new(version: Version, store_config: StoreConfig) -> RegionImpl { - let metadata = version.metadata(); - let id = metadata.id(); - let name = metadata.name().to_string(); - let version_control = VersionControl::with_version(version); - let wal = Wal::new(id, store_config.log_store); - - let compaction_picker = compaction_strategy_to_picker(&store_config.compaction_strategy); - let inner = Arc::new(RegionInner { - shared: Arc::new(SharedData { - id, - name, - version_control: Arc::new(version_control), - last_flush_millis: AtomicI64::new(0), - }), - writer: Arc::new(RegionWriter::new( - store_config.memtable_builder, - store_config.engine_config.clone(), - store_config.ttl, - store_config.write_buffer_size, - store_config.compaction_scheduler.clone(), - compaction_picker.clone(), - )), - wal, - flush_strategy: store_config.flush_strategy, - flush_scheduler: store_config.flush_scheduler, - compaction_scheduler: store_config.compaction_scheduler, - compaction_picker, - sst_layer: store_config.sst_layer, - manifest: store_config.manifest, - }); - - RegionImpl { inner } - } - - /// Open an existing region and recover its data. - /// - /// The caller should avoid calling this method simultaneously. - pub async fn open( - name: String, - store_config: StoreConfig, - _opts: &OpenOptions, - ) -> Result>> { - // Load version meta data from manifest. - let (version, mut recovered_metadata) = match Self::recover_from_manifest( - &store_config.manifest, - &store_config.memtable_builder, - &store_config.sst_layer, - &store_config.file_purger, - ) - .await? - { - (None, _) => return Ok(None), - (Some(v), m) => (v, m), - }; - - logging::debug!( - "Region recovered version from manifest, version: {:?}", - version - ); - - let metadata = version.metadata().clone(); - let flushed_sequence = version.flushed_sequence(); - let version_control = Arc::new(VersionControl::with_version(version)); - - let recovered_metadata_after_flushed = - recovered_metadata.split_off(&(flushed_sequence + 1)); - // apply the last flushed metadata - if let Some((sequence, (manifest_version, metadata))) = recovered_metadata.pop_last() { - let metadata: RegionMetadataRef = Arc::new( - metadata - .try_into() - .context(error::InvalidRawRegionSnafu { region: &name })?, - ); - let mutable_memtable = store_config - .memtable_builder - .build(metadata.schema().clone()); - version_control.freeze_mutable_and_apply_metadata( - metadata, - manifest_version, - mutable_memtable, - ); - - logging::debug!( - "Applied the last flushed metadata to region: {}, sequence: {}, manifest: {}", - name, - sequence, - manifest_version, - ); - } - - let wal = Wal::new(metadata.id(), store_config.log_store); - wal.obsolete(flushed_sequence).await?; - info!( - "Obsolete WAL entries on startup, region: {}, flushed sequence: {}", - metadata.id(), - flushed_sequence - ); - - let shared = Arc::new(SharedData { - id: metadata.id(), - name, - version_control, - last_flush_millis: AtomicI64::new(0), - }); - - let compaction_picker = compaction_strategy_to_picker(&store_config.compaction_strategy); - let writer = Arc::new(RegionWriter::new( - store_config.memtable_builder, - store_config.engine_config.clone(), - store_config.ttl, - store_config.write_buffer_size, - store_config.compaction_scheduler.clone(), - compaction_picker.clone(), - )); - - let writer_ctx = WriterContext { - shared: &shared, - flush_strategy: &store_config.flush_strategy, - flush_scheduler: &store_config.flush_scheduler, - compaction_scheduler: &store_config.compaction_scheduler, - sst_layer: &store_config.sst_layer, - wal: &wal, - writer: &writer, - manifest: &store_config.manifest, - compaction_picker: compaction_picker.clone(), - }; - // Replay all unflushed data. - writer - .replay(recovered_metadata_after_flushed, writer_ctx) - .await?; - - let inner = Arc::new(RegionInner { - shared, - writer, - wal, - flush_strategy: store_config.flush_strategy, - flush_scheduler: store_config.flush_scheduler, - compaction_scheduler: store_config.compaction_scheduler, - compaction_picker, - sst_layer: store_config.sst_layer, - manifest: store_config.manifest, - }); - - crate::metrics::REGION_COUNT.inc(); - Ok(Some(RegionImpl { inner })) - } - - /// Get ID of this region. - pub fn id(&self) -> RegionId { - self.inner.shared.id() - } - - /// Returns last flush timestamp in millis. - pub(crate) fn last_flush_millis(&self) -> i64 { - self.inner.shared.last_flush_millis() - } - - /// Returns the [VersionControl] of the region. - pub(crate) fn version_control(&self) -> &VersionControl { - self.inner.version_control() - } - - fn create_version_with_checkpoint( - checkpoint: RegionCheckpoint, - memtable_builder: &MemtableBuilderRef, - sst_layer: &AccessLayerRef, - file_purger: &FilePurgerRef, - ) -> Result> { - if checkpoint.checkpoint.is_none() { - return Ok(None); - } - // Safety: it's safe to unwrap here, checking it above. - let s = checkpoint.checkpoint.unwrap(); - - let region = s.metadata.name.clone(); - let region_metadata: RegionMetadata = s - .metadata - .try_into() - .context(error::InvalidRawRegionSnafu { region })?; - - let memtable = memtable_builder.build(region_metadata.schema().clone()); - let mut version = Version::with_manifest_version( - Arc::new(region_metadata), - checkpoint.last_version, - memtable, - sst_layer.clone(), - file_purger.clone(), - ); - - if let Some(v) = s.version { - version.apply_checkpoint( - v.flushed_sequence, - v.manifest_version, - v.files.into_values(), - ); - } - - Ok(Some(version)) - } - - async fn recover_from_manifest( - manifest: &RegionManifest, - memtable_builder: &MemtableBuilderRef, - sst_layer: &AccessLayerRef, - file_purger: &FilePurgerRef, - ) -> Result<(Option, RecoveredMetadataMap)> { - let checkpoint = manifest.last_checkpoint().await?; - - let (start, end, mut version) = if let Some(checkpoint) = checkpoint { - ( - checkpoint.last_version + 1, - manifest::MAX_VERSION, - Self::create_version_with_checkpoint( - checkpoint, - memtable_builder, - sst_layer, - file_purger, - )?, - ) - } else { - (manifest::MIN_VERSION, manifest::MAX_VERSION, None) - }; - - let mut iter = manifest.scan(start, end).await?; - - let mut actions = Vec::new(); - let mut last_manifest_version = manifest::MIN_VERSION; - let mut recovered_metadata = BTreeMap::new(); - - while let Some((manifest_version, action_list)) = iter.next_action().await? { - last_manifest_version = manifest_version; - - for action in action_list.actions { - match (action, version) { - (RegionMetaAction::Change(c), None) => { - let region = c.metadata.name.clone(); - let region_metadata: RegionMetadata = c - .metadata - .try_into() - .context(error::InvalidRawRegionSnafu { region })?; - // Use current schema to build a memtable. This might be replaced later - // in `freeze_mutable_and_apply_metadata()`. - let memtable = memtable_builder.build(region_metadata.schema().clone()); - version = Some(Version::with_manifest_version( - Arc::new(region_metadata), - last_manifest_version, - memtable, - sst_layer.clone(), - file_purger.clone(), - )); - for (manifest_version, action) in actions.drain(..) { - version = Self::replay_edit(manifest_version, action, version); - } - } - (RegionMetaAction::Change(c), Some(v)) => { - let _ = recovered_metadata - .insert(c.committed_sequence, (manifest_version, c.metadata)); - version = Some(v); - } - (RegionMetaAction::Remove(r), Some(v)) => { - manifest.stop().await?; - - let files = v.ssts().mark_all_files_deleted(); - logging::info!( - "Try to remove all SSTs, region: {}, files: {:?}", - r.region_id, - files - ); - - manifest - .manifest_store() - .delete_all(v.manifest_version()) - .await?; - return Ok((None, recovered_metadata)); - } - (RegionMetaAction::Truncate(t), Some(mut v)) => { - let files = v.ssts().mark_all_files_deleted(); - logging::info!( - "Try to remove all SSTs on truncate, region: {}, files: {:?}", - t.region_id, - files - ); - let region_metadata = v.metadata().clone(); - let memtables = Arc::new(MemtableVersion::new( - memtable_builder.build(region_metadata.schema().clone()), - )); - let ssts = - Arc::new(LevelMetas::new(sst_layer.clone(), file_purger.clone())); - v.reset( - v.manifest_version() + 1, - memtables, - ssts, - t.committed_sequence, - ); - version = Some(v); - } - (action, None) => { - actions.push((manifest_version, action)); - version = None; - } - (action, Some(v)) => { - version = Self::replay_edit(manifest_version, action, Some(v)); - } - } - } - } - - assert!(actions.is_empty() || version.is_none()); - - if let Some(version) = &version { - // update manifest state after recovering - let protocol = iter.last_protocol(); - manifest.update_state(last_manifest_version + 1, protocol.clone()); - manifest.set_flushed_manifest_version(version.manifest_version()); - } - - Ok((version, recovered_metadata)) - } - - fn replay_edit( - manifest_version: ManifestVersion, - action: RegionMetaAction, - version: Option, - ) -> Option { - if let RegionMetaAction::Edit(e) = action { - let edit = VersionEdit { - files_to_add: e.files_to_add, - files_to_remove: e.files_to_remove, - flushed_sequence: e.flushed_sequence, - manifest_version, - max_memtable_id: None, - compaction_time_window: e.compaction_time_window, - }; - version.map(|mut v| { - v.apply_edit(edit); - v - }) - } else { - version - } - } - - /// Compact the region manually. - pub async fn compact(&self, ctx: &CompactContext) -> Result<()> { - self.inner.compact(ctx).await - } - - pub async fn close(&self, ctx: &CloseContext) -> Result<()> { - crate::metrics::REGION_COUNT.dec(); - self.inner.close(ctx).await - } -} - -// Private methods for tests. -#[cfg(test)] -impl RegionImpl { - #[inline] - fn committed_sequence(&self) -> store_api::storage::SequenceNumber { - self.inner.version_control().committed_sequence() - } - - fn current_manifest_version(&self) -> ManifestVersion { - self.inner.version_control().current_manifest_version() - } - - /// Write to inner, also the `RegionWriter` directly. - async fn write_inner(&self, ctx: &WriteContext, request: WriteBatch) -> Result { - self.inner.write(ctx, request).await - } - - // Replay metadata to inner. - async fn replay_inner(&self, recovered_metadata: RecoveredMetadataMap) -> Result<()> { - let inner = &self.inner; - let writer_ctx = WriterContext { - shared: &inner.shared, - flush_strategy: &inner.flush_strategy, - flush_scheduler: &inner.flush_scheduler, - compaction_scheduler: &inner.compaction_scheduler, - sst_layer: &inner.sst_layer, - wal: &inner.wal, - writer: &inner.writer, - manifest: &inner.manifest, - compaction_picker: inner.compaction_picker.clone(), - }; - - inner.writer.replay(recovered_metadata, writer_ctx).await - } - - pub(crate) async fn write_buffer_size(&self) -> usize { - self.inner.writer.write_buffer_size().await - } -} - -/// Shared data of region. -#[derive(Debug)] -pub struct SharedData { - // Region id and name is immutable, so we cache them in shared data to avoid loading - // current version from `version_control` each time we need to access them. - id: RegionId, - name: String, - // TODO(yingwen): Maybe no need to use Arc for version control. - pub version_control: VersionControlRef, - - /// Last flush time in millis. - last_flush_millis: AtomicI64, -} - -impl SharedData { - #[inline] - pub fn id(&self) -> RegionId { - self.id - } - - #[inline] - pub fn name(&self) -> &str { - &self.name - } - - /// Update flush time to current time. - pub(crate) fn update_flush_millis(&self) { - let now = util::current_time_millis(); - self.last_flush_millis.store(now, Ordering::Relaxed); - } - - /// Returns last flush timestamp in millis. - fn last_flush_millis(&self) -> i64 { - self.last_flush_millis.load(Ordering::Relaxed) - } -} - -pub type SharedDataRef = Arc; - -struct RegionInner { - shared: SharedDataRef, - writer: RegionWriterRef, - wal: Wal, - flush_strategy: FlushStrategyRef, - flush_scheduler: FlushSchedulerRef, - compaction_scheduler: CompactionSchedulerRef, - compaction_picker: CompactionPickerRef, - sst_layer: AccessLayerRef, - manifest: RegionManifest, -} - -impl RegionInner { - #[inline] - fn version_control(&self) -> &VersionControl { - &self.shared.version_control - } - - fn in_memory_metadata(&self) -> RegionMetaImpl { - let metadata = self.version_control().metadata(); - - RegionMetaImpl::new(metadata) - } - - fn create_snapshot(&self) -> SnapshotImpl { - let version = self.version_control().current(); - let sequence = self.version_control().committed_sequence(); - - SnapshotImpl::new(version, sequence, self.sst_layer.clone()) - } - - fn compat_write_batch(&self, request: &mut WriteBatch) -> Result<()> { - let metadata = self.version_control().metadata(); - let schema = metadata.schema(); - - // Try to make request schema compatible with region's outside of write lock. Note that - // schema might be altered after this step. - request.compat_write(schema.user_schema()) - } - - /// Write to writer directly. - async fn write(&self, ctx: &WriteContext, request: WriteBatch) -> Result { - let writer_ctx = WriterContext { - shared: &self.shared, - flush_strategy: &self.flush_strategy, - flush_scheduler: &self.flush_scheduler, - compaction_scheduler: &self.compaction_scheduler, - sst_layer: &self.sst_layer, - wal: &self.wal, - writer: &self.writer, - manifest: &self.manifest, - compaction_picker: self.compaction_picker.clone(), - }; - // The writer would also try to compat the schema of write batch if it finds out the - // schema version of request is less than current schema version. - self.writer.write(ctx, request, writer_ctx).await - } - - async fn alter(&self, request: AlterRequest) -> Result<()> { - logging::info!( - "Alter region {}, name: {}, request: {:?}", - self.shared.id, - self.shared.name, - request - ); - - let alter_ctx = AlterContext { - shared: &self.shared, - wal: &self.wal, - manifest: &self.manifest, - }; - - self.writer.alter(alter_ctx, request).await - } - - async fn close(&self, ctx: &CloseContext) -> Result<()> { - self.writer.close().await?; - if ctx.flush { - let ctx = FlushContext { - wait: true, - reason: FlushReason::Manually, - force: true, - }; - self.flush(&ctx).await?; - } - self.manifest.stop().await - } - - async fn drop_region(&self) -> Result<()> { - logging::info!("Drop region {}, name: {}", self.shared.id, self.shared.name); - let drop_ctx = DropContext { - shared: &self.shared, - wal: &self.wal, - manifest: &self.manifest, - flush_scheduler: &self.flush_scheduler, - compaction_scheduler: &self.compaction_scheduler, - sst_layer: &self.sst_layer, - }; - - self.manifest.stop().await?; - self.writer.on_drop(drop_ctx).await - } - - async fn flush(&self, ctx: &FlushContext) -> Result<()> { - let writer_ctx = WriterContext { - shared: &self.shared, - flush_strategy: &self.flush_strategy, - flush_scheduler: &self.flush_scheduler, - compaction_scheduler: &self.compaction_scheduler, - sst_layer: &self.sst_layer, - wal: &self.wal, - writer: &self.writer, - manifest: &self.manifest, - compaction_picker: self.compaction_picker.clone(), - }; - self.writer.flush(writer_ctx, ctx).await - } - - /// Compact the region manually. - async fn compact(&self, compact_ctx: &CompactContext) -> Result<()> { - self.writer - .compact(WriterCompactRequest { - shared_data: self.shared.clone(), - sst_layer: self.sst_layer.clone(), - manifest: self.manifest.clone(), - wal: self.wal.clone(), - region_writer: self.writer.clone(), - compact_ctx: *compact_ctx, - }) - .await - } - - async fn truncate(&self) -> Result<()> { - logging::info!( - "Truncate region {}, name: {}", - self.shared.id, - self.shared.name - ); - - let ctx = TruncateContext { - shared: &self.shared, - wal: &self.wal, - manifest: &self.manifest, - sst_layer: &self.sst_layer, - }; - - self.writer.truncate(&ctx).await?; - Ok(()) - } -} diff --git a/src/storage/src/region/tests.rs b/src/storage/src/region/tests.rs deleted file mode 100644 index 1db8484b74c5..000000000000 --- a/src/storage/src/region/tests.rs +++ /dev/null @@ -1,833 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Region tests. - -use std::collections::{HashMap, HashSet}; - -use arrow::compute::SortOptions; -use common_base::readable_size::ReadableSize; -use common_datasource::compression::CompressionType; -use common_recordbatch::OrderOption; -use common_telemetry::logging; -use common_test_util::temp_dir::{create_temp_dir, TempDir}; -use datatypes::prelude::{LogicalTypeId, ScalarVector, WrapperType}; -use datatypes::timestamp::TimestampMillisecond; -use datatypes::vectors::{ - BooleanVector, Int64Vector, StringVector, TimestampMillisecondVector, VectorRef, -}; -use log_store::raft_engine::log_store::RaftEngineLogStore; -use log_store::NoopLogStore; -use object_store::services::Fs; -use object_store::ObjectStore; -use store_api::manifest::{Manifest, MAX_VERSION}; -use store_api::storage::{ - Chunk, ChunkReader, FlushContext, FlushReason, ReadContext, Region, RegionMeta, ScanRequest, - SequenceNumber, Snapshot, WriteContext, WriteRequest, -}; - -use super::*; -use crate::chunk::ChunkReaderImpl; -use crate::compaction::noop::NoopCompactionScheduler; -use crate::engine; -use crate::engine::RegionMap; -use crate::file_purger::noop::NoopFilePurgeHandler; -use crate::flush::{FlushScheduler, PickerConfig, SizeBasedStrategy}; -use crate::manifest::action::{RegionChange, RegionMetaActionList}; -use crate::manifest::manifest_compress_type; -use crate::manifest::region::RegionManifest; -use crate::manifest::test_utils::*; -use crate::memtable::DefaultMemtableBuilder; -use crate::metadata::RegionMetadata; -use crate::region::{RegionImpl, StoreConfig}; -use crate::scheduler::{LocalScheduler, SchedulerConfig}; -use crate::sst::{FileId, FsAccessLayer}; -use crate::test_util::descriptor_util::RegionDescBuilder; -use crate::test_util::{self, config_util, schema_util, write_batch_util}; - -mod alter; -mod basic; -mod close; -mod compact; -mod drop; -mod flush; -mod projection; -mod truncate; - -/// Create metadata of a region with schema: (timestamp, v0). -pub fn new_metadata(region_name: &str) -> RegionMetadata { - let desc = RegionDescBuilder::new(region_name) - .id(123) - .push_field_column(("v0", LogicalTypeId::String, true)) - .build(); - desc.try_into().unwrap() -} - -/// Test region with schema (timestamp, v0). -pub struct TesterBase { - pub region: RegionImpl, - pub write_ctx: WriteContext, - pub read_ctx: ReadContext, -} - -impl TesterBase { - pub fn with_region(region: RegionImpl) -> TesterBase { - TesterBase { - region, - write_ctx: WriteContext::default(), - read_ctx: ReadContext::default(), - } - } - - pub async fn checkpoint_manifest(&self) { - let manifest = &self.region.inner.manifest; - manifest.set_flushed_manifest_version(manifest.last_version() - 1); - let _ = manifest.do_checkpoint().await.unwrap().unwrap(); - } - - pub async fn close(&self) { - self.region.inner.flush_scheduler.stop().await.unwrap(); - self.region - .inner - .compaction_scheduler - .stop(true) - .await - .unwrap(); - self.region.close(&CloseContext::default()).await.unwrap(); - self.region.inner.wal.close().await.unwrap(); - } - - /// Put without version specified. - /// - /// Format of data: (timestamp, v0), timestamp is key, v0 is value. - pub async fn put(&self, data: &[(i64, Option)]) -> WriteResponse { - self.try_put(data).await.unwrap() - } - - /// Put without version specified, returns [`Result`] - /// - /// Format of data: (timestamp, v0), timestamp is key, v0 is value. - pub async fn try_put(&self, data: &[(i64, Option)]) -> Result { - let data: Vec<(TimestampMillisecond, Option)> = - data.iter().map(|(l, r)| ((*l).into(), r.clone())).collect(); - // Build a batch without version. - let mut batch = new_write_batch_for_test(false); - let put_data = new_put_data(&data); - batch.put(put_data).unwrap(); - - self.region.write(&self.write_ctx, batch).await - } - - /// Put without version specified directly to inner writer. - pub async fn put_inner(&self, data: &[(i64, Option)]) -> WriteResponse { - let data: Vec<(TimestampMillisecond, Option)> = - data.iter().map(|(l, r)| ((*l).into(), r.clone())).collect(); - let mut batch = new_write_batch_for_test(false); - let put_data = new_put_data(&data); - batch.put(put_data).unwrap(); - - self.region - .write_inner(&self.write_ctx, batch) - .await - .unwrap() - } - - pub async fn replay_inner(&self, recovered_metadata: RecoveredMetadataMap) { - self.region.replay_inner(recovered_metadata).await.unwrap() - } - - /// Scan all data. - pub async fn full_scan(&self) -> Vec<(i64, Option)> { - logging::info!("Full scan with ctx {:?}", self.read_ctx); - let snapshot = self.region.snapshot(&self.read_ctx).unwrap(); - - let resp = snapshot - .scan(&self.read_ctx, ScanRequest::default()) - .await - .unwrap(); - let mut reader = resp.reader; - - let metadata = self.region.in_memory_metadata(); - assert_eq!(metadata.schema(), reader.user_schema()); - - let mut dst = Vec::new(); - while let Some(chunk) = reader.next_chunk().await.unwrap() { - let chunk = reader.project_chunk(chunk); - append_chunk_to(&chunk, &mut dst); - } - - dst - } - - pub async fn scan(&self, req: ScanRequest) -> Vec<(i64, Option)> { - logging::info!("Full scan with ctx {:?}", self.read_ctx); - let snapshot = self.region.snapshot(&self.read_ctx).unwrap(); - - let resp = snapshot.scan(&self.read_ctx, req).await.unwrap(); - let mut reader = resp.reader; - - let metadata = self.region.in_memory_metadata(); - assert_eq!(metadata.schema(), reader.user_schema()); - - let mut dst = Vec::new(); - while let Some(chunk) = reader.next_chunk().await.unwrap() { - let chunk = reader.project_chunk(chunk); - append_chunk_to(&chunk, &mut dst); - } - dst - } - - pub fn committed_sequence(&self) -> SequenceNumber { - self.region.committed_sequence() - } - - /// Delete by keys (timestamp). - pub async fn delete(&self, keys: &[i64]) -> WriteResponse { - let keys: Vec = keys.iter().map(|v| (*v).into()).collect(); - // Build a batch without version. - let mut batch = new_write_batch_for_test(false); - let keys = new_delete_data(&keys); - batch.delete(keys).unwrap(); - - self.region.write(&self.write_ctx, batch).await.unwrap() - } - - /// Returns a reader to scan all data. - pub async fn full_scan_reader(&self) -> ChunkReaderImpl { - let snapshot = self.region.snapshot(&self.read_ctx).unwrap(); - - let resp = snapshot - .scan(&self.read_ctx, ScanRequest::default()) - .await - .unwrap(); - resp.reader - } - - /// Collect data from the reader. - pub async fn collect_reader(&self, mut reader: ChunkReaderImpl) -> Vec<(i64, Option)> { - let mut dst = Vec::new(); - while let Some(chunk) = reader.next_chunk().await.unwrap() { - let chunk = reader.project_chunk(chunk); - append_chunk_to(&chunk, &mut dst); - } - - dst - } -} - -pub type FileTesterBase = TesterBase; - -fn new_write_batch_for_test(enable_version_column: bool) -> WriteBatch { - if enable_version_column { - write_batch_util::new_write_batch( - &[ - ( - test_util::TIMESTAMP_NAME, - LogicalTypeId::TimestampMillisecond, - false, - ), - ("v0", LogicalTypeId::String, true), - ], - Some(0), - 2, - ) - } else { - write_batch_util::new_write_batch( - &[ - ( - test_util::TIMESTAMP_NAME, - LogicalTypeId::TimestampMillisecond, - false, - ), - ("v0", LogicalTypeId::String, true), - ], - Some(0), - 1, - ) - } -} - -fn new_put_data(data: &[(TimestampMillisecond, Option)]) -> HashMap { - let timestamps = - TimestampMillisecondVector::from_vec(data.iter().map(|v| v.0.into()).collect()); - let values = StringVector::from(data.iter().map(|kv| kv.1.clone()).collect::>()); - - HashMap::from([ - ( - test_util::TIMESTAMP_NAME.to_string(), - Arc::new(timestamps) as VectorRef, - ), - ("v0".to_string(), Arc::new(values) as VectorRef), - ]) -} - -fn new_delete_data(keys: &[TimestampMillisecond]) -> HashMap { - let timestamps = - TimestampMillisecondVector::from_vec(keys.iter().map(|v| v.0.into()).collect()); - HashMap::from([( - test_util::TIMESTAMP_NAME.to_string(), - Arc::new(timestamps) as VectorRef, - )]) -} - -fn append_chunk_to(chunk: &Chunk, dst: &mut Vec<(i64, Option)>) { - assert_eq!(2, chunk.columns.len()); - - let timestamps = chunk.columns[0] - .as_any() - .downcast_ref::() - .unwrap(); - let values = chunk.columns[1] - .as_any() - .downcast_ref::() - .unwrap(); - for (ts, value) in timestamps.iter_data().zip(values.iter_data()) { - dst.push((ts.unwrap().into_native(), value.map(|s| s.to_string()))); - } -} - -#[tokio::test] -async fn test_new_region() { - let region_name = "region-0"; - let desc = RegionDescBuilder::new(region_name) - .push_key_column(("k1", LogicalTypeId::Int32, false)) - .push_field_column(("v0", LogicalTypeId::Float32, true)) - .build(); - let metadata: RegionMetadata = desc.try_into().unwrap(); - - let dir = create_temp_dir("test_new_region"); - let store_dir = dir.path().to_str().unwrap(); - - let store_config = - config_util::new_store_config(region_name, store_dir, EngineConfig::default()).await; - let placeholder_memtable = store_config - .memtable_builder - .build(metadata.schema().clone()); - - let region = RegionImpl::new( - Version::new(Arc::new(metadata), placeholder_memtable), - store_config, - ); - - let expect_schema = schema_util::new_schema_ref( - &[ - ("k1", LogicalTypeId::Int32, false), - ( - test_util::TIMESTAMP_NAME, - LogicalTypeId::TimestampMillisecond, - false, - ), - ("v0", LogicalTypeId::Float32, true), - ], - Some(1), - ); - - assert_eq!(region_name, region.name()); - assert_eq!(expect_schema, *region.in_memory_metadata().schema()); -} - -#[tokio::test] -async fn test_recover_region_manifets_compress() { - test_recover_region_manifets(true).await; -} - -#[tokio::test] -async fn test_recover_region_manifets_uncompress() { - test_recover_region_manifets(false).await; -} - -async fn test_recover_region_manifets(compress: bool) { - common_telemetry::init_default_ut_logging(); - let tmp_dir = create_temp_dir("test_recover_region_manifets"); - let memtable_builder = Arc::new(DefaultMemtableBuilder::default()) as _; - - let mut builder = Fs::default(); - let _ = builder.root(&tmp_dir.path().to_string_lossy()); - let object_store = ObjectStore::new(builder).unwrap().finish(); - - let manifest = RegionManifest::with_checkpointer( - "/manifest/", - object_store.clone(), - manifest_compress_type(compress), - None, - None, - ); - let region_meta = Arc::new(build_region_meta()); - - let sst_layer = Arc::new(FsAccessLayer::new("sst", object_store)) as _; - let file_purger = Arc::new(LocalScheduler::new( - SchedulerConfig::default(), - NoopFilePurgeHandler, - )); - // Recover from empty - assert!(RegionImpl::::recover_from_manifest( - &manifest, - &memtable_builder, - &sst_layer, - &file_purger, - ) - .await - .unwrap() - .0 - .is_none()); - - let file_id_a = FileId::random(); - let file_id_b = FileId::random(); - let file_id_c = FileId::random(); - - { - // save some actions into region_meta - assert!(manifest - .update(RegionMetaActionList::with_action(RegionMetaAction::Change( - RegionChange { - metadata: region_meta.as_ref().into(), - committed_sequence: 40, - }, - ))) - .await - .is_ok()); - - assert!(manifest - .update(RegionMetaActionList::new(vec![ - RegionMetaAction::Edit(build_region_edit(1, &[file_id_a], &[])), - RegionMetaAction::Edit(build_region_edit(2, &[file_id_b, file_id_c], &[])), - ])) - .await - .is_ok()); - - assert!(manifest - .update(RegionMetaActionList::with_action(RegionMetaAction::Change( - RegionChange { - metadata: region_meta.as_ref().into(), - committed_sequence: 42, - }, - ))) - .await - .is_ok()); - } - - // try to recover - let (version, recovered_metadata) = RegionImpl::::recover_from_manifest( - &manifest, - &memtable_builder, - &sst_layer, - &file_purger, - ) - .await - .unwrap(); - - assert_recovered_manifest( - version, - recovered_metadata, - &file_id_a, - &file_id_b, - &file_id_c, - ®ion_meta, - ); - - // do a manifest checkpoint - let checkpoint = manifest.do_checkpoint().await.unwrap().unwrap(); - assert_eq!(1, checkpoint.last_version); - assert_eq!(2, checkpoint.compacted_actions); - assert_eq!( - manifest.last_checkpoint().await.unwrap().unwrap(), - checkpoint - ); - // recover from checkpoint - let (version, recovered_metadata) = RegionImpl::::recover_from_manifest( - &manifest, - &memtable_builder, - &sst_layer, - &file_purger, - ) - .await - .unwrap(); - - assert_recovered_manifest( - version, - recovered_metadata, - &file_id_a, - &file_id_b, - &file_id_c, - ®ion_meta, - ); - - // check manifest state - assert_eq!(3, manifest.last_version()); - let mut iter = manifest.scan(0, MAX_VERSION).await.unwrap(); - let (version, action) = iter.next_action().await.unwrap().unwrap(); - assert_eq!(2, version); - assert!(matches!(action.actions[0], RegionMetaAction::Change(..))); - assert!(iter.next_action().await.unwrap().is_none()); -} - -fn assert_recovered_manifest( - version: Option, - recovered_metadata: RecoveredMetadataMap, - file_id_a: &FileId, - file_id_b: &FileId, - file_id_c: &FileId, - region_meta: &Arc, -) { - assert_eq!(42, *recovered_metadata.first_key_value().unwrap().0); - let version = version.unwrap(); - assert_eq!(*version.metadata(), *region_meta); - assert_eq!(version.flushed_sequence(), 2); - assert_eq!(version.manifest_version(), 1); - let ssts = version.ssts(); - let files = ssts.levels()[0] - .files() - .map(|f| f.file_name()) - .collect::>(); - assert_eq!(3, files.len()); - assert_eq!( - HashSet::from([ - file_id_a.as_parquet(), - file_id_b.as_parquet(), - file_id_c.as_parquet() - ]), - files - ); -} - -fn create_region_meta(region_name: &str) -> RegionMetadata { - let desc = RegionDescBuilder::new(region_name) - .push_field_column(("v0", LogicalTypeId::Int64, true)) - .push_field_column(("v1", LogicalTypeId::String, true)) - .push_field_column(("v2", LogicalTypeId::Boolean, true)) - .build(); - desc.try_into().unwrap() -} - -async fn create_store_config(region_name: &str, root: &str) -> StoreConfig { - let mut builder = Fs::default(); - let _ = builder.root(root); - let object_store = ObjectStore::new(builder).unwrap().finish(); - let parent_dir = ""; - let sst_dir = engine::region_sst_dir(parent_dir, region_name); - let manifest_dir = engine::region_manifest_dir(parent_dir, region_name); - - let sst_layer = Arc::new(FsAccessLayer::new(&sst_dir, object_store.clone())); - let manifest = RegionManifest::with_checkpointer( - &manifest_dir, - object_store, - CompressionType::Uncompressed, - None, - None, - ); - manifest.start().await.unwrap(); - - let compaction_scheduler = Arc::new(NoopCompactionScheduler::default()); - - let regions = Arc::new(RegionMap::new()); - - let flush_scheduler = Arc::new( - FlushScheduler::new( - SchedulerConfig::default(), - compaction_scheduler.clone(), - regions, - PickerConfig::default(), - ) - .unwrap(), - ); - - let log_store = Arc::new(NoopLogStore); - - let file_purger = Arc::new(LocalScheduler::new( - SchedulerConfig::default(), - NoopFilePurgeHandler, - )); - StoreConfig { - log_store, - sst_layer, - manifest, - memtable_builder: Arc::new(DefaultMemtableBuilder::default()), - flush_scheduler, - flush_strategy: Arc::new(SizeBasedStrategy::default()), - compaction_scheduler, - engine_config: Default::default(), - file_purger, - ttl: None, - write_buffer_size: ReadableSize::mb(32).0 as usize, - compaction_strategy: Default::default(), - } -} - -struct WindowedReaderTester { - data_written: Vec>, - expected: Vec<(i64, i64, String, bool)>, - region: RegionImpl, - _temp_dir: TempDir, -} - -impl WindowedReaderTester { - async fn new( - region_name: &'static str, - data_written: Vec>, - expected: Vec<(i64, i64, String, bool)>, - ) -> Self { - let temp_dir = create_temp_dir(&format!("write_and_read_windowed_{}", region_name)); - let root = temp_dir.path().to_str().unwrap(); - let metadata = create_region_meta(region_name); - let store_config = create_store_config(region_name, root).await; - let region = RegionImpl::create(metadata, store_config).await.unwrap(); - - let tester = Self { - data_written, - expected, - region, - _temp_dir: temp_dir, - }; - tester.prepare().await; - tester - } - - async fn prepare(&self) { - for batch in &self.data_written { - let mut write_batch = self.region.write_request(); - let ts = TimestampMillisecondVector::from_iterator( - batch - .iter() - .map(|(v, _, _, _)| TimestampMillisecond::new(*v)), - ); - let v0 = Int64Vector::from_iterator(batch.iter().map(|(_, v, _, _)| *v)); - let v1 = StringVector::from_iterator(batch.iter().map(|(_, _, v, _)| v.as_str())); - let v2 = BooleanVector::from_iterator(batch.iter().map(|(_, _, _, v)| *v)); - - let columns = [ - ("timestamp".to_string(), Arc::new(ts) as VectorRef), - ("v0".to_string(), Arc::new(v0) as VectorRef), - ("v1".to_string(), Arc::new(v1) as VectorRef), - ("v2".to_string(), Arc::new(v2) as VectorRef), - ] - .into_iter() - .collect::>(); - write_batch.put(columns).unwrap(); - - assert!(self - .region - .write(&WriteContext {}, write_batch) - .await - .is_ok()); - - // flush the region to ensure data resides across SST files. - self.region - .flush(&FlushContext { - wait: true, - reason: FlushReason::Others, - ..Default::default() - }) - .await - .unwrap(); - } - } - - async fn check(&self, order_options: Vec) { - let read_context = ReadContext::default(); - let snapshot = self.region.snapshot(&read_context).unwrap(); - let response = snapshot - .scan( - &read_context, - ScanRequest { - sequence: None, - projection: None, - filters: vec![], - limit: None, - output_ordering: Some(order_options), - }, - ) - .await - .unwrap(); - - let mut timestamps = Vec::with_capacity(self.expected.len()); - let mut col1 = Vec::with_capacity(self.expected.len()); - let mut col2 = Vec::with_capacity(self.expected.len()); - let mut col3 = Vec::with_capacity(self.expected.len()); - - let mut reader = response.reader; - let ts_index = reader.user_schema().timestamp_index().unwrap(); - while let Some(chunk) = reader.next_chunk().await.unwrap() { - let ts_col = &chunk.columns[ts_index]; - let ts_col = ts_col - .as_any() - .downcast_ref::() - .unwrap(); - let v1_col = chunk.columns[1] - .as_any() - .downcast_ref::() - .unwrap(); - let v2_col = chunk.columns[2] - .as_any() - .downcast_ref::() - .unwrap(); - let v3_col = chunk.columns[3] - .as_any() - .downcast_ref::() - .unwrap(); - - for ts in ts_col.iter_data() { - timestamps.push(ts.unwrap().0.value()); - } - for v in v1_col.iter_data() { - col1.push(v.unwrap()); - } - for v in v2_col.iter_data() { - col2.push(v.unwrap().to_string()); - } - for v in v3_col.iter_data() { - col3.push(v.unwrap()); - } - } - - assert_eq!( - timestamps, - self.expected - .iter() - .map(|(v, _, _, _)| *v) - .collect::>() - ); - assert_eq!( - col1, - self.expected - .iter() - .map(|(_, v, _, _)| *v) - .collect::>() - ); - assert_eq!( - col2, - self.expected - .iter() - .map(|(_, _, v, _)| v.clone()) - .collect::>() - ); - assert_eq!( - col3, - self.expected - .iter() - .map(|(_, _, _, v)| *v) - .collect::>() - ); - } -} - -#[tokio::test] -async fn test_read_by_chunk_reader() { - common_telemetry::init_default_ut_logging(); - - WindowedReaderTester::new( - "test_region", - vec![vec![(1, 1, "1".to_string(), false)]], - vec![(1, 1, "1".to_string(), false)], - ) - .await - .check(vec![OrderOption { - name: "timestamp".to_string(), - options: SortOptions { - descending: true, - nulls_first: true, - }, - }]) - .await; - - WindowedReaderTester::new( - "test_region", - vec![ - vec![ - (1, 1, "1".to_string(), false), - (2, 2, "2".to_string(), false), - ], - vec![ - (3, 3, "3".to_string(), false), - (4, 4, "4".to_string(), false), - ], - ], - vec![ - (4, 4, "4".to_string(), false), - (3, 3, "3".to_string(), false), - (2, 2, "2".to_string(), false), - (1, 1, "1".to_string(), false), - ], - ) - .await - .check(vec![OrderOption { - name: "timestamp".to_string(), - options: SortOptions { - descending: true, - nulls_first: true, - }, - }]) - .await; - - WindowedReaderTester::new( - "test_region", - vec![ - vec![ - (1, 1, "1".to_string(), false), - (2, 2, "2".to_string(), false), - (60000, 60000, "60".to_string(), false), - ], - vec![ - (3, 3, "3".to_string(), false), - (61000, 61000, "61".to_string(), false), - ], - ], - vec![ - (61000, 61000, "61".to_string(), false), - (60000, 60000, "60".to_string(), false), - (3, 3, "3".to_string(), false), - (2, 2, "2".to_string(), false), - (1, 1, "1".to_string(), false), - ], - ) - .await - .check(vec![OrderOption { - name: "timestamp".to_string(), - options: SortOptions { - descending: true, - nulls_first: true, - }, - }]) - .await; - - WindowedReaderTester::new( - "test_region", - vec![ - vec![ - (1, 1, "1".to_string(), false), - (2, 2, "2".to_string(), false), - (60000, 60000, "60".to_string(), false), - ], - vec![ - (3, 3, "3".to_string(), false), - (61000, 61000, "61".to_string(), false), - ], - ], - vec![ - (1, 1, "1".to_string(), false), - (2, 2, "2".to_string(), false), - (3, 3, "3".to_string(), false), - (60000, 60000, "60".to_string(), false), - (61000, 61000, "61".to_string(), false), - ], - ) - .await - .check(vec![OrderOption { - name: "timestamp".to_string(), - options: SortOptions { - descending: false, - nulls_first: true, - }, - }]) - .await; -} diff --git a/src/storage/src/region/tests/alter.rs b/src/storage/src/region/tests/alter.rs deleted file mode 100644 index 432239a8a853..000000000000 --- a/src/storage/src/region/tests/alter.rs +++ /dev/null @@ -1,491 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::{BTreeMap, HashMap}; -use std::sync::Arc; - -use common_test_util::temp_dir::create_temp_dir; -use datatypes::prelude::*; -use datatypes::timestamp::TimestampMillisecond; -use datatypes::vectors::{Int64Vector, StringVector, TimestampMillisecondVector, VectorRef}; -use log_store::raft_engine::log_store::RaftEngineLogStore; -use store_api::storage::{ - AddColumn, AlterOperation, AlterRequest, Chunk, ChunkReader, ColumnDescriptor, - ColumnDescriptorBuilder, ColumnId, FlushContext, FlushReason, Region, RegionMeta, ScanRequest, - SchemaRef, Snapshot, WriteRequest, -}; - -use crate::config::EngineConfig; -use crate::region::tests::{self, FileTesterBase}; -use crate::region::{OpenOptions, RawRegionMetadata, RegionImpl, RegionMetadata}; -use crate::test_util; -use crate::test_util::config_util; -use crate::test_util::descriptor_util::RegionDescBuilder; - -const REGION_NAME: &str = "region-alter-0"; - -async fn create_region_for_alter(store_dir: &str) -> RegionImpl { - // Always disable version column in this test. - let metadata = tests::new_metadata(REGION_NAME); - - let store_config = - config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await; - - RegionImpl::create(metadata, store_config).await.unwrap() -} - -/// Tester for region alter. -struct AlterTester { - store_dir: String, - base: Option, -} - -#[derive(Debug, Clone, PartialEq)] -struct DataRow { - key: Option, - ts: TimestampMillisecond, - v0: Option, - v1: Option, -} - -impl DataRow { - fn new_with_string(key: Option, ts: i64, v0: Option, v1: Option) -> Self { - DataRow { - key, - ts: ts.into(), - v0, - v1, - } - } - - fn new(key: Option, ts: i64, v0: Option, v1: Option) -> Self { - Self::new_with_string(key, ts, v0.map(|s| s.to_string()), v1) - } -} - -fn new_put_data(data: &[DataRow]) -> HashMap { - let keys = Int64Vector::from(data.iter().map(|v| v.key).collect::>()); - let timestamps = TimestampMillisecondVector::from( - data.iter() - .map(|v| Some(v.ts.into_native())) - .collect::>(), - ); - let values1 = StringVector::from(data.iter().map(|v| v.v0.clone()).collect::>()); - let values2 = Int64Vector::from(data.iter().map(|kv| kv.v1).collect::>()); - - HashMap::from([ - ("k0".to_string(), Arc::new(keys) as VectorRef), - ( - test_util::TIMESTAMP_NAME.to_string(), - Arc::new(timestamps) as VectorRef, - ), - ("v0".to_string(), Arc::new(values1) as VectorRef), - ("v1".to_string(), Arc::new(values2) as VectorRef), - ]) -} - -impl AlterTester { - async fn new(store_dir: &str) -> AlterTester { - let region = create_region_for_alter(store_dir).await; - - AlterTester { - base: Some(FileTesterBase::with_region(region)), - store_dir: store_dir.to_string(), - } - } - - async fn reopen(&mut self) { - // Close the old region. - if let Some(base) = self.base.as_ref() { - base.close().await; - } - self.base = None; - // Reopen the region. - let store_config = - config_util::new_store_config(REGION_NAME, &self.store_dir, EngineConfig::default()) - .await; - let opts = OpenOptions::default(); - let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts) - .await - .unwrap() - .unwrap(); - self.base = Some(FileTesterBase::with_region(region)); - } - - async fn flush(&self, wait: Option) { - let ctx = wait - .map(|wait| FlushContext { - wait, - reason: FlushReason::Manually, - ..Default::default() - }) - .unwrap_or_default(); - self.base().region.flush(&ctx).await.unwrap(); - } - - async fn checkpoint_manifest(&self) { - self.base().checkpoint_manifest().await - } - - #[inline] - fn base(&self) -> &FileTesterBase { - self.base.as_ref().unwrap() - } - - fn schema(&self) -> SchemaRef { - let metadata = self.base().region.in_memory_metadata(); - metadata.schema().clone() - } - - // Put with schema k0, ts, v0, v1 - async fn put(&self, data: &[DataRow]) { - let mut batch = self.base().region.write_request(); - let put_data = new_put_data(data); - batch.put(put_data).unwrap(); - - assert!(self - .base() - .region - .write(&self.base().write_ctx, batch) - .await - .is_ok()); - } - - /// Put data with initial schema. - async fn put_with_init_schema(&self, data: &[(i64, Option)]) { - // put of FileTesterBase always use initial schema version. - let data = data - .iter() - .map(|(ts, v0)| (*ts, v0.map(|v| v.to_string()))) - .collect::>(); - let _ = self.base().put(&data).await; - } - - /// Put data to inner writer with initial schema. - async fn put_inner_with_init_schema(&self, data: &[(i64, Option)]) { - let data = data - .iter() - .map(|(ts, v0)| (*ts, v0.map(|v| v.to_string()))) - .collect::>(); - // put of FileTesterBase always use initial schema version. - let _ = self.base().put_inner(&data).await; - } - - async fn alter(&self, mut req: AlterRequest) { - let version = self.version(); - req.version = version; - - self.base().region.alter(req).await.unwrap(); - } - - fn version(&self) -> u32 { - let metadata = self.base().region.in_memory_metadata(); - metadata.version() - } - - async fn full_scan_with_init_schema(&self) -> Vec<(i64, Option)> { - self.base().full_scan().await - } - - async fn full_scan(&self) -> Vec { - let read_ctx = &self.base().read_ctx; - let snapshot = self.base().region.snapshot(read_ctx).unwrap(); - - let resp = snapshot - .scan(read_ctx, ScanRequest::default()) - .await - .unwrap(); - let mut reader = resp.reader; - - let metadata = self.base().region.in_memory_metadata(); - assert_eq!(metadata.schema(), reader.user_schema()); - - let mut dst = Vec::new(); - while let Some(chunk) = reader.next_chunk().await.unwrap() { - let chunk = reader.project_chunk(chunk); - append_chunk_to(&chunk, &mut dst); - } - - dst - } -} - -fn append_chunk_to(chunk: &Chunk, dst: &mut Vec) { - assert_eq!(4, chunk.columns.len()); - - let k0_vector = chunk.columns[0] - .as_any() - .downcast_ref::() - .unwrap(); - let ts_vector = chunk.columns[1] - .as_any() - .downcast_ref::() - .unwrap(); - let v0_vector = chunk.columns[2] - .as_any() - .downcast_ref::() - .unwrap(); - let v1_vector = chunk.columns[3] - .as_any() - .downcast_ref::() - .unwrap(); - for i in 0..k0_vector.len() { - dst.push(DataRow::new_with_string( - k0_vector.get_data(i), - ts_vector.get_data(i).unwrap().into(), - v0_vector.get_data(i).map(|s| s.to_string()), - v1_vector.get_data(i), - )); - } -} - -fn new_column_desc(id: ColumnId, name: &str) -> ColumnDescriptor { - ColumnDescriptorBuilder::new(id, name, ConcreteDataType::int64_datatype()) - .is_nullable(true) - .build() - .unwrap() -} - -fn add_column_req(desc_and_is_key: &[(ColumnDescriptor, bool)]) -> AlterRequest { - let columns = desc_and_is_key - .iter() - .map(|(desc, is_key)| AddColumn { - desc: desc.clone(), - is_key: *is_key, - }) - .collect(); - let operation = AlterOperation::AddColumns { columns }; - - AlterRequest { - operation, - version: 0, - } -} - -fn drop_column_req(names: &[&str]) -> AlterRequest { - let names = names.iter().map(|s| s.to_string()).collect(); - let operation = AlterOperation::DropColumns { names }; - - AlterRequest { - operation, - version: 0, - } -} - -fn check_schema_names(schema: &SchemaRef, names: &[&str]) { - assert_eq!(names.len(), schema.num_columns()); - - for (idx, name) in names.iter().enumerate() { - assert_eq!(*name, schema.column_name_by_index(idx)); - let _ = schema.column_schema_by_name(name).unwrap(); - } -} - -#[tokio::test] -async fn test_alter_region_with_reopen() { - test_alter_region_with_reopen0(true).await; - test_alter_region_with_reopen0(false).await; -} - -async fn test_alter_region_with_reopen0(flush_and_checkpoint: bool) { - common_telemetry::init_default_ut_logging(); - - let dir = create_temp_dir("alter-region"); - let store_dir = dir.path().to_str().unwrap(); - let mut tester = AlterTester::new(store_dir).await; - - let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))]; - tester.put_with_init_schema(&data).await; - assert_eq!(3, tester.full_scan_with_init_schema().await.len()); - - let req = add_column_req(&[ - (new_column_desc(4, "k0"), true), // key column k0 - (new_column_desc(5, "v1"), false), // value column v1 - ]); - tester.alter(req).await; - - let schema = tester.schema(); - check_schema_names(&schema, &["k0", "timestamp", "v0", "v1"]); - - // Put data after schema altered. - let data = vec![ - DataRow::new(Some(10000), 1003, Some(103), Some(201)), - DataRow::new(Some(10001), 1004, Some(104), Some(202)), - DataRow::new(Some(10002), 1005, Some(105), Some(203)), - ]; - tester.put(&data).await; - - if flush_and_checkpoint { - tester.flush(None).await; - tester.checkpoint_manifest().await; - } - - // Scan with new schema before reopen. - let mut expect = vec![ - DataRow::new(None, 1000, Some(100), None), - DataRow::new(None, 1001, Some(101), None), - DataRow::new(None, 1002, Some(102), None), - ]; - expect.extend_from_slice(&data); - let scanned = tester.full_scan().await; - assert_eq!(expect, scanned); - - // Reopen and put more data. - tester.reopen().await; - let data = vec![ - DataRow::new(Some(10003), 1006, Some(106), Some(204)), - DataRow::new(Some(10004), 1007, Some(107), Some(205)), - DataRow::new(Some(10005), 1008, Some(108), Some(206)), - ]; - tester.put(&data).await; - // Extend expected result. - expect.extend_from_slice(&data); - - // add columns,then remove them without writing data. - let req = add_column_req(&[ - (new_column_desc(6, "v2"), false), // key column k0 - (new_column_desc(7, "v3"), false), // value column v1 - ]); - tester.alter(req).await; - - let req = drop_column_req(&["v2", "v3"]); - tester.alter(req).await; - - if flush_and_checkpoint { - tester.flush(None).await; - tester.checkpoint_manifest().await; - } - - // reopen and write again - tester.reopen().await; - let schema = tester.schema(); - check_schema_names(&schema, &["k0", "timestamp", "v0", "v1"]); - - let data = vec![DataRow::new(Some(10006), 1009, Some(109), Some(207))]; - tester.put(&data).await; - expect.extend_from_slice(&data); - - // Scan with new schema after reopen and write. - let scanned = tester.full_scan().await; - assert_eq!(expect, scanned); -} - -#[tokio::test] -async fn test_alter_region() { - let dir = create_temp_dir("alter-region"); - let store_dir = dir.path().to_str().unwrap(); - let tester = AlterTester::new(store_dir).await; - - let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))]; - - tester.put_with_init_schema(&data).await; - - let schema = tester.schema(); - check_schema_names(&schema, &["timestamp", "v0"]); - - let req = add_column_req(&[ - (new_column_desc(4, "k0"), true), // key column k0 - (new_column_desc(5, "v1"), false), // value column v1 - ]); - tester.alter(req).await; - - let schema = tester.schema(); - check_schema_names(&schema, &["k0", "timestamp", "v0", "v1"]); - - let req = add_column_req(&[ - (new_column_desc(6, "v2"), false), - (new_column_desc(7, "v3"), false), - ]); - tester.alter(req).await; - - let schema = tester.schema(); - check_schema_names(&schema, &["k0", "timestamp", "v0", "v1", "v2", "v3"]); - - // Remove v0, v1 - let req = drop_column_req(&["v0", "v1"]); - tester.alter(req).await; - - let schema = tester.schema(); - check_schema_names(&schema, &["k0", "timestamp", "v2", "v3"]); -} - -#[tokio::test] -async fn test_put_old_schema_after_alter() { - let dir = create_temp_dir("put-old"); - let store_dir = dir.path().to_str().unwrap(); - let tester = AlterTester::new(store_dir).await; - - let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))]; - - tester.put_with_init_schema(&data).await; - - let req = add_column_req(&[ - (new_column_desc(4, "k0"), true), // key column k0 - (new_column_desc(5, "v1"), false), // value column v1 - ]); - tester.alter(req).await; - - // Put with old schema. - let data = vec![(1005, Some(105)), (1006, Some(106))]; - tester.put_with_init_schema(&data).await; - - // Put data with old schema directly to the inner writer, to check that the region - // writer could compat the schema of write batch. - let data = vec![(1003, Some(103)), (1004, Some(104))]; - tester.put_inner_with_init_schema(&data).await; - - let expect = vec![ - DataRow::new(None, 1000, Some(100), None), - DataRow::new(None, 1001, Some(101), None), - DataRow::new(None, 1002, Some(102), None), - DataRow::new(None, 1003, Some(103), None), - DataRow::new(None, 1004, Some(104), None), - DataRow::new(None, 1005, Some(105), None), - DataRow::new(None, 1006, Some(106), None), - ]; - let scanned = tester.full_scan().await; - assert_eq!(expect, scanned); -} - -#[tokio::test] -async fn test_replay_metadata_after_open() { - let dir = create_temp_dir("replay-metadata-after-open"); - let store_dir = dir.path().to_str().unwrap(); - let mut tester = AlterTester::new(store_dir).await; - - let data = vec![(1000, Some(100)), (1001, Some(101)), (1002, Some(102))]; - - tester.put_with_init_schema(&data).await; - - tester.reopen().await; - - let committed_sequence = tester.base().committed_sequence(); - let manifest_version = tester.base().region.current_manifest_version(); - let version = tester.version(); - - let desc = RegionDescBuilder::new(REGION_NAME) - .push_key_column(("k1", LogicalTypeId::Int32, false)) - .push_field_column(("v0", LogicalTypeId::Float32, true)) - .build(); - let metadata: &RegionMetadata = &desc.try_into().unwrap(); - let mut raw_metadata: RawRegionMetadata = metadata.into(); - raw_metadata.version = version + 1; - - let recovered_metadata = - BTreeMap::from([(committed_sequence, (manifest_version + 1, raw_metadata))]); - - tester.base().replay_inner(recovered_metadata).await; - let schema = tester.schema(); - check_schema_names(&schema, &["k1", "timestamp", "v0"]); -} diff --git a/src/storage/src/region/tests/basic.rs b/src/storage/src/region/tests/basic.rs deleted file mode 100644 index 13565fba682c..000000000000 --- a/src/storage/src/region/tests/basic.rs +++ /dev/null @@ -1,288 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Region read/write tests. - -use common_telemetry::info; -use common_test_util::temp_dir::create_temp_dir; -use log_store::raft_engine::log_store::RaftEngineLogStore; -use store_api::storage::{OpenOptions, SequenceNumber}; - -use crate::config::EngineConfig; -use crate::error::Result; -use crate::region::tests::{self, FileTesterBase}; -use crate::region::RegionImpl; -use crate::test_util::config_util; - -const REGION_NAME: &str = "region-basic-0"; - -/// Create a new region for basic tests. -async fn create_region_for_basic( - region_name: &str, - store_dir: &str, -) -> RegionImpl { - let metadata = tests::new_metadata(region_name); - let store_config = - config_util::new_store_config(region_name, store_dir, EngineConfig::default()).await; - RegionImpl::create(metadata, store_config).await.unwrap() -} - -/// Tester for basic tests. -struct Tester { - region_name: String, - store_dir: String, - base: Option, -} - -impl Tester { - async fn new(region_name: &str, store_dir: &str) -> Tester { - let region = create_region_for_basic(region_name, store_dir).await; - - Tester { - region_name: region_name.to_string(), - store_dir: store_dir.to_string(), - base: Some(FileTesterBase::with_region(region)), - } - } - - async fn empty(region_name: &str, store_dir: &str) -> Tester { - Tester { - region_name: region_name.to_string(), - store_dir: store_dir.to_string(), - base: None, - } - } - - async fn reopen(&mut self) { - let _ = self.try_reopen().await.unwrap(); - } - - async fn try_reopen(&mut self) -> Result { - // Close the old region. - if let Some(base) = self.base.as_ref() { - info!("Reopen tester base"); - base.close().await; - } - - self.base = None; - // Reopen the region. - let store_config = config_util::new_store_config( - &self.region_name, - &self.store_dir, - EngineConfig::default(), - ) - .await; - let opts = OpenOptions::default(); - let region = RegionImpl::open(self.region_name.clone(), store_config, &opts).await?; - match region { - None => Ok(false), - Some(region) => { - let base = FileTesterBase::with_region(region); - self.base = Some(base); - Ok(true) - } - } - } - - #[inline] - fn base(&self) -> &FileTesterBase { - self.base.as_ref().unwrap() - } - - #[inline] - fn set_batch_size(&mut self, batch_size: usize) { - self.base.as_mut().unwrap().read_ctx.batch_size = batch_size; - } - - async fn put(&self, data: &[(i64, Option)]) { - let _ = self.base().put(data).await; - } - - async fn full_scan(&self) -> Vec<(i64, Option)> { - self.base().full_scan().await - } - - fn committed_sequence(&self) -> SequenceNumber { - self.base().committed_sequence() - } - - async fn delete(&self, keys: &[i64]) { - let _ = self.base().delete(keys).await; - } -} - -#[tokio::test] -async fn test_simple_put_scan() { - let dir = create_temp_dir("put-scan"); - let store_dir = dir.path().to_str().unwrap(); - let tester = Tester::new(REGION_NAME, store_dir).await; - - let data = vec![ - (1000, Some(100.to_string())), - (1001, Some(101.to_string())), - (1002, None), - (1003, Some(103.to_string())), - (1004, Some(104.to_string())), - ]; - - tester.put(&data).await; - - let output = tester.full_scan().await; - assert_eq!(data, output); -} - -#[tokio::test] -async fn test_sequence_increase() { - let dir = create_temp_dir("sequence"); - let store_dir = dir.path().to_str().unwrap(); - let tester = Tester::new(REGION_NAME, store_dir).await; - - let mut committed_sequence = tester.committed_sequence(); - for i in 0..100 { - tester.put(&[(i, Some(1234.to_string()))]).await; - committed_sequence += 1; - - assert_eq!(committed_sequence, tester.committed_sequence()); - } -} - -#[tokio::test] -async fn test_reopen() { - common_telemetry::logging::init_default_ut_logging(); - - let dir = create_temp_dir("reopen"); - let store_dir = dir.path().to_str().unwrap(); - let mut tester = Tester::new(REGION_NAME, store_dir).await; - - let mut all_data = Vec::new(); - // Reopen region multiple times. - for i in 0..5 { - let data = (i, Some(i.to_string())); - tester.put(&[data.clone()]).await; - all_data.push(data.clone()); - - let output = tester.full_scan().await; - assert_eq!(all_data, output); - - tester.reopen().await; - - // Scan after reopen. - let output = tester.full_scan().await; - assert_eq!(all_data, output); - - // Check committed sequence. - assert_eq!(i + 1, tester.committed_sequence() as i64); - } -} - -#[tokio::test] -async fn test_open_empty() { - let dir = create_temp_dir("open-empty"); - let store_dir = dir.path().to_str().unwrap(); - let mut tester = Tester::empty(REGION_NAME, store_dir).await; - - let ret = tester.try_reopen().await; - assert!(!ret.unwrap()); -} - -#[tokio::test] -async fn test_scan_different_batch() { - let dir = create_temp_dir("different-batch"); - let store_dir = dir.path().to_str().unwrap(); - let mut tester = Tester::new(REGION_NAME, store_dir).await; - - let data: Vec<_> = (0..=2000).map(|i| (i, Some(i.to_string()))).collect(); - - for chunk in data.chunks(100) { - tester.put(chunk).await; - } - - let batch_sizes = [1, 2, 4, 16, 64, 128, 256, 512]; - for batch_size in batch_sizes { - tester.set_batch_size(batch_size); - - let output = tester.full_scan().await; - assert_eq!(data, output); - } -} - -#[tokio::test] -async fn test_put_delete_scan() { - common_telemetry::init_default_ut_logging(); - let dir = create_temp_dir("put-delete-scan"); - let store_dir = dir.path().to_str().unwrap(); - let mut tester = Tester::new(REGION_NAME, store_dir).await; - - let data = vec![ - (1000, Some(100.to_string())), - (1001, Some(101.to_string())), - (1002, None), - (1003, None), - (1004, Some(104.to_string())), - ]; - - tester.put(&data).await; - - let keys = [1001, 1003]; - - tester.delete(&keys).await; - - let output = tester.full_scan().await; - let expect = vec![ - (1000, Some(100.to_string())), - (1002, None), - (1004, Some(104.to_string())), - ]; - assert_eq!(expect, output); - - // Deletion is also persistent. - let _ = tester.try_reopen().await.unwrap(); - let output = tester.full_scan().await; - assert_eq!(expect, output); -} - -#[tokio::test] -async fn test_put_delete_absent_key() { - let dir = create_temp_dir("put-delete-scan"); - let store_dir = dir.path().to_str().unwrap(); - let mut tester = Tester::new(REGION_NAME, store_dir).await; - - let data = vec![ - (1000, Some(100.to_string())), - (1001, Some(101.to_string())), - (1002, None), - (1003, None), - (1004, Some(104.to_string())), - ]; - - tester.put(&data).await; - - // 999 and 1006 is absent. - let keys = [999, 1002, 1004, 1006]; - - tester.delete(&keys).await; - - let output = tester.full_scan().await; - let expect = vec![ - (1000, Some(100.to_string())), - (1001, Some(101.to_string())), - (1003, None), - ]; - assert_eq!(expect, output); - - // Deletion is also persistent. - let _ = tester.try_reopen().await.unwrap(); - let output = tester.full_scan().await; - assert_eq!(expect, output); -} diff --git a/src/storage/src/region/tests/close.rs b/src/storage/src/region/tests/close.rs deleted file mode 100644 index 75f7ab032db4..000000000000 --- a/src/storage/src/region/tests/close.rs +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Region close tests. - -use std::sync::Arc; - -use common_test_util::temp_dir::create_temp_dir; -use log_store::raft_engine::log_store::RaftEngineLogStore; -use store_api::storage::{ - AlterOperation, AlterRequest, CloseContext, Region, RegionMeta, WriteResponse, -}; - -use crate::config::EngineConfig; -use crate::engine; -use crate::error::Error; -use crate::flush::FlushStrategyRef; -use crate::region::tests::{self, FileTesterBase}; -use crate::region::RegionImpl; -use crate::test_util::config_util; -use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch}; - -const REGION_NAME: &str = "region-close-0"; - -/// Tester for region close -struct CloseTester { - base: Option, -} - -/// Create a new region for close test -async fn create_region_for_close( - store_dir: &str, - flush_strategy: FlushStrategyRef, -) -> RegionImpl { - let metadata = tests::new_metadata(REGION_NAME); - - let mut store_config = - config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await; - store_config.flush_strategy = flush_strategy; - - RegionImpl::create(metadata, store_config).await.unwrap() -} - -impl CloseTester { - async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> CloseTester { - let region = create_region_for_close(store_dir, flush_strategy.clone()).await; - - CloseTester { - base: Some(FileTesterBase::with_region(region)), - } - } - - #[inline] - fn base(&self) -> &FileTesterBase { - self.base.as_ref().unwrap() - } - - async fn put(&self, data: &[(i64, Option)]) { - let data = data - .iter() - .map(|(ts, v0)| (*ts, v0.map(|v| v.to_string()))) - .collect::>(); - let _ = self.base().put(&data).await; - } - - async fn try_put(&self, data: &[(i64, Option)]) -> Result { - let data = data - .iter() - .map(|(ts, v0)| (*ts, v0.map(|v| v.to_string()))) - .collect::>(); - self.base().try_put(&data).await - } - - async fn try_alter(&self, mut req: AlterRequest) -> Result<(), Error> { - let version = self.version(); - req.version = version; - - self.base().region.alter(req).await - } - - fn version(&self) -> u32 { - let metadata = self.base().region.in_memory_metadata(); - metadata.version() - } -} - -#[tokio::test] -async fn test_close_basic() { - common_telemetry::init_default_ut_logging(); - let dir = create_temp_dir("close-basic"); - let store_dir = dir.path().to_str().unwrap(); - - let flush_switch = Arc::new(FlushSwitch::default()); - let tester = CloseTester::new(store_dir, flush_switch).await; - - tester - .base() - .region - .close(&CloseContext::default()) - .await - .unwrap(); - - let data = [(1000, Some(100))]; - - let closed_region_error = "Try to write the closed region".to_string(); - // Put one element should return ClosedRegion error - assert_eq!( - tester.try_put(&data).await.unwrap_err().to_string(), - closed_region_error - ); - - // Alter table should return ClosedRegion error - assert_eq!( - tester - .try_alter(AlterRequest { - operation: AlterOperation::AddColumns { - columns: Vec::new(), - }, - version: 0, - }) - .await - .unwrap_err() - .to_string(), - closed_region_error - ); -} - -#[tokio::test] -async fn test_close_wait_flush_done() { - common_telemetry::init_default_ut_logging(); - let dir = create_temp_dir("close-basic"); - let store_dir = dir.path().to_str().unwrap(); - - let flush_switch = Arc::new(FlushSwitch::default()); - let tester = CloseTester::new(store_dir, flush_switch.clone()).await; - - let data = [(1000, Some(100))]; - - // Now set should flush to true to trigger flush. - flush_switch.set_should_flush(true); - - // Put one element so we have content to flush. - tester.put(&data).await; - - let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME)); - assert!(!has_parquet_file(&sst_dir)); - - // Close should cancel the flush. - tester - .base() - .region - .close(&CloseContext::default()) - .await - .unwrap(); - - assert!(!has_parquet_file(&sst_dir)); -} diff --git a/src/storage/src/region/tests/compact.rs b/src/storage/src/region/tests/compact.rs deleted file mode 100644 index 1cd946f4ab0a..000000000000 --- a/src/storage/src/region/tests/compact.rs +++ /dev/null @@ -1,458 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Region compaction tests. - -use std::env; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::Arc; - -use common_telemetry::logging; -use common_test_util::temp_dir::create_temp_dir; -use log_store::raft_engine::log_store::RaftEngineLogStore; -use object_store::services::{Fs, S3}; -use object_store::ObjectStore; -use store_api::storage::{FlushContext, FlushReason, OpenOptions, Region}; -use tokio::sync::{Notify, RwLock}; - -use crate::compaction::CompactionHandler; -use crate::config::EngineConfig; -use crate::error::Result; -use crate::file_purger::{FilePurgeHandler, FilePurgeRequest}; -use crate::region::tests::{self, FileTesterBase}; -use crate::region::{CompactContext, FlushStrategyRef, RegionImpl}; -use crate::scheduler::rate_limit::BoxedRateLimitToken; -use crate::scheduler::{Handler, LocalScheduler, SchedulerConfig}; -use crate::test_util::config_util; -use crate::test_util::flush_switch::FlushSwitch; - -const REGION_NAME: &str = "region-compact-0"; - -fn new_object_store(store_dir: &str, s3_bucket: Option) -> ObjectStore { - if let Some(bucket) = s3_bucket { - if !bucket.is_empty() { - logging::info!("Use S3 object store"); - - let root = uuid::Uuid::new_v4().to_string(); - - let mut builder = S3::default(); - let _ = builder - .root(&root) - .access_key_id(&env::var("GT_S3_ACCESS_KEY_ID").unwrap()) - .secret_access_key(&env::var("GT_S3_ACCESS_KEY").unwrap()) - .region(&env::var("GT_S3_REGION").unwrap()) - .bucket(&bucket); - - return ObjectStore::new(builder).unwrap().finish(); - } - } - - logging::info!("Use local fs object store"); - - let mut builder = Fs::default(); - let _ = builder.root(store_dir); - ObjectStore::new(builder).unwrap().finish() -} - -/// Create a new region for compaction test -async fn create_region_for_compaction< - H: Handler + Send + Sync + 'static, ->( - store_dir: &str, - engine_config: EngineConfig, - purge_handler: H, - flush_strategy: FlushStrategyRef, - s3_bucket: Option, -) -> ( - RegionImpl, - ObjectStore, - Arc>>>, -) { - let metadata = tests::new_metadata(REGION_NAME); - - let object_store = new_object_store(store_dir, s3_bucket); - - let (mut store_config, _) = config_util::new_store_config_with_object_store( - REGION_NAME, - store_dir, - object_store.clone(), - EngineConfig::default(), - ) - .await; - store_config.engine_config = Arc::new(engine_config); - store_config.flush_strategy = flush_strategy; - - let pending_compaction_tasks = Arc::new(RwLock::new(vec![])); - let handler = CompactionHandler::new_with_pending_tasks(pending_compaction_tasks.clone()); - let config = SchedulerConfig::default(); - // Overwrite test compaction scheduler and file purger. - store_config.compaction_scheduler = Arc::new(LocalScheduler::new(config, handler)); - store_config.file_purger = Arc::new(LocalScheduler::new( - SchedulerConfig { - max_inflight_tasks: store_config.engine_config.max_purge_tasks, - }, - purge_handler, - )); - - ( - RegionImpl::create(metadata, store_config).await.unwrap(), - object_store, - pending_compaction_tasks, - ) -} - -#[derive(Debug, Default, Clone)] -struct MockFilePurgeHandler { - num_deleted: Arc, -} - -#[async_trait::async_trait] -impl Handler for MockFilePurgeHandler { - type Request = FilePurgeRequest; - - async fn handle_request( - &self, - req: Self::Request, - token: BoxedRateLimitToken, - finish_notifier: Arc, - ) -> Result<()> { - logging::info!( - "Try to delete file: {:?}, num_deleted: {:?}", - req.file_id, - self.num_deleted - ); - - let handler = FilePurgeHandler; - handler - .handle_request(req, token, finish_notifier) - .await - .unwrap(); - - let _ = self.num_deleted.fetch_add(1, Ordering::Relaxed); - - Ok(()) - } -} - -impl MockFilePurgeHandler { - fn num_deleted(&self) -> usize { - self.num_deleted.load(Ordering::Relaxed) - } -} - -/// Tester for region compaction. -struct CompactionTester { - base: Option, - purge_handler: MockFilePurgeHandler, - object_store: ObjectStore, - store_dir: String, - engine_config: EngineConfig, - flush_strategy: FlushStrategyRef, - pending_tasks: Arc>>>, -} - -impl CompactionTester { - async fn new( - store_dir: &str, - engine_config: EngineConfig, - flush_strategy: FlushStrategyRef, - s3_bucket: Option, - ) -> CompactionTester { - let purge_handler = MockFilePurgeHandler::default(); - let (region, object_store, pending_tasks) = create_region_for_compaction( - store_dir, - engine_config.clone(), - purge_handler.clone(), - flush_strategy.clone(), - s3_bucket, - ) - .await; - - CompactionTester { - base: Some(FileTesterBase::with_region(region)), - purge_handler, - object_store, - store_dir: store_dir.to_string(), - engine_config, - flush_strategy, - pending_tasks, - } - } - - #[inline] - fn base(&self) -> &FileTesterBase { - self.base.as_ref().unwrap() - } - - #[inline] - fn base_mut(&mut self) -> &mut FileTesterBase { - self.base.as_mut().unwrap() - } - - async fn put(&self, data: &[(i64, Option)]) { - let data = data - .iter() - .map(|(ts, v0)| (*ts, v0.map(|v| v.to_string()))) - .collect::>(); - let _ = self.base().put(&data).await; - } - - async fn flush(&self, wait: Option) { - let ctx = wait - .map(|wait| FlushContext { - wait, - reason: FlushReason::Manually, - ..Default::default() - }) - .unwrap_or_default(); - self.base().region.flush(&ctx).await.unwrap(); - } - - async fn compact(&self) { - // Trigger compaction and wait until it is done. - self.base() - .region - .compact(&CompactContext::default()) - .await - .unwrap(); - } - - /// Close region and clean up files. - async fn clean_up(mut self) { - self.base = None; - - self.object_store.remove_all("/").await.unwrap(); - } - - async fn reopen(&mut self) -> Result { - // Close the old region. - if let Some(base) = self.base.take() { - let _ = futures::future::join_all(self.pending_tasks.write().await.drain(..)).await; - base.close().await; - } - - // Reopen the region. - let object_store = new_object_store(&self.store_dir, None); - let (mut store_config, _) = config_util::new_store_config_with_object_store( - REGION_NAME, - &self.store_dir, - object_store.clone(), - EngineConfig { - max_files_in_l0: usize::MAX, - ..Default::default() - }, - ) - .await; - store_config.engine_config = Arc::new(self.engine_config.clone()); - store_config.flush_strategy = self.flush_strategy.clone(); - - let handler = CompactionHandler::new_with_pending_tasks(Arc::new(Default::default())); - let config = SchedulerConfig::default(); - // Overwrite test compaction scheduler and file purger. - store_config.compaction_scheduler = Arc::new(LocalScheduler::new(config, handler)); - store_config.file_purger = Arc::new(LocalScheduler::new( - SchedulerConfig { - max_inflight_tasks: store_config.engine_config.max_purge_tasks, - }, - MockFilePurgeHandler::default(), - )); - - let Some(region) = RegionImpl::open( - REGION_NAME.to_string(), - store_config, - &OpenOptions::default(), - ) - .await? - else { - return Ok(false); - }; - self.base = Some(FileTesterBase::with_region(region)); - Ok(true) - } -} - -async fn compact_during_read(s3_bucket: Option) { - let dir = create_temp_dir("compact_read"); - let store_dir = dir.path().to_str().unwrap(); - - // Use a large max_files_in_l0 to avoid compaction automatically. - let mut tester = CompactionTester::new( - store_dir, - EngineConfig { - max_files_in_l0: 100, - ..Default::default() - }, - // Disable auto-flush. - Arc::new(FlushSwitch::default()), - s3_bucket, - ) - .await; - - let expect: Vec<_> = (0..200).map(|v| (v, Some(v))).collect(); - // Put elements so we have content to flush (In SST1). - tester.put(&expect[0..100]).await; - - // Flush content to SST1. - tester.flush(None).await; - - // Put element (In SST2). - tester.put(&expect[100..200]).await; - - // Flush content to SST2. - tester.flush(None).await; - - tester.base_mut().read_ctx.batch_size = 1; - // Create a reader. - let reader = tester.base().full_scan_reader().await; - - assert_eq!(0, tester.purge_handler.num_deleted()); - - // Trigger compaction. - tester.compact().await; - - // The files are still referenced. - assert_eq!(0, tester.purge_handler.num_deleted()); - - // Read from the reader. - let output = tester.base().collect_reader(reader).await; - - assert_eq!(expect.len(), output.len()); - - tester.clean_up().await; -} - -#[tokio::test] -async fn test_compact_during_read_on_fs() { - common_telemetry::init_default_ut_logging(); - - compact_during_read(None).await; -} - -#[tokio::test] -async fn test_compact_during_read_on_s3() { - common_telemetry::init_default_ut_logging(); - - if let Ok(bucket) = env::var("GT_S3_BUCKET") { - if !bucket.is_empty() { - compact_during_read(Some(bucket)).await; - } - } -} - -#[tokio::test] -async fn test_persist_region_compaction_time_window() { - common_telemetry::init_default_ut_logging(); - let dir = create_temp_dir("put-delete-scan"); - let store_dir = dir.path().to_str().unwrap(); - let mut tester = CompactionTester::new( - store_dir, - EngineConfig { - max_files_in_l0: 100, - ..Default::default() - }, - // Disable auto-flush. - Arc::new(FlushSwitch::default()), - None, - ) - .await; - - // initially the time window is not present since no compaction ever happened. - assert_eq!( - None, - tester - .base - .as_ref() - .unwrap() - .region - .inner - .shared - .version_control - .current() - .ssts() - .compaction_time_window() - ); - - // write some data with one hour span - for idx in 0..10 { - tester - .put(&[(idx * 1000, Some(idx)), ((idx + 360) * 1000, Some(idx))]) - .await; - tester.flush(Some(true)).await; - } - - tester.compact().await; - // the inferred and persisted compaction time window should be 3600 seconds. - assert_eq!( - 3600, - tester - .base - .as_ref() - .unwrap() - .region - .inner - .shared - .version_control - .current() - .ssts() - .compaction_time_window() - .unwrap() - ); - - // try write data with a larger time window - for idx in 0..10 { - tester - .put(&[ - (idx * 1000, Some(idx)), - ((idx + 2 * 60 * 60) * 1000, Some(idx)), - ]) - .await; - tester.flush(Some(true)).await; - } - tester.compact().await; - - // but we won't changed persisted compaction window for now, so it remains unchanged. - assert_eq!( - 3600, - tester - .base - .as_ref() - .unwrap() - .region - .inner - .shared - .version_control - .current() - .ssts() - .compaction_time_window() - .unwrap() - ); - - let reopened = tester.reopen().await.unwrap(); - assert!(reopened); - assert_eq!( - 3600, - tester - .base - .as_ref() - .unwrap() - .region - .inner - .shared - .version_control - .current() - .ssts() - .compaction_time_window() - .unwrap() - ); -} diff --git a/src/storage/src/region/tests/drop.rs b/src/storage/src/region/tests/drop.rs deleted file mode 100644 index 8fc7b8550f49..000000000000 --- a/src/storage/src/region/tests/drop.rs +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Region drop tests. - -use std::path::Path; -use std::sync::Arc; - -use common_telemetry::info; -use common_test_util::temp_dir::create_temp_dir; -use log_store::raft_engine::log_store::RaftEngineLogStore; -use store_api::manifest::{Manifest, MetaAction}; -use store_api::storage::{FlushContext, OpenOptions, Region}; - -use crate::config::EngineConfig; -use crate::engine; -use crate::flush::FlushStrategyRef; -use crate::manifest::action::{RegionMetaAction, RegionMetaActionList, RegionRemove}; -use crate::region::tests::{self, FileTesterBase}; -use crate::region::RegionImpl; -use crate::test_util::config_util; -use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch}; - -const REGION_NAME: &str = "region-drop-0"; - -/// Create a new region for drop tests. -async fn create_region_for_drop( - store_dir: &str, - flush_strategy: FlushStrategyRef, -) -> RegionImpl { - let metadata = tests::new_metadata(REGION_NAME); - - let mut store_config = - config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await; - store_config.flush_strategy = flush_strategy; - - RegionImpl::create(metadata, store_config).await.unwrap() -} - -/// Tester for drop tests. -struct DropTester { - base: Option, -} - -impl DropTester { - async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> DropTester { - let region = create_region_for_drop(store_dir, flush_strategy).await; - DropTester { - base: Some(FileTesterBase::with_region(region)), - } - } - - #[inline] - fn base(&self) -> &FileTesterBase { - self.base.as_ref().unwrap() - } - - async fn put(&self, data: &[(i64, Option)]) { - let data = data - .iter() - .map(|(ts, v0)| (*ts, v0.map(|v| v.to_string()))) - .collect::>(); - let _ = self.base().put(&data).await; - } - - async fn flush(&self) { - let ctx = FlushContext::default(); - self.base().region.flush(&ctx).await.unwrap(); - } - - async fn close(&mut self) { - if let Some(base) = self.base.take() { - base.close().await; - } - } -} - -fn get_all_files(path: &str) -> Vec { - let mut files = Vec::new(); - for entry in std::fs::read_dir(path).unwrap() { - let entry = entry.unwrap(); - let path = entry.path(); - if path.is_file() { - files.push(path.to_str().unwrap().to_string()); - } else if path.is_dir() { - files.extend(get_all_files(path.to_str().unwrap())); - } - } - files -} - -#[tokio::test] -async fn test_drop_basic() { - let dir = create_temp_dir("drop-basic"); - common_telemetry::init_default_ut_logging(); - let store_dir = dir.path().to_str().unwrap(); - - let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME)); - let manifest_dir = format!( - "{}/{}", - store_dir, - engine::region_manifest_dir("", REGION_NAME) - ); - let flush_switch = Arc::new(FlushSwitch::default()); - let mut tester = DropTester::new(store_dir, flush_switch.clone()).await; - - let data = [(1000, Some(100))]; - - // Put one element so we have content to flush. - tester.put(&data).await; - - // Manually trigger flush. - tester.flush().await; - - assert!(has_parquet_file(&sst_dir)); - - tester.base().checkpoint_manifest().await; - let manifest_files = get_all_files(&manifest_dir); - info!("manifest_files: {:?}", manifest_files); - - tester.base().region.drop_region().await.unwrap(); - tester.close().await; - - assert!(!Path::new(&manifest_dir).exists()); -} - -#[tokio::test] -async fn test_drop_reopen() { - let dir = create_temp_dir("drop-basic"); - common_telemetry::init_default_ut_logging(); - let store_dir = dir.path().to_str().unwrap(); - - let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME)); - let manifest_dir = format!( - "{}/{}", - store_dir, - engine::region_manifest_dir("", REGION_NAME) - ); - let flush_switch = Arc::new(FlushSwitch::default()); - let mut tester = DropTester::new(store_dir, flush_switch.clone()).await; - - let data = [(1000, Some(100))]; - - // Put one element so we have content to flush. - tester.put(&data).await; - // Manually trigger flush. - tester.flush().await; - - assert!(has_parquet_file(&sst_dir)); - - tester.base().checkpoint_manifest().await; - let version_control = tester.base().region.version_control(); - - let mut action_list = - RegionMetaActionList::with_action(RegionMetaAction::Remove(RegionRemove { - region_id: tester.base().region.id(), - })); - let prev_version = version_control.current_manifest_version(); - action_list.set_prev_version(prev_version); - let manifest = &tester.base().region.inner.manifest; - let _ = manifest.update(action_list).await.unwrap(); - tester.close().await; - - // Reopen the region. - let store_config = config_util::new_store_config( - REGION_NAME, - store_dir, - EngineConfig { - max_files_in_l0: usize::MAX, - ..Default::default() - }, - ) - .await; - - let opts = OpenOptions::default(); - let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts) - .await - .unwrap(); - assert!(region.is_none()); - assert!(!Path::new(&manifest_dir).exists()); -} diff --git a/src/storage/src/region/tests/flush.rs b/src/storage/src/region/tests/flush.rs deleted file mode 100644 index 7095e1268862..000000000000 --- a/src/storage/src/region/tests/flush.rs +++ /dev/null @@ -1,462 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Region flush tests. - -use std::sync::Arc; -use std::time::Duration; - -use arrow::compute::SortOptions; -use common_query::prelude::Expr; -use common_recordbatch::OrderOption; -use common_test_util::temp_dir::create_temp_dir; -use common_time::timestamp::TimeUnit; -use datafusion_common::Column; -use datatypes::value::timestamp_to_scalar_value; -use log_store::raft_engine::log_store::RaftEngineLogStore; -use store_api::storage::{FlushContext, FlushReason, OpenOptions, Region, ScanRequest}; - -use crate::config::EngineConfig; -use crate::engine::{self, RegionMap}; -use crate::flush::{FlushStrategyRef, FlushType}; -use crate::region::tests::{self, FileTesterBase}; -use crate::region::RegionImpl; -use crate::test_util::config_util; -use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch}; - -const REGION_NAME: &str = "region-flush-0"; - -/// Create a new region for flush test -async fn create_region_for_flush( - store_dir: &str, - flush_strategy: FlushStrategyRef, -) -> ( - RegionImpl, - Arc>, -) { - let metadata = tests::new_metadata(REGION_NAME); - - let (mut store_config, regions) = config_util::new_store_config_and_region_map( - REGION_NAME, - store_dir, - EngineConfig { - max_files_in_l0: usize::MAX, - ..Default::default() - }, - ) - .await; - store_config.flush_strategy = flush_strategy; - - ( - RegionImpl::create(metadata, store_config).await.unwrap(), - regions, - ) -} - -/// Tester for region flush. -struct FlushTester { - base: Option, - store_dir: String, - flush_strategy: FlushStrategyRef, - regions: Arc>, -} - -impl FlushTester { - async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> FlushTester { - let (region, regions) = create_region_for_flush(store_dir, flush_strategy.clone()).await; - - FlushTester { - base: Some(FileTesterBase::with_region(region)), - store_dir: store_dir.to_string(), - flush_strategy: flush_strategy.clone(), - regions, - } - } - - async fn reopen(&mut self) { - self.regions.clear(); - // Close the old region. - if let Some(base) = self.base.take() { - base.close().await; - } - // Reopen the region. - let mut store_config = config_util::new_store_config( - REGION_NAME, - &self.store_dir, - EngineConfig { - max_files_in_l0: usize::MAX, - ..Default::default() - }, - ) - .await; - store_config.flush_strategy = self.flush_strategy.clone(); - let opts = OpenOptions::default(); - let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts) - .await - .unwrap() - .unwrap(); - self.base = Some(FileTesterBase::with_region(region)); - } - - #[inline] - fn base(&self) -> &FileTesterBase { - self.base.as_ref().unwrap() - } - - async fn put(&self, data: &[(i64, Option)]) { - let data = data - .iter() - .map(|(ts, v0)| (*ts, v0.map(|v| v.to_string()))) - .collect::>(); - let _ = self.base().put(&data).await; - } - - async fn full_scan(&self) -> Vec<(i64, Option)> { - self.base().full_scan().await - } - - async fn scan(&self, req: ScanRequest) -> Vec<(i64, Option)> { - self.base().scan(req).await - } - - async fn flush(&self, wait: Option) { - let ctx = wait - .map(|wait| FlushContext { - wait, - reason: FlushReason::Manually, - ..Default::default() - }) - .unwrap_or_default(); - self.base().region.flush(&ctx).await.unwrap(); - } -} - -impl Drop for FlushTester { - fn drop(&mut self) { - self.regions.clear(); - } -} - -#[tokio::test] -async fn test_flush_and_stall() { - common_telemetry::init_default_ut_logging(); - - let dir = create_temp_dir("flush-stall"); - let store_dir = dir.path().to_str().unwrap(); - - let flush_switch = Arc::new(FlushSwitch::default()); - let tester = FlushTester::new(store_dir, flush_switch.clone()).await; - - let data = [(1000, Some(100))]; - // Put one element so we have content to flush. - tester.put(&data).await; - - // Now set should flush to true to trigger flush. - flush_switch.set_should_flush(true); - // Put element to trigger flush. - tester.put(&data).await; - - // Now put another data to trigger write stall and wait until last flush done to - // ensure at least one parquet file is generated. - tester.put(&data).await; - - // Check parquet files. - let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME)); - assert!(has_parquet_file(&sst_dir)); -} - -#[tokio::test] -async fn test_manual_flush() { - common_telemetry::init_default_ut_logging(); - let dir = create_temp_dir("manual_flush"); - - let store_dir = dir.path().to_str().unwrap(); - - let flush_switch = Arc::new(FlushSwitch::default()); - let tester = FlushTester::new(store_dir, flush_switch.clone()).await; - - let data = [(1000, Some(100))]; - // Put one element so we have content to flush. - tester.put(&data).await; - - // No parquet file should be flushed. - let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME)); - assert!(!has_parquet_file(&sst_dir)); - - tester.flush(None).await; - - assert!(has_parquet_file(&sst_dir)); -} - -#[tokio::test] -async fn test_flush_and_reopen() { - common_telemetry::init_default_ut_logging(); - let dir = create_temp_dir("manual_flush"); - let store_dir = dir.path().to_str().unwrap(); - let flush_switch = Arc::new(FlushSwitch::default()); - let mut tester = FlushTester::new(store_dir, flush_switch.clone()).await; - - tester.put(&[(1000, Some(100))]).await; - tester.flush(Some(true)).await; - tester.reopen().await; - let i = tester - .base() - .region - .inner - .shared - .version_control - .committed_sequence(); - - // we wrote a request and flushed the region (involving writing a manifest), thus - // committed_sequence should be 2. - assert_eq!(2, i); -} - -#[tokio::test] -async fn test_flush_empty() { - let dir = create_temp_dir("flush-empty"); - let store_dir = dir.path().to_str().unwrap(); - - let flush_switch = Arc::new(FlushSwitch::default()); - let tester = FlushTester::new(store_dir, flush_switch.clone()).await; - - // Flush empty table. - tester.flush(None).await; - let data = [(1000, Some(100))]; - // Put element to trigger flush. - tester.put(&data).await; - - // Put again. - let data = [(2000, Some(200))]; - tester.put(&data).await; - - // No parquet file should be flushed. - let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME)); - assert!(!has_parquet_file(&sst_dir)); - - let expect = vec![(1000, Some(100.to_string())), (2000, Some(200.to_string()))]; - - let output = tester.full_scan().await; - assert_eq!(expect, output); -} - -#[tokio::test] -async fn test_read_after_flush_across_window() { - common_telemetry::init_default_ut_logging(); - - let dir = create_temp_dir("read-flush"); - let store_dir = dir.path().to_str().unwrap(); - - let flush_switch = Arc::new(FlushSwitch::default()); - let tester = FlushTester::new(store_dir, flush_switch.clone()).await; - - // Put elements so we have content to flush. - tester.put(&[(1000, Some(100))]).await; - tester.put(&[(2000, Some(200))]).await; - - // Flush. - tester.flush(None).await; - - // Put element again. - tester.put(&[(3000, Some(300))]).await; - - let expect = vec![ - (1000, Some(100.to_string())), - (2000, Some(200.to_string())), - (3000, Some(300.to_string())), - ]; - - let output = tester.full_scan().await; - assert_eq!(expect, output); - - // Reopen - let mut tester = tester; - tester.reopen().await; - - // Scan after reopen. - let output = tester.full_scan().await; - assert_eq!(expect, output); -} - -#[tokio::test] -async fn test_read_after_flush_same_window() { - common_telemetry::init_default_ut_logging(); - - let dir = create_temp_dir("read-flush"); - let store_dir = dir.path().to_str().unwrap(); - - let flush_switch = Arc::new(FlushSwitch::default()); - let tester = FlushTester::new(store_dir, flush_switch.clone()).await; - - // Put elements so we have content to flush. - tester.put(&[(1000, Some(100))]).await; - tester.put(&[(2000, Some(200))]).await; - - // Flush. - tester.flush(None).await; - - // Put element again. - tester.put(&[(1003, Some(300))]).await; - - let expect = vec![ - (1000, Some(100.to_string())), - (1003, Some(300.to_string())), - (2000, Some(200.to_string())), - ]; - - let output = tester.full_scan().await; - assert_eq!(expect, output); - - // Reopen - let mut tester = tester; - tester.reopen().await; - - // Scan after reopen. - let output = tester.full_scan().await; - assert_eq!(expect, output); -} - -#[tokio::test] -async fn test_merge_read_after_flush() { - let dir = create_temp_dir("merge-read-flush"); - let store_dir = dir.path().to_str().unwrap(); - - let flush_switch = Arc::new(FlushSwitch::default()); - let tester = FlushTester::new(store_dir, flush_switch.clone()).await; - - // Put elements so we have content to flush (In SST1). - tester.put(&[(3000, Some(300))]).await; - tester.put(&[(2000, Some(200))]).await; - - // Flush content to SST1. - tester.flush(None).await; - - // Put element (In SST2). - tester.put(&[(2000, Some(201))]).await; - - // In SST2. - tester.put(&[(2000, Some(202))]).await; - tester.put(&[(1000, Some(100))]).await; - - // Trigger flush. - tester.flush(None).await; - - // Overwrite row (In memtable). - tester.put(&[(2000, Some(203))]).await; - - let expect = vec![ - (1000, Some(100.to_string())), - (2000, Some(203.to_string())), - (3000, Some(300.to_string())), - ]; - - let output = tester.full_scan().await; - assert_eq!(expect, output); - - // Reopen - let mut tester = tester; - tester.reopen().await; - - // Scan after reopen. - let output = tester.full_scan().await; - assert_eq!(expect, output); -} - -#[tokio::test] -async fn test_schedule_engine_flush() { - common_telemetry::init_default_ut_logging(); - - let dir = create_temp_dir("engine-flush"); - let store_dir = dir.path().to_str().unwrap(); - - let flush_switch = Arc::new(FlushSwitch::default()); - let tester = FlushTester::new(store_dir, flush_switch.clone()).await; - assert_eq!(0, tester.base().region.last_flush_millis()); - - // Insert the region to the region map. - let _ = tester.regions.get_or_occupy_slot( - REGION_NAME, - engine::RegionSlot::Ready(tester.base().region.clone()), - ); - - // Put elements so we have content to flush. - tester.put(&[(1000, Some(100))]).await; - tester.put(&[(2000, Some(200))]).await; - - flush_switch.set_flush_type(FlushType::Engine); - - // Put element and trigger an engine level flush. - tester.put(&[(3000, Some(300))]).await; - - // Wait for flush. - let mut count = 0; - while tester.base().region.last_flush_millis() == 0 && count < 50 { - tokio::time::sleep(Duration::from_millis(100)).await; - count += 1; - } - - // Check parquet files. - let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME)); - assert!(has_parquet_file(&sst_dir)); -} - -#[tokio::test] -async fn test_flush_and_query_empty() { - common_telemetry::init_default_ut_logging(); - let dir = create_temp_dir("flush_and_query_empty_range"); - let store_dir = dir.path().to_str().unwrap(); - let flush_switch = Arc::new(FlushSwitch::default()); - let tester = FlushTester::new(store_dir, flush_switch.clone()).await; - - tester - .put( - &(20000..30000) - .map(|v| (v as i64, Some(v as i64))) - .collect::>(), - ) - .await; - tester.flush(Some(true)).await; - - tester - .put( - &(20100..20200) - .map(|v| (v as i64, Some(v as i64))) - .collect::>(), - ) - .await; - tester.flush(Some(true)).await; - - use datafusion_expr::Expr as DfExpr; - let req = ScanRequest { - sequence: None, - projection: None, - filters: vec![Expr::from(datafusion_expr::binary_expr( - DfExpr::Column(Column::from("timestamp")), - datafusion_expr::Operator::GtEq, - datafusion_expr::lit(timestamp_to_scalar_value( - TimeUnit::Millisecond, - Some(20000), - )), - ))], - output_ordering: Some(vec![OrderOption { - name: "timestamp".to_string(), - options: SortOptions { - descending: true, - nulls_first: true, - }, - }]), - limit: Some(1), - }; - let _ = tester.scan(req).await; -} diff --git a/src/storage/src/region/tests/projection.rs b/src/storage/src/region/tests/projection.rs deleted file mode 100644 index 74b6d25374b9..000000000000 --- a/src/storage/src/region/tests/projection.rs +++ /dev/null @@ -1,206 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; -use std::sync::Arc; - -use common_test_util::temp_dir::create_temp_dir; -use datatypes::data_type::ConcreteDataType; -use datatypes::prelude::ScalarVector; -use datatypes::type_id::LogicalTypeId; -use datatypes::vectors::{Int64Vector, TimestampMillisecondVector, VectorRef}; -use log_store::raft_engine::log_store::RaftEngineLogStore; -use store_api::logstore::LogStore; -use store_api::storage::{ - Chunk, ChunkReader, ReadContext, Region, ScanRequest, Snapshot, WriteContext, WriteRequest, -}; - -use crate::config::EngineConfig; -use crate::region::{RegionImpl, RegionMetadata}; -use crate::test_util::{self, config_util, descriptor_util, write_batch_util}; -use crate::write_batch::WriteBatch; - -/// Create metadata with schema (k0, timestamp, v0, v1) -fn new_metadata(region_name: &str) -> RegionMetadata { - let desc = descriptor_util::desc_with_field_columns(region_name, 2); - desc.try_into().unwrap() -} - -fn new_write_batch_for_test() -> WriteBatch { - write_batch_util::new_write_batch( - &[ - ("k0", LogicalTypeId::Int64, false), - ( - test_util::TIMESTAMP_NAME, - LogicalTypeId::TimestampMillisecond, - false, - ), - ("v0", LogicalTypeId::Int64, true), - ("v1", LogicalTypeId::Int64, true), - ], - Some(1), - 2, - ) -} - -/// Build put data -/// -/// ```text -/// k0: [key_start, key_start + 1, ... key_start + len - 1] -/// timestamp: [ts_start, ts_start + 1, ... ts_start + len - 1] -/// v0: [initial_value, ...., initial_value] -/// v1: [initial_value, ..., initial_value + len - 1] -/// ``` -fn new_put_data( - len: usize, - key_start: i64, - ts_start: i64, - initial_value: i64, -) -> HashMap { - let k0 = Arc::new(Int64Vector::from_values( - (0..len).map(|v| key_start + v as i64), - )) as VectorRef; - let ts = Arc::new(TimestampMillisecondVector::from_values( - (0..len).map(|v| ts_start + v as i64), - )) as VectorRef; - let v0 = Arc::new(Int64Vector::from_values( - std::iter::repeat(initial_value).take(len), - )) as VectorRef; - let v1 = Arc::new(Int64Vector::from_values( - (0..len).map(|v| initial_value + v as i64), - )) as VectorRef; - - HashMap::from([ - ("k0".to_string(), k0), - (test_util::TIMESTAMP_NAME.to_string(), ts), - ("v0".to_string(), v0), - ("v1".to_string(), v1), - ]) -} - -fn append_chunk_to(chunk: &Chunk, dst: &mut Vec>) { - if chunk.columns.is_empty() { - return; - } - let num_rows = chunk.columns[0].len(); - dst.resize(num_rows, Vec::new()); - for (i, row) in dst.iter_mut().enumerate() { - for col in &chunk.columns { - match col.data_type() { - ConcreteDataType::Int64(_) => { - let val = col - .as_any() - .downcast_ref::() - .unwrap() - .get_data(i) - .unwrap(); - row.push(val); - } - ConcreteDataType::Timestamp(_) => { - let val = col - .as_any() - .downcast_ref::() - .unwrap() - .get_data(i) - .unwrap(); - row.push(val.into()); - } - _ => unreachable!(), - } - } - } -} - -struct ProjectionTester { - region: RegionImpl, - write_ctx: WriteContext, - read_ctx: ReadContext, -} - -impl ProjectionTester { - fn with_region(region: RegionImpl) -> ProjectionTester { - ProjectionTester { - region, - write_ctx: WriteContext::default(), - read_ctx: ReadContext::default(), - } - } - - async fn put(&self, len: usize, key_start: i64, ts_start: i64, initial_value: i64) { - let mut batch = new_write_batch_for_test(); - let put_data = new_put_data(len, key_start, ts_start, initial_value); - batch.put(put_data).unwrap(); - - let _ = self.region.write(&self.write_ctx, batch).await.unwrap(); - } - - async fn scan(&self, projection: Option>) -> Vec> { - let snapshot = self.region.snapshot(&self.read_ctx).unwrap(); - - let request = ScanRequest { - projection, - ..Default::default() - }; - let resp = snapshot.scan(&self.read_ctx, request).await.unwrap(); - let mut reader = resp.reader; - - let mut dst = Vec::new(); - while let Some(chunk) = reader.next_chunk().await.unwrap() { - let chunk = reader.project_chunk(chunk); - append_chunk_to(&chunk, &mut dst); - } - - dst - } -} - -const REGION_NAME: &str = "region-projection-0"; - -async fn new_tester(store_dir: &str) -> ProjectionTester { - let metadata = new_metadata(REGION_NAME); - - let store_config = - config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await; - let region = RegionImpl::create(metadata, store_config).await.unwrap(); - - ProjectionTester::with_region(region) -} - -#[tokio::test] -async fn test_projection_ordered() { - let dir = create_temp_dir("projection-ordered"); - let store_dir = dir.path().to_str().unwrap(); - - let tester = new_tester(store_dir).await; - tester.put(4, 1, 10, 100).await; - - // timestamp, v1 - let output = tester.scan(Some(vec![1, 3])).await; - let expect = vec![vec![10, 100], vec![11, 101], vec![12, 102], vec![13, 103]]; - assert_eq!(expect, output); -} - -#[tokio::test] -async fn test_projection_unordered() { - let dir = create_temp_dir("projection-unordered"); - let store_dir = dir.path().to_str().unwrap(); - - let tester = new_tester(store_dir).await; - tester.put(4, 1, 10, 100).await; - - // v1, k0 - let output = tester.scan(Some(vec![3, 0])).await; - let expect = vec![vec![100, 1], vec![101, 2], vec![102, 3], vec![103, 4]]; - assert_eq!(expect, output); -} diff --git a/src/storage/src/region/tests/truncate.rs b/src/storage/src/region/tests/truncate.rs deleted file mode 100644 index 71d2da5bc655..000000000000 --- a/src/storage/src/region/tests/truncate.rs +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Region truncate tests. - -use std::sync::Arc; - -use common_test_util::temp_dir::create_temp_dir; -use log_store::raft_engine::log_store::RaftEngineLogStore; -use store_api::manifest::{Manifest, MetaAction}; -use store_api::storage::{FlushContext, OpenOptions, Region}; - -use crate::config::EngineConfig; -use crate::engine; -use crate::flush::FlushStrategyRef; -use crate::manifest::action::{RegionMetaAction, RegionMetaActionList, RegionTruncate}; -use crate::region::tests::{self, FileTesterBase}; -use crate::region::RegionImpl; -use crate::test_util::config_util; -use crate::test_util::flush_switch::{has_parquet_file, FlushSwitch}; - -const REGION_NAME: &str = "region-truncate-0"; - -/// Create a new region for truncate tests. -async fn create_region_for_truncate( - store_dir: &str, - flush_strategy: FlushStrategyRef, -) -> RegionImpl { - let metadata = tests::new_metadata(REGION_NAME); - - let mut store_config = - config_util::new_store_config(REGION_NAME, store_dir, EngineConfig::default()).await; - store_config.flush_strategy = flush_strategy; - - RegionImpl::create(metadata, store_config).await.unwrap() -} - -/// Tester for truncate tests. -struct TruncateTester { - store_dir: String, - base: Option, -} - -impl TruncateTester { - async fn new(store_dir: &str, flush_strategy: FlushStrategyRef) -> TruncateTester { - let region = create_region_for_truncate(store_dir, flush_strategy).await; - TruncateTester { - store_dir: store_dir.to_string(), - base: Some(FileTesterBase::with_region(region)), - } - } - - #[inline] - fn base(&self) -> &FileTesterBase { - self.base.as_ref().unwrap() - } - - async fn flush(&self) { - let ctx = FlushContext::default(); - self.base().region.flush(&ctx).await.unwrap(); - } - - async fn truncate(&self) { - self.base().region.truncate().await.unwrap(); - } - - async fn reopen(&mut self) { - // Close the old region. - if let Some(base) = self.base.as_ref() { - base.close().await; - } - self.base = None; - // Reopen the region. - let store_config = config_util::new_store_config( - REGION_NAME, - &self.store_dir, - EngineConfig { - max_files_in_l0: usize::MAX, - ..Default::default() - }, - ) - .await; - - let opts = OpenOptions::default(); - let region = RegionImpl::open(REGION_NAME.to_string(), store_config, &opts) - .await - .unwrap() - .unwrap(); - - self.base = Some(FileTesterBase::with_region(region)); - } -} - -#[tokio::test] -async fn test_truncate_basic() { - let dir = create_temp_dir("truncate-basic"); - common_telemetry::init_default_ut_logging(); - let store_dir = dir.path().to_str().unwrap(); - - let flush_switch = Arc::new(FlushSwitch::default()); - let tester = TruncateTester::new(store_dir, flush_switch.clone()).await; - - let data = [ - (1000, Some("1000".to_string())), - (1001, Some("1001".to_string())), - (1002, Some("1002".to_string())), - (1003, Some("1003".to_string())), - ]; - - // Data in Memtable - tester.base().put(&data).await; - let res = tester.base().full_scan().await; - assert_eq!(4, res.len()); - - // Truncate region. - tester.truncate().await; - - let res = tester.base().full_scan().await; - assert_eq!(0, res.len()); -} - -#[tokio::test] -async fn test_put_data_after_truncate() { - let dir = create_temp_dir("put_data_after_truncate"); - common_telemetry::init_default_ut_logging(); - let store_dir = dir.path().to_str().unwrap(); - - let sst_dir = format!("{}/{}", store_dir, engine::region_sst_dir("", REGION_NAME)); - let flush_switch = Arc::new(FlushSwitch::default()); - let tester = TruncateTester::new(store_dir, flush_switch.clone()).await; - - let data = [ - (1000, Some("1000".to_string())), - (1001, Some("1001".to_string())), - (1002, None), - (1003, Some("1003".to_string())), - ]; - - tester.base().put(&data).await; - - // Manually trigger flush. - tester.flush().await; - assert!(has_parquet_file(&sst_dir)); - - let data = [ - (1002, Some("1002".to_string())), - (1004, Some("1004".to_string())), - (1005, Some("1005".to_string())), - ]; - tester.base().put(&data).await; - - // Truncate region. - tester.truncate().await; - let res = tester.base().full_scan().await; - assert_eq!(0, res.len()); - - let new_data = [ - (1010, Some("0".to_string())), - (1011, Some("1".to_string())), - (1012, Some("2".to_string())), - (1013, Some("3".to_string())), - ]; - tester.base().put(&new_data).await; - - let res = tester.base().full_scan().await; - assert_eq!(new_data, res.as_slice()); -} - -#[tokio::test] -async fn test_truncate_reopen() { - let dir = create_temp_dir("put_data_after_truncate"); - common_telemetry::init_default_ut_logging(); - let store_dir = dir.path().to_str().unwrap(); - - let flush_switch = Arc::new(FlushSwitch::default()); - let mut tester = TruncateTester::new(store_dir, flush_switch.clone()).await; - - let data = [ - (1000, Some("1000".to_string())), - (1001, Some("1001".to_string())), - (1002, None), - (1003, Some("1003".to_string())), - ]; - - tester.base().put(&data).await; - - // Manually trigger flush. - tester.flush().await; - - let data = [ - (1002, Some("1002".to_string())), - (1004, Some("1004".to_string())), - (1005, Some("1005".to_string())), - ]; - tester.base().put(&data).await; - - let manifest = &tester.base().region.inner.manifest; - let manifest_version = tester - .base() - .region - .version_control() - .current_manifest_version(); - - let committed_sequence = tester.base().committed_sequence(); - let mut action_list = - RegionMetaActionList::with_action(RegionMetaAction::Truncate(RegionTruncate { - region_id: 0.into(), - committed_sequence, - })); - - // Persist the meta action. - let prev_version = manifest_version; - action_list.set_prev_version(prev_version); - manifest.update(action_list).await.unwrap(); - - // Reopen and put data. - tester.reopen().await; - let res = tester.base().full_scan().await; - assert_eq!(0, res.len()); - - let new_data = [ - (0, Some("0".to_string())), - (1, Some("1".to_string())), - (2, Some("2".to_string())), - (3, Some("3".to_string())), - ]; - - tester.base().put(&new_data).await; - let res = tester.base().full_scan().await; - assert_eq!(new_data, res.as_slice()); -} diff --git a/src/storage/src/region/writer.rs b/src/storage/src/region/writer.rs deleted file mode 100644 index a14ada258898..000000000000 --- a/src/storage/src/region/writer.rs +++ /dev/null @@ -1,984 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; -use std::time::Duration; - -use common_base::readable_size::ReadableSize; -use common_telemetry::logging; -use futures::TryStreamExt; -use snafu::{ensure, ResultExt}; -use store_api::logstore::LogStore; -use store_api::manifest::{Manifest, ManifestLogStorage, ManifestVersion, MetaAction}; -use store_api::storage::{ - AlterRequest, FlushContext, FlushReason, SequenceNumber, WriteContext, WriteResponse, -}; -use tokio::sync::{oneshot, Mutex}; - -use crate::compaction::{CompactionPickerRef, CompactionRequestImpl, CompactionSchedulerRef}; -use crate::config::EngineConfig; -use crate::error::{self, Result}; -use crate::flush::{ - FlushHandle, FlushRegionRequest, FlushSchedulerRef, FlushStrategyRef, FlushType, RegionStatus, -}; -use crate::manifest::action::{ - RawRegionMetadata, RegionChange, RegionEdit, RegionMetaAction, RegionMetaActionList, - RegionRemove, RegionTruncate, -}; -use crate::memtable::{Inserter, MemtableBuilderRef, MemtableId, MemtableRef, MemtableVersion}; -use crate::metadata::RegionMetadataRef; -use crate::metrics::{FLUSH_REQUESTS_TOTAL, PREPROCESS_ELAPSED}; -use crate::proto::wal::WalHeader; -use crate::region::{ - CompactContext, RecoveredMetadata, RecoveredMetadataMap, RegionManifest, SharedDataRef, -}; -use crate::schema::compat::CompatWrite; -use crate::sst::{AccessLayerRef, LevelMetas}; -use crate::version::{VersionControl, VersionControlRef, VersionEdit}; -use crate::wal::Wal; -use crate::write_batch::WriteBatch; - -pub type RegionWriterRef = Arc>; - -// TODO(yingwen): Add benches for write and support group commit to improve write throughput. - -/// Region writer manages all write operations to the region. -#[derive(Debug)] -pub struct RegionWriter { - // To avoid dead lock, we need to ensure the lock order is: inner -> version_mutex. - /// Inner writer guarded by write lock, the write lock is used to ensure - /// all write operations are serialized. - inner: Mutex, - /// Version lock, protects read-write-update to region `Version`. - /// - /// Increasing committed sequence should be guarded by this lock. - version_mutex: Mutex<()>, - - compaction_scheduler: CompactionSchedulerRef, - compaction_picker: CompactionPickerRef, -} - -impl RegionWriter -where - S: LogStore, -{ - pub fn new( - memtable_builder: MemtableBuilderRef, - config: Arc, - ttl: Option, - write_buffer_size: usize, - compaction_scheduler: CompactionSchedulerRef, - compaction_picker: CompactionPickerRef, - ) -> RegionWriter { - RegionWriter { - inner: Mutex::new(WriterInner::new( - memtable_builder, - config, - ttl, - write_buffer_size, - )), - version_mutex: Mutex::new(()), - compaction_scheduler, - compaction_picker, - } - } - - /// Write to region in the write lock. - pub async fn write( - &self, - ctx: &WriteContext, - request: WriteBatch, - writer_ctx: WriterContext<'_, S>, - ) -> Result { - let mut inner = self.inner.lock().await; - - ensure!(!inner.is_closed(), error::ClosedRegionSnafu); - - inner - .write(&self.version_mutex, ctx, request, writer_ctx) - .await - } - - /// Replay data to memtables. - pub async fn replay( - &self, - recovered_metadata: RecoveredMetadataMap, - writer_ctx: WriterContext<'_, S>, - ) -> Result<()> { - let mut inner = self.inner.lock().await; - inner - .replay(&self.version_mutex, recovered_metadata, writer_ctx) - .await - } - - /// Write and apply the region edit. - pub(crate) async fn write_edit_and_apply( - &self, - wal: &Wal, - shared: &SharedDataRef, - manifest: &RegionManifest, - edit: RegionEdit, - max_memtable_id: Option, - ) -> Result<()> { - let _lock = self.version_mutex.lock().await; - // HACK: We won't acquire the write lock here because write stall would hold - // write lock thus we have no chance to get the lock and apply the version edit. - // So we add a version lock to ensure modification to `VersionControl` is - // serialized. - let version_control = &shared.version_control; - let prev_version = version_control.current_manifest_version(); - - logging::debug!( - "Write region edit: {:?} to manifest, prev_version: {}.", - edit, - prev_version, - ); - - let files_to_add = edit.files_to_add.clone(); - let files_to_remove = edit.files_to_remove.clone(); - let flushed_sequence = edit.flushed_sequence; - let compaction_time_window = edit.compaction_time_window; - // Persist the meta action. - let mut action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit)); - action_list.set_prev_version(prev_version); - let manifest_version = manifest.update(action_list).await?; - - // Notify checkpointer the flushed manifest version after flushing memtable - if flushed_sequence.is_some() { - manifest.set_flushed_manifest_version(manifest_version); - } - - let version_edit = VersionEdit { - files_to_add, - files_to_remove, - flushed_sequence, - manifest_version, - max_memtable_id, - compaction_time_window, - }; - - // We could tolerate failure during persisting manifest version to the WAL, since it won't - // affect how we applying the edit to the version. - version_control.apply_edit(version_edit); - // TODO(yingwen): We should set the flush handle to `None`, but we can't acquire - // write lock here. - - // Persist the manifest version to notify subscriber of the wal that the manifest has been - // updated. This should be done at the end of the method. - self.persist_manifest_version(wal, version_control, manifest_version) - .await - } - - /// Alter schema of the region. - pub async fn alter(&self, alter_ctx: AlterContext<'_, S>, request: AlterRequest) -> Result<()> { - // To alter the schema, we need to acquire the write lock first, so we could - // avoid other writers write to the region and switch the memtable safely. - // Another potential benefit is that the write lock also protect against concurrent - // alter request to the region. - let inner = self.inner.lock().await; - - ensure!(!inner.is_closed(), error::ClosedRegionSnafu); - - let version_control = alter_ctx.version_control(); - - let old_metadata = version_control.metadata(); - old_metadata - .validate_alter(&request) - .context(error::InvalidAlterRequestSnafu)?; - - // The write lock protects us against other alter request, so we could build the new - // metadata struct outside of the version mutex. - let new_metadata = old_metadata - .alter(&request) - .context(error::AlterMetadataSnafu)?; - - let raw = RawRegionMetadata::from(&new_metadata); - - // Acquire the version lock before altering the metadata. - let _lock = self.version_mutex.lock().await; - - let committed_sequence = version_control.committed_sequence(); - let mut action_list = - RegionMetaActionList::with_action(RegionMetaAction::Change(RegionChange { - metadata: raw, - committed_sequence, - })); - let new_metadata = Arc::new(new_metadata); - - // Persist the meta action. - let prev_version = version_control.current_manifest_version(); - action_list.set_prev_version(prev_version); - - logging::debug!( - "Try to alter schema of region {}, region_id: {}, action_list: {:?}", - new_metadata.name(), - new_metadata.id(), - action_list - ); - - let manifest_version = alter_ctx.manifest.update(action_list).await?; - - // Now we could switch memtables and apply the new metadata to the version. - let new_mutable = inner.memtable_builder.build(new_metadata.schema().clone()); - version_control.freeze_mutable_and_apply_metadata( - new_metadata, - manifest_version, - new_mutable, - ); - - self.persist_manifest_version(alter_ctx.wal, version_control, manifest_version) - .await - } - - /// Allocate a sequence and persist the manifest version using that sequence to the wal. - /// - /// This method should be protected by the `version_mutex`. - async fn persist_manifest_version( - &self, - wal: &Wal, - version_control: &VersionControlRef, - manifest_version: ManifestVersion, - ) -> Result<()> { - // We always bump the committed sequence regardless whether persisting the manifest version - // to wal is success, to avoid RegionMetaAction use same committed sequence in accident. - let next_sequence = version_control.committed_sequence() + 1; - version_control.set_committed_sequence(next_sequence); - - let header = WalHeader::with_last_manifest_version(manifest_version); - let _ = wal.write_to_wal(next_sequence, header, None).await?; - - Ok(()) - } - - pub async fn close(&self) -> Result<()> { - // In order to close a writer - // 1. Acquires the write lock. - // 2. Sets a memory flag to reject any potential writing. - // 3. Waits for the pending flush task. - { - let mut inner = self.inner.lock().await; - - if inner.is_closed() { - return Ok(()); - } - - inner.mark_closed(); - } - // we release the writer lock once for rejecting any following potential writing requests immediately. - - self.wait_flush().await?; - - // TODO: cancel the compaction task - - Ok(()) - } - - pub async fn on_drop(&self, drop_ctx: DropContext<'_, S>) -> Result<()> { - // 1. Acquires the write lock. - // 2. Close writer reject any potential writing. - // 3. Waits or cancels the flush job. - // 4. Add `RegionMetaAction::Remove` to recover from manifest in case of failure. - // The main task is to restore the cleaning of sst files. If there is a failure - // in the previous stops, it can be restored through the `Procedure` framework. - // 5. Mark all data obsolete in the WAL. - // 6. Delete the namespace of the region from the WAL. - // 7. Mark all SSTs deleted. - // 8. Remove all manifests. - let mut inner = self.inner.lock().await; - inner.mark_closed(); - - if let Some(handle) = inner.flush_handle.take() { - handle.wait().await?; - } - - let version_control = drop_ctx.version_control(); - - let _lock = self.version_mutex.lock().await; - let committed_sequence = version_control.committed_sequence(); - let current_version = version_control.current(); - - let mut action_list = - RegionMetaActionList::with_action(RegionMetaAction::Remove(RegionRemove { - region_id: drop_ctx.shared.id, - })); - - // Persist the meta action. - let prev_version = version_control.current_manifest_version(); - action_list.set_prev_version(prev_version); - - logging::info!( - "Try to remove region {}, action_list: {:?}", - drop_ctx.shared.id(), - action_list - ); - - let remove_action_version = drop_ctx.manifest.update(action_list).await?; - - // Mark all data obsolete and delete the namespace in the WAL - drop_ctx.wal.obsolete(committed_sequence).await?; - drop_ctx.wal.delete_namespace().await?; - logging::info!( - "Remove WAL entries in region: {}, committed sequence: {}", - drop_ctx.shared.id(), - committed_sequence - ); - - // Mark all SSTs deleted - let files = current_version.ssts().mark_all_files_deleted(); - logging::info!( - "Try to remove all SSTs, region: {}, files: {:?}", - drop_ctx.shared.id(), - files - ); - - drop_ctx - .manifest - .manifest_store() - .delete_all(remove_action_version) - .await?; - Ok(()) - } - - /// Flush task manually - pub async fn flush(&self, writer_ctx: WriterContext<'_, S>, ctx: &FlushContext) -> Result<()> { - let mut inner = self.inner.lock().await; - - if !ctx.force { - ensure!(!inner.is_closed(), error::ClosedRegionSnafu); - } - - inner.manual_flush(writer_ctx, ctx.reason).await?; - - if ctx.wait { - if let Some(handle) = inner.flush_handle.take() { - handle.wait().await?; - } - } - - Ok(()) - } - - /// Compact manually. - pub async fn compact(&self, request: WriterCompactRequest) -> Result<()> { - let mut inner = self.inner.lock().await; - - ensure!(!inner.is_closed(), error::ClosedRegionSnafu); - let sst_write_buffer_size = ReadableSize::mb(8); // deprecated usage - - inner - .manual_compact( - request, - self.compaction_picker.clone(), - self.compaction_scheduler.clone(), - sst_write_buffer_size, - ) - .await - } - - /// Wait flush task if any - async fn wait_flush(&self) -> Result<()> { - let mut inner = self.inner.lock().await; - - if let Some(handle) = inner.flush_handle.take() { - handle.wait().await?; - } - - Ok(()) - } - - pub async fn truncate(&self, ctx: &TruncateContext<'_, S>) -> Result<()> { - // Acquires the write lock. - let mut inner = self.inner.lock().await; - ensure!(!inner.is_closed(), error::ClosedRegionSnafu); - - if let Some(handle) = inner.flush_handle.take() { - handle.wait().await?; - } - - let version_control = ctx.version_control(); - let _lock = self.version_mutex.lock().await; - let committed_sequence = version_control.committed_sequence(); - - // Add `RegionMetaAction::Truncate` to recover from manifest in case of failure. - let mut action_list = - RegionMetaActionList::with_action(RegionMetaAction::Truncate(RegionTruncate { - region_id: ctx.shared.id, - committed_sequence, - })); - - // Persist the meta action. - let current_version = version_control.current(); - let manifest_version = version_control.current_manifest_version(); - let prev_version = manifest_version; - action_list.set_prev_version(prev_version); - ctx.manifest.update(action_list).await?; - - // Mark all data obsolete - ctx.wal.obsolete(committed_sequence).await?; - - // Mark all SSTs deleted - let files = current_version.ssts().mark_all_files_deleted(); - logging::info!( - "Try to remove all SSTs, region: {}, files: {:?}", - ctx.shared.id(), - files - ); - - // Reset version - let memtables = Arc::new(MemtableVersion::new(inner.alloc_memtable(version_control))); - let ssts = Arc::new(LevelMetas::new( - ctx.sst_layer.clone(), - current_version.ssts().file_purger(), - )); - version_control.reset_version(manifest_version + 1, memtables, ssts); - - Ok(()) - } -} - -// Methods for tests. -#[cfg(test)] -impl RegionWriter -where - S: LogStore, -{ - pub(crate) async fn write_buffer_size(&self) -> usize { - self.inner.lock().await.write_buffer_size - } -} - -/// Structs needed by triggering a compaction. -pub struct WriterCompactRequest { - pub shared_data: SharedDataRef, - pub sst_layer: AccessLayerRef, - pub manifest: RegionManifest, - pub wal: Wal, - pub region_writer: RegionWriterRef, - pub compact_ctx: CompactContext, -} - -pub struct WriterContext<'a, S: LogStore> { - pub shared: &'a SharedDataRef, - pub flush_strategy: &'a FlushStrategyRef, - pub flush_scheduler: &'a FlushSchedulerRef, - pub compaction_scheduler: &'a CompactionSchedulerRef, - pub sst_layer: &'a AccessLayerRef, - pub wal: &'a Wal, - pub writer: &'a RegionWriterRef, - pub manifest: &'a RegionManifest, - pub compaction_picker: CompactionPickerRef, -} - -impl<'a, S: LogStore> WriterContext<'a, S> { - #[inline] - fn version_control(&self) -> &VersionControlRef { - &self.shared.version_control - } -} - -pub struct AlterContext<'a, S: LogStore> { - pub shared: &'a SharedDataRef, - pub wal: &'a Wal, - pub manifest: &'a RegionManifest, -} - -impl<'a, S: LogStore> AlterContext<'a, S> { - #[inline] - fn version_control(&self) -> &VersionControlRef { - &self.shared.version_control - } -} - -pub struct DropContext<'a, S: LogStore> { - pub shared: &'a SharedDataRef, - pub wal: &'a Wal, - pub manifest: &'a RegionManifest, - pub flush_scheduler: &'a FlushSchedulerRef, - pub compaction_scheduler: &'a CompactionSchedulerRef, - pub sst_layer: &'a AccessLayerRef, -} - -impl<'a, S: LogStore> DropContext<'a, S> { - #[inline] - fn version_control(&self) -> &VersionControlRef { - &self.shared.version_control - } -} - -pub struct TruncateContext<'a, S: LogStore> { - pub shared: &'a SharedDataRef, - pub wal: &'a Wal, - pub manifest: &'a RegionManifest, - pub sst_layer: &'a AccessLayerRef, -} - -impl<'a, S: LogStore> TruncateContext<'a, S> { - #[inline] - fn version_control(&self) -> &VersionControlRef { - &self.shared.version_control - } -} - -#[derive(Debug)] -struct WriterInner { - memtable_builder: MemtableBuilderRef, - flush_handle: Option, - - /// `WriterInner` will reject any future writing, if the closed flag is set. - /// - /// It should protected by upper mutex - closed: bool, - engine_config: Arc, - ttl: Option, - /// Size in bytes to freeze the mutable memtable. - write_buffer_size: usize, -} - -impl WriterInner { - fn new( - memtable_builder: MemtableBuilderRef, - engine_config: Arc, - ttl: Option, - write_buffer_size: usize, - ) -> WriterInner { - WriterInner { - memtable_builder, - flush_handle: None, - engine_config, - closed: false, - ttl, - write_buffer_size, - } - } - - /// Write `WriteBatch` to region, now the schema of batch needs to be validated outside. - /// - /// Mutable reference of writer ensure no other reference of this writer can modify the - /// version control (write is exclusive). - async fn write( - &mut self, - version_mutex: &Mutex<()>, - _ctx: &WriteContext, - mut request: WriteBatch, - writer_ctx: WriterContext<'_, S>, - ) -> Result { - self.preprocess_write(&writer_ctx).await?; - let version_control = writer_ctx.version_control(); - - let _lock = version_mutex.lock().await; - - let metadata = version_control.metadata(); - // We need to check the schema again since it might has been altered. We need - // to compat request's schema before writing it into the WAL otherwise some - // default constraint like `current_timestamp()` would yield different value - // during replay. - request.compat_write(metadata.schema().user_schema())?; - - let committed_sequence = version_control.committed_sequence(); - // Sequence for current write batch. - let next_sequence = committed_sequence + 1; - - let version = version_control.current(); - let wal_header = WalHeader::with_last_manifest_version(version.manifest_version()); - let _ = writer_ctx - .wal - .write_to_wal(next_sequence, wal_header, Some(request.payload())) - .await?; - - // Insert batch into memtable. - let mut inserter = Inserter::new(next_sequence); - inserter.insert_memtable(request.payload(), version.mutable_memtable())?; - - // Update committed_sequence to make current batch visible. The `&mut self` of WriterInner - // guarantees the writer is exclusive. - version_control.set_committed_sequence(next_sequence); - - Ok(WriteResponse {}) - } - - async fn replay( - &mut self, - version_mutex: &Mutex<()>, - mut recovered_metadata: RecoveredMetadataMap, - writer_ctx: WriterContext<'_, S>, - ) -> Result<()> { - let version_control = writer_ctx.version_control(); - - let (flushed_sequence, mut last_sequence); - let mut num_requests = 0; - let mut num_recovered_metadata = 0; - let mut next_apply_metadata = recovered_metadata.pop_first(); - { - let _lock = version_mutex.lock().await; - - // Data after flushed sequence need to be recovered. - flushed_sequence = version_control.current().flushed_sequence(); - last_sequence = flushed_sequence; - // Read starts from the first entry after last flushed entry, so the start sequence - // should be flushed_sequence + 1. - let mut stream = writer_ctx.wal.read_from_wal(flushed_sequence + 1).await?; - while let Some((req_sequence, _header, payload)) = stream.try_next().await? { - while let Some((sequence_before_alter, _)) = next_apply_metadata { - // There might be multiple metadata changes to be applied, so a loop is necessary. - if req_sequence > sequence_before_alter { - // This is the first request that use the new metadata. - self.apply_metadata( - &writer_ctx, - sequence_before_alter, - next_apply_metadata, - version_control, - )?; - - num_recovered_metadata += 1; - next_apply_metadata = recovered_metadata.pop_first(); - } else { - // Keep the next_apply_metadata until req_sequence > sequence_before_alter - break; - } - } - - if req_sequence > last_sequence { - last_sequence = req_sequence; - } else { - logging::error!( - "Sequence should not decrease during replay, found {} <= {}, \ - region_id: {}, region_name: {}, flushed_sequence: {}, num_requests: {}", - req_sequence, - last_sequence, - writer_ctx.shared.id, - writer_ctx.shared.name, - flushed_sequence, - num_requests, - ); - - error::SequenceNotMonotonicSnafu { - prev: last_sequence, - given: req_sequence, - } - .fail()?; - } - - if let Some(payload) = payload { - num_requests += 1; - // Note that memtables of `Version` may be updated during replay. - let version = version_control.current(); - // TODO(yingwen): Trigger flush if the size of memtables reach the flush threshold to avoid - // out of memory during replay, but we need to do it carefully to avoid dead lock. - let mut inserter = Inserter::new(last_sequence); - inserter.insert_memtable(&payload, version.mutable_memtable())?; - } - } - - // Apply metadata after last WAL entry - while let Some((sequence_before_alter, _)) = next_apply_metadata { - assert!( - sequence_before_alter >= last_sequence, - "The sequence in metadata after last WAL entry is less than last sequence, \ - metadata sequence: {}, last_sequence: {}, region_id: {}, region_name: {}", - sequence_before_alter, - last_sequence, - writer_ctx.shared.id, - writer_ctx.shared.name - ); - - self.apply_metadata( - &writer_ctx, - sequence_before_alter, - next_apply_metadata, - version_control, - )?; - - num_recovered_metadata += 1; - next_apply_metadata = recovered_metadata.pop_first(); - } - - version_control.set_committed_sequence(last_sequence); - } - - logging::info!( - "Region replay finished, region_id: {}, region_name: {}, flushed_sequence: {}, last_sequence: {}, num_requests: {}, num_recovered_metadata: {}", - writer_ctx.shared.id, - writer_ctx.shared.name, - flushed_sequence, - last_sequence, - num_requests, - num_recovered_metadata, - ); - - Ok(()) - } - - fn apply_metadata( - &self, - writer_ctx: &WriterContext<'_, S>, - sequence: SequenceNumber, - mut metadata: Option, - version_control: &VersionControl, - ) -> Result<()> { - // It's safe to unwrap here, it's checked outside. - // Move out metadata to avoid cloning it. - - let (_, (manifest_version, metadata)) = metadata.take().unwrap(); - let region_metadata: RegionMetadataRef = - Arc::new(metadata.try_into().context(error::InvalidRawRegionSnafu { - region: &writer_ctx.shared.name, - })?); - let new_mutable = self - .memtable_builder - .build(region_metadata.schema().clone()); - version_control.freeze_mutable_and_apply_metadata( - region_metadata, - manifest_version, - new_mutable, - ); - logging::debug!( - "Applied metadata to region: {} when replaying WAL: sequence={} manifest={} ", - writer_ctx.shared.name, - sequence, - manifest_version - ); - - Ok(()) - } - - /// Preprocess before write. - /// - /// Creates needed mutable memtables, ensures there is enough capacity in memtable and trigger - /// flush if necessary. Returns time ranges of the input write batch. - async fn preprocess_write( - &mut self, - writer_ctx: &WriterContext<'_, S>, - ) -> Result<()> { - let _timer = PREPROCESS_ELAPSED.start_timer(); - - let version_control = writer_ctx.version_control(); - // Check whether memtable is full or flush should be triggered. We need to do this first since - // switching memtables will clear all mutable memtables. - if let Some(flush_type) = self.should_flush( - writer_ctx.shared, - version_control, - writer_ctx.flush_strategy, - ) { - // Trigger flush according to the flush type. - match flush_type { - FlushType::Region => { - // Trigger flush for current region. - self.trigger_flush(writer_ctx, FlushReason::MemtableFull) - .await?; - } - FlushType::Engine => { - // Trigger engine level flush. This wakeup the flush handler - // to pick region to flush. - writer_ctx.flush_scheduler.schedule_engine_flush()?; - } - } - } - - Ok(()) - } - - /// Create a new mutable memtable. - fn alloc_memtable(&self, version_control: &VersionControlRef) -> MemtableRef { - let memtable_schema = version_control.current().schema().clone(); - self.memtable_builder.build(memtable_schema) - } - - fn should_flush( - &self, - shared: &SharedDataRef, - version_control: &VersionControlRef, - flush_strategy: &FlushStrategyRef, - ) -> Option { - let current = version_control.current(); - let memtables = current.memtables(); - let status = RegionStatus { - region_id: shared.id(), - bytes_mutable: memtables.mutable_bytes_allocated(), - write_buffer_size: self.write_buffer_size, - }; - flush_strategy.should_flush(status) - } - - async fn trigger_flush( - &mut self, - ctx: &WriterContext<'_, S>, - reason: FlushReason, - ) -> Result<()> { - let version_control = &ctx.shared.version_control; - let new_mutable = self.alloc_memtable(version_control); - // Freeze all mutable memtables so we can flush them later. - version_control.freeze_mutable(new_mutable); - - FLUSH_REQUESTS_TOTAL - .with_label_values(&[reason.as_str()]) - .inc(); - - if let Some(flush_handle) = self.flush_handle.take() { - // Previous flush job is incomplete, wait util it is finished. - // However the last flush job may fail, in which case, we just return error - // and abort current write request. The flush handle is left empty, so the next - // time we still have chance to trigger a new flush. - // TODO(yingwen): We should release the write lock during waiting flush done, which - // needs something like async condvar. - flush_handle.wait().await.map_err(|e| { - logging::error!(e; "Previous flush job failed, region: {}", ctx.shared.name); - e - })?; - } - - let current_version = version_control.current(); - let (max_memtable_id, mem_to_flush) = current_version.memtables().memtables_to_flush(); - - if max_memtable_id.is_none() { - // We still update the flush time to avoid the picker picks this region again. - ctx.shared.update_flush_millis(); - - logging::info!("No memtables to flush in region: {}", ctx.shared.name); - return Ok(()); - } - - let flush_req = FlushRegionRequest { - max_memtable_id: max_memtable_id.unwrap(), - memtables: mem_to_flush, - // In write thread, safe to use current committed sequence. - flush_sequence: version_control.committed_sequence(), - shared: ctx.shared.clone(), - sst_layer: ctx.sst_layer.clone(), - writer: ctx.writer.clone(), - wal: ctx.wal.clone(), - manifest: ctx.manifest.clone(), - engine_config: self.engine_config.clone(), - ttl: self.ttl, - compaction_time_window: current_version.ssts().compaction_time_window(), - compaction_picker: ctx.compaction_picker.clone(), - }; - - let flush_handle = ctx - .flush_scheduler - .schedule_region_flush(flush_req) - .map_err(|e| { - logging::error!(e; "Failed to schedule flush request"); - e - })?; - self.flush_handle = Some(flush_handle); - - Ok(()) - } - - async fn manual_compact( - &mut self, - request: WriterCompactRequest, - compaction_picker: CompactionPickerRef, - compaction_scheduler: CompactionSchedulerRef, - sst_write_buffer_size: ReadableSize, - ) -> Result<()> { - let region_id = request.shared_data.id(); - let compaction_time_window = request - .shared_data - .version_control - .current() - .ssts() - .compaction_time_window(); - let mut compaction_request = CompactionRequestImpl { - region_id, - sst_layer: request.sst_layer, - writer: request.region_writer, - shared: request.shared_data.clone(), - manifest: request.manifest, - wal: request.wal, - ttl: self.ttl, - compaction_time_window, - sender: None, - picker: compaction_picker, - sst_write_buffer_size, - // manual compaction does not reschedule itself. - reschedule_on_finish: false, - }; - - let compaction_scheduler = compaction_scheduler.clone(); - logging::info!( - "Manual compact, region_id: {}, compact_ctx: {:?}", - region_id, - request.compact_ctx - ); - - if request.compact_ctx.wait { - let (sender, receiver) = oneshot::channel(); - compaction_request.sender = Some(sender); - - if schedule_compaction( - request.shared_data, - compaction_scheduler, - compaction_request, - ) { - receiver - .await - .context(error::CompactTaskCancelSnafu { region_id })??; - } - } else { - let _ = schedule_compaction( - request.shared_data, - compaction_scheduler, - compaction_request, - ); - } - - Ok(()) - } - - async fn manual_flush( - &mut self, - writer_ctx: WriterContext<'_, S>, - reason: FlushReason, - ) -> Result<()> { - self.trigger_flush(&writer_ctx, reason).await?; - Ok(()) - } - - #[inline] - fn is_closed(&self) -> bool { - self.closed - } - - #[inline] - fn mark_closed(&mut self) { - self.closed = true; - } -} - -/// Schedule compaction task, returns whether the task is scheduled. -pub(crate) fn schedule_compaction( - shared_data: SharedDataRef, - compaction_scheduler: CompactionSchedulerRef, - compaction_request: CompactionRequestImpl, -) -> bool { - let region_id = shared_data.id(); - - match compaction_scheduler.schedule(compaction_request) { - Ok(scheduled) => { - logging::info!( - "Schedule region {} compaction request result: {}", - region_id, - scheduled - ); - - scheduled - } - Err(e) => { - logging::error!(e;"Failed to schedule region compaction request {}", region_id); - - false - } - } -} diff --git a/src/storage/src/scheduler.rs b/src/storage/src/scheduler.rs deleted file mode 100644 index 0fdbce6aa089..000000000000 --- a/src/storage/src/scheduler.rs +++ /dev/null @@ -1,652 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::fmt::{Debug, Formatter}; -use std::hash::Hash; -use std::sync::atomic::{AtomicU8, Ordering}; -use std::sync::{Arc, Mutex, RwLock}; - -use async_trait::async_trait; -use common_telemetry::{debug, error, info}; -use snafu::{ensure, ResultExt}; -use tokio::sync::Notify; -use tokio::task::JoinHandle; -use tokio_util::sync::CancellationToken; - -use crate::error::{IllegalSchedulerStateSnafu, Result, StopSchedulerSnafu}; -use crate::scheduler::dedup_deque::DedupDeque; -use crate::scheduler::rate_limit::{ - BoxedRateLimitToken, CascadeRateLimiter, MaxInflightTaskLimiter, RateLimiter, -}; - -pub mod dedup_deque; -pub mod rate_limit; - -/// Request that can be scheduled. -/// It must contain a key for deduplication. -pub trait Request: Send + Sync + 'static { - /// Type of request key. - type Key: Eq + Hash + Clone + Debug + Send + Sync; - - /// Returns the request key. - fn key(&self) -> Self::Key; - - /// Notify the request result. - fn complete(self, result: Result<()>); -} - -#[async_trait::async_trait] -pub trait Handler { - type Request; - - async fn handle_request( - &self, - req: Self::Request, - token: BoxedRateLimitToken, - finish_notifier: Arc, - ) -> Result<()>; -} - -/// [Scheduler] defines a set of API to schedule requests. -#[async_trait] -pub trait Scheduler: Debug { - type Request; - - /// Schedules a request. - /// Returns true if request is scheduled. Returns false if task queue already - /// contains the request with same key. - fn schedule(&self, request: Self::Request) -> Result; - - /// Stops scheduler. If `await_termination` is set to true, the scheduler will - /// wait until all queued requests are processed. - async fn stop(&self, await_termination: bool) -> Result<()>; -} - -/// Scheduler config. -#[derive(Debug)] -pub struct SchedulerConfig { - pub max_inflight_tasks: usize, -} - -impl Default for SchedulerConfig { - fn default() -> Self { - Self { - max_inflight_tasks: 4, - } - } -} - -const STATE_RUNNING: u8 = 0; -const STATE_STOP: u8 = 1; -const STATE_AWAIT_TERMINATION: u8 = 2; - -/// Request scheduler based on local state. -pub struct LocalScheduler { - /// Request FIFO with key deduplication. - request_queue: Arc>>, - /// Token used to halt the scheduler. - cancel_token: CancellationToken, - /// Tasks use a cooperative manner to notify scheduler that another request can be scheduled. - task_notifier: Arc, - /// Join handle of spawned request handling loop. - join_handle: Mutex>>, - /// State of scheduler. - state: Arc, -} - -impl Debug for LocalScheduler -where - R: Request + Send + Sync, -{ - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_struct("LocalScheduler") - .field("state", &self.state) - .finish() - } -} - -impl Drop for LocalScheduler -where - R: Request, -{ - fn drop(&mut self) { - self.state.store(STATE_STOP, Ordering::Relaxed); - - self.cancel_token.cancel(); - - // Clear all requests - self.request_queue.write().unwrap().clear(); - } -} - -#[async_trait] -impl Scheduler for LocalScheduler -where - R: Request + Send, -{ - type Request = R; - - fn schedule(&self, request: Self::Request) -> Result { - ensure!(self.running(), IllegalSchedulerStateSnafu); - debug!( - "Schedule request: {:?}, queue size: {}", - request.key(), - self.remaining_requests() - ); - let mut queue = self.request_queue.write().unwrap(); - let res = queue.push_back(request.key(), request); - self.task_notifier.notify_one(); - Ok(res) - } - - async fn stop(&self, await_termination: bool) -> Result<()> { - let state = if await_termination { - STATE_AWAIT_TERMINATION - } else { - STATE_STOP - }; - self.state.store(state, Ordering::Relaxed); - - self.cancel_token.cancel(); - let handle = { self.join_handle.lock().unwrap().take() }; - if let Some(handle) = handle { - handle.await.context(StopSchedulerSnafu)?; - } - Ok(()) - } -} - -impl LocalScheduler -where - R: Request, -{ - /// Creates a new scheduler instance with given config and request handler. - pub fn new(config: SchedulerConfig, handler: H) -> Self - where - H: Handler + Send + Sync + 'static, - { - let request_queue = Arc::new(RwLock::new(DedupDeque::default())); - let cancel_token = CancellationToken::new(); - let task_notifier = Arc::new(Notify::new()); - let state = Arc::new(AtomicU8::new(STATE_RUNNING)); - let handle_loop = HandlerLoop { - task_notifier: task_notifier.clone(), - req_queue: request_queue.clone(), - cancel_token: cancel_token.child_token(), - limiter: Arc::new(CascadeRateLimiter::new(vec![Box::new( - MaxInflightTaskLimiter::new(config.max_inflight_tasks), - )])), - request_handler: handler, - state: state.clone(), - }; - let join_handle = common_runtime::spawn_bg(async move { - debug!("Task handler loop spawned"); - handle_loop.run().await; - }); - Self { - join_handle: Mutex::new(Some(join_handle)), - request_queue, - cancel_token, - task_notifier, - state, - } - } - - /// Returns remaining requests number. - #[inline] - fn remaining_requests(&self) -> usize { - self.request_queue.read().unwrap().len() - } - - #[inline] - fn running(&self) -> bool { - self.state.load(Ordering::Relaxed) == STATE_RUNNING - } -} - -pub struct HandlerLoop { - pub req_queue: Arc>>, - pub cancel_token: CancellationToken, - pub task_notifier: Arc, - pub request_handler: H, - pub limiter: Arc>, - pub state: Arc, -} - -impl HandlerLoop -where - R: Request, - H: Handler, -{ - /// Runs scheduled requests dispatch loop. - pub async fn run(&self) { - let limiter = self.limiter.clone(); - while self.running() { - tokio::select! { - _ = self.task_notifier.notified() => { - debug!("Notified, queue size: {:?}",self.req_queue.read().unwrap().len()); - self.poll_and_execute(&limiter).await; - } - _ = self.cancel_token.cancelled() => { - info!("Task scheduler cancelled."); - break; - } - } - } - // For correctness, we need to poll requests from fifo again. - if self.state.load(Ordering::Relaxed) == STATE_AWAIT_TERMINATION { - info!("Waiting for all pending tasks to finish."); - self.poll_and_execute(&limiter).await; - self.state.store(STATE_STOP, Ordering::Relaxed); - } - info!("Task scheduler stopped"); - } - - /// Polls and executes requests as many as possible until rate limited. - async fn poll_and_execute(&self, limiter: &Arc>) { - while let Some((task_key, req)) = self.poll_task().await { - if let Ok(token) = limiter.acquire_token(&req) { - debug!("Executing request: {:?}", task_key); - if let Err(e) = self - .handle_request(req, token, self.task_notifier.clone()) - .await - { - error!(e; "Failed to submit request: {:?}", task_key); - } else { - info!("Submitted task: {:?}", task_key); - } - } else { - // rate limited, put back to req queue to wait for next schedule - debug!( - "Put back request {:?}, queue size: {}", - task_key, - self.req_queue.read().unwrap().len() - ); - self.put_back_req(task_key, req).await; - break; - } - } - } - - #[inline] - async fn poll_task(&self) -> Option<(R::Key, R)> { - let mut queue = self.req_queue.write().unwrap(); - queue.pop_front() - } - - /// Puts request back to the front of request queue. - #[inline] - async fn put_back_req(&self, key: R::Key, req: R) { - let mut queue = self.req_queue.write().unwrap(); - let _ = queue.push_front(key, req); - } - - // Handles request, submit task to bg runtime. - async fn handle_request( - &self, - req: R, - token: BoxedRateLimitToken, - finish_notifier: Arc, - ) -> Result<()> { - self.request_handler - .handle_request(req, token, finish_notifier) - .await - } - - #[inline] - fn running(&self) -> bool { - self.state.load(Ordering::Relaxed) == STATE_RUNNING - } -} - -#[cfg(test)] -mod tests { - use std::sync::atomic::{AtomicBool, AtomicI32}; - use std::time::Duration; - - use futures_util::future::BoxFuture; - use store_api::storage::RegionId; - - use super::*; - use crate::scheduler::dedup_deque::DedupDeque; - use crate::scheduler::rate_limit::{ - BoxedRateLimitToken, CascadeRateLimiter, MaxInflightTaskLimiter, - }; - use crate::scheduler::{HandlerLoop, LocalScheduler, Scheduler, SchedulerConfig}; - - struct CountdownLatch { - counter: std::sync::Mutex, - notify: Notify, - } - - impl CountdownLatch { - fn new(size: usize) -> Self { - Self { - counter: std::sync::Mutex::new(size), - notify: Notify::new(), - } - } - - fn countdown(&self) { - let mut counter = self.counter.lock().unwrap(); - if *counter >= 1 { - *counter -= 1; - if *counter == 0 { - self.notify.notify_one(); - } - } - } - - /// Users should only call this once. - async fn wait(&self) { - self.notify.notified().await - } - } - - #[tokio::test] - async fn test_schedule_handler() { - common_telemetry::init_default_ut_logging(); - let queue = Arc::new(std::sync::RwLock::new(DedupDeque::default())); - let latch = Arc::new(CountdownLatch::new(2)); - let latch_cloned = latch.clone(); - let handler = Arc::new(HandlerLoop { - req_queue: queue.clone(), - cancel_token: Default::default(), - task_notifier: Arc::new(Default::default()), - request_handler: MockHandler { - cb: move || { - latch_cloned.countdown(); - }, - }, - limiter: Arc::new(CascadeRateLimiter::new(vec![Box::new( - MaxInflightTaskLimiter::new(3), - )])), - state: Arc::new(AtomicU8::default()), - }); - - let handler_cloned = handler.clone(); - let _handle = common_runtime::spawn_bg(async move { handler_cloned.run().await }); - - let _ = queue - .write() - .unwrap() - .push_back(1.into(), MockRequest::default()); - handler.task_notifier.notify_one(); - let _ = queue - .write() - .unwrap() - .push_back(2.into(), MockRequest::default()); - handler.task_notifier.notify_one(); - - tokio::time::timeout(Duration::from_secs(1), latch.wait()) - .await - .unwrap(); - } - - #[derive(Default, Debug)] - struct MockRequest { - region_id: RegionId, - } - - struct MockHandler { - cb: F, - } - - #[async_trait::async_trait] - impl Handler for MockHandler - where - F: Fn() + Send + Sync, - { - type Request = MockRequest; - - async fn handle_request( - &self, - _req: Self::Request, - token: BoxedRateLimitToken, - finish_notifier: Arc, - ) -> Result<()> { - (self.cb)(); - token.try_release(); - finish_notifier.notify_one(); - Ok(()) - } - } - - impl Request for MockRequest { - type Key = RegionId; - - fn key(&self) -> Self::Key { - self.region_id - } - - fn complete(self, _result: Result<()>) {} - } - - #[tokio::test] - async fn test_scheduler() { - let latch = Arc::new(CountdownLatch::new(2)); - let latch_cloned = latch.clone(); - - let handler = MockHandler { - cb: move || { - latch_cloned.countdown(); - }, - }; - let scheduler: LocalScheduler = LocalScheduler::new( - SchedulerConfig { - max_inflight_tasks: 3, - }, - handler, - ); - - let _ = scheduler - .schedule(MockRequest { - region_id: 1.into(), - }) - .unwrap(); - let _ = scheduler - .schedule(MockRequest { - region_id: 2.into(), - }) - .unwrap(); - - tokio::time::timeout(Duration::from_secs(1), latch.wait()) - .await - .unwrap(); - } - - #[tokio::test] - async fn test_scheduler_many() { - common_telemetry::init_default_ut_logging(); - let task_size = 100; - - let latch = Arc::new(CountdownLatch::new(task_size)); - let latch_clone = latch.clone(); - - let handler = MockHandler { - cb: move || { - latch_clone.countdown(); - }, - }; - - let config = SchedulerConfig { - max_inflight_tasks: 3, - }; - let scheduler = LocalScheduler::new(config, handler); - - for i in 0..task_size { - assert!(scheduler - .schedule(MockRequest { - region_id: RegionId::from(i as u64), - }) - .is_ok()); - } - - tokio::time::timeout(Duration::from_secs(3), latch.wait()) - .await - .unwrap(); - } - - #[tokio::test] - async fn test_scheduler_interval() { - common_telemetry::init_default_ut_logging(); - let task_size = 100; - let latch = Arc::new(CountdownLatch::new(task_size)); - let latch_clone = latch.clone(); - - let handler = MockHandler { - cb: move || { - latch_clone.countdown(); - }, - }; - - let config = SchedulerConfig { - max_inflight_tasks: 3, - }; - let scheduler = LocalScheduler::new(config, handler); - - for i in 0..task_size / 2 { - assert!(scheduler - .schedule(MockRequest { - region_id: RegionId::from(i as u64), - }) - .is_ok()); - } - - tokio::time::sleep(Duration::from_millis(100)).await; - for i in task_size / 2..task_size { - assert!(scheduler - .schedule(MockRequest { - region_id: RegionId::from(i as u64), - }) - .is_ok()); - } - - tokio::time::timeout(Duration::from_secs(6), latch.wait()) - .await - .unwrap(); - } - - struct MockAsyncHandler { - cb: F, - } - - #[async_trait::async_trait] - impl Handler for MockAsyncHandler - where - F: Fn() -> BoxFuture<'static, ()> + Send + Sync, - { - type Request = MockRequest; - - async fn handle_request( - &self, - _req: Self::Request, - token: BoxedRateLimitToken, - finish_notifier: Arc, - ) -> Result<()> { - let fut = (self.cb)(); - fut.await; - token.try_release(); - finish_notifier.notify_one(); - Ok(()) - } - } - - #[tokio::test] - async fn test_schedule_duplicate_tasks() { - common_telemetry::init_default_ut_logging(); - let (tx, rx) = tokio::sync::watch::channel(false); - let handler = MockAsyncHandler { - cb: move || { - let mut rx = rx.clone(); - Box::pin(async move { - // Block the handler so it can't handle more requests. - loop { - rx.changed().await.unwrap(); - if *rx.borrow() { - break; - } - } - }) as _ // Casts the Pin> to Pin> - }, - }; - let config = SchedulerConfig { - max_inflight_tasks: 30, - }; - let scheduler = LocalScheduler::new(config, handler); - - let mut scheduled_task = 0; - for _ in 0..10 { - if scheduler - .schedule(MockRequest { - region_id: 1.into(), - }) - .unwrap() - { - scheduled_task += 1; - } - } - tx.send(true).unwrap(); - scheduler.stop(true).await.unwrap(); - debug!("Schedule tasks: {}", scheduled_task); - assert!(scheduled_task < 10); - } - - #[tokio::test] - async fn test_await_termination() { - common_telemetry::init_default_ut_logging(); - - let finished = Arc::new(AtomicI32::new(0)); - let finished_clone = finished.clone(); - let handler = MockHandler { - cb: move || { - let _ = finished_clone.fetch_add(1, Ordering::Relaxed); - }, - }; - - let config = SchedulerConfig { - max_inflight_tasks: 3, - }; - let scheduler = Arc::new(LocalScheduler::new(config, handler)); - let scheduler_cloned = scheduler.clone(); - let task_scheduled = Arc::new(AtomicI32::new(0)); - let task_scheduled_cloned = task_scheduled.clone(); - - let scheduling = Arc::new(AtomicBool::new(true)); - let scheduling_clone = scheduling.clone(); - let handle = common_runtime::spawn_write(async move { - for i in 0..10000 { - if let Ok(res) = scheduler_cloned.schedule(MockRequest { - region_id: RegionId::from(i as u64), - }) { - if res { - let _ = task_scheduled_cloned.fetch_add(1, Ordering::Relaxed); - } - } - - if !scheduling_clone.load(Ordering::Relaxed) { - break; - } - } - }); - - scheduler.stop(true).await.unwrap(); - scheduling.store(false, Ordering::Relaxed); - - let finished = finished.load(Ordering::Relaxed); - handle.await.unwrap(); - - assert_eq!(finished, task_scheduled.load(Ordering::Relaxed)); - } -} diff --git a/src/storage/src/scheduler/dedup_deque.rs b/src/storage/src/scheduler/dedup_deque.rs deleted file mode 100644 index edb2184d8e2b..000000000000 --- a/src/storage/src/scheduler/dedup_deque.rs +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::hash_map::Entry; -use std::collections::{HashMap, VecDeque}; -use std::fmt::{Debug, Formatter}; -use std::hash::Hash; - -/// Deque with key deduplication. -pub struct DedupDeque { - deque: VecDeque, - existing: HashMap, -} - -impl Default for DedupDeque { - fn default() -> Self { - Self { - deque: VecDeque::new(), - existing: HashMap::new(), - } - } -} - -impl DedupDeque { - /// Pushes a key value to the back of deque. - /// Returns true if the deque does not already contain value with the same key, otherwise - /// returns false. - pub fn push_back(&mut self, key: K, value: V) -> bool { - debug_assert_eq!(self.deque.len(), self.existing.len()); - if let Entry::Vacant(entry) = self.existing.entry(key.clone()) { - let _ = entry.insert(value); - self.deque.push_back(key); - return true; - } - false - } - - /// Pushes a key value to the front of deque. - /// Returns true if the deque does not already contain value with the same key, otherwise - /// returns false. - pub fn push_front(&mut self, key: K, value: V) -> bool { - if let Entry::Vacant(entry) = self.existing.entry(key.clone()) { - let _ = entry.insert(value); - self.deque.push_front(key); - return true; - } - false - } - - /// Pops a pair from the back of deque. Returns [None] if the deque is empty. - pub fn pop_front(&mut self) -> Option<(K, V)> { - debug_assert_eq!(self.deque.len(), self.existing.len()); - let key = self.deque.pop_front()?; - let value = self.existing.remove(&key)?; - Some((key, value)) - } - - #[inline] - pub fn len(&self) -> usize { - debug_assert_eq!(self.deque.len(), self.existing.len()); - self.deque.len() - } - - #[inline] - pub fn is_empty(&self) -> bool { - self.deque.is_empty() - } - - #[inline] - pub fn clear(&mut self) { - self.deque.clear(); - self.existing.clear(); - } -} - -impl Debug for DedupDeque -where - K: Debug, - V: Debug, -{ - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - f.debug_struct("DedupDeque") - .field("deque", &self.deque) - .field("existing", &self.existing) - .finish() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_dedup_deque() { - let mut deque = DedupDeque::default(); - assert!(deque.push_back(1, "hello".to_string())); - assert_eq!(1, deque.len()); - assert!(deque.push_back(2, "world".to_string())); - assert_eq!(2, deque.len()); - assert_eq!((1, "hello".to_string()), deque.pop_front().unwrap()); - assert_eq!(1, deque.len()); - assert_eq!((2, "world".to_string()), deque.pop_front().unwrap()); - assert_eq!(0, deque.len()); - - // insert duplicated item - assert!(deque.push_back(1, "hello".to_string())); - assert!(!deque.push_back(1, "world".to_string())); - assert_eq!((1, "hello".to_string()), deque.pop_front().unwrap()); - - deque.clear(); - assert!(deque.is_empty()); - } -} diff --git a/src/storage/src/scheduler/rate_limit.rs b/src/storage/src/scheduler/rate_limit.rs deleted file mode 100644 index 63776b43f7a0..000000000000 --- a/src/storage/src/scheduler/rate_limit.rs +++ /dev/null @@ -1,185 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::marker::PhantomData; -use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; -use std::sync::Arc; - -use crate::error::{RateLimitedSnafu, Result}; - -pub trait RateLimitToken { - /// Releases the token. - /// ### Note - /// Implementation should guarantee the idempotency. - fn try_release(&self); -} - -pub type BoxedRateLimitToken = Box; - -impl RateLimitToken for Box { - fn try_release(&self) { - (**self).try_release() - } -} - -/// Rate limiter -pub trait RateLimiter { - type Request; - - /// Acquires a token from rate limiter. Returns `Err` on failure. - fn acquire_token(&self, req: &Self::Request) -> Result; -} - -pub type BoxedRateLimiter = Box + Send + Sync>; - -/// Limits max inflight tasks number. -pub struct MaxInflightTaskLimiter { - max_inflight_tasks: usize, - inflight_tasks: Arc, - _phantom_data: PhantomData, -} - -impl MaxInflightTaskLimiter { - pub fn new(max_inflight_tasks: usize) -> Self { - Self { - max_inflight_tasks, - inflight_tasks: Arc::new(AtomicUsize::new(0)), - _phantom_data: Default::default(), - } - } -} - -impl RateLimiter for MaxInflightTaskLimiter { - type Request = R; - - fn acquire_token(&self, _: &Self::Request) -> Result { - if self.inflight_tasks.fetch_add(1, Ordering::Relaxed) >= self.max_inflight_tasks { - let _ = self.inflight_tasks.fetch_sub(1, Ordering::Relaxed); - return RateLimitedSnafu { - msg: format!( - "Max inflight task num exceeds, current: {}, max: {}", - self.inflight_tasks.load(Ordering::Relaxed), - self.max_inflight_tasks - ), - } - .fail(); - } - - Ok(Box::new(MaxInflightLimiterToken::new( - self.inflight_tasks.clone(), - ))) - } -} - -pub struct MaxInflightLimiterToken { - counter: Arc, - released: AtomicBool, -} - -impl MaxInflightLimiterToken { - pub fn new(counter: Arc) -> Self { - Self { - counter, - released: AtomicBool::new(false), - } - } -} - -impl RateLimitToken for MaxInflightLimiterToken { - fn try_release(&self) { - if self - .released - .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed) - .is_ok() - { - let _ = self.counter.fetch_sub(1, Ordering::Relaxed); - } - } -} - -/// A composite rate limiter that allows token acquisition only when all internal limiters allow. -pub struct CascadeRateLimiter { - limits: Vec>, -} - -impl CascadeRateLimiter { - pub fn new(limits: Vec>) -> Self { - Self { limits } - } -} - -impl RateLimiter for CascadeRateLimiter { - type Request = T; - - fn acquire_token(&self, req: &Self::Request) -> Result { - let mut res = vec![]; - for limit in &self.limits { - match limit.acquire_token(req) { - Ok(token) => { - res.push(token); - } - Err(e) => { - res.iter().for_each(RateLimitToken::try_release); - return Err(e); - } - } - } - Ok(Box::new(CompositeToken { tokens: res })) - } -} - -/// Composite token that releases all acquired token when released. -pub struct CompositeToken { - tokens: Vec, -} - -impl RateLimitToken for CompositeToken { - fn try_release(&self) { - for token in &self.tokens { - token.try_release(); - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_max_inflight_limiter() { - let limiter = MaxInflightTaskLimiter::new(3); - let t1 = limiter.acquire_token(&1).unwrap(); - assert_eq!(1, limiter.inflight_tasks.load(Ordering::Relaxed)); - let _t2 = limiter.acquire_token(&1).unwrap(); - assert_eq!(2, limiter.inflight_tasks.load(Ordering::Relaxed)); - let _t3 = limiter.acquire_token(&1).unwrap(); - assert_eq!(3, limiter.inflight_tasks.load(Ordering::Relaxed)); - assert!(limiter.acquire_token(&1).is_err()); - t1.try_release(); - assert_eq!(2, limiter.inflight_tasks.load(Ordering::Relaxed)); - let _t4 = limiter.acquire_token(&1).unwrap(); - } - - #[test] - fn test_cascade_limiter() { - let limiter: CascadeRateLimiter = - CascadeRateLimiter::new(vec![Box::new(MaxInflightTaskLimiter::new(3))]); - let t1 = limiter.acquire_token(&1).unwrap(); - let _t2 = limiter.acquire_token(&1).unwrap(); - let _t3 = limiter.acquire_token(&1).unwrap(); - assert!(limiter.acquire_token(&1).is_err()); - t1.try_release(); - let _t4 = limiter.acquire_token(&1).unwrap(); - } -} diff --git a/src/storage/src/schema.rs b/src/storage/src/schema.rs deleted file mode 100644 index a4be72e5b4ef..000000000000 --- a/src/storage/src/schema.rs +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub mod compat; -mod projected; -mod region; -mod store; - -pub use crate::schema::projected::{ProjectedSchema, ProjectedSchemaRef}; -pub use crate::schema::region::{RegionSchema, RegionSchemaRef}; -pub use crate::schema::store::{StoreSchema, StoreSchemaRef}; - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use datatypes::vectors::{ - Int64Vector, TimestampMillisecondVector, UInt64Vector, UInt8Vector, VectorRef, - }; - - use crate::read::Batch; - - pub const REGION_NAME: &str = "test"; - - pub(crate) fn new_batch() -> Batch { - new_batch_with_num_values(1) - } - - pub(crate) fn new_batch_with_num_values(num_field_columns: usize) -> Batch { - let k0 = Int64Vector::from_slice([1, 2, 3]); - let timestamp = TimestampMillisecondVector::from_vec(vec![4, 5, 6]); - - let mut columns: Vec = vec![Arc::new(k0), Arc::new(timestamp)]; - - for i in 0..num_field_columns { - let vi = Int64Vector::from_slice([i as i64, i as i64, i as i64]); - columns.push(Arc::new(vi)); - } - - let sequences = UInt64Vector::from_slice([100, 100, 100]); - let op_types = UInt8Vector::from_slice([0, 0, 0]); - - columns.push(Arc::new(sequences)); - columns.push(Arc::new(op_types)); - - Batch::new(columns) - } -} diff --git a/src/storage/src/schema/compat.rs b/src/storage/src/schema/compat.rs deleted file mode 100644 index 2deefaddb27d..000000000000 --- a/src/storage/src/schema/compat.rs +++ /dev/null @@ -1,611 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Utilities for resolving schema compatibility problems. - -use datatypes::arrow::record_batch::RecordBatch; -use datatypes::schema::SchemaRef; -use datatypes::vectors::{Helper, VectorRef}; -use snafu::{ensure, OptionExt, ResultExt}; - -use crate::error::{self, Result}; -use crate::metadata::ColumnMetadata; -use crate::read::Batch; -use crate::schema::{ProjectedSchemaRef, StoreSchemaRef}; - -/// Make schema compatible to write to target with another schema. -pub trait CompatWrite { - /// Makes the schema of `self` compatible with `dest_schema`. - /// - /// For column in `dest_schema` but not in `self`, this method would insert a - /// vector with default value. - /// - /// If there are columns not in `dest_schema`, an error would be returned. - fn compat_write(&mut self, dest_schema: &SchemaRef) -> Result<()>; -} - -/// Checks whether column with `source_column` could be read as a column with `dest_column`. -/// -/// Returns -/// - `Ok(true)` if `source_column` is compatible to read using `dest_column` as schema. -/// - `Ok(false)` if they are considered different columns. -/// - `Err` if there is incompatible issue that could not be resolved. -fn is_source_column_compatible( - source_column: &ColumnMetadata, - dest_column: &ColumnMetadata, -) -> Result { - ensure!( - source_column.name() == dest_column.name(), - error::CompatReadSnafu { - reason: format!( - "try to use column in {} for column {}", - source_column.name(), - dest_column.name() - ), - } - ); - - if source_column.id() != dest_column.id() { - return Ok(false); - } - - ensure!( - source_column.desc.data_type == dest_column.desc.data_type, - error::CompatReadSnafu { - reason: format!( - "could not read column {} from {:?} type as {:?} type", - dest_column.name(), - source_column.desc.data_type, - dest_column.desc.data_type - ), - } - ); - - ensure!( - dest_column.desc.is_nullable() || !source_column.desc.is_nullable(), - error::CompatReadSnafu { - reason: format!( - "unable to read nullable data for non null column {}", - dest_column.name() - ), - } - ); - - Ok(true) -} - -/// Adapter to help reading data with source schema as data with dest schema. -#[derive(Debug)] -pub struct ReadAdapter { - /// Schema of data source. - source_schema: StoreSchemaRef, - /// Schema user expects to read. - dest_schema: ProjectedSchemaRef, - /// For each column in dest schema, stores the index in read result for - /// this column, or None if the column is not in result. - /// - /// This vec would be left empty if `source_version == dest_version`. - indices_in_result: Vec>, - /// For each column in source schema, stores whether we need to read that column. All - /// columns are needed by default. - is_source_needed: Vec, -} - -impl ReadAdapter { - /// Creates a new [ReadAdapter] that could convert data with `source_schema` into data - /// with `dest_schema`. - pub fn new( - source_schema: StoreSchemaRef, - dest_schema: ProjectedSchemaRef, - ) -> Result { - if source_schema.version() == dest_schema.schema_to_read().version() { - ReadAdapter::from_same_version(source_schema, dest_schema) - } else { - ReadAdapter::from_different_version(source_schema, dest_schema) - } - } - - fn from_same_version( - source_schema: StoreSchemaRef, - dest_schema: ProjectedSchemaRef, - ) -> Result { - let mut is_source_needed = vec![true; source_schema.num_columns()]; - if source_schema.num_columns() != dest_schema.schema_to_read().num_columns() { - // `dest_schema` might be projected, so we need to find out value columns that not be read - // by the `dest_schema`. - - for (offset, field_column) in source_schema.field_columns().iter().enumerate() { - // Iterate value columns in source and mark those not in destination as unneeded. - if !dest_schema.is_needed(field_column.id()) { - is_source_needed[source_schema.field_column_index_by_offset(offset)] = false; - } - } - } - - Ok(ReadAdapter { - source_schema, - dest_schema, - indices_in_result: Vec::new(), - is_source_needed, - }) - } - - fn from_different_version( - source_schema: StoreSchemaRef, - dest_schema: ProjectedSchemaRef, - ) -> Result { - let schema_to_read = dest_schema.schema_to_read(); - let mut indices_in_result = vec![None; schema_to_read.num_columns()]; - let mut is_source_needed = vec![true; source_schema.num_columns()]; - // Number of columns in result from source data. - let mut num_columns_in_result = 0; - - for (idx, source_column) in source_schema.columns().iter().enumerate() { - // For each column in source schema, check whether we need to read it. - if let Some(dest_idx) = schema_to_read - .schema() - .column_index_by_name(source_column.name()) - { - let dest_column = &schema_to_read.columns()[dest_idx]; - // Check whether we could read this column. - if is_source_column_compatible(source_column, dest_column)? { - // Mark that this column could be read from source data, since some - // columns in source schema would be skipped, we should not use - // the source column's index directly. - indices_in_result[dest_idx] = Some(num_columns_in_result); - num_columns_in_result += 1; - } else { - // This column is not the same column in dest schema, should be fill by default value - // instead of reading from source data. - is_source_needed[idx] = false; - } - } else { - // The column is not in `dest_schema`, we don't need to read it. - is_source_needed[idx] = false; - } - } - - Ok(ReadAdapter { - source_schema, - dest_schema, - indices_in_result, - is_source_needed, - }) - } - - /// Returns a bool slice to denote which key column in source is needed. - #[inline] - pub fn source_key_needed(&self) -> &[bool] { - &self.is_source_needed[..self.source_schema.row_key_end()] - } - - /// Returns a bool slice to denote which value column in source is needed. - #[inline] - pub fn source_value_needed(&self) -> &[bool] { - &self.is_source_needed - [self.source_schema.row_key_end()..self.source_schema.user_column_end()] - } - - /// Construct a new [Batch] from row key, value, sequence and op_type. - /// - /// # Panics - /// Panics if input `VectorRef` is empty. - pub fn batch_from_parts( - &self, - row_key_columns: Vec, - mut field_columns: Vec, - sequences: VectorRef, - op_types: VectorRef, - ) -> Result { - // Each vector should has same length, so here we just use the length of `sequence`. - let num_rows = sequences.len(); - - let mut source = row_key_columns; - // Reserve space for value, sequence and op_type - source.reserve(field_columns.len() + 2); - source.append(&mut field_columns); - // Internal columns are push in sequence, op_type order. - source.push(sequences); - source.push(op_types); - - if !self.need_compat() { - return Ok(Batch::new(source)); - } - - self.source_columns_to_batch(source, num_rows) - } - - /// Returns list of fields indices need to read from the parquet file. - pub fn fields_to_read(&self) -> Vec { - self.is_source_needed - .iter() - .enumerate() - .filter_map(|(idx, needed)| if *needed { Some(idx) } else { None }) - .collect::>() - } - - /// Convert [RecordBatch] read from the parquet file into [Batch]. - /// - /// The [RecordBatch] should have the same schema as [`ReadAdapter::fields_to_read()`]. - pub fn arrow_record_batch_to_batch(&self, record_batch: &RecordBatch) -> Result { - let names = self - .source_schema - .schema() - .column_schemas() - .iter() - .zip(self.is_source_needed.iter()) - .filter_map(|(column_schema, is_needed)| { - if *is_needed { - Some(&column_schema.name) - } else { - None - } - }); - let source = record_batch - .columns() - .iter() - .zip(names) - .map(|(column, name)| { - Helper::try_into_vector(column.clone()).context(error::ConvertChunkSnafu { name }) - }) - .collect::>()?; - - if !self.need_compat() || record_batch.num_rows() == 0 { - return Ok(Batch::new(source)); - } - - let num_rows = record_batch.num_rows(); - self.source_columns_to_batch(source, num_rows) - } - - #[inline] - fn need_compat(&self) -> bool { - self.source_schema.version() != self.dest_schema.schema_to_read().version() - } - - fn source_columns_to_batch(&self, source: Vec, num_rows: usize) -> Result { - let column_schemas = self.dest_schema.schema_to_read().schema().column_schemas(); - let columns = self - .indices_in_result - .iter() - .zip(column_schemas) - .map(|(index_opt, column_schema)| { - if let Some(idx) = index_opt { - Ok(source[*idx].clone()) - } else { - let vector = column_schema - .create_default_vector(num_rows) - .context(error::CreateDefaultToReadSnafu { - column: &column_schema.name, - })? - .context(error::NoDefaultToReadSnafu { - column: &column_schema.name, - })?; - Ok(vector) - } - }) - .collect::>>()?; - - Ok(Batch::new(columns)) - } -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use datatypes::data_type::ConcreteDataType; - use datatypes::schema::Schema; - use store_api::storage::ColumnDescriptorBuilder; - - use super::*; - use crate::error::Error; - use crate::metadata::RegionMetadata; - use crate::schema::{tests, ProjectedSchema, RegionSchema}; - use crate::test_util::{descriptor_util, schema_util}; - - fn call_batch_from_parts( - adapter: &ReadAdapter, - batch: &Batch, - num_field_columns: usize, - ) -> Batch { - let key = batch.columns()[0..2].to_vec(); - let value = batch.columns()[2..2 + num_field_columns].to_vec(); - let sequence = batch.column(2 + num_field_columns).clone(); - let op_type = batch.column(2 + num_field_columns + 1).clone(); - - adapter - .batch_from_parts(key, value, sequence, op_type) - .unwrap() - } - - fn check_batch_from_parts_without_padding( - adapter: &ReadAdapter, - batch: &Batch, - num_field_columns: usize, - ) { - let new_batch = call_batch_from_parts(adapter, batch, num_field_columns); - assert_eq!(*batch, new_batch); - } - - fn call_arrow_chunk_to_batch(adapter: &ReadAdapter, batch: &Batch) -> Batch { - let columns_schema = adapter - .source_schema - .columns() - .iter() - .zip(adapter.is_source_needed.iter()) - .filter_map(|(field, is_needed)| { - if *is_needed { - Some(field.to_column_schema().unwrap()) - } else { - None - } - }) - .collect::>(); - let arrow_schema = Schema::try_new(columns_schema) - .unwrap() - .arrow_schema() - .clone(); - let arrays = batch.columns().iter().map(|v| v.to_arrow_array()).collect(); - let chunk = RecordBatch::try_new(arrow_schema, arrays).unwrap(); - adapter.arrow_record_batch_to_batch(&chunk).unwrap() - } - - fn check_arrow_chunk_to_batch_without_padding(adapter: &ReadAdapter, batch: &Batch) { - let new_batch = call_arrow_chunk_to_batch(adapter, batch); - assert_eq!(*batch, new_batch); - } - - fn check_batch_with_null_padding(batch: &Batch, new_batch: &Batch, null_columns: &[usize]) { - assert_eq!( - batch.num_columns() + null_columns.len(), - new_batch.num_columns() - ); - - let columns_from_source = new_batch - .columns() - .iter() - .enumerate() - .filter_map(|(i, v)| { - if null_columns.contains(&i) { - None - } else { - Some(v.clone()) - } - }) - .collect::>(); - - assert_eq!(batch.columns(), &columns_from_source); - - for idx in null_columns { - assert!(new_batch.column(*idx).only_null()); - } - } - - #[test] - fn test_compat_same_schema() { - // (k0, timestamp, v0, v1) with version 0. - let region_schema = Arc::new(schema_util::new_region_schema(0, 2)); - let projected_schema = Arc::new(ProjectedSchema::no_projection(region_schema.clone())); - let source_schema = region_schema.store_schema().clone(); - let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap(); - - assert_eq!(&[true, true], adapter.source_key_needed()); - assert_eq!(&[true, true], adapter.source_value_needed()); - - let batch = tests::new_batch_with_num_values(2); - check_batch_from_parts_without_padding(&adapter, &batch, 2); - - assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 3, 4, 5],); - - check_arrow_chunk_to_batch_without_padding(&adapter, &batch); - } - - #[test] - fn test_compat_same_version_with_projection() { - // (k0, timestamp, v0, v1) with version 0. - let region_schema = Arc::new(schema_util::new_region_schema(0, 2)); - // Just read v0, k0. - let projected_schema = - Arc::new(ProjectedSchema::new(region_schema.clone(), Some(vec![2, 0])).unwrap()); - - let source_schema = region_schema.store_schema().clone(); - let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap(); - - assert_eq!(&[true, true], adapter.source_key_needed()); - assert_eq!(&[true, false], adapter.source_value_needed()); - - // One value column has been filtered out, so the result batch should only contains one value column. - let batch = tests::new_batch_with_num_values(1); - check_batch_from_parts_without_padding(&adapter, &batch, 1); - - assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 4, 5]); - - check_arrow_chunk_to_batch_without_padding(&adapter, &batch); - } - - #[test] - fn test_compat_old_column() { - // (k0, timestamp, v0) with version 0. - let region_schema_old = Arc::new(schema_util::new_region_schema(0, 1)); - // (k0, timestamp, v0, v1) with version 1. - let region_schema_new = Arc::new(schema_util::new_region_schema(1, 1)); - - // Just read v0, k0 - let projected_schema = - Arc::new(ProjectedSchema::new(region_schema_new, Some(vec![2, 0])).unwrap()); - - let source_schema = region_schema_old.store_schema().clone(); - let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap(); - - assert_eq!(&[true, true], adapter.source_key_needed()); - assert_eq!(&[true], adapter.source_value_needed()); - - let batch = tests::new_batch_with_num_values(1); - check_batch_from_parts_without_padding(&adapter, &batch, 1); - - assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 3, 4],); - - check_arrow_chunk_to_batch_without_padding(&adapter, &batch); - } - - #[test] - fn test_compat_new_column() { - // (k0, timestamp, v0, v1) with version 0. - let region_schema_old = Arc::new(schema_util::new_region_schema(0, 2)); - // (k0, timestamp, v0, v1, v2) with version 1. - let region_schema_new = Arc::new(schema_util::new_region_schema(1, 3)); - - // Just read v2, v0, k0 - let projected_schema = - Arc::new(ProjectedSchema::new(region_schema_new, Some(vec![4, 2, 0])).unwrap()); - - let source_schema = region_schema_old.store_schema().clone(); - let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap(); - - assert_eq!(&[true, true], adapter.source_key_needed()); - assert_eq!(&[true, false], adapter.source_value_needed()); - - // Only read one value column from source. - let batch = tests::new_batch_with_num_values(1); - // New batch should contains k0, timestamp, v0, sequence, op_type. - let new_batch = call_batch_from_parts(&adapter, &batch, 1); - // v2 is filled by null. - check_batch_with_null_padding(&batch, &new_batch, &[3]); - - assert_eq!(&adapter.fields_to_read(), &[0, 1, 2, 4, 5],); - - let new_batch = call_arrow_chunk_to_batch(&adapter, &batch); - check_batch_with_null_padding(&batch, &new_batch, &[3]); - } - - #[test] - fn test_compat_different_column() { - // (k0, timestamp, v0, v1) with version 0. - let region_schema_old = Arc::new(schema_util::new_region_schema(0, 2)); - - let mut descriptor = descriptor_util::desc_with_field_columns(tests::REGION_NAME, 2); - // Assign a much larger column id to v0. - descriptor.default_cf.columns[0].id = descriptor.default_cf.columns.last().unwrap().id + 10; - let metadata: RegionMetadata = descriptor.try_into().unwrap(); - let columns = metadata.columns; - // (k0, timestamp, v0, v1) with version 2, and v0 has different column id. - let region_schema_new = Arc::new(RegionSchema::new(columns, 2).unwrap()); - - let projected_schema = Arc::new(ProjectedSchema::no_projection(region_schema_new)); - let source_schema = region_schema_old.store_schema().clone(); - let adapter = ReadAdapter::new(source_schema, projected_schema).unwrap(); - - assert_eq!(&[true, true], adapter.source_key_needed()); - // v0 is discarded as it has different column id than new schema's. - assert_eq!(&[false, true], adapter.source_value_needed()); - - // New batch should contains k0, timestamp, v1, sequence, op_type, so we need to remove v0 - // from the created batch. - let batch = tests::new_batch_with_num_values(2); - let mut columns = batch.columns().to_vec(); - // Remove v0. - let _ = columns.remove(2); - let batch = Batch::new(columns); - - let new_batch = call_batch_from_parts(&adapter, &batch, 1); - // v0 is filled by null. - check_batch_with_null_padding(&batch, &new_batch, &[2]); - - assert_eq!(&adapter.fields_to_read(), &[0, 1, 3, 4, 5],); - - let new_batch = call_arrow_chunk_to_batch(&adapter, &batch); - check_batch_with_null_padding(&batch, &new_batch, &[2]); - } - - #[inline] - fn new_column_desc_builder() -> ColumnDescriptorBuilder { - ColumnDescriptorBuilder::new(10, "test", ConcreteDataType::int32_datatype()) - } - - #[test] - fn test_is_source_column_compatible() { - let desc = new_column_desc_builder().build().unwrap(); - let source = ColumnMetadata { cf_id: 1, desc }; - - // Same column is always compatible, also tests read nullable column - // as a nullable column. - assert!(is_source_column_compatible(&source, &source).unwrap()); - - // Different id. - let desc = new_column_desc_builder() - .id(source.desc.id + 1) - .build() - .unwrap(); - let dest = ColumnMetadata { cf_id: 1, desc }; - assert!(!is_source_column_compatible(&source, &dest).unwrap()); - } - - #[test] - fn test_nullable_column_read_by_not_null() { - let desc = new_column_desc_builder().build().unwrap(); - assert!(desc.is_nullable()); - let source = ColumnMetadata { cf_id: 1, desc }; - - let desc = new_column_desc_builder() - .is_nullable(false) - .build() - .unwrap(); - let dest = ColumnMetadata { cf_id: 1, desc }; - - let err = is_source_column_compatible(&source, &dest).unwrap_err(); - assert!( - matches!(err, Error::CompatRead { .. }), - "{err:?} is not CompatRead", - ); - } - - #[test] - fn test_read_not_null_column() { - let desc = new_column_desc_builder() - .is_nullable(false) - .build() - .unwrap(); - let source = ColumnMetadata { cf_id: 1, desc }; - - let desc = new_column_desc_builder() - .is_nullable(false) - .build() - .unwrap(); - let not_null_dest = ColumnMetadata { cf_id: 1, desc }; - assert!(is_source_column_compatible(&source, ¬_null_dest).unwrap()); - - let desc = new_column_desc_builder().build().unwrap(); - let null_dest = ColumnMetadata { cf_id: 1, desc }; - assert!(is_source_column_compatible(&source, &null_dest).unwrap()); - } - - #[test] - fn test_read_column_with_different_name() { - let desc = new_column_desc_builder().build().unwrap(); - let source = ColumnMetadata { cf_id: 1, desc }; - - let desc = new_column_desc_builder() - .name(format!("{}_other", source.desc.name)) - .build() - .unwrap(); - let dest = ColumnMetadata { cf_id: 1, desc }; - - let err = is_source_column_compatible(&source, &dest).unwrap_err(); - assert!( - matches!(err, Error::CompatRead { .. }), - "{err:?} is not CompatRead", - ); - } -} diff --git a/src/storage/src/schema/projected.rs b/src/storage/src/schema/projected.rs deleted file mode 100644 index 756c9a877728..000000000000 --- a/src/storage/src/schema/projected.rs +++ /dev/null @@ -1,590 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::cmp::Ordering; -use std::collections::{BTreeSet, HashMap}; -use std::sync::Arc; - -use api::v1::OpType; -use common_base::BitVec; -use datatypes::prelude::ScalarVector; -use datatypes::schema::{SchemaBuilder, SchemaRef}; -use datatypes::vectors::{BooleanVector, UInt8Vector}; -use snafu::{ensure, ResultExt}; -use store_api::storage::{Chunk, ColumnId}; - -use crate::error; -use crate::metadata::{self, Result}; -use crate::read::{Batch, BatchOp}; -use crate::schema::{RegionSchema, RegionSchemaRef, StoreSchema, StoreSchemaRef}; - -/// Metadata about projection. -#[derive(Debug, Default)] -struct Projection { - /// Column indices of projection. - projected_columns: Vec, - /// Sorted and deduplicated indices of columns to read, includes all row key columns - /// and internal columns. - /// - /// We use these indices to read from data sources. - columns_to_read: Vec, - /// Maps column id to its index in `columns_to_read`. - /// - /// Used to ask whether the column with given column id is needed in projection. - id_to_read_idx: HashMap, - /// Maps index of `projected_columns` to index of the column in `columns_to_read`. - /// - /// Invariant: - /// - `projected_idx_to_read_idx.len() == projected_columns.len()` - projected_idx_to_read_idx: Vec, - /// Number of user columns to read. - num_user_columns: usize, -} - -impl Projection { - fn new(region_schema: &RegionSchema, projected_columns: Vec) -> Projection { - // Get a sorted list of column indices to read. - let mut column_indices: BTreeSet<_> = projected_columns.iter().cloned().collect(); - column_indices.extend(region_schema.row_key_indices()); - let num_user_columns = column_indices.len(); - // Now insert internal columns. - column_indices.extend([ - region_schema.sequence_index(), - region_schema.op_type_index(), - ]); - let columns_to_read: Vec<_> = column_indices.into_iter().collect(); - - // The region schema ensure that last two column must be internal columns. - assert_eq!( - region_schema.sequence_index(), - columns_to_read[num_user_columns] - ); - assert_eq!( - region_schema.op_type_index(), - columns_to_read[num_user_columns + 1] - ); - - // Mapping: => - let id_to_read_idx: HashMap<_, _> = columns_to_read - .iter() - .enumerate() - .map(|(idx, col_idx)| (region_schema.column_metadata(*col_idx).id(), idx)) - .collect(); - // Use column id to find index in `columns_to_read` of a column in `projected_columns`. - let projected_idx_to_read_idx = projected_columns - .iter() - .map(|col_idx| { - let column_id = region_schema.column_metadata(*col_idx).id(); - // This unwrap() should be safe since `columns_to_read` must contains all columns in `projected_columns`. - let read_idx = id_to_read_idx.get(&column_id).unwrap(); - *read_idx - }) - .collect(); - - Projection { - projected_columns, - columns_to_read, - id_to_read_idx, - projected_idx_to_read_idx, - num_user_columns, - } - } -} - -/// Schema with projection info. -#[derive(Debug)] -pub struct ProjectedSchema { - /// Projection info, `None` means don't need to do projection. - projection: Option, - /// Schema used to read from data sources. - schema_to_read: StoreSchemaRef, - /// User schema after projection. - projected_user_schema: SchemaRef, -} - -pub type ProjectedSchemaRef = Arc; - -impl ProjectedSchema { - /// Create a new `ProjectedSchema` with given `projected_columns`. - /// - /// If `projected_columns` is None, then all columns would be read. If `projected_columns` is - /// `Some`, then the `Vec` in it contains the indices of columns need to be read. - /// - /// If the `Vec` is empty or contains invalid index, `Err` would be returned. - pub fn new( - region_schema: RegionSchemaRef, - projected_columns: Option>, - ) -> Result { - match projected_columns { - Some(indices) => { - Self::validate_projection(®ion_schema, &indices)?; - - let projection = Projection::new(®ion_schema, indices); - - let schema_to_read = Self::build_schema_to_read(®ion_schema, &projection)?; - let projected_user_schema = - Self::build_projected_user_schema(®ion_schema, &projection)?; - - Ok(ProjectedSchema { - projection: Some(projection), - schema_to_read, - projected_user_schema, - }) - } - None => Ok(ProjectedSchema::no_projection(region_schema)), - } - } - - /// Create a `ProjectedSchema` that read all columns. - pub fn no_projection(region_schema: RegionSchemaRef) -> ProjectedSchema { - // We could just reuse the StoreSchema and user schema. - ProjectedSchema { - projection: None, - schema_to_read: region_schema.store_schema().clone(), - projected_user_schema: region_schema.user_schema().clone(), - } - } - - #[inline] - pub fn projected_user_schema(&self) -> &SchemaRef { - &self.projected_user_schema - } - - #[inline] - pub fn schema_to_read(&self) -> &StoreSchemaRef { - &self.schema_to_read - } - - /// Convert [Batch] into [Chunk]. - /// - /// This will remove all internal columns. The input `batch` should has the - /// same schema as [`self.schema_to_read()`](ProjectedSchema::schema_to_read). - /// The output [Chunk] has the same schema as - /// [`self.projected_user_schema()`](ProjectedSchema::projected_user_schema). - pub fn batch_to_chunk(&self, batch: &Batch) -> Chunk { - let columns = match &self.projection { - Some(projection) => projection - .projected_idx_to_read_idx - .iter() - .map(|col_idx| batch.column(*col_idx)) - .cloned() - .collect(), - None => { - let num_user_columns = self.projected_user_schema.num_columns(); - batch - .columns() - .iter() - .take(num_user_columns) - .cloned() - .collect() - } - }; - Chunk::new(columns) - } - - /// Returns true if column with given `column_id` is needed (in projection). - pub fn is_needed(&self, column_id: ColumnId) -> bool { - self.projection - .as_ref() - .map(|p| p.id_to_read_idx.contains_key(&column_id)) - .unwrap_or(true) - } - - fn build_schema_to_read( - region_schema: &RegionSchema, - projection: &Projection, - ) -> Result { - // Reorder columns according to the projection. - let columns: Vec<_> = projection - .columns_to_read - .iter() - .map(|col_idx| region_schema.column_metadata(*col_idx)) - .cloned() - .collect(); - // All row key columns are reserved in this schema, so we can use the row_key_end - // and timestamp_key_index from region schema. - let store_schema = StoreSchema::new( - columns, - region_schema.version(), - region_schema.row_key_end(), - projection.num_user_columns, - )?; - - Ok(Arc::new(store_schema)) - } - - fn build_projected_user_schema( - region_schema: &RegionSchema, - projection: &Projection, - ) -> Result { - let column_schemas: Vec<_> = projection - .projected_columns - .iter() - .map(|col_idx| { - region_schema - .column_metadata(*col_idx) - .desc - .to_column_schema() - }) - .collect(); - - let schema = SchemaBuilder::try_from(column_schemas) - .context(metadata::ConvertSchemaSnafu)? - .version(region_schema.version()) - .build() - .context(metadata::InvalidSchemaSnafu)?; - - Ok(Arc::new(schema)) - } - - fn validate_projection(region_schema: &RegionSchema, indices: &[usize]) -> Result<()> { - // The projection indices should not be empty, at least the timestamp column - // should be always read, and the `StoreSchema` also requires the timestamp column. - ensure!( - !indices.is_empty(), - metadata::InvalidProjectionSnafu { - msg: "at least one column should be read", - } - ); - - // Now only allowed to read user columns. - let user_schema = region_schema.user_schema(); - for i in indices { - ensure!( - *i < user_schema.num_columns(), - metadata::InvalidProjectionSnafu { - msg: format!( - "index {} out of bound, only contains {} columns", - i, - user_schema.num_columns() - ), - } - ); - } - - Ok(()) - } -} - -impl BatchOp for ProjectedSchema { - fn compare_row(&self, left: &Batch, i: usize, right: &Batch, j: usize) -> Ordering { - // Ordered by (row_key asc, sequence desc, op_type desc). - let indices = self.schema_to_read.row_key_indices(); - for idx in indices { - let (left_col, right_col) = (left.column(idx), right.column(idx)); - // Comparison of vector is done by virtual method calls currently. Consider using - // enum dispatch if this becomes bottleneck. - let order = left_col.get_ref(i).cmp(&right_col.get_ref(j)); - if order != Ordering::Equal { - return order; - } - } - let (sequence_index, op_type_index) = ( - self.schema_to_read.sequence_index(), - self.schema_to_read.op_type_index(), - ); - right - .column(sequence_index) - .get_ref(j) - .cmp(&left.column(sequence_index).get_ref(i)) - .then_with(|| { - right - .column(op_type_index) - .get_ref(j) - .cmp(&left.column(op_type_index).get_ref(i)) - }) - } - - fn find_unique(&self, batch: &Batch, selected: &mut BitVec, prev: Option<&Batch>) { - if let Some(prev) = prev { - assert_eq!(batch.num_columns(), prev.num_columns()); - } - let indices = self.schema_to_read.row_key_indices(); - for idx in indices { - let (current, prev_col) = ( - batch.column(idx), - prev.map(|prev| prev.column(idx).as_ref()), - ); - current.find_unique(selected, prev_col); - } - } - - fn filter(&self, batch: &Batch, filter: &BooleanVector) -> error::Result { - let columns = batch - .columns() - .iter() - .enumerate() - .map(|(i, v)| { - v.filter(filter).context(error::FilterColumnSnafu { - name: self.schema_to_read.column_name(i), - }) - }) - .collect::>>()?; - - Ok(Batch::new(columns)) - } - - fn unselect_deleted(&self, batch: &Batch, selected: &mut BitVec) { - let op_types = batch.column(self.schema_to_read.op_type_index()); - // Safety: We expect the batch has the same schema as `self.schema_to_read`. The - // read procedure should guarantee this, otherwise this is a critical bug and it - // should be fine to panic. - let op_types = op_types - .as_any() - .downcast_ref::() - .unwrap_or_else(|| { - panic!( - "Expect op_type (UInt8) column at index {}, given {:?}", - self.schema_to_read.op_type_index(), - op_types.data_type() - ); - }); - - for (i, op_type) in op_types.iter_data().enumerate() { - if op_type == Some(OpType::Delete as u8) { - selected.set(i, false); - } - } - } -} - -#[cfg(test)] -mod tests { - use api::v1::OpType; - use datatypes::prelude::ScalarVector; - use datatypes::type_id::LogicalTypeId; - use datatypes::vectors::{TimestampMillisecondVector, VectorRef}; - - use super::*; - use crate::metadata::Error; - use crate::schema::tests; - use crate::test_util::{read_util, schema_util}; - - #[test] - fn test_projection() { - // Build a region schema with 2 value columns. So the final user schema is - // (k0, timestamp, v0, v1) - let region_schema = schema_util::new_region_schema(0, 2); - - // Projection, but still keep column order. - // After projection: (timestamp, v0) - let projected_columns = vec![1, 2]; - let projection = Projection::new(®ion_schema, projected_columns.clone()); - assert_eq!(projected_columns, projection.projected_columns); - // Need to read (k0, timestamp, v0, sequence, op_type) - assert_eq!(&[0, 1, 2, 4, 5], &projection.columns_to_read[..]); - assert_eq!(5, projection.id_to_read_idx.len()); - // Index of timestamp, v0 in `columns_to_read` - assert_eq!(&[1, 2], &projection.projected_idx_to_read_idx[..]); - // 3 columns: k0, timestamp, v0 - assert_eq!(3, projection.num_user_columns); - - // Projection, unordered. - // After projection: (timestamp, v1, k0) - let projected_columns = vec![1, 3, 0]; - let projection = Projection::new(®ion_schema, projected_columns.clone()); - assert_eq!(projected_columns, projection.projected_columns); - // Need to read (k0, timestamp, v1, sequence, op_type) - assert_eq!(&[0, 1, 3, 4, 5], &projection.columns_to_read[..]); - assert_eq!(5, projection.id_to_read_idx.len()); - // Index of timestamp, v1, k0 in `columns_to_read` - assert_eq!(&[1, 2, 0], &projection.projected_idx_to_read_idx[..]); - // 3 columns: k0, timestamp, v1 - assert_eq!(3, projection.num_user_columns); - - // Empty projection. - let projection = Projection::new(®ion_schema, Vec::new()); - assert!(projection.projected_columns.is_empty()); - // Still need to read row keys. - assert_eq!(&[0, 1, 4, 5], &projection.columns_to_read[..]); - assert_eq!(4, projection.id_to_read_idx.len()); - assert!(projection.projected_idx_to_read_idx.is_empty()); - assert_eq!(2, projection.num_user_columns); - } - - #[test] - fn test_projected_schema_with_projection() { - // (k0, timestamp, v0, v1, v2) - let region_schema = Arc::new(schema_util::new_region_schema(123, 3)); - - // After projection: (v1, timestamp) - let projected_schema = - ProjectedSchema::new(region_schema.clone(), Some(vec![3, 1])).unwrap(); - let expect_user = schema_util::new_schema_with_version( - &[ - ("v1", LogicalTypeId::Int64, true), - ("timestamp", LogicalTypeId::TimestampMillisecond, false), - ], - Some(1), - 123, - ); - assert_eq!(expect_user, **projected_schema.projected_user_schema()); - - // Test is_needed - let needed: Vec<_> = region_schema - .columns() - .iter() - .enumerate() - .filter_map(|(idx, column_meta)| { - if projected_schema.is_needed(column_meta.id()) { - Some(idx) - } else { - None - } - }) - .collect(); - // (k0, timestamp, v1, sequence, op_type) - assert_eq!(&[0, 1, 3, 5, 6], &needed[..]); - - // Use another projection. - // After projection: (v0, timestamp) - let projected_schema = ProjectedSchema::new(region_schema, Some(vec![2, 1])).unwrap(); - - // The schema to read should be same as region schema with (k0, timestamp, v0). - // We can't use `new_schema_with_version()` because the StoreSchema also store other - // metadata that `new_schema_with_version()` can't store. - let expect_schema = schema_util::new_region_schema(123, 1); - assert_eq!( - expect_schema.store_schema(), - projected_schema.schema_to_read() - ); - - // (k0, timestamp, v0, sequence, op_type) - let batch = tests::new_batch(); - // Test Batch to our Chunk. - // (v0, timestamp) - let chunk = projected_schema.batch_to_chunk(&batch); - assert_eq!(2, chunk.columns.len()); - assert_eq!(&chunk.columns[0], batch.column(2)); - assert_eq!(&chunk.columns[1], batch.column(1)); - } - - #[test] - fn test_projected_schema_no_projection() { - // (k0, timestamp, v0) - let region_schema = Arc::new(schema_util::new_region_schema(123, 1)); - - let projected_schema = ProjectedSchema::no_projection(region_schema.clone()); - - assert_eq!( - region_schema.user_schema(), - projected_schema.projected_user_schema() - ); - assert_eq!( - region_schema.store_schema(), - projected_schema.schema_to_read() - ); - - for column in region_schema.columns() { - assert!(projected_schema.is_needed(column.id())); - } - - // (k0, timestamp, v0, sequence, op_type) - let batch = tests::new_batch(); - // Test Batch to our Chunk. - // (k0, timestamp, v0) - let chunk = projected_schema.batch_to_chunk(&batch); - assert_eq!(3, chunk.columns.len()); - } - - #[test] - fn test_projected_schema_empty_projection() { - // (k0, timestamp, v0) - let region_schema = Arc::new(schema_util::new_region_schema(123, 1)); - - let err = ProjectedSchema::new(region_schema, Some(Vec::new())) - .err() - .unwrap(); - assert!(matches!(err, Error::InvalidProjection { .. })); - } - - #[test] - fn test_compare_batch() { - let schema = read_util::new_projected_schema(); - let left = read_util::new_full_kv_batch(&[(1000, 1, 1000, OpType::Put)]); - let right = read_util::new_full_kv_batch(&[ - (999, 1, 1000, OpType::Put), - (1000, 1, 999, OpType::Put), - (1000, 1, 1000, OpType::Put), - ]); - - assert_eq!(Ordering::Greater, schema.compare_row(&left, 0, &right, 0)); - assert_eq!(Ordering::Less, schema.compare_row(&left, 0, &right, 1)); - assert_eq!(Ordering::Equal, schema.compare_row(&left, 0, &right, 2)); - } - - #[test] - fn test_batch_find_unique() { - let schema = read_util::new_projected_schema(); - let batch = read_util::new_kv_batch(&[(1000, Some(1)), (2000, Some(2)), (2000, Some(2))]); - - let mut selected = BitVec::repeat(false, 3); - schema.find_unique(&batch, &mut selected, None); - assert!(selected[0]); - assert!(selected[1]); - assert!(!selected[2]); - - let mut selected = BitVec::repeat(false, 3); - let prev = read_util::new_kv_batch(&[(1000, Some(1))]); - schema.find_unique(&batch, &mut selected, Some(&prev)); - assert!(!selected[0]); - assert!(selected[1]); - assert!(!selected[2]); - } - - #[test] - fn test_find_unique_with_op() { - let schema = read_util::new_projected_schema(); - let mut selected = BitVec::repeat(false, 3); - let batch = read_util::new_full_kv_batch(&[ - (1001, 1, 3, OpType::Put), - (1000, 1, 2, OpType::Delete), - (1000, 1, 1, OpType::Put), - ]); - schema.find_unique(&batch, &mut selected, None); - assert!(selected[0]); - assert!(selected[1]); - assert!(!selected[2]); - } - - #[test] - fn test_filter_batch() { - let schema = read_util::new_projected_schema(); - let batch = read_util::new_kv_batch(&[(1000, Some(1)), (2000, Some(2)), (3000, Some(3))]); - let filter = BooleanVector::from_slice(&[true, false, true]); - - let res = schema.filter(&batch, &filter).unwrap(); - let expect: VectorRef = Arc::new(TimestampMillisecondVector::from_values([1000, 3000])); - assert_eq!(expect, *res.column(0)); - } - - #[test] - fn test_unselect_deleted() { - let schema = read_util::new_projected_schema(); - let batch = read_util::new_full_kv_batch(&[ - (100, 1, 1000, OpType::Put), - (101, 1, 999, OpType::Delete), - (102, 1, 1000, OpType::Put), - (103, 1, 999, OpType::Put), - (104, 1, 1000, OpType::Delete), - ]); - - let mut selected = BitVec::repeat(true, batch.num_rows()); - schema.unselect_deleted(&batch, &mut selected); - assert_eq!( - BitVec::from_iter([true, false, true, true, false]), - selected - ); - } -} diff --git a/src/storage/src/schema/region.rs b/src/storage/src/schema/region.rs deleted file mode 100644 index e601da45a1ed..000000000000 --- a/src/storage/src/schema/region.rs +++ /dev/null @@ -1,214 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::fmt; -use std::sync::Arc; - -use datatypes::schema::{Schema, SchemaBuilder, SchemaRef}; -use snafu::ResultExt; - -use crate::metadata::{self, ColumnMetadata, ColumnsMetadata, ColumnsMetadataRef, Result}; -use crate::schema::{StoreSchema, StoreSchemaRef}; - -/// Schema of region. -/// -/// The `RegionSchema` has the knowledge of reserved and internal columns. -/// Reserved columns are columns that their names, ids are reserved by the storage -/// engine, and could not be used by the user. Reserved columns usually have -/// special usage. Reserved columns expect the version columns are also -/// called internal columns (though the version could also be thought as a -/// special kind of internal column), are not visible to user, such as our -/// internal sequence, op_type columns. -/// -/// The user schema is the schema that only contains columns that user could visit, -/// as well as what the schema user created. -#[derive(PartialEq, Eq)] -pub struct RegionSchema { - /// Schema that only contains columns that user defined, excluding internal columns - /// that are reserved and used by the storage engine. - /// - /// Holding a [SchemaRef] to allow converting into `SchemaRef`/`arrow::SchemaRef` - /// conveniently. The fields order in `SchemaRef` **must** be consistent with - /// columns order in [ColumnsMetadata] to ensure the projection index of a field - /// is correct. - user_schema: SchemaRef, - /// store schema contains all columns of the region, including all internal columns. - store_schema: StoreSchemaRef, - /// Metadata of columns. - columns: ColumnsMetadataRef, -} - -impl fmt::Debug for RegionSchema { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("RegionSchema") - .field("columns", &self.columns) - .finish() - } -} - -impl RegionSchema { - pub fn new(columns: ColumnsMetadataRef, version: u32) -> Result { - let user_schema = Arc::new(build_user_schema(&columns, version)?); - let store_schema = Arc::new(StoreSchema::from_columns_metadata(&columns, version)?); - - debug_assert_eq!(user_schema.version(), store_schema.version()); - debug_assert_eq!(version, user_schema.version()); - - Ok(RegionSchema { - user_schema, - store_schema, - columns, - }) - } - - /// Returns the schema of the region, excluding internal columns that used by - /// the storage engine. - #[inline] - pub fn user_schema(&self) -> &SchemaRef { - &self.user_schema - } - - /// Returns the schema actually stores, which would also contains all internal columns. - #[inline] - pub fn store_schema(&self) -> &StoreSchemaRef { - &self.store_schema - } - - #[inline] - pub fn row_key_columns(&self) -> impl Iterator { - self.columns.iter_row_key_columns() - } - - #[inline] - pub fn field_columns(&self) -> impl Iterator { - self.columns.iter_field_columns() - } - - #[inline] - pub fn num_row_key_columns(&self) -> usize { - self.columns.num_row_key_columns() - } - - #[inline] - pub fn num_field_columns(&self) -> usize { - self.columns.num_field_columns() - } - - #[inline] - pub fn version(&self) -> u32 { - self.user_schema.version() - } - - #[inline] - pub(crate) fn row_key_end(&self) -> usize { - self.columns.row_key_end() - } - - #[inline] - pub(crate) fn sequence_index(&self) -> usize { - self.store_schema.sequence_index() - } - - #[inline] - pub(crate) fn op_type_index(&self) -> usize { - self.store_schema.op_type_index() - } - - #[inline] - pub(crate) fn row_key_indices(&self) -> impl Iterator { - self.store_schema.row_key_indices() - } - - #[inline] - pub fn timestamp_index(&self) -> usize { - self.store_schema.timestamp_index() - } - - #[inline] - pub(crate) fn timestamp_column_name(&self) -> &str { - self.store_schema.column_name(self.timestamp_index()) - } - - #[inline] - pub(crate) fn value_indices(&self) -> impl Iterator { - self.store_schema.value_indices() - } - - #[inline] - pub fn column_metadata(&self, idx: usize) -> &ColumnMetadata { - self.columns.column_metadata(idx) - } - - #[cfg(test)] - pub(crate) fn columns(&self) -> &[ColumnMetadata] { - self.columns.columns() - } -} - -pub type RegionSchemaRef = Arc; - -// Now user schema don't have extra metadata like store schema. -fn build_user_schema(columns: &ColumnsMetadata, version: u32) -> Result { - let column_schemas: Vec<_> = columns - .iter_user_columns() - .map(|col| col.desc.to_column_schema()) - .collect(); - - SchemaBuilder::try_from(column_schemas) - .context(metadata::ConvertSchemaSnafu)? - .version(version) - .build() - .context(metadata::InvalidSchemaSnafu) -} - -#[cfg(test)] -mod tests { - use datatypes::type_id::LogicalTypeId; - - use super::*; - use crate::test_util::schema_util; - - #[test] - fn test_region_schema() { - let region_schema = Arc::new(schema_util::new_region_schema(123, 1)); - - let expect_schema = schema_util::new_schema_with_version( - &[ - ("k0", LogicalTypeId::Int64, false), - ("timestamp", LogicalTypeId::TimestampMillisecond, false), - ("v0", LogicalTypeId::Int64, true), - ], - Some(1), - 123, - ); - - assert_eq!(expect_schema, **region_schema.user_schema()); - - // Checks row key column. - let mut row_keys = region_schema.row_key_columns(); - assert_eq!("k0", row_keys.next().unwrap().desc.name); - assert_eq!("timestamp", row_keys.next().unwrap().desc.name); - assert_eq!(None, row_keys.next()); - assert_eq!(2, region_schema.num_row_key_columns()); - - // Checks value column. - let mut values = region_schema.field_columns(); - assert_eq!("v0", values.next().unwrap().desc.name); - assert_eq!(None, values.next()); - assert_eq!(1, region_schema.num_field_columns()); - - // Checks version. - assert_eq!(123, region_schema.version()); - } -} diff --git a/src/storage/src/schema/store.rs b/src/storage/src/schema/store.rs deleted file mode 100644 index 69ac1a45b018..000000000000 --- a/src/storage/src/schema/store.rs +++ /dev/null @@ -1,323 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashMap; -use std::sync::Arc; - -use datatypes::arrow::datatypes::Schema as ArrowSchema; -use datatypes::arrow::record_batch::RecordBatch; -use datatypes::schema::{Schema, SchemaBuilder, SchemaRef}; -use snafu::{ensure, OptionExt, ResultExt}; -use store_api::storage::consts; - -use crate::error::NewRecordBatchSnafu; -use crate::metadata::{self, ColumnMetadata, ColumnsMetadata, Error, Result}; -use crate::read::Batch; - -const ROW_KEY_END_KEY: &str = "greptime:storage:row_key_end"; -const USER_COLUMN_END_KEY: &str = "greptime:storage:user_column_end"; - -/// Schema that contains storage engine specific metadata, such as internal columns. -/// -/// Used internally, contains all row key columns, internal columns and a sub set of -/// value columns in a region. The columns are organized in `key, value, internal` order. -#[derive(Debug, PartialEq, Eq)] -pub struct StoreSchema { - columns: Vec, - schema: SchemaRef, - row_key_end: usize, - user_column_end: usize, -} - -pub type StoreSchemaRef = Arc; - -impl StoreSchema { - #[inline] - pub fn version(&self) -> u32 { - self.schema.version() - } - - #[inline] - pub fn schema(&self) -> &SchemaRef { - &self.schema - } - - #[inline] - pub fn arrow_schema(&self) -> &Arc { - self.schema.arrow_schema() - } - - // TODO(yingwen): Remove this method. - pub fn batch_to_arrow_record_batch( - &self, - batch: &Batch, - ) -> std::result::Result { - assert_eq!(self.schema.num_columns(), batch.num_columns(),); - RecordBatch::try_new( - self.schema.arrow_schema().clone(), - batch.columns().iter().map(|v| v.to_arrow_array()).collect(), - ) - .context(NewRecordBatchSnafu) - } - - /// Returns the ending index of row key columns. - /// - /// The ending index has the same value as the number of the row key columns. - #[inline] - pub fn row_key_end(&self) -> usize { - self.row_key_end - } - - /// Returns the index of timestamp column. - /// We always assume that timestamp is the last column in [StoreSchema]. - #[inline] - pub fn timestamp_index(&self) -> usize { - self.row_key_end - 1 - } - - pub(crate) fn contains_column(&self, name: &str) -> bool { - self.schema.column_schema_by_name(name).is_some() - } - - pub(crate) fn is_key_column(&self, name: &str) -> bool { - self.schema - .column_index_by_name(name) - .map(|idx| idx < self.row_key_end) - .unwrap_or(false) - } - - pub(crate) fn is_user_column(&self, name: &str) -> bool { - self.schema - .column_index_by_name(name) - .map(|idx| idx < self.user_column_end) - .unwrap_or(false) - } - - pub(crate) fn from_columns_metadata( - columns: &ColumnsMetadata, - version: u32, - ) -> Result { - StoreSchema::new( - columns.columns().to_vec(), - version, - columns.row_key_end(), - columns.user_column_end(), - ) - } - - pub(crate) fn new( - columns: Vec, - version: u32, - row_key_end: usize, - user_column_end: usize, - ) -> Result { - let column_schemas = columns - .iter() - .map(|meta| meta.to_column_schema()) - .collect::>>()?; - - let schema = SchemaBuilder::try_from(column_schemas) - .context(metadata::ConvertSchemaSnafu)? - .version(version) - .add_metadata(ROW_KEY_END_KEY, row_key_end.to_string()) - .add_metadata(USER_COLUMN_END_KEY, user_column_end.to_string()) - .build() - .context(metadata::InvalidSchemaSnafu)?; - - assert_eq!( - consts::SEQUENCE_COLUMN_NAME, - schema.column_schemas()[user_column_end].name - ); - assert_eq!( - consts::OP_TYPE_COLUMN_NAME, - schema.column_schemas()[user_column_end + 1].name - ); - - Ok(StoreSchema { - columns, - schema: Arc::new(schema), - row_key_end, - user_column_end, - }) - } - - #[inline] - pub(crate) fn sequence_index(&self) -> usize { - self.user_column_end - } - - #[inline] - pub(crate) fn op_type_index(&self) -> usize { - self.user_column_end + 1 - } - - #[inline] - pub(crate) fn row_key_indices(&self) -> impl Iterator { - 0..self.row_key_end - } - - #[inline] - pub(crate) fn value_indices(&self) -> impl Iterator { - self.row_key_end..self.user_column_end - } - - #[inline] - pub(crate) fn column_name(&self, idx: usize) -> &str { - &self.schema.column_schemas()[idx].name - } - - /// # Panic - /// Panics if `name` is not a valid column name. - #[inline] - pub(crate) fn column_index(&self, name: &str) -> usize { - self.schema.column_index_by_name(name).unwrap() - } - - #[inline] - pub(crate) fn num_columns(&self) -> usize { - self.schema.num_columns() - } - - #[inline] - pub(crate) fn user_column_end(&self) -> usize { - self.user_column_end - } - - #[inline] - pub(crate) fn field_columns(&self) -> &[ColumnMetadata] { - &self.columns[self.row_key_end..self.user_column_end] - } - - /// Returns the index of the value column according its `offset`. - #[inline] - pub(crate) fn field_column_index_by_offset(&self, offset: usize) -> usize { - self.row_key_end + offset - } - - #[inline] - pub(crate) fn columns(&self) -> &[ColumnMetadata] { - &self.columns - } -} - -impl TryFrom> for StoreSchema { - type Error = Error; - - fn try_from(arrow_schema: Arc) -> std::result::Result { - let schema = Schema::try_from(arrow_schema).context(metadata::ConvertArrowSchemaSnafu)?; - // Recover other metadata from schema. - let row_key_end = parse_index_from_metadata(schema.metadata(), ROW_KEY_END_KEY)?; - let user_column_end = parse_index_from_metadata(schema.metadata(), USER_COLUMN_END_KEY)?; - - // There should be sequence and op_type columns. - ensure!( - consts::SEQUENCE_COLUMN_NAME == schema.column_schemas()[user_column_end].name, - metadata::InvalidIndexSnafu - ); - ensure!( - consts::OP_TYPE_COLUMN_NAME == schema.column_schemas()[user_column_end + 1].name, - metadata::InvalidIndexSnafu - ); - - // Recover ColumnMetadata from schema. - let columns = schema - .column_schemas() - .iter() - .map(ColumnMetadata::from_column_schema) - .collect::>()?; - - Ok(StoreSchema { - columns, - schema: Arc::new(schema), - row_key_end, - user_column_end, - }) - } -} - -impl TryFrom for StoreSchema { - type Error = Error; - - fn try_from(arrow_schema: ArrowSchema) -> std::result::Result { - StoreSchema::try_from(Arc::new(arrow_schema)) - } -} - -fn parse_index_from_metadata(metadata: &HashMap, key: &str) -> Result { - let value = metadata - .get(key) - .context(metadata::MetaNotFoundSnafu { key })?; - value.parse().with_context(|_| metadata::ParseMetaIntSnafu { - key_value: format!("{key}={value}"), - }) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::read::Batch; - use crate::schema::tests; - use crate::test_util::schema_util; - - fn check_chunk_batch(record_batch: &RecordBatch, batch: &Batch) { - assert_eq!(5, record_batch.num_columns()); - assert_eq!(3, record_batch.num_rows()); - - for i in 0..5 { - assert_eq!(record_batch.column(i), &batch.column(i).to_arrow_array()); - } - } - - #[test] - fn test_store_schema() { - let region_schema = Arc::new(schema_util::new_region_schema(123, 1)); - - // Checks StoreSchema. - let store_schema = region_schema.store_schema(); - assert_eq!(123, store_schema.version()); - let sst_arrow_schema = store_schema.arrow_schema(); - let converted_store_schema = StoreSchema::try_from((**sst_arrow_schema).clone()).unwrap(); - - assert_eq!(**store_schema, converted_store_schema); - - let column_schemas: Vec<_> = region_schema - .columns() - .iter() - .map(|meta| meta.to_column_schema().unwrap()) - .collect(); - let expect_schema = SchemaBuilder::try_from(column_schemas) - .unwrap() - .version(123) - .build() - .unwrap(); - // Only compare column schemas since SchemaRef in StoreSchema also contains other metadata that only used - // by StoreSchema. - assert_eq!( - expect_schema.column_schemas(), - store_schema.schema().column_schemas(), - ); - assert_eq!(3, store_schema.sequence_index()); - assert_eq!(4, store_schema.op_type_index()); - let row_key_indices: Vec<_> = store_schema.row_key_indices().collect(); - assert_eq!([0, 1], &row_key_indices[..]); - let value_indices: Vec<_> = store_schema.value_indices().collect(); - assert_eq!([2], &value_indices[..]); - - // Test batch and chunk conversion. - let batch = tests::new_batch(); - // Convert batch to chunk. - let chunk = store_schema.batch_to_arrow_record_batch(&batch).unwrap(); - check_chunk_batch(&chunk, &batch); - } -} diff --git a/src/storage/src/snapshot.rs b/src/storage/src/snapshot.rs deleted file mode 100644 index 4c3daac86ace..000000000000 --- a/src/storage/src/snapshot.rs +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::cmp; - -use async_trait::async_trait; -use store_api::storage::{ - GetRequest, GetResponse, ReadContext, ScanRequest, ScanResponse, SchemaRef, SequenceNumber, - Snapshot, -}; - -use crate::chunk::{ChunkReaderBuilder, ChunkReaderImpl}; -use crate::error::{Error, Result}; -use crate::sst::AccessLayerRef; -use crate::version::VersionRef; - -/// [Snapshot] implementation. -pub struct SnapshotImpl { - version: VersionRef, - /// Max sequence number (inclusive) visible to user. - visible_sequence: SequenceNumber, - sst_layer: AccessLayerRef, -} - -#[async_trait] -impl Snapshot for SnapshotImpl { - type Error = Error; - type Reader = ChunkReaderImpl; - - fn schema(&self) -> &SchemaRef { - self.version.user_schema() - } - - async fn scan( - &self, - ctx: &ReadContext, - request: ScanRequest, - ) -> Result> { - let visible_sequence = self.sequence_to_read(request.sequence); - let memtable_version = self.version.memtables(); - - let mutables = memtable_version.mutable_memtable(); - let immutables = memtable_version.immutable_memtables(); - - let mut builder = ChunkReaderBuilder::new( - self.version.metadata().id(), - self.version.schema().clone(), - self.sst_layer.clone(), - ) - .reserve_num_memtables(memtable_version.num_memtables()) - .projection(request.projection) - .filters(request.filters) - .batch_size(ctx.batch_size) - .output_ordering(request.output_ordering) - .visible_sequence(visible_sequence) - .pick_memtables(mutables.clone()) - .use_chain_reader(true); - - for memtable in immutables { - builder = builder.pick_memtables(memtable.clone()); - } - - let reader = builder.pick_all_ssts(self.version.ssts())?.build().await?; - - Ok(ScanResponse { reader }) - } - - async fn get(&self, _ctx: &ReadContext, _request: GetRequest) -> Result { - unimplemented!() - } -} - -impl SnapshotImpl { - pub fn new( - version: VersionRef, - visible_sequence: SequenceNumber, - sst_layer: AccessLayerRef, - ) -> SnapshotImpl { - SnapshotImpl { - version, - visible_sequence, - sst_layer, - } - } - - #[inline] - fn sequence_to_read(&self, request_sequence: Option) -> SequenceNumber { - request_sequence - .map(|s| cmp::min(s, self.visible_sequence)) - .unwrap_or(self.visible_sequence) - } -} diff --git a/src/storage/src/sst.rs b/src/storage/src/sst.rs deleted file mode 100644 index b3b914d03694..000000000000 --- a/src/storage/src/sst.rs +++ /dev/null @@ -1,830 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub(crate) mod parquet; -mod pruning; -mod stream_writer; - -use std::collections::HashMap; -use std::fmt; -use std::fmt::{Debug, Formatter}; -use std::str::FromStr; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::sync::Arc; - -use async_trait::async_trait; -use common_base::readable_size::ReadableSize; -use common_recordbatch::SendableRecordBatchStream; -use common_telemetry::{debug, error}; -use common_time::range::TimestampRange; -use common_time::Timestamp; -use datatypes::schema::SchemaRef; -use futures_util::StreamExt; -use object_store::{util, ObjectStore}; -use serde::{Deserialize, Deserializer, Serialize}; -use snafu::{ResultExt, Snafu}; -use store_api::storage::{ChunkReader, RegionId}; -use table::predicate::Predicate; -use uuid::Uuid; - -use crate::chunk::ChunkReaderImpl; -use crate::error; -use crate::error::{DeleteSstSnafu, Result}; -use crate::file_purger::{FilePurgeRequest, FilePurgerRef}; -use crate::memtable::BoxedBatchIterator; -use crate::read::{Batch, BatchReader, BoxedBatchReader}; -use crate::scheduler::Scheduler; -use crate::schema::ProjectedSchemaRef; -use crate::sst::parquet::{ChunkStream, ParquetReader, ParquetWriter}; - -/// Maximum level of SSTs. -pub const MAX_LEVEL: u8 = 2; - -pub type Level = u8; - -pub use crate::sst::stream_writer::BufferedWriter; - -// We only has fixed number of level, so we use array to hold elements. This implementation -// detail of LevelMetaVec should not be exposed to the user of [LevelMetas]. -type LevelMetaVec = [LevelMeta; MAX_LEVEL as usize]; - -/// Metadata of all SSTs under a region. -/// -/// Files are organized into multiple level, though there may be only one level. -#[derive(Clone)] -pub struct LevelMetas { - levels: LevelMetaVec, - sst_layer: AccessLayerRef, - file_purger: FilePurgerRef, - /// Compaction time window in seconds - compaction_time_window: Option, -} - -impl std::fmt::Debug for LevelMetas { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("LevelMetas") - .field("levels", &self.levels) - .field("compaction_time_window", &self.compaction_time_window) - .finish() - } -} - -impl LevelMetas { - /// Create a new LevelMetas and initialized each level. - pub fn new(sst_layer: AccessLayerRef, file_purger: FilePurgerRef) -> LevelMetas { - LevelMetas { - levels: new_level_meta_vec(), - sst_layer, - file_purger, - compaction_time_window: Default::default(), - } - } - - /// Returns total level number. - #[inline] - pub fn level_num(&self) -> usize { - self.levels.len() - } - - pub fn compaction_time_window(&self) -> Option { - self.compaction_time_window - } - - #[inline] - pub fn level(&self, level: Level) -> &LevelMeta { - &self.levels[level as usize] - } - - /// Merge `self` with files to add/remove to create a new [LevelMetas]. - /// - /// # Panics - /// Panics if level of [FileHandle] is greater than [MAX_LEVEL]. - pub fn merge( - &self, - files_to_add: impl Iterator, - files_to_remove: impl Iterator, - compaction_time_window: Option, - ) -> LevelMetas { - let mut merged = self.clone(); - for file in files_to_add { - let level = file.level; - let handle = FileHandle::new(file, self.sst_layer.clone(), self.file_purger.clone()); - merged.levels[level as usize].add_file(handle); - } - - for file in files_to_remove { - let level = file.level; - if let Some(removed_file) = merged.levels[level as usize].remove_file(file.file_id) { - removed_file.mark_deleted(); - } - } - // we only update region's compaction time window iff region's window is not set and VersionEdit's - // compaction time window is present. - if let Some(window) = compaction_time_window { - let _ = merged.compaction_time_window.get_or_insert(window); - } - merged - } - - pub fn mark_all_files_deleted(&self) -> Vec { - self.levels().iter().fold(vec![], |mut files, level| { - files.extend(level.files().map(|f| { - f.mark_deleted(); - f.file_id() - })); - files - }) - } - - pub fn levels(&self) -> &[LevelMeta] { - &self.levels - } - - pub fn file_purger(&self) -> FilePurgerRef { - self.file_purger.clone() - } -} - -/// Metadata of files in same SST level. -#[derive(Default, Clone)] -pub struct LevelMeta { - level: Level, - /// Handles to the files in this level. - // TODO(yingwen): Now for simplicity, files are unordered, maybe sort the files by time range - // or use another structure to hold them. - files: HashMap, -} - -impl std::fmt::Debug for LevelMeta { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("LevelMeta") - .field("level", &self.level) - .field("files", &self.files.keys()) - .finish() - } -} - -impl LevelMeta { - pub fn new(level: Level) -> Self { - Self { - level, - files: HashMap::new(), - } - } - - fn add_file(&mut self, file: FileHandle) { - let _ = self.files.insert(file.file_id(), file); - } - - fn remove_file(&mut self, file_to_remove: FileId) -> Option { - self.files.remove(&file_to_remove) - } - - /// Returns the level of level meta. - #[inline] - pub fn level(&self) -> Level { - self.level - } - - /// Returns number of SST files in level. - #[inline] - pub fn file_num(&self) -> usize { - self.files.len() - } - - /// Returns expired SSTs from current level. - pub fn get_expired_files(&self, expire_time: &Timestamp) -> Vec { - self.files - .iter() - .filter_map(|(_, v)| { - let Some((_, end)) = v.time_range() else { - return None; - }; - if end < expire_time { - Some(v.clone()) - } else { - None - } - }) - .collect() - } - - pub fn files(&self) -> impl Iterator { - self.files.values() - } -} - -fn new_level_meta_vec() -> LevelMetaVec { - (0u8..MAX_LEVEL) - .map(LevelMeta::new) - .collect::>() - .try_into() - .unwrap() // safety: LevelMetaVec is a fixed length array with length MAX_LEVEL -} - -#[derive(Clone)] -pub struct FileHandle { - inner: Arc, -} - -impl Debug for FileHandle { - fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - f.debug_struct("FileHandle") - .field("file_id", &self.inner.meta.file_id) - .field("region_id", &self.inner.meta.region_id) - .field("time_range", &self.inner.meta.time_range) - .field("size", &self.inner.meta.file_size) - .field("level", &self.inner.meta.level) - .field("compacting", &self.inner.compacting) - .field("deleted", &self.inner.deleted) - .finish() - } -} - -impl FileHandle { - pub fn new( - meta: FileMeta, - sst_layer: AccessLayerRef, - file_purger: FilePurgerRef, - ) -> FileHandle { - FileHandle { - inner: Arc::new(FileHandleInner::new(meta, sst_layer, file_purger)), - } - } - - /// Returns level as usize so it can be used as index. - #[inline] - pub fn level(&self) -> Level { - self.inner.meta.level - } - - #[inline] - pub fn file_name(&self) -> String { - self.inner.meta.file_id.as_parquet() - } - - #[inline] - pub fn file_path(&self) -> String { - self.inner - .sst_layer - .sst_file_path(&self.inner.meta.file_id.as_parquet()) - } - - #[inline] - pub fn file_id(&self) -> FileId { - self.inner.meta.file_id - } - - #[inline] - pub fn time_range(&self) -> &Option<(Timestamp, Timestamp)> { - &self.inner.meta.time_range - } - - /// Returns true if current file is under compaction. - #[inline] - pub fn compacting(&self) -> bool { - self.inner.compacting.load(Ordering::Relaxed) - } - - /// Sets the compacting flag. - #[inline] - pub fn mark_compacting(&self, compacting: bool) { - self.inner.compacting.store(compacting, Ordering::Relaxed); - } - - #[inline] - pub fn deleted(&self) -> bool { - self.inner.deleted.load(Ordering::Relaxed) - } - - #[inline] - pub fn mark_deleted(&self) { - self.inner.deleted.store(true, Ordering::Relaxed); - } - - #[inline] - pub fn meta(&self) -> FileMeta { - self.inner.meta.clone() - } - - #[inline] - pub fn file_size(&self) -> u64 { - self.inner.meta.file_size - } -} - -/// Actually data of [FileHandle]. -/// -/// Contains meta of the file, and other mutable info like metrics. -struct FileHandleInner { - meta: FileMeta, - compacting: AtomicBool, - deleted: AtomicBool, - sst_layer: AccessLayerRef, - file_purger: FilePurgerRef, -} - -impl fmt::Debug for FileHandleInner { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("FileHandleInner") - .field("meta", &self.meta) - .field("compacting", &self.compacting) - .field("deleted", &self.deleted) - .finish() - } -} - -impl Drop for FileHandleInner { - fn drop(&mut self) { - if self.deleted.load(Ordering::Relaxed) { - let request = FilePurgeRequest { - sst_layer: self.sst_layer.clone(), - file_id: self.meta.file_id, - region_id: self.meta.region_id, - }; - match self.file_purger.schedule(request) { - Ok(res) => { - debug!( - "Scheduled SST purge task, region: {}, name: {}, res: {}", - self.meta.region_id, - self.meta.file_id.as_parquet(), - res - ); - } - Err(e) => { - error!(e; "Failed to schedule SST purge task, region: {}, name: {}", - self.meta.region_id, self.meta.file_id.as_parquet()); - } - } - } - } -} - -impl FileHandleInner { - fn new( - meta: FileMeta, - sst_layer: AccessLayerRef, - file_purger: FilePurgerRef, - ) -> FileHandleInner { - FileHandleInner { - meta, - compacting: AtomicBool::new(false), - deleted: AtomicBool::new(false), - sst_layer, - file_purger, - } - } -} - -#[derive(Debug, Snafu, PartialEq)] -pub struct ParseIdError { - source: uuid::Error, -} - -/// Unique id for [SST File]. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] -pub struct FileId(Uuid); - -impl FileId { - /// Returns a new unique [FileId] randomly. - pub fn random() -> FileId { - FileId(Uuid::new_v4()) - } - - /// Parses id from string. - pub fn parse_str(input: &str) -> std::result::Result { - Uuid::parse_str(input).map(FileId).context(ParseIdSnafu) - } - - /// Append `.parquet` to file id to make a complete file name - pub fn as_parquet(&self) -> String { - format!("{}{}", self.0.hyphenated(), ".parquet") - } -} - -impl fmt::Display for FileId { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.0) - } -} - -impl FromStr for FileId { - type Err = ParseIdError; - - fn from_str(s: &str) -> std::result::Result { - FileId::parse_str(s) - } -} - -/// Immutable metadata of a sst file. -#[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] -#[serde(default)] -pub struct FileMeta { - /// Region of file. - pub region_id: RegionId, - /// Compared to normal file names, FileId ignore the extension - #[serde(deserialize_with = "deserialize_from_string")] - #[serde(alias = "file_name")] - pub file_id: FileId, - /// Timestamp range of file. - pub time_range: Option<(Timestamp, Timestamp)>, - /// SST level of the file. - pub level: Level, - /// Size of the file. - pub file_size: u64, -} - -fn deserialize_from_string<'de, D>(deserializer: D) -> std::result::Result -where - D: Deserializer<'de>, -{ - let s: &str = Deserialize::deserialize(deserializer)?; - let stripped = s.strip_suffix(".parquet").unwrap_or(s); // strip parquet suffix if needed. - FileId::from_str(stripped).map_err(::custom) -} - -#[derive(Debug)] -pub struct WriteOptions { - // TODO(yingwen): [flush] row group size. - pub sst_write_buffer_size: ReadableSize, -} - -impl Default for WriteOptions { - fn default() -> Self { - Self { - sst_write_buffer_size: ReadableSize::mb(8), - } - } -} - -pub struct ReadOptions { - /// Suggested size of each batch. - pub batch_size: usize, - /// The schema that user expected to read, might not the same as the - /// schema of the SST file. - pub projected_schema: ProjectedSchemaRef, - - pub predicate: Predicate, - pub time_range: TimestampRange, -} - -#[derive(Debug, PartialEq)] -pub struct SstInfo { - pub time_range: Option<(Timestamp, Timestamp)>, - pub file_size: u64, - pub num_rows: usize, -} - -/// SST access layer. -#[async_trait] -pub trait AccessLayer: Send + Sync + std::fmt::Debug { - /// Returns the sst file path. - fn sst_file_path(&self, file_name: &str) -> String; - - /// Writes SST file with given `file_id` and returns the SST info. - /// If source does not contain any data, `write_sst` will return `Ok(None)`. - async fn write_sst( - &self, - file_id: FileId, - source: Source, - opts: &WriteOptions, - ) -> Result>; - - /// Read SST file with given `file_handle` and schema. - async fn read_sst( - &self, - file_handle: FileHandle, - opts: &ReadOptions, - ) -> Result; - - /// Deletes a SST file with given name. - async fn delete_sst(&self, file_id: FileId) -> Result<()>; -} - -pub type AccessLayerRef = Arc; - -/// Parquet writer data source. -pub enum Source { - /// Writes rows from memtable to parquet - Iter(BoxedBatchIterator), - /// Writes row from ChunkReaderImpl (maybe a set of SSTs) to parquet. - Reader(ChunkReaderImpl), - /// Record batch stream yielded by table scan - Stream(SendableRecordBatchStream), -} - -impl Source { - async fn next_batch(&mut self) -> Result> { - match self { - Source::Iter(iter) => iter.next().transpose(), - Source::Reader(reader) => reader - .next_chunk() - .await - .map(|p| p.map(|chunk| Batch::new(chunk.columns))), - Source::Stream(stream) => stream - .next() - .await - .transpose() - .map(|r| r.map(|r| Batch::new(r.columns().to_vec()))) - .context(error::CreateRecordBatchSnafu), - } - } - - fn schema(&self) -> SchemaRef { - match self { - Source::Iter(iter) => { - let projected_schema = iter.schema(); - projected_schema.schema_to_read().schema().clone() - } - Source::Reader(reader) => reader.projected_schema().schema_to_read().schema().clone(), - Source::Stream(stream) => stream.schema(), - } - } -} - -/// Sst access layer. -pub struct FsAccessLayer { - sst_dir: String, - object_store: ObjectStore, -} - -impl fmt::Debug for FsAccessLayer { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("FsAccessLayer") - .field("sst_dir", &self.sst_dir) - .finish() - } -} - -impl FsAccessLayer { - pub fn new(sst_dir: &str, object_store: ObjectStore) -> FsAccessLayer { - FsAccessLayer { - sst_dir: util::normalize_dir(sst_dir), - object_store, - } - } -} - -#[async_trait] -impl AccessLayer for FsAccessLayer { - fn sst_file_path(&self, file_name: &str) -> String { - format!("{}{}", self.sst_dir, file_name) - } - - /// Writes SST file with given `file_id`. - async fn write_sst( - &self, - file_id: FileId, - source: Source, - opts: &WriteOptions, - ) -> Result> { - // Now we only supports parquet format. We may allow caller to specific SST format in - // WriteOptions in the future. - let file_path = self.sst_file_path(&file_id.as_parquet()); - let writer = ParquetWriter::new(&file_path, source, self.object_store.clone()); - writer.write_sst(opts).await - } - - /// Read SST file with given `file_handle` and schema. - async fn read_sst( - &self, - file_handle: FileHandle, - opts: &ReadOptions, - ) -> Result { - let reader = ParquetReader::new( - file_handle, - self.object_store.clone(), - opts.projected_schema.clone(), - opts.predicate.clone(), - opts.time_range, - ); - - Ok(Box::new(LazyParquetBatchReader::new(reader))) - } - - /// Deletes a SST file with given file id. - async fn delete_sst(&self, file_id: FileId) -> Result<()> { - let path = self.sst_file_path(&file_id.as_parquet()); - self.object_store - .delete(&path) - .await - .context(DeleteSstSnafu) - } -} - -struct LazyParquetBatchReader { - inner: ParquetReader, - stream: Option, -} - -impl LazyParquetBatchReader { - fn new(inner: ParquetReader) -> Self { - Self { - inner, - stream: None, - } - } -} - -#[async_trait] -impl BatchReader for LazyParquetBatchReader { - async fn next_batch(&mut self) -> Result> { - if let Some(s) = &mut self.stream { - s.next_batch().await - } else { - let mut stream = self.inner.chunk_stream().await?; - let res = stream.next_batch().await; - self.stream = Some(stream); - res - } - } -} - -#[cfg(test)] -mod tests { - use std::collections::HashSet; - - use super::*; - use crate::file_purger::noop::NoopFilePurgeHandler; - use crate::scheduler::{LocalScheduler, SchedulerConfig}; - - #[test] - fn test_file_id() { - let id = FileId::random(); - let uuid_str = id.to_string(); - assert_eq!(id.0.to_string(), uuid_str); - - let parsed = FileId::parse_str(&uuid_str).unwrap(); - assert_eq!(id, parsed); - let parsed = uuid_str.parse().unwrap(); - assert_eq!(id, parsed); - } - - #[test] - fn test_file_id_serialization() { - let id = FileId::random(); - let json = serde_json::to_string(&id).unwrap(); - assert_eq!(format!("\"{id}\""), json); - - let parsed = serde_json::from_str(&json).unwrap(); - assert_eq!(id, parsed); - } - - #[test] - fn test_deserialize_file_meta() { - let file_meta = create_file_meta(FileId::random(), 0); - let serialized_file_meta = serde_json::to_string(&file_meta).unwrap(); - let deserialized_file_meta = serde_json::from_str(&serialized_file_meta); - assert_eq!(file_meta, deserialized_file_meta.unwrap()); - } - - #[test] - fn test_deserialize_from_string() { - let json_file_meta = "{\"region_id\":0,\"file_id\":\"bc5896ec-e4d8-4017-a80d-f2de73188d55\",\"time_range\":null,\"level\":0}"; - let file_meta = create_file_meta( - FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap(), - 0, - ); - let deserialized_file_meta: FileMeta = serde_json::from_str(json_file_meta).unwrap(); - assert_eq!(file_meta, deserialized_file_meta); - } - #[test] - fn test_deserialize_from_string_parquet() { - let json_file_meta = "{\"region_id\":0,\"file_id\":\"bc5896ec-e4d8-4017-a80d-f2de73188d55.parquet\",\"time_range\":null,\"level\":0}"; - let file_meta = create_file_meta( - FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap(), - 0, - ); - let deserialized_file_meta: FileMeta = serde_json::from_str(json_file_meta).unwrap(); - assert_eq!(file_meta, deserialized_file_meta); - } - - #[test] - fn test_deserialize_from_string_parquet_file_name() { - let json_file_meta = "{\"region_id\":0,\"file_name\":\"bc5896ec-e4d8-4017-a80d-f2de73188d55.parquet\",\"time_range\":null,\"level\":0}"; - let file_meta = create_file_meta( - FileId::from_str("bc5896ec-e4d8-4017-a80d-f2de73188d55").unwrap(), - 0, - ); - let deserialized_file_meta: FileMeta = serde_json::from_str(json_file_meta).unwrap(); - assert_eq!(file_meta, deserialized_file_meta); - } - - #[test] - fn test_file_id_as_parquet() { - let id = FileId::from_str("67e55044-10b1-426f-9247-bb680e5fe0c8").unwrap(); - assert_eq!( - "67e55044-10b1-426f-9247-bb680e5fe0c8.parquet", - id.as_parquet() - ); - } - - fn create_file_meta(file_id: FileId, level: Level) -> FileMeta { - FileMeta { - region_id: 0.into(), - file_id, - time_range: None, - level, - file_size: 0, - } - } - - #[test] - fn test_level_metas_add_and_remove() { - let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {}); - let purger = Arc::new(LocalScheduler::new( - SchedulerConfig::default(), - NoopFilePurgeHandler, - )); - let file_ids = [ - FileId::random(), - FileId::random(), - FileId::random(), - FileId::random(), - ]; - - let metas = LevelMetas::new(layer, purger); - let merged = metas.merge( - vec![ - create_file_meta(file_ids[0], 0), - create_file_meta(file_ids[1], 0), - ] - .into_iter(), - vec![].into_iter(), - None, - ); - - assert_eq!( - HashSet::from([file_ids[0], file_ids[1]]), - merged.level(0).files().map(|f| f.file_id()).collect() - ); - - let merged1 = merged.merge( - vec![ - create_file_meta(file_ids[2], 1), - create_file_meta(file_ids[3], 1), - ] - .into_iter(), - vec![].into_iter(), - None, - ); - assert_eq!( - HashSet::from([file_ids[0], file_ids[1]]), - merged1.level(0).files().map(|f| f.file_id()).collect() - ); - - assert_eq!( - HashSet::from([file_ids[2], file_ids[3]]), - merged1.level(1).files().map(|f| f.file_id()).collect() - ); - - let removed1 = merged1.merge( - vec![].into_iter(), - vec![ - create_file_meta(file_ids[0], 0), - create_file_meta(file_ids[2], 0), - ] - .into_iter(), - None, - ); - assert_eq!( - HashSet::from([file_ids[1]]), - removed1.level(0).files().map(|f| f.file_id()).collect() - ); - - assert_eq!( - HashSet::from([file_ids[2], file_ids[3]]), - removed1.level(1).files().map(|f| f.file_id()).collect() - ); - - let removed2 = removed1.merge( - vec![].into_iter(), - vec![ - create_file_meta(file_ids[2], 1), - create_file_meta(file_ids[3], 1), - ] - .into_iter(), - None, - ); - assert_eq!( - HashSet::from([file_ids[1]]), - removed2.level(0).files().map(|f| f.file_id()).collect() - ); - - assert_eq!( - HashSet::new(), - removed2.level(1).files().map(|f| f.file_id()).collect() - ); - } -} diff --git a/src/storage/src/sst/parquet.rs b/src/storage/src/sst/parquet.rs deleted file mode 100644 index fa0cb9c56e0e..000000000000 --- a/src/storage/src/sst/parquet.rs +++ /dev/null @@ -1,819 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Parquet sst format. - -use std::collections::HashMap; -use std::pin::Pin; -use std::sync::Arc; - -use async_compat::CompatExt; -use async_stream::try_stream; -use async_trait::async_trait; -use common_telemetry::{debug, error}; -use common_time::range::TimestampRange; -use common_time::Timestamp; -use datatypes::arrow::record_batch::RecordBatch; -use datatypes::prelude::ConcreteDataType; -use futures_util::{Stream, StreamExt, TryStreamExt}; -use object_store::ObjectStore; -use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask}; -use parquet::basic::{Compression, Encoding, ZstdLevel}; -use parquet::file::metadata::KeyValue; -use parquet::file::properties::WriterProperties; -use parquet::format::FileMetaData; -use parquet::schema::types::ColumnPath; -use snafu::{OptionExt, ResultExt}; -use store_api::storage::consts::SEQUENCE_COLUMN_NAME; -use table::predicate::Predicate; -use tokio::io::BufReader; - -use crate::error::{self, DecodeParquetTimeRangeSnafu, ReadObjectSnafu, ReadParquetSnafu, Result}; -use crate::read::{Batch, BatchReader}; -use crate::schema::compat::ReadAdapter; -use crate::schema::{ProjectedSchemaRef, StoreSchema}; -use crate::sst; -use crate::sst::pruning::build_row_filter; -use crate::sst::stream_writer::BufferedWriter; -use crate::sst::{FileHandle, Source, SstInfo}; - -/// Parquet sst writer. -pub struct ParquetWriter<'a> { - file_path: &'a str, - source: Source, - object_store: ObjectStore, - max_row_group_size: usize, -} - -impl<'a> ParquetWriter<'a> { - pub fn new(file_path: &'a str, source: Source, object_store: ObjectStore) -> ParquetWriter { - ParquetWriter { - file_path, - source, - object_store, - max_row_group_size: 4096, // TODO(hl): make this configurable - } - } - - pub async fn write_sst(self, opts: &sst::WriteOptions) -> Result> { - self.write_rows(None, opts).await - } - - /// Iterates memtable and writes rows to Parquet file. - /// A chunk of records yielded from each iteration with a size given - /// in config will be written to a single row group. - async fn write_rows( - mut self, - extra_meta: Option>, - opts: &sst::WriteOptions, - ) -> Result> { - let schema = self.source.schema(); - - let mut props_builder = WriterProperties::builder() - .set_compression(Compression::ZSTD(ZstdLevel::default())) - .set_encoding(Encoding::PLAIN) - .set_max_row_group_size(self.max_row_group_size) - .set_key_value_metadata(extra_meta.map(|map| { - map.iter() - .map(|(k, v)| KeyValue::new(k.clone(), v.clone())) - .collect::>() - })) - .set_column_encoding( - ColumnPath::new(vec![SEQUENCE_COLUMN_NAME.to_string()]), - Encoding::DELTA_BINARY_PACKED, - ) - .set_column_dictionary_enabled( - ColumnPath::new(vec![SEQUENCE_COLUMN_NAME.to_string()]), - false, - ); - - if let Some(ts_col) = schema.timestamp_column() { - props_builder = props_builder.set_column_encoding( - ColumnPath::new(vec![ts_col.name.clone()]), - Encoding::DELTA_BINARY_PACKED, - ); - } - - let writer_props = props_builder.build(); - - let mut buffered_writer = BufferedWriter::try_new( - self.file_path.to_string(), - self.object_store.clone(), - &schema, - Some(writer_props), - opts.sst_write_buffer_size.as_bytes() as usize, - ) - .await?; - let mut rows_written = 0; - - while let Some(batch) = self.source.next_batch().await? { - buffered_writer.write(&batch).await?; - rows_written += batch.num_rows(); - } - - if rows_written == 0 { - debug!("No data written, try abort writer: {}", self.file_path); - let _ = buffered_writer.close().await?; - return Ok(None); - } - - let (file_meta, file_size) = buffered_writer.close().await?; - let time_range = decode_timestamp_range(&file_meta, &schema).ok().flatten(); - - // object_store.write will make sure all bytes are written or an error is raised. - Ok(Some(SstInfo { - time_range, - file_size, - num_rows: rows_written, - })) - } -} - -fn decode_timestamp_range( - file_meta: &FileMetaData, - schema: &datatypes::schema::SchemaRef, -) -> Result> { - let (Some(ts_col_idx), Some(ts_col)) = (schema.timestamp_index(), schema.timestamp_column()) - else { - return Ok(None); - }; - let ts_datatype = &ts_col.data_type; - decode_timestamp_range_inner(file_meta, ts_col_idx, ts_datatype) -} - -fn decode_timestamp_range_inner( - file_meta: &FileMetaData, - ts_index: usize, - ts_datatype: &ConcreteDataType, -) -> Result> { - let mut start = i64::MAX; - let mut end = i64::MIN; - - let unit = match ts_datatype { - ConcreteDataType::Timestamp(type_) => type_.unit(), - _ => { - return DecodeParquetTimeRangeSnafu { - msg: format!("Unexpected timestamp column datatype: {ts_datatype:?}"), - } - .fail(); - } - }; - - for rg in &file_meta.row_groups { - let Some(ref metadata) = rg - .columns - .get(ts_index) - .context(DecodeParquetTimeRangeSnafu { - msg: format!("Cannot find ts column by index: {ts_index}"), - })? - .meta_data - else { - return Ok(None); - }; - let Some(stats) = &metadata.statistics else { - return Ok(None); - }; - let (Some(min_value), Some(max_value)) = (&stats.min_value, &stats.max_value) else { - return Ok(None); - }; - - // according to [parquet's spec](https://parquet.apache.org/docs/file-format/data-pages/encodings/), min/max value in stats uses plain encoding with little endian. - // also see https://github.com/apache/arrow-rs/blob/5fb337db04a1a19f7d40da46f19b7b5fd4051593/parquet/src/file/statistics.rs#L172 - let min = i64::from_le_bytes(min_value[..8].try_into().map_err(|e| { - error!( - "Failed to decode min value from stats, bytes: {:?}, source: {:?}", - min_value, e - ); - DecodeParquetTimeRangeSnafu { - msg: "decode min value", - } - .build() - })?); - let max = i64::from_le_bytes(max_value[..8].try_into().map_err(|e| { - error!( - "Failed to decode max value from stats, bytes: {:?}, source: {:?}", - max_value, e - ); - DecodeParquetTimeRangeSnafu { - msg: "decode max value", - } - .build() - })?); - start = start.min(min); - end = end.max(max); - } - - assert!( - start <= end, - "Illegal timestamp range decoded from SST file {:?}, start: {}, end: {}", - file_meta, - start, - end - ); - Ok(Some(( - Timestamp::new(start, unit), - Timestamp::new(end, unit), - ))) -} - -pub struct ParquetReader { - // Holds the file handle to avoid the file purge purge it. - file_handle: FileHandle, - object_store: ObjectStore, - projected_schema: ProjectedSchemaRef, - predicate: Predicate, - time_range: TimestampRange, -} - -impl ParquetReader { - pub fn new( - file_handle: FileHandle, - object_store: ObjectStore, - projected_schema: ProjectedSchemaRef, - predicate: Predicate, - time_range: TimestampRange, - ) -> ParquetReader { - ParquetReader { - file_handle, - object_store, - projected_schema, - predicate, - time_range, - } - } - - pub async fn chunk_stream(&self) -> Result { - let file_path = self.file_handle.file_path(); - let operator = self.object_store.clone(); - - let reader = operator - .reader(&file_path) - .await - .context(ReadObjectSnafu { path: &file_path })? - .compat(); - let buf_reader = BufReader::new(reader); - let builder = ParquetRecordBatchStreamBuilder::new(buf_reader) - .await - .context(ReadParquetSnafu { file: &file_path })?; - let arrow_schema = builder.schema().clone(); - - let store_schema = Arc::new( - StoreSchema::try_from(arrow_schema) - .context(error::ConvertStoreSchemaSnafu { file: &file_path })?, - ); - - let adapter = ReadAdapter::new(store_schema.clone(), self.projected_schema.clone())?; - - let pruned_row_groups = self - .predicate - .prune_row_groups( - builder.metadata().row_groups(), - store_schema.schema().clone(), - ) - .into_iter() - .enumerate() - .filter_map(|(idx, valid)| if valid { Some(idx) } else { None }) - .collect::>(); - - let parquet_schema_desc = builder.metadata().file_metadata().schema_descr_ptr(); - - let projection_mask = ProjectionMask::roots(&parquet_schema_desc, adapter.fields_to_read()); - let mut builder = builder - .with_projection(projection_mask.clone()) - .with_row_groups(pruned_row_groups); - - if let Some(row_filter) = build_row_filter( - self.time_range, - &self.predicate, - &store_schema, - &parquet_schema_desc, - projection_mask, - ) { - builder = builder.with_row_filter(row_filter); - } - - let mut stream = builder - .build() - .context(ReadParquetSnafu { file: &file_path })?; - - let chunk_stream = try_stream!({ - while let Some(res) = stream.next().await { - yield res.context(ReadParquetSnafu { file: &file_path })? - } - }); - - ChunkStream::new(self.file_handle.clone(), adapter, Box::pin(chunk_stream)) - } -} - -pub type SendableChunkStream = Pin> + Send>>; - -pub struct ChunkStream { - // Holds the file handle in the stream to avoid the purger purge it. - _file_handle: FileHandle, - adapter: ReadAdapter, - stream: SendableChunkStream, -} - -impl ChunkStream { - pub fn new( - file_handle: FileHandle, - adapter: ReadAdapter, - stream: SendableChunkStream, - ) -> Result { - Ok(Self { - _file_handle: file_handle, - adapter, - stream, - }) - } -} - -#[async_trait] -impl BatchReader for ChunkStream { - async fn next_batch(&mut self) -> Result> { - self.stream - .try_next() - .await? - .map(|rb| self.adapter.arrow_record_batch_to_batch(&rb)) - .transpose() - } -} - -#[cfg(test)] -mod tests { - use std::ops::Range; - use std::sync::Arc; - - use api::v1::OpType; - use common_base::readable_size::ReadableSize; - use common_test_util::temp_dir::create_temp_dir; - use common_time::timestamp::TimeUnit; - use datatypes::arrow::array::{Array, UInt64Array, UInt8Array}; - use datatypes::prelude::{ScalarVector, Vector}; - use datatypes::types::{TimestampMillisecondType, TimestampType}; - use datatypes::vectors::TimestampMillisecondVector; - use object_store::services::Fs; - - use super::*; - use crate::file_purger::noop::new_noop_file_purger; - use crate::memtable::{ - tests as memtable_tests, DefaultMemtableBuilder, IterContext, MemtableBuilder, - }; - use crate::schema::ProjectedSchema; - use crate::sst::{FileId, FileMeta}; - - fn create_object_store(root: &str) -> ObjectStore { - let mut builder = Fs::default(); - let _ = builder.root(root); - ObjectStore::new(builder).unwrap().finish() - } - - #[tokio::test] - async fn test_parquet_writer() { - common_telemetry::init_default_ut_logging(); - let schema = memtable_tests::schema_for_test(); - let memtable = DefaultMemtableBuilder::default().build(schema); - - memtable_tests::write_kvs( - &*memtable, - 10, // sequence - OpType::Put, - &[1000, 1002, 2002, 2003, 2003, 1001], // keys - &[ - (Some(1), Some(1234)), - (Some(2), Some(1234)), - (Some(7), Some(1234)), - (Some(8), Some(1234)), - (Some(9), Some(1234)), - (Some(3), Some(1234)), - ], // values - ); - - let dir = create_temp_dir("write_parquet"); - let path = dir.path().to_str().unwrap(); - - let object_store = create_object_store(path); - let sst_file_name = "test-flush.parquet"; - let iter = memtable.iter(IterContext::default()).unwrap(); - let writer = ParquetWriter::new(sst_file_name, Source::Iter(iter), object_store.clone()); - - assert!(writer - .write_sst(&sst::WriteOptions::default()) - .await - .is_ok()); - - // verify parquet file - let reader = BufReader::new(object_store.reader(sst_file_name).await.unwrap().compat()); - - let builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap(); - - let mut stream = builder.build().unwrap(); - // chunk schema: timestamp, v1, __sequence, __op_type - let chunk = stream.next().await.unwrap().unwrap(); - assert_eq!(5, chunk.columns().len()); - - // timestamp - assert_eq!( - &TimestampMillisecondVector::from_slice([ - 1000.into(), - 1001.into(), - 1002.into(), - 2002.into(), - 2003.into(), - ]) - .to_arrow_array(), - chunk.column(0) - ); - - // v0 - assert_eq!( - &(Arc::new(UInt64Array::from(vec![1, 3, 2, 7, 9])) as Arc), - chunk.column(1) - ); - - // v1 - assert_eq!( - &(Arc::new(UInt64Array::from(vec![1234; 5])) as Arc), - chunk.column(2) - ); - - // sequence - assert_eq!( - &(Arc::new(UInt64Array::from(vec![10; 5])) as Arc), - chunk.column(3) - ); - - // op_type - assert_eq!( - &(Arc::new(UInt8Array::from(vec![1; 5])) as Arc), - chunk.column(4) - ); - } - - #[tokio::test] - async fn test_write_large_data() { - common_telemetry::init_default_ut_logging(); - let schema = memtable_tests::schema_for_test(); - let memtable = DefaultMemtableBuilder::default().build(schema); - - let mut rows_written = 0; - for i in 0..16 { - let range: Range = i * 1024..(i + 1) * 1024; - let keys = range.clone().collect::>(); - let values = range - .map(|idx| (Some(idx as u64), Some(idx as u64))) - .collect::>(); - memtable_tests::write_kvs(&*memtable, i as u64, OpType::Put, &keys, &values); - rows_written += keys.len(); - } - - let dir = create_temp_dir("write_large_parquet"); - let path = dir.path().to_str().unwrap(); - - let object_store = create_object_store(path); - let sst_file_name = "test-large.parquet"; - let iter = memtable.iter(IterContext::default()).unwrap(); - let writer = ParquetWriter::new(sst_file_name, Source::Iter(iter), object_store.clone()); - - let sst_info = writer - .write_sst(&sst::WriteOptions { - sst_write_buffer_size: ReadableSize::kb(4), - }) - .await - .unwrap() - .unwrap(); - let file_meta = object_store.stat(sst_file_name).await.unwrap(); - assert!(file_meta.is_file()); - assert_eq!(sst_info.file_size, file_meta.content_length()); - assert_eq!(rows_written, sst_info.num_rows); - } - - #[tokio::test] - async fn test_parquet_read_large_batch() { - common_telemetry::init_default_ut_logging(); - let schema = memtable_tests::schema_for_test(); - let memtable = DefaultMemtableBuilder::default().build(schema.clone()); - - let rows_total = 4096 * 4; - let mut keys_vec = Vec::with_capacity(rows_total); - let mut values_vec = Vec::with_capacity(rows_total); - - for i in 0..rows_total { - keys_vec.push(i as i64); - values_vec.push((Some(i as u64), Some(i as u64))); - } - - memtable_tests::write_kvs( - &*memtable, - 10, // sequence - OpType::Put, - &keys_vec, // keys - &values_vec, // values - ); - - let dir = create_temp_dir("write_parquet"); - let path = dir.path().to_str().unwrap(); - let object_store = create_object_store(path); - let sst_file_handle = new_file_handle(FileId::random()); - let sst_file_name = sst_file_handle.file_name(); - let iter = memtable.iter(IterContext::default()).unwrap(); - let writer = ParquetWriter::new(&sst_file_name, Source::Iter(iter), object_store.clone()); - - let SstInfo { - time_range, - file_size, - .. - } = writer - .write_sst(&sst::WriteOptions::default()) - .await - .unwrap() - .unwrap(); - - assert_eq!( - Some(( - Timestamp::new_millisecond(0), - Timestamp::new_millisecond((rows_total - 1) as i64) - )), - time_range - ); - assert_ne!(file_size, 0); - let operator = create_object_store(dir.path().to_str().unwrap()); - - let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1])).unwrap()); - let reader = ParquetReader::new( - sst_file_handle, - operator, - projected_schema, - Predicate::empty(), - TimestampRange::min_to_max(), - ); - - let mut rows_fetched = 0; - let mut stream = reader.chunk_stream().await.unwrap(); - while let Some(res) = stream.next_batch().await.unwrap() { - rows_fetched += res.num_rows(); - } - assert_eq!(rows_total, rows_fetched); - } - - fn new_file_handle(file_id: FileId) -> FileHandle { - let file_purger = new_noop_file_purger(); - let layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer {}); - FileHandle::new( - FileMeta { - region_id: 0.into(), - file_id, - time_range: Some(( - Timestamp::new_millisecond(0), - Timestamp::new_millisecond(1000), - )), - level: 0, - file_size: 0, - }, - layer, - file_purger, - ) - } - - #[tokio::test] - async fn test_parquet_reader() { - common_telemetry::init_default_ut_logging(); - let schema = memtable_tests::schema_for_test(); - let memtable = DefaultMemtableBuilder::default().build(schema.clone()); - - memtable_tests::write_kvs( - &*memtable, - 10, // sequence - OpType::Put, - &[1000, 1002, 2002, 2003, 2003, 1001], // keys - &[ - (Some(1), Some(1234)), - (Some(2), Some(1234)), - (Some(7), Some(1234)), - (Some(8), Some(1234)), - (Some(9), Some(1234)), - (Some(3), Some(1234)), - ], // values - ); - - let dir = create_temp_dir("write_parquet"); - let path = dir.path().to_str().unwrap(); - - let object_store = create_object_store(path); - let file_handle = new_file_handle(FileId::random()); - let sst_file_name = file_handle.file_name(); - let iter = memtable.iter(IterContext::default()).unwrap(); - let writer = ParquetWriter::new(&sst_file_name, Source::Iter(iter), object_store.clone()); - - let SstInfo { - time_range, - file_size, - .. - } = writer - .write_sst(&sst::WriteOptions::default()) - .await - .unwrap() - .unwrap(); - - assert_eq!( - Some(( - Timestamp::new_millisecond(1000), - Timestamp::new_millisecond(2003) - )), - time_range - ); - assert_ne!(file_size, 0); - let operator = create_object_store(dir.path().to_str().unwrap()); - - let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1])).unwrap()); - let reader = ParquetReader::new( - file_handle, - operator, - projected_schema, - Predicate::empty(), - TimestampRange::min_to_max(), - ); - - let mut stream = reader.chunk_stream().await.unwrap(); - assert_eq!( - 5, - stream - .next_batch() - .await - .transpose() - .unwrap() - .unwrap() - .num_rows() - ); - } - - async fn check_range_read( - file_handle: FileHandle, - object_store: ObjectStore, - schema: ProjectedSchemaRef, - range: TimestampRange, - expect: Vec, - ) { - let reader = - ParquetReader::new(file_handle, object_store, schema, Predicate::empty(), range); - let mut stream = reader.chunk_stream().await.unwrap(); - let result = stream.next_batch().await; - - let Some(batch) = result.unwrap() else { - // if batch does not contain any row - assert!(expect.is_empty()); - return; - }; - - assert_eq!( - ConcreteDataType::Timestamp(TimestampType::Millisecond(TimestampMillisecondType)), - batch.column(0).data_type() - ); - - let ts = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap() - .iter_data() - .map(|t| t.unwrap().0.value()) - .collect::>(); - assert_eq!(expect, ts); - } - - #[tokio::test] - async fn test_parquet_reader_with_time_range_filter() { - common_telemetry::init_default_ut_logging(); - let schema = memtable_tests::schema_for_test(); - let memtable = DefaultMemtableBuilder::default().build(schema.clone()); - - memtable_tests::write_kvs( - &*memtable, - 10, // sequence - OpType::Put, - &[1000, 1002, 2002, 2003, 2003, 1001, 3001], // keys - &[ - (Some(1), Some(1234)), - (Some(2), Some(1234)), - (Some(7), Some(1234)), - (Some(8), Some(1234)), - (Some(9), Some(1234)), - (Some(3), Some(1234)), - (Some(7), Some(1234)), - ], // values - ); - - let dir = create_temp_dir("read-parquet-by-range"); - let path = dir.path().to_str().unwrap(); - let object_store = create_object_store(path); - let sst_file_handle = new_file_handle(FileId::random()); - let sst_file_name = sst_file_handle.file_name(); - let iter = memtable.iter(IterContext::default()).unwrap(); - let writer = ParquetWriter::new(&sst_file_name, Source::Iter(iter), object_store.clone()); - - let SstInfo { - time_range, - file_size, - .. - } = writer - .write_sst(&sst::WriteOptions::default()) - .await - .unwrap() - .unwrap(); - - assert_eq!( - Some(( - Timestamp::new_millisecond(1000), - Timestamp::new_millisecond(3001) - )), - time_range - ); - assert_ne!(file_size, 0); - - let projected_schema = Arc::new(ProjectedSchema::new(schema, Some(vec![1, 0, 2])).unwrap()); - - check_range_read( - sst_file_handle.clone(), - object_store.clone(), - projected_schema.clone(), - TimestampRange::with_unit(1000, 2003, TimeUnit::Millisecond).unwrap(), - vec![1000, 1001, 1002, 2002], - ) - .await; - - check_range_read( - sst_file_handle.clone(), - object_store.clone(), - projected_schema.clone(), - TimestampRange::with_unit(2002, 3001, TimeUnit::Millisecond).unwrap(), - vec![2002, 2003], - ) - .await; - - // read a range without any rows. - check_range_read( - sst_file_handle.clone(), - object_store.clone(), - projected_schema.clone(), - TimestampRange::with_unit(3002, 3003, TimeUnit::Millisecond).unwrap(), - vec![], - ) - .await; - - // - check_range_read( - sst_file_handle.clone(), - object_store.clone(), - projected_schema.clone(), - TimestampRange::with_unit(1000, 3000, TimeUnit::Millisecond).unwrap(), - vec![1000, 1001, 1002, 2002, 2003], - ) - .await; - - // read full range - check_range_read( - sst_file_handle, - object_store, - projected_schema, - TimestampRange::min_to_max(), - vec![1000, 1001, 1002, 2002, 2003, 3001], - ) - .await; - } - - #[tokio::test] - async fn test_write_empty_file() { - common_telemetry::init_default_ut_logging(); - let schema = memtable_tests::schema_for_test(); - let memtable = DefaultMemtableBuilder::default().build(schema.clone()); - - let dir = create_temp_dir("write-empty-file"); - let path = dir.path().to_str().unwrap(); - let mut builder = Fs::default(); - let _ = builder.root(path); - let object_store = ObjectStore::new(builder).unwrap().finish(); - let sst_file_name = "test-empty.parquet"; - let iter = memtable.iter(IterContext::default()).unwrap(); - let writer = ParquetWriter::new(sst_file_name, Source::Iter(iter), object_store.clone()); - - let sst_info_opt = writer - .write_sst(&sst::WriteOptions::default()) - .await - .unwrap(); - assert!(sst_info_opt.is_none()); - // The file should not exist when no row has been written. - assert!(!object_store.is_exist(sst_file_name).await.unwrap()); - } -} diff --git a/src/storage/src/sst/pruning.rs b/src/storage/src/sst/pruning.rs deleted file mode 100644 index 6cfbd105f211..000000000000 --- a/src/storage/src/sst/pruning.rs +++ /dev/null @@ -1,415 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use arrow::array::{ - TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, - TimestampSecondArray, -}; -use arrow::datatypes::DataType; -use arrow::error::ArrowError; -use arrow_array::{Array, BooleanArray, RecordBatch}; -use common_time::range::TimestampRange; -use common_time::timestamp::TimeUnit; -use common_time::Timestamp; -use datafusion::physical_plan::PhysicalExpr; -use datatypes::prelude::ConcreteDataType; -use parquet::arrow::arrow_reader::{ArrowPredicate, RowFilter}; -use parquet::arrow::ProjectionMask; -use parquet::schema::types::SchemaDescriptor; -use snafu::ResultExt; -use table::predicate::Predicate; - -use crate::error; -use crate::error::BuildPredicateSnafu; -use crate::schema::StoreSchema; - -/// Builds row filters according to predicates. -pub(crate) fn build_row_filter( - time_range: TimestampRange, - predicate: &Predicate, - store_schema: &Arc, - schema_desc: &SchemaDescriptor, - projection_mask: ProjectionMask, -) -> Option { - let ts_col_idx = store_schema.timestamp_index(); - let ts_col = store_schema.columns().get(ts_col_idx)?; - let ts_col_unit = match &ts_col.desc.data_type { - ConcreteDataType::Timestamp(ts_type) => ts_type.unit(), - _ => unreachable!(), - }; - - let ts_col_projection = ProjectionMask::roots(schema_desc, vec![ts_col_idx]); - - // checks if converting time range unit into ts col unit will result into rounding error. - if time_unit_lossy(&time_range, ts_col_unit) { - let filter = RowFilter::new(vec![Box::new(PlainTimestampRowFilter::new( - time_range, - ts_col_projection, - ))]); - return Some(filter); - } - - // If any of the conversion overflows, we cannot use arrow's computation method, instead - // we resort to plain filter that compares timestamp with given range, less efficient, - // but simpler. - // TODO(hl): If the range is gt_eq/lt, we also use PlainTimestampRowFilter, but these cases - // can also use arrow's gt_eq_scalar/lt_scalar methods. - let time_range_row_filter = if let (Some(lower), Some(upper)) = ( - time_range - .start() - .and_then(|s| s.convert_to(ts_col_unit)) - .map(|t| t.value()), - time_range - .end() - .and_then(|s| s.convert_to(ts_col_unit)) - .map(|t| t.value()), - ) { - Box::new(FastTimestampRowFilter::new(ts_col_projection, lower, upper)) as _ - } else { - Box::new(PlainTimestampRowFilter::new(time_range, ts_col_projection)) as _ - }; - let mut predicates = vec![time_range_row_filter]; - if let Ok(datafusion_filters) = predicate_to_row_filter( - predicate, - projection_mask, - store_schema.schema().arrow_schema(), - ) { - predicates.extend(datafusion_filters); - } - let filter = RowFilter::new(predicates); - Some(filter) -} - -fn predicate_to_row_filter( - predicate: &Predicate, - projection_mask: ProjectionMask, - schema: &arrow::datatypes::SchemaRef, -) -> error::Result>> { - let physical_exprs = predicate - .to_physical_exprs(schema) - .context(BuildPredicateSnafu)?; - let mut datafusion_predicates = Vec::with_capacity(physical_exprs.len()); - for expr in &physical_exprs { - datafusion_predicates.push(Box::new(DatafusionArrowPredicate { - projection_mask: projection_mask.clone(), - physical_expr: expr.clone(), - }) as _); - } - - Ok(datafusion_predicates) -} - -#[derive(Debug)] -struct DatafusionArrowPredicate { - projection_mask: ProjectionMask, - physical_expr: Arc, -} - -impl ArrowPredicate for DatafusionArrowPredicate { - fn projection(&self) -> &ProjectionMask { - &self.projection_mask - } - - fn evaluate(&mut self, batch: RecordBatch) -> Result { - match self - .physical_expr - .evaluate(&batch) - .map(|v| v.into_array(batch.num_rows())) - { - Ok(array) => { - let bool_arr = array - .as_any() - .downcast_ref::() - .ok_or(ArrowError::CastError( - "Physical expr evaluated res is not a boolean array".to_string(), - ))? - .clone(); - Ok(bool_arr) - } - Err(e) => Err(ArrowError::ComputeError(format!( - "Error evaluating filter predicate: {e:?}" - ))), - } - } -} - -fn time_unit_lossy(range: &TimestampRange, ts_col_unit: TimeUnit) -> bool { - range - .start() - .map(|start| start.unit().factor() < ts_col_unit.factor()) - .unwrap_or(false) - || range - .end() - .map(|end| end.unit().factor() < ts_col_unit.factor()) - .unwrap_or(false) -} - -/// `FastTimestampRowFilter` is used to filter rows within given timestamp range when reading -/// row groups from parquet files, while avoids fetching all columns from SSTs file. -struct FastTimestampRowFilter { - lower_bound: i64, - upper_bound: i64, - projection: ProjectionMask, -} - -impl FastTimestampRowFilter { - fn new(projection: ProjectionMask, lower_bound: i64, upper_bound: i64) -> Self { - Self { - lower_bound, - upper_bound, - projection, - } - } -} - -impl ArrowPredicate for FastTimestampRowFilter { - fn projection(&self) -> &ProjectionMask { - &self.projection - } - - /// Selects the rows matching given time range. - fn evaluate(&mut self, batch: RecordBatch) -> Result { - // the projection has only timestamp column, so we can safely take the first column in batch. - let ts_col = batch.column(0); - - macro_rules! downcast_and_compute { - ($typ: ty) => { - { - let ts_col = ts_col - .as_any() - .downcast_ref::<$typ>() - .unwrap(); // safety: we've checked the data type of timestamp column. - let lower_bound = <$typ>::new_scalar(self.lower_bound); - let upper_bound = <$typ>::new_scalar(self.upper_bound); - let left = arrow::compute::kernels::cmp::gt_eq(ts_col, &lower_bound)?; - let right = arrow::compute::kernels::cmp::lt(ts_col, &upper_bound)?; - arrow::compute::and(&left, &right) - } - }; - } - - match ts_col.data_type() { - DataType::Timestamp(unit, _) => match unit { - arrow::datatypes::TimeUnit::Second => { - downcast_and_compute!(TimestampSecondArray) - } - arrow::datatypes::TimeUnit::Millisecond => { - downcast_and_compute!(TimestampMillisecondArray) - } - arrow::datatypes::TimeUnit::Microsecond => { - downcast_and_compute!(TimestampMicrosecondArray) - } - arrow::datatypes::TimeUnit::Nanosecond => { - downcast_and_compute!(TimestampNanosecondArray) - } - }, - _ => { - unreachable!() - } - } - } -} - -/// [PlainTimestampRowFilter] iterates each element in timestamp column, build a [Timestamp] struct -/// and checks if given time range contains the timestamp. -struct PlainTimestampRowFilter { - time_range: TimestampRange, - projection: ProjectionMask, -} - -impl PlainTimestampRowFilter { - fn new(time_range: TimestampRange, projection: ProjectionMask) -> Self { - Self { - time_range, - projection, - } - } -} - -impl ArrowPredicate for PlainTimestampRowFilter { - fn projection(&self) -> &ProjectionMask { - &self.projection - } - - fn evaluate(&mut self, batch: RecordBatch) -> Result { - // the projection has only timestamp column, so we can safely take the first column in batch. - let ts_col = batch.column(0); - - macro_rules! downcast_and_compute { - ($array_ty: ty, $unit: ident) => {{ - let ts_col = ts_col - .as_any() - .downcast_ref::<$array_ty>() - .unwrap(); // safety: we've checked the data type of timestamp column. - Ok(BooleanArray::from_iter(ts_col.iter().map(|ts| { - ts.map(|val| { - Timestamp::new(val, TimeUnit::$unit) - }).map(|ts| { - self.time_range.contains(&ts) - }) - }))) - - }}; - } - - match ts_col.data_type() { - DataType::Timestamp(unit, _) => match unit { - arrow::datatypes::TimeUnit::Second => { - downcast_and_compute!(TimestampSecondArray, Second) - } - arrow::datatypes::TimeUnit::Millisecond => { - downcast_and_compute!(TimestampMillisecondArray, Millisecond) - } - arrow::datatypes::TimeUnit::Microsecond => { - downcast_and_compute!(TimestampMicrosecondArray, Microsecond) - } - arrow::datatypes::TimeUnit::Nanosecond => { - downcast_and_compute!(TimestampNanosecondArray, Nanosecond) - } - }, - _ => { - unreachable!() - } - } - } -} - -#[cfg(test)] -mod tests { - use arrow_array::ArrayRef; - use datafusion_common::ToDFSchema; - use datafusion_expr::Operator; - use datafusion_physical_expr::create_physical_expr; - use datafusion_physical_expr::execution_props::ExecutionProps; - use datatypes::arrow_array::StringArray; - use datatypes::schema::{ColumnSchema, Schema}; - use datatypes::value::timestamp_to_scalar_value; - use parquet::arrow::arrow_to_parquet_schema; - - use super::*; - - fn check_unit_lossy(range_unit: TimeUnit, col_unit: TimeUnit, expect: bool) { - assert_eq!( - expect, - time_unit_lossy( - &TimestampRange::with_unit(0, 1, range_unit).unwrap(), - col_unit - ) - ) - } - - #[test] - fn test_time_unit_lossy() { - // converting a range with unit second to millisecond will not cause rounding error - check_unit_lossy(TimeUnit::Second, TimeUnit::Second, false); - check_unit_lossy(TimeUnit::Second, TimeUnit::Millisecond, false); - check_unit_lossy(TimeUnit::Second, TimeUnit::Microsecond, false); - check_unit_lossy(TimeUnit::Second, TimeUnit::Nanosecond, false); - - check_unit_lossy(TimeUnit::Millisecond, TimeUnit::Second, true); - check_unit_lossy(TimeUnit::Millisecond, TimeUnit::Millisecond, false); - check_unit_lossy(TimeUnit::Millisecond, TimeUnit::Microsecond, false); - check_unit_lossy(TimeUnit::Millisecond, TimeUnit::Nanosecond, false); - - check_unit_lossy(TimeUnit::Microsecond, TimeUnit::Second, true); - check_unit_lossy(TimeUnit::Microsecond, TimeUnit::Millisecond, true); - check_unit_lossy(TimeUnit::Microsecond, TimeUnit::Microsecond, false); - check_unit_lossy(TimeUnit::Microsecond, TimeUnit::Nanosecond, false); - - check_unit_lossy(TimeUnit::Nanosecond, TimeUnit::Second, true); - check_unit_lossy(TimeUnit::Nanosecond, TimeUnit::Millisecond, true); - check_unit_lossy(TimeUnit::Nanosecond, TimeUnit::Microsecond, true); - check_unit_lossy(TimeUnit::Nanosecond, TimeUnit::Nanosecond, false); - } - - fn check_arrow_predicate( - schema: Schema, - expr: datafusion_expr::Expr, - columns: Vec, - expected: Vec>, - ) { - let arrow_schema = schema.arrow_schema(); - let df_schema = arrow_schema.clone().to_dfschema().unwrap(); - let physical_expr = create_physical_expr( - &expr, - &df_schema, - arrow_schema.as_ref(), - &ExecutionProps::default(), - ) - .unwrap(); - let parquet_schema = arrow_to_parquet_schema(arrow_schema).unwrap(); - let mut predicate = DatafusionArrowPredicate { - physical_expr, - projection_mask: ProjectionMask::roots(&parquet_schema, vec![0, 1]), - }; - - let batch = arrow_array::RecordBatch::try_new(arrow_schema.clone(), columns).unwrap(); - - let res = predicate.evaluate(batch).unwrap(); - assert_eq!(expected, res.iter().collect::>()); - } - - #[test] - fn test_datafusion_predicate() { - let schema = Schema::new(vec![ - ColumnSchema::new( - "ts", - ConcreteDataType::timestamp_datatype(TimeUnit::Nanosecond), - false, - ), - ColumnSchema::new("name", ConcreteDataType::string_datatype(), true), - ]); - - let expr = datafusion_expr::and( - datafusion_expr::binary_expr( - datafusion_expr::col("ts"), - Operator::GtEq, - datafusion_expr::lit(timestamp_to_scalar_value(TimeUnit::Nanosecond, Some(10))), - ), - datafusion_expr::binary_expr( - datafusion_expr::col("name"), - Operator::Lt, - datafusion_expr::lit("Bob"), - ), - ); - - let ts_arr = Arc::new(TimestampNanosecondArray::from(vec![9, 11])) as Arc<_>; - let name_arr = Arc::new(StringArray::from(vec![Some("Alice"), Some("Charlie")])) as Arc<_>; - - let columns = vec![ts_arr, name_arr]; - check_arrow_predicate( - schema.clone(), - expr, - columns.clone(), - vec![Some(false), Some(false)], - ); - - let expr = datafusion_expr::and( - datafusion_expr::binary_expr( - datafusion_expr::col("ts"), - Operator::Lt, - datafusion_expr::lit(timestamp_to_scalar_value(TimeUnit::Nanosecond, Some(10))), - ), - datafusion_expr::binary_expr( - datafusion_expr::col("name"), - Operator::Lt, - datafusion_expr::lit("Bob"), - ), - ); - - check_arrow_predicate(schema, expr, columns, vec![Some(true), Some(false)]); - } -} diff --git a/src/storage/src/sst/stream_writer.rs b/src/storage/src/sst/stream_writer.rs deleted file mode 100644 index b1c14b89d0c9..000000000000 --- a/src/storage/src/sst/stream_writer.rs +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::future::Future; -use std::pin::Pin; - -use arrow_array::RecordBatch; -use common_datasource::buffered_writer::LazyBufferedWriter as DatasourceBufferedWriter; -use common_datasource::share_buffer::SharedBuffer; -use datatypes::schema::SchemaRef; -use object_store::ObjectStore; -use parquet::arrow::ArrowWriter; -use parquet::file::properties::WriterProperties; -use parquet::format::FileMetaData; -use snafu::ResultExt; - -use crate::error; -use crate::error::{NewRecordBatchSnafu, WriteParquetSnafu}; -use crate::read::Batch; -/// Parquet writer that buffers row groups in memory and writes buffered data to an underlying -/// storage by chunks to reduce memory consumption. -pub struct BufferedWriter { - inner: InnerBufferedWriter, - arrow_schema: arrow::datatypes::SchemaRef, -} - -type InnerBufferedWriter = DatasourceBufferedWriter< - object_store::Writer, - ArrowWriter, - Box< - dyn FnMut( - String, - ) -> Pin< - Box< - dyn Future> - + Send, - >, - > + Send, - >, ->; - -impl BufferedWriter { - pub async fn try_new( - path: String, - store: ObjectStore, - schema: &SchemaRef, - props: Option, - buffer_threshold: usize, - ) -> error::Result { - let arrow_schema = schema.arrow_schema(); - let buffer = SharedBuffer::with_capacity(buffer_threshold); - - let arrow_writer = ArrowWriter::try_new(buffer.clone(), arrow_schema.clone(), props) - .context(WriteParquetSnafu)?; - - Ok(Self { - inner: DatasourceBufferedWriter::new( - buffer_threshold, - buffer, - arrow_writer, - &path, - Box::new(move |path| { - let store = store.clone(); - Box::pin(async move { - store - .writer(&path) - .await - .context(common_datasource::error::WriteObjectSnafu { path }) - }) - }), - ), - arrow_schema: arrow_schema.clone(), - }) - } - - /// Write a record batch to stream writer. - pub async fn write(&mut self, batch: &Batch) -> error::Result<()> { - let arrow_batch = RecordBatch::try_new( - self.arrow_schema.clone(), - batch - .columns() - .iter() - .map(|v| v.to_arrow_array()) - .collect::>(), - ) - .context(NewRecordBatchSnafu)?; - - self.inner - .write(&arrow_batch) - .await - .context(error::WriteBufferSnafu)?; - let _ = self - .inner - .try_flush(false) - .await - .context(error::WriteBufferSnafu)?; - - Ok(()) - } - - /// Close parquet writer. - pub async fn close(self) -> error::Result<(FileMetaData, u64)> { - self.inner - .close_with_arrow_writer() - .await - .context(error::WriteBufferSnafu) - } -} diff --git a/src/storage/src/sync.rs b/src/storage/src/sync.rs deleted file mode 100644 index 6e71b616fe9f..000000000000 --- a/src/storage/src/sync.rs +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Synchronization utilities - -use std::ops::{Deref, DerefMut}; -use std::sync::{Arc, Mutex, MutexGuard}; - -use arc_swap::ArcSwap; - -/// A thread safe clone-on-write cell. -/// -/// Each read returns a read only clone of the internal data and won't block -/// write. Write to the cell data needs to acquire a lock txn first and -/// modifications are not visible to others until the txn is committed. -#[derive(Debug)] -pub struct CowCell { - inner: ArcSwap, - mutex: Mutex<()>, -} - -impl CowCell { - /// Create a new cell. - pub fn new(data: T) -> CowCell { - CowCell { - inner: ArcSwap::from(Arc::new(data)), - mutex: Mutex::new(()), - } - } - - /// Get a read only clone from the cell. - pub fn get(&self) -> Arc { - self.inner.load_full() - } -} - -impl CowCell { - /// Acquire a write txn, blocking the current thread. - /// - /// Note that this will clone the inner data. - pub fn lock(&self) -> TxnGuard { - let _guard = self.mutex.lock().unwrap(); - // Acquire a clone of data inside lock. - let data = (*self.get()).clone(); - - TxnGuard { - inner: &self.inner, - data, - _guard, - } - } -} - -/// A RAII implementation of a write transaction of the [CowCell]. -/// -/// When this txn is dropped (falls out of scope or committed), the lock will be -/// unlocked, but updates to the content won't be visible unless the txn is committed. -#[must_use = "if unused the CowCell will immediately unlock"] -pub struct TxnGuard<'a, T: Clone> { - inner: &'a ArcSwap, - data: T, - _guard: MutexGuard<'a, ()>, -} - -impl TxnGuard<'_, T> { - /// Commit updates to the cell and release the lock. - pub fn commit(self) { - let data = Arc::new(self.data); - self.inner.store(data); - } -} - -impl Deref for TxnGuard<'_, T> { - type Target = T; - - fn deref(&self) -> &T { - &self.data - } -} - -impl DerefMut for TxnGuard<'_, T> { - fn deref_mut(&mut self) -> &mut T { - &mut self.data - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_cow_cell_commit() { - let cell = CowCell::new(10); - assert_eq!(10, *cell.get()); - - let mut data = cell.lock(); - assert_eq!(10, *data); - - // It's okay to get read only clone from the cell during lock is held. - assert_eq!(10, *cell.get()); - - *data += 2; - - assert_eq!(*data, 12); - // The modification is still not visible. - assert_eq!(10, *cell.get()); - - // Commit the txn. - data.commit(); - - // Once the guard is committed, the new data is visible. - assert_eq!(12, *cell.get()); - } - - #[test] - fn test_cow_cell_cancel() { - let cell = CowCell::new(10); - assert_eq!(10, *cell.get()); - - { - let mut data = cell.lock(); - *data += 2; - } - - // The update is not committed, should not be visible. - assert_eq!(10, *cell.get()); - } -} diff --git a/src/storage/src/test_util.rs b/src/storage/src/test_util.rs deleted file mode 100644 index 1acbda14876b..000000000000 --- a/src/storage/src/test_util.rs +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub mod access_layer_util; -pub mod config_util; -pub mod descriptor_util; -pub mod flush_switch; -pub mod read_util; -pub mod schema_util; -pub mod write_batch_util; - -pub const TIMESTAMP_NAME: &str = "timestamp"; diff --git a/src/storage/src/test_util/access_layer_util.rs b/src/storage/src/test_util/access_layer_util.rs deleted file mode 100644 index faad942cb70c..000000000000 --- a/src/storage/src/test_util/access_layer_util.rs +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::read::BoxedBatchReader; -use crate::sst::{AccessLayer, FileHandle, FileId, ReadOptions, Source, SstInfo, WriteOptions}; - -#[derive(Debug)] -pub struct MockAccessLayer; - -#[async_trait::async_trait] -impl AccessLayer for MockAccessLayer { - fn sst_file_path(&self, file_name: &str) -> String { - file_name.to_string() - } - - async fn write_sst( - &self, - _file_id: FileId, - _source: Source, - _opts: &WriteOptions, - ) -> crate::error::Result> { - unimplemented!() - } - - async fn read_sst( - &self, - _file_handle: FileHandle, - _opts: &ReadOptions, - ) -> crate::error::Result { - unimplemented!() - } - - async fn delete_sst(&self, _file_id: FileId) -> crate::error::Result<()> { - Ok(()) - } -} diff --git a/src/storage/src/test_util/config_util.rs b/src/storage/src/test_util/config_util.rs deleted file mode 100644 index c71a23feb765..000000000000 --- a/src/storage/src/test_util/config_util.rs +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use common_config::WalConfig; -use common_datasource::compression::CompressionType; -use log_store::raft_engine::log_store::RaftEngineLogStore; -use object_store::services::Fs; -use object_store::ObjectStore; -use store_api::manifest::Manifest; -use store_api::storage::{CompactionStrategy, TwcsOptions}; - -use crate::compaction::CompactionHandler; -use crate::config::{EngineConfig, DEFAULT_REGION_WRITE_BUFFER_SIZE}; -use crate::engine::{self, RegionMap}; -use crate::file_purger::noop::NoopFilePurgeHandler; -use crate::flush::{FlushScheduler, PickerConfig, SizeBasedStrategy}; -use crate::manifest::region::RegionManifest; -use crate::memtable::DefaultMemtableBuilder; -use crate::region::StoreConfig; -use crate::scheduler::{LocalScheduler, SchedulerConfig}; -use crate::sst::FsAccessLayer; - -fn log_store_dir(store_dir: &str) -> String { - format!("{store_dir}/wal") -} - -/// Create a new StoreConfig for test. -pub async fn new_store_config( - region_name: &str, - store_dir: &str, - engine_config: EngineConfig, -) -> StoreConfig { - let mut builder = Fs::default(); - let _ = builder.root(store_dir); - let object_store = ObjectStore::new(builder).unwrap().finish(); - - new_store_config_with_object_store(region_name, store_dir, object_store, engine_config) - .await - .0 -} - -/// Create a new StoreConfig and region map for test. -pub async fn new_store_config_and_region_map( - region_name: &str, - store_dir: &str, - engine_config: EngineConfig, -) -> ( - StoreConfig, - Arc>, -) { - let mut builder = Fs::default(); - let _ = builder.root(store_dir); - let object_store = ObjectStore::new(builder).unwrap().finish(); - - new_store_config_with_object_store(region_name, store_dir, object_store, engine_config).await -} - -/// Create a new StoreConfig with given object store. -pub async fn new_store_config_with_object_store( - region_name: &str, - store_dir: &str, - object_store: ObjectStore, - engine_config: EngineConfig, -) -> ( - StoreConfig, - Arc>, -) { - let parent_dir = ""; - let sst_dir = engine::region_sst_dir(parent_dir, region_name); - let manifest_dir = engine::region_manifest_dir(parent_dir, region_name); - - let sst_layer = Arc::new(FsAccessLayer::new(&sst_dir, object_store.clone())); - let manifest = RegionManifest::with_checkpointer( - &manifest_dir, - object_store, - CompressionType::Uncompressed, - None, - None, - ); - manifest.start().await.unwrap(); - let log_store = Arc::new( - RaftEngineLogStore::try_new(log_store_dir(store_dir), WalConfig::default()) - .await - .unwrap(), - ); - - let compaction_scheduler = Arc::new(LocalScheduler::new( - SchedulerConfig::default(), - CompactionHandler::default(), - )); - // We use an empty region map so actually the background worker of the picker is disabled. - let regions = Arc::new(RegionMap::new()); - let flush_scheduler = Arc::new( - FlushScheduler::new( - SchedulerConfig::default(), - compaction_scheduler.clone(), - regions.clone(), - PickerConfig::default(), - ) - .unwrap(), - ); - let file_purger = Arc::new(LocalScheduler::new( - SchedulerConfig::default(), - NoopFilePurgeHandler, - )); - ( - StoreConfig { - log_store, - sst_layer, - manifest, - memtable_builder: Arc::new(DefaultMemtableBuilder::default()), - flush_scheduler, - flush_strategy: Arc::new(SizeBasedStrategy::default()), - compaction_scheduler, - engine_config: Arc::new(engine_config), - file_purger, - ttl: None, - write_buffer_size: DEFAULT_REGION_WRITE_BUFFER_SIZE.as_bytes() as usize, - compaction_strategy: CompactionStrategy::Twcs(TwcsOptions::default()), - }, - regions, - ) -} diff --git a/src/storage/src/test_util/descriptor_util.rs b/src/storage/src/test_util/descriptor_util.rs deleted file mode 100644 index ad7777dbff51..000000000000 --- a/src/storage/src/test_util/descriptor_util.rs +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use datatypes::prelude::ConcreteDataType; -use datatypes::type_id::LogicalTypeId; -use store_api::storage::{ - ColumnDescriptor, ColumnDescriptorBuilder, ColumnFamilyDescriptorBuilder, ColumnId, - RegionDescriptor, RegionId, RowKeyDescriptorBuilder, -}; - -use crate::test_util; -use crate::test_util::schema_util::ColumnDef; - -/// A RegionDescriptor builder for test. -pub struct RegionDescBuilder { - id: RegionId, - name: String, - last_column_id: ColumnId, - key_builder: RowKeyDescriptorBuilder, - default_cf_builder: ColumnFamilyDescriptorBuilder, -} - -impl RegionDescBuilder { - pub fn new>(name: T) -> Self { - let key_builder = RowKeyDescriptorBuilder::new( - ColumnDescriptorBuilder::new( - 1, - test_util::TIMESTAMP_NAME, - ConcreteDataType::timestamp_millisecond_datatype(), - ) - .is_nullable(false) - .is_time_index(true) - .build() - .unwrap(), - ); - - Self { - id: 0.into(), - name: name.into(), - last_column_id: 1, - key_builder, - default_cf_builder: ColumnFamilyDescriptorBuilder::default(), - } - } - - pub fn id(mut self, id: impl Into) -> Self { - self.id = id.into(); - self - } - - pub fn timestamp(mut self, column_def: ColumnDef) -> Self { - let column = self.new_ts_column(column_def); - self.key_builder = self.key_builder.timestamp(column); - self - } - - pub fn push_key_column(mut self, column_def: ColumnDef) -> Self { - let column = self.new_column(column_def); - self.key_builder = self.key_builder.push_column(column); - self - } - - pub fn push_field_column(mut self, column_def: ColumnDef) -> Self { - let column = self.new_column(column_def); - self.default_cf_builder = self.default_cf_builder.push_column(column); - self - } - - pub fn set_last_column_id(mut self, column_id: ColumnId) -> Self { - self.last_column_id = column_id; - self - } - - pub fn build(self) -> RegionDescriptor { - RegionDescriptor { - id: self.id, - name: self.name, - row_key: self.key_builder.build().unwrap(), - default_cf: self.default_cf_builder.build().unwrap(), - extra_cfs: Vec::new(), - } - } - - pub fn last_column_id(&self) -> ColumnId { - self.last_column_id - } - - fn alloc_column_id(&mut self) -> ColumnId { - self.last_column_id += 1; - self.last_column_id - } - - fn new_ts_column(&mut self, column_def: ColumnDef) -> ColumnDescriptor { - let datatype = column_def.1.data_type(); - ColumnDescriptorBuilder::new(self.alloc_column_id(), column_def.0, datatype) - .is_nullable(column_def.2) - .is_time_index(true) - .build() - .unwrap() - } - - fn new_column(&mut self, column_def: ColumnDef) -> ColumnDescriptor { - let datatype = column_def.1.data_type(); - ColumnDescriptorBuilder::new(self.alloc_column_id(), column_def.0, datatype) - .is_nullable(column_def.2) - .build() - .unwrap() - } -} - -/// Create desc with schema (k0, timestamp, v0, ... vn-1) -pub fn desc_with_field_columns(region_name: &str, num_field_columns: usize) -> RegionDescriptor { - let mut builder = - RegionDescBuilder::new(region_name).push_key_column(("k0", LogicalTypeId::Int64, false)); - for i in 0..num_field_columns { - let name = format!("v{i}"); - builder = builder.push_field_column((&name, LogicalTypeId::Int64, true)); - } - builder.build() -} diff --git a/src/storage/src/test_util/flush_switch.rs b/src/storage/src/test_util/flush_switch.rs deleted file mode 100644 index 58124fa896dd..000000000000 --- a/src/storage/src/test_util/flush_switch.rs +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Mutex; - -use crate::flush::{FlushStrategy, FlushType, RegionStatus}; - -/// Controls whether to flush a region while writing the region. -/// Disable flush by default. -#[derive(Debug, Default)] -pub struct FlushSwitch { - flush_type: Mutex>, -} - -impl FlushSwitch { - pub fn set_should_flush(&self, should_flush: bool) { - if should_flush { - *self.flush_type.lock().unwrap() = Some(FlushType::Region); - } else { - *self.flush_type.lock().unwrap() = None; - } - } - - pub fn set_flush_type(&self, flush_type: FlushType) { - *self.flush_type.lock().unwrap() = Some(flush_type); - } -} - -impl FlushStrategy for FlushSwitch { - fn should_flush(&self, _status: RegionStatus) -> Option { - *self.flush_type.lock().unwrap() - } - - fn reserve_mem(&self, _mem: usize) {} - - fn schedule_free_mem(&self, _mem: usize) {} - - fn free_mem(&self, _mem: usize) {} -} - -pub fn has_parquet_file(sst_dir: &str) -> bool { - for entry in std::fs::read_dir(sst_dir).unwrap() { - let entry = entry.unwrap(); - let path = entry.path(); - if !path.is_dir() { - assert_eq!("parquet", path.extension().unwrap()); - return true; - } - } - - false -} diff --git a/src/storage/src/test_util/read_util.rs b/src/storage/src/test_util/read_util.rs deleted file mode 100644 index 23cfc6c3022a..000000000000 --- a/src/storage/src/test_util/read_util.rs +++ /dev/null @@ -1,164 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use api::v1::OpType; -use async_trait::async_trait; -use datatypes::prelude::ScalarVector; -use datatypes::type_id::LogicalTypeId; -use datatypes::vectors::{Int64Vector, TimestampMillisecondVector, UInt64Vector, UInt8Vector}; - -use crate::error::Result; -use crate::memtable::{BatchIterator, BoxedBatchIterator, RowOrdering}; -use crate::metadata::RegionMetadata; -use crate::read::{Batch, BatchReader, BoxedBatchReader}; -use crate::schema::{ProjectedSchema, ProjectedSchemaRef, RegionSchemaRef}; -use crate::test_util::descriptor_util::RegionDescBuilder; - -/// Create a new region schema (timestamp, v0). -fn new_region_schema() -> RegionSchemaRef { - let desc = RegionDescBuilder::new("read-util") - .push_field_column(("v0", LogicalTypeId::Int64, true)) - .build(); - let metadata: RegionMetadata = desc.try_into().unwrap(); - metadata.schema().clone() -} - -/// Create a new projected schema (timestamp, v0). -pub fn new_projected_schema() -> ProjectedSchemaRef { - let region_schema = new_region_schema(); - Arc::new(ProjectedSchema::new(region_schema, None).unwrap()) -} - -/// Build a new batch, with 0 sequence and op_type. -pub fn new_kv_batch(key_values: &[(i64, Option)]) -> Batch { - let key = Arc::new(TimestampMillisecondVector::from_values( - key_values.iter().map(|v| v.0), - )); - let value = Arc::new(Int64Vector::from( - key_values.iter().map(|v| v.1).collect::>(), - )); - let sequences = Arc::new(UInt64Vector::from_vec(vec![0; key_values.len()])); - let op_types = Arc::new(UInt8Vector::from_vec(vec![0; key_values.len()])); - - Batch::new(vec![key, value, sequences, op_types]) -} - -/// Build a new batch from (key, value, sequence, op_type) -pub fn new_full_kv_batch(all_values: &[(i64, i64, u64, OpType)]) -> Batch { - let key = Arc::new(TimestampMillisecondVector::from_values( - all_values.iter().map(|v| v.0), - )); - let value = Arc::new(Int64Vector::from_values(all_values.iter().map(|v| v.1))); - let sequences = Arc::new(UInt64Vector::from_values(all_values.iter().map(|v| v.2))); - let op_types = Arc::new(UInt8Vector::from_values( - all_values.iter().map(|v| v.3 as u8), - )); - - Batch::new(vec![key, value, sequences, op_types]) -} - -pub async fn collect_kv_batch(reader: &mut dyn BatchReader) -> Vec<(i64, Option)> { - let mut result = Vec::new(); - while let Some(batch) = reader.next_batch().await.unwrap() { - let key = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - let value = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - - for (k, v) in key.iter_data().zip(value.iter_data()) { - result.push((k.unwrap().into(), v)); - } - } - - result -} - -pub type Batches<'a> = &'a [&'a [(i64, Option)]]; - -/// A reader for test that pop batch from Vec. -pub struct VecBatchReader { - schema: ProjectedSchemaRef, - batches: Vec, -} - -impl VecBatchReader { - fn new(mut batches: Vec) -> VecBatchReader { - batches.reverse(); - - VecBatchReader { - schema: new_projected_schema(), - batches, - } - } -} - -#[async_trait] -impl BatchReader for VecBatchReader { - async fn next_batch(&mut self) -> Result> { - Ok(self.batches.pop()) - } -} - -impl Iterator for VecBatchReader { - type Item = Result; - - fn next(&mut self) -> Option> { - self.batches.pop().map(Ok) - } -} - -impl BatchIterator for VecBatchReader { - fn schema(&self) -> ProjectedSchemaRef { - self.schema.clone() - } - - fn ordering(&self) -> RowOrdering { - // TODO(yingwen): Allow setting the row ordering. - RowOrdering::Key - } -} - -pub fn build_vec_reader(batches: &[&[(i64, Option)]]) -> VecBatchReader { - let batches: Vec<_> = batches - .iter() - .map(|key_values| new_kv_batch(key_values)) - .collect(); - - VecBatchReader::new(batches) -} - -pub fn build_full_vec_reader(batches: &[&[(i64, i64, u64, OpType)]]) -> VecBatchReader { - let batches: Vec<_> = batches - .iter() - .map(|key_values| new_full_kv_batch(key_values)) - .collect(); - - VecBatchReader::new(batches) -} - -pub fn build_boxed_reader(batches: &[&[(i64, Option)]]) -> BoxedBatchReader { - Box::new(build_vec_reader(batches)) -} - -pub fn build_boxed_iter(batches: &[&[(i64, Option)]]) -> BoxedBatchIterator { - Box::new(build_vec_reader(batches)) -} diff --git a/src/storage/src/test_util/schema_util.rs b/src/storage/src/test_util/schema_util.rs deleted file mode 100644 index ad93213c0969..000000000000 --- a/src/storage/src/test_util/schema_util.rs +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::sync::Arc; - -use datatypes::prelude::*; -use datatypes::schema::{ColumnSchema, Schema, SchemaBuilder, SchemaRef}; - -use super::descriptor_util; -use crate::metadata::RegionMetadata; -use crate::schema::RegionSchema; - -/// Column definition: (name, datatype, is_nullable) -pub type ColumnDef<'a> = (&'a str, LogicalTypeId, bool); - -pub fn new_schema(column_defs: &[ColumnDef], timestamp_index: Option) -> Schema { - new_schema_with_version(column_defs, timestamp_index, 0) -} - -pub fn new_schema_with_version( - column_defs: &[ColumnDef], - timestamp_index: Option, - version: u32, -) -> Schema { - let column_schemas: Vec<_> = column_defs - .iter() - .enumerate() - .map(|(index, column_def)| { - let datatype = column_def.1.data_type(); - if let Some(timestamp_index) = timestamp_index { - ColumnSchema::new(column_def.0, datatype, column_def.2) - .with_time_index(index == timestamp_index) - } else { - ColumnSchema::new(column_def.0, datatype, column_def.2) - } - }) - .collect(); - - SchemaBuilder::try_from(column_schemas) - .unwrap() - .version(version) - .build() - .unwrap() -} - -pub fn new_schema_ref(column_defs: &[ColumnDef], timestamp_index: Option) -> SchemaRef { - Arc::new(new_schema(column_defs, timestamp_index)) -} - -pub fn new_region_schema(version: u32, num_field_columns: usize) -> RegionSchema { - let metadata: RegionMetadata = - descriptor_util::desc_with_field_columns("REGION_NAME", num_field_columns) - .try_into() - .unwrap(); - - let columns = metadata.columns; - RegionSchema::new(columns, version).unwrap() -} diff --git a/src/storage/src/test_util/write_batch_util.rs b/src/storage/src/test_util/write_batch_util.rs deleted file mode 100644 index 596e4228f193..000000000000 --- a/src/storage/src/test_util/write_batch_util.rs +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::test_util::schema_util::{self, ColumnDef}; -use crate::write_batch::WriteBatch; - -pub fn new_write_batch( - column_defs: &[ColumnDef], - timestamp_index: Option, - row_key_end: usize, -) -> WriteBatch { - let schema = schema_util::new_schema_ref(column_defs, timestamp_index); - - WriteBatch::new(schema, row_key_end) -} diff --git a/src/storage/src/version.rs b/src/storage/src/version.rs deleted file mode 100644 index f614f320a91d..000000000000 --- a/src/storage/src/version.rs +++ /dev/null @@ -1,359 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -//! Version control of storage. -//! -//! To read latest data from `VersionControl`, we need to -//! 1. Acquire `Version` from `VersionControl`. -//! 2. Then acquire last sequence. -//! -//! Reason: data may be flushed/compacted and some data with old sequence may be removed -//! and became invisible between step 1 and 2, so need to acquire version at first. - -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::Arc; - -use common_telemetry::{debug, info}; -use store_api::manifest::ManifestVersion; -use store_api::storage::{SchemaRef, SequenceNumber}; - -use crate::file_purger::FilePurgerRef; -use crate::memtable::{MemtableId, MemtableRef, MemtableVersion}; -use crate::metadata::RegionMetadataRef; -use crate::schema::RegionSchemaRef; -use crate::sst::{AccessLayerRef, FileMeta, LevelMetas}; -use crate::sync::CowCell; -pub const INIT_COMMITTED_SEQUENCE: u64 = 0; - -/// Controls version of in memory state for a region. -#[derive(Debug)] -pub struct VersionControl { - // TODO(yingwen): If all modification to version must acquire the region writer lock first, - // then we may just use ArcSwap to hold version. But some operations may only require the - // version lock, instead of the writer lock, since we can use the version lock the protect - // the read-modify-write of version. - version: CowCell, - /// Latest sequence that is committed and visible to user. - committed_sequence: AtomicU64, -} - -impl VersionControl { - /// Construct a new version control from existing `version`. - pub fn with_version(version: Version) -> VersionControl { - VersionControl { - version: CowCell::new(version), - committed_sequence: AtomicU64::new(INIT_COMMITTED_SEQUENCE), - } - } - - /// Returns current version. - #[inline] - pub fn current(&self) -> VersionRef { - self.version.get() - } - - #[inline] - pub fn current_manifest_version(&self) -> ManifestVersion { - self.current().manifest_version - } - - /// Metadata of current version. - pub fn metadata(&self) -> RegionMetadataRef { - let version = self.current(); - version.metadata.clone() - } - - #[inline] - pub fn committed_sequence(&self) -> SequenceNumber { - self.committed_sequence.load(Ordering::Relaxed) - } - - /// Set committed sequence to `value`. - /// - /// External synchronization is required to ensure only one thread can update the - /// last sequence. - #[inline] - pub fn set_committed_sequence(&self, value: SequenceNumber) { - // Relaxed ordering is enough for this update as this method requires external synchoronization. - self.committed_sequence.store(value, Ordering::Relaxed); - } - - /// Freeze all mutable memtables. - pub fn freeze_mutable(&self, new_memtable: MemtableRef) { - let mut version_to_update = self.version.lock(); - - let memtable_version = version_to_update.memtables(); - let freezed = memtable_version.freeze_mutable(new_memtable); - version_to_update.memtables = Arc::new(freezed); - - version_to_update.commit(); - } - - /// Apply [VersionEdit] to the version. - pub fn apply_edit(&self, edit: VersionEdit) { - let mut version_to_update = self.version.lock(); - version_to_update.apply_edit(edit); - version_to_update.commit(); - } - - /// Freeze all mutable memtables and then apply the new metadata to the version. - pub fn freeze_mutable_and_apply_metadata( - &self, - metadata: RegionMetadataRef, - manifest_version: ManifestVersion, - mutable_memtable: MemtableRef, - ) { - let mut version_to_update = self.version.lock(); - - let memtable_version = version_to_update.memtables(); - // When applying metadata, mutable memtable set might be empty and there is no - // need to freeze it. - let freezed = memtable_version.freeze_mutable(mutable_memtable); - version_to_update.memtables = Arc::new(freezed); - - version_to_update.apply_metadata(metadata, manifest_version); - version_to_update.commit(); - } - - pub fn reset_version( - &self, - manifest_version: ManifestVersion, - memtables: MemtableVersionRef, - ssts: LevelMetasRef, - ) { - let mut version_to_update = self.version.lock(); - version_to_update.reset(manifest_version, memtables, ssts, 0); - version_to_update.commit(); - } -} - -#[derive(Debug)] -pub struct VersionEdit { - pub files_to_add: Vec, - pub files_to_remove: Vec, - pub flushed_sequence: Option, - pub manifest_version: ManifestVersion, - pub max_memtable_id: Option, - pub compaction_time_window: Option, -} - -pub type VersionControlRef = Arc; -pub type VersionRef = Arc; -type MemtableVersionRef = Arc; -pub type LevelMetasRef = Arc; - -/// Version contains metadata and state of region. -#[derive(Clone, Debug)] -pub struct Version { - /// Metadata of the region. - /// - /// Altering metadata isn't frequent, storing metadata in Arc to allow sharing - /// metadata and reuse metadata when creating a new `Version`. - metadata: RegionMetadataRef, - /// Mutable and immutable memtables. - /// - /// Wrapped in Arc to make clone of `Version` much cheaper. - memtables: MemtableVersionRef, - /// SSTs of the region. - ssts: LevelMetasRef, - /// Inclusive max sequence of flushed data. - flushed_sequence: SequenceNumber, - /// Current version of manifest. - manifest_version: ManifestVersion, - // TODO(yingwen): Maybe also store last sequence to this version when switching - // version, so we can know the newest data can read from this version. -} - -impl Version { - /// Create a new `Version` with given `metadata`. - #[cfg(test)] - pub fn new(metadata: RegionMetadataRef, memtable: MemtableRef) -> Version { - let sst_layer = Arc::new(crate::test_util::access_layer_util::MockAccessLayer) as Arc<_>; - let file_purger = Arc::new(crate::scheduler::LocalScheduler::new( - crate::scheduler::SchedulerConfig::default(), - crate::file_purger::noop::NoopFilePurgeHandler, - )); - Version::with_manifest_version(metadata, 0, memtable, sst_layer, file_purger) - } - - /// Create a new `Version` with given `metadata` and initial `manifest_version`. - pub fn with_manifest_version( - metadata: RegionMetadataRef, - manifest_version: ManifestVersion, - mutable_memtable: MemtableRef, - sst_layer: AccessLayerRef, - file_purger: FilePurgerRef, - ) -> Version { - Version { - metadata, - memtables: Arc::new(MemtableVersion::new(mutable_memtable)), - ssts: Arc::new(LevelMetas::new(sst_layer, file_purger)), - flushed_sequence: 0, - manifest_version, - } - } - - #[inline] - pub fn metadata(&self) -> &RegionMetadataRef { - &self.metadata - } - - #[inline] - pub fn schema(&self) -> &RegionSchemaRef { - self.metadata.schema() - } - - #[inline] - pub fn user_schema(&self) -> &SchemaRef { - self.metadata.user_schema() - } - - #[inline] - pub fn mutable_memtable(&self) -> &MemtableRef { - self.memtables.mutable_memtable() - } - - #[inline] - pub fn memtables(&self) -> &MemtableVersionRef { - &self.memtables - } - - #[inline] - pub fn ssts(&self) -> &LevelMetasRef { - &self.ssts - } - - #[inline] - pub fn flushed_sequence(&self) -> SequenceNumber { - self.flushed_sequence - } - - pub fn apply_checkpoint( - &mut self, - flushed_sequence: Option, - manifest_version: ManifestVersion, - files: impl Iterator, - ) { - self.flushed_sequence = flushed_sequence.unwrap_or(self.flushed_sequence); - self.manifest_version = manifest_version; - let ssts = self.ssts.merge(files, std::iter::empty(), None); - info!( - "After applying checkpoint, region: {}, id: {}, flushed_sequence: {}, manifest_version: {}", - self.metadata.name(), - self.metadata.id(), - self.flushed_sequence, - self.manifest_version, - ); - - self.ssts = Arc::new(ssts); - } - - pub fn apply_edit(&mut self, edit: VersionEdit) { - let flushed_sequence = edit.flushed_sequence.unwrap_or(self.flushed_sequence); - if self.flushed_sequence < flushed_sequence { - self.flushed_sequence = flushed_sequence; - } - if self.manifest_version < edit.manifest_version { - self.manifest_version = edit.manifest_version; - } - - if let Some(max_memtable_id) = edit.max_memtable_id { - // Remove flushed memtables - let memtable_version = self.memtables(); - let removed = memtable_version.remove_immutables(max_memtable_id); - self.memtables = Arc::new(removed); - } - - let handles_to_add = edit.files_to_add.into_iter(); - let merged_ssts = self.ssts.merge( - handles_to_add, - edit.files_to_remove.into_iter(), - edit.compaction_time_window, - ); - - debug!( - "After applying edit, region: {}, id: {}, SST files: {:?}", - self.metadata.name(), - self.metadata.id(), - merged_ssts, - ); - self.ssts = Arc::new(merged_ssts); - } - - /// Updates metadata of the version. - /// - /// # Panics - /// Panics if `metadata.version() <= self.metadata.version()`. - pub fn apply_metadata( - &mut self, - metadata: RegionMetadataRef, - manifest_version: ManifestVersion, - ) { - assert!( - metadata.version() > self.metadata.version(), - "Updating metadata from version {} to {} is not allowed", - self.metadata.version(), - metadata.version() - ); - - if self.manifest_version < manifest_version { - self.manifest_version = manifest_version; - } - - self.metadata = metadata; - } - - #[inline] - pub fn manifest_version(&self) -> ManifestVersion { - self.manifest_version - } - - pub fn reset( - &mut self, - manifest_version: ManifestVersion, - memtables: MemtableVersionRef, - ssts: LevelMetasRef, - flushed_sequence: SequenceNumber, - ) { - self.memtables = memtables; - self.ssts = ssts; - self.manifest_version = manifest_version; - self.flushed_sequence = flushed_sequence; - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::memtable::{DefaultMemtableBuilder, MemtableBuilder}; - use crate::test_util::descriptor_util::RegionDescBuilder; - - fn new_version_control() -> VersionControl { - let desc = RegionDescBuilder::new("version-test").build(); - let metadata: RegionMetadataRef = Arc::new(desc.try_into().unwrap()); - let memtable = DefaultMemtableBuilder::default().build(metadata.schema().clone()); - - let version = Version::new(metadata, memtable); - VersionControl::with_version(version) - } - - #[test] - fn test_version_control() { - let version_control = new_version_control(); - - assert_eq!(0, version_control.committed_sequence()); - version_control.set_committed_sequence(12345); - assert_eq!(12345, version_control.committed_sequence()); - } -} diff --git a/src/storage/src/wal.rs b/src/storage/src/wal.rs deleted file mode 100644 index 54629cb7f938..000000000000 --- a/src/storage/src/wal.rs +++ /dev/null @@ -1,324 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::pin::Pin; -use std::sync::Arc; - -use common_error::ext::BoxedError; -use futures::{stream, Stream, TryStreamExt}; -use prost::Message; -use snafu::{ensure, Location, ResultExt}; -use store_api::logstore::entry::{Entry, Id}; -use store_api::logstore::LogStore; -use store_api::storage::{RegionId, SequenceNumber}; - -use crate::codec::{Decoder, Encoder}; -use crate::error::{ - DecodeWalHeaderSnafu, DeleteWalNamespaceSnafu, EncodeWalHeaderSnafu, Error, - MarkWalObsoleteSnafu, ReadWalSnafu, Result, WalDataCorruptedSnafu, WriteWalSnafu, -}; -use crate::proto::wal::{self, WalHeader}; -use crate::write_batch::codec::{PayloadDecoder, PayloadEncoder}; -use crate::write_batch::Payload; - -#[derive(Debug)] -pub struct Wal { - region_id: RegionId, - namespace: S::Namespace, - store: Arc, -} - -pub type PayloadStream<'a> = - Pin)>> + Send + 'a>>; - -// Wal should be cheap to clone, so avoid holding things like String, Vec. -impl Clone for Wal { - fn clone(&self) -> Self { - Self { - region_id: self.region_id, - namespace: self.namespace.clone(), - store: self.store.clone(), - } - } -} - -impl Wal { - pub fn new(region_id: RegionId, store: Arc) -> Self { - let namespace = store.namespace(region_id.into()); - Self { - region_id, - namespace, - store, - } - } - - pub async fn obsolete(&self, seq: SequenceNumber) -> Result<()> { - self.store - .obsolete(self.namespace.clone(), seq) - .await - .map_err(BoxedError::new) - .context(MarkWalObsoleteSnafu { - region_id: self.region_id, - }) - } - - pub async fn delete_namespace(&self) -> Result<()> { - self.store - .delete_namespace(&self.namespace) - .await - .map_err(BoxedError::new) - .context(DeleteWalNamespaceSnafu { - region_id: self.region_id, - }) - } - - #[inline] - pub fn region_id(&self) -> RegionId { - self.region_id - } - - #[cfg(test)] - pub async fn close(&self) -> Result<()> { - let _ = self.store.stop().await; - Ok(()) - } -} - -impl Wal { - /// Data format: - /// - /// ```text - /// | | - /// |--------------------------> Header Len <-----------------------------| Arrow IPC format - /// | | - /// v v - /// +---------------------+----------------------------------------------------+--------------+-------------+--------------+ - /// | | Header | | | | - /// | Header Len(varint) | (last_manifest_version + mutation_types + ...) | Payload 0 | Payload 1 | ... | - /// | | | | | | - /// +---------------------+----------------------------------------------------+--------------+-------------+--------------+ - /// ``` - /// - pub async fn write_to_wal( - &self, - seq: SequenceNumber, - mut header: WalHeader, - payload: Option<&Payload>, - ) -> Result { - let _timer = crate::metrics::LOG_STORE_WRITE_ELAPSED.start_timer(); - if let Some(p) = payload { - header.mutation_types = wal::gen_mutation_types(p); - } - - let mut buf = vec![]; - - // Encode header - let wal_header_encoder = WalHeaderEncoder {}; - wal_header_encoder.encode(&header, &mut buf)?; - - // Encode payload - if let Some(p) = payload { - let encoder = PayloadEncoder::new(); - // TODO(jiachun): provide some way to compute data size before encode, so we can preallocate an exactly sized buf. - encoder - .encode(p, &mut buf) - .map_err(BoxedError::new) - .context(WriteWalSnafu { - region_id: self.region_id(), - })?; - } - - // write bytes to wal - self.write(seq, &buf).await - } - - pub async fn read_from_wal(&self, start_seq: SequenceNumber) -> Result> { - let stream = self - .store - .read(&self.namespace, start_seq) - .await - .map_err(BoxedError::new) - .context(ReadWalSnafu { - region_id: self.region_id(), - })? - // Handle the error when reading from the stream. - .map_err(|e| Error::ReadWal { - region_id: self.region_id(), - source: BoxedError::new(e), - location: Location::default(), - }) - .and_then(|entries| async { - let iter = entries.into_iter().map(|x| self.decode_entry(x)); - - Ok(stream::iter(iter)) - }) - .try_flatten(); - - Ok(Box::pin(stream)) - } - - async fn write(&self, seq: SequenceNumber, bytes: &[u8]) -> Result { - let e = self.store.entry(bytes, seq, self.namespace.clone()); - - let response = self - .store - .append(e) - .await - .map_err(BoxedError::new) - .context(WriteWalSnafu { - region_id: self.region_id(), - })?; - - Ok(response.entry_id) - } - - fn decode_entry( - &self, - entry: E, - ) -> Result<(SequenceNumber, WalHeader, Option)> { - let seq_num = entry.id(); - let input = entry.data(); - - let wal_header_decoder = WalHeaderDecoder {}; - let (data_pos, header) = wal_header_decoder.decode(input)?; - - ensure!( - data_pos <= input.len(), - WalDataCorruptedSnafu { - region_id: self.region_id(), - message: format!( - "Not enough input buffer, expected data position={}, actual buffer length={}", - data_pos, - input.len() - ), - } - ); - - if header.mutation_types.is_empty() { - return Ok((seq_num, header, None)); - } - - let decoder = PayloadDecoder::new(&header.mutation_types); - let payload = decoder - .decode(&input[data_pos..]) - .map_err(BoxedError::new) - .context(ReadWalSnafu { - region_id: self.region_id(), - })?; - - Ok((seq_num, header, Some(payload))) - } -} - -pub struct WalHeaderEncoder {} - -impl Encoder for WalHeaderEncoder { - type Item = WalHeader; - type Error = Error; - - fn encode(&self, item: &WalHeader, dst: &mut Vec) -> Result<()> { - item.encode_length_delimited(dst) - .map_err(|err| err.into()) - .context(EncodeWalHeaderSnafu) - } -} - -pub struct WalHeaderDecoder {} - -impl Decoder for WalHeaderDecoder { - type Item = (usize, WalHeader); - type Error = Error; - - fn decode(&self, src: &[u8]) -> Result<(usize, WalHeader)> { - let mut data_pos = prost::decode_length_delimiter(src) - .map_err(|err| err.into()) - .context(DecodeWalHeaderSnafu)?; - data_pos += prost::length_delimiter_len(data_pos); - - let wal_header = WalHeader::decode_length_delimited(src) - .map_err(|err| err.into()) - .context(DecodeWalHeaderSnafu)?; - - Ok((data_pos, wal_header)) - } -} - -#[cfg(test)] -mod tests { - use common_test_util::temp_dir::create_temp_dir; - use log_store::test_util; - - use super::*; - - #[tokio::test] - pub async fn test_write_wal() { - let log_file_dir = create_temp_dir("wal_test"); - let log_file_dir_path = log_file_dir.path().to_str().unwrap(); - let log_store = - test_util::log_store_util::create_tmp_local_file_log_store(log_file_dir_path).await; - let wal = Wal::new(RegionId::from(0), Arc::new(log_store)); - - let res = wal.write(0, b"test1").await.unwrap(); - - assert_eq!(0, res); - let res = wal.write(1, b"test2").await.unwrap(); - assert_eq!(1, res); - } - - #[tokio::test] - pub async fn test_read_wal_only_header() -> Result<()> { - common_telemetry::init_default_ut_logging(); - let log_file_dir = create_temp_dir("wal_test"); - let log_file_dir_path = log_file_dir.path().to_str().unwrap(); - let log_store = - test_util::log_store_util::create_tmp_local_file_log_store(log_file_dir_path).await; - let wal = Wal::new(RegionId::from(0), Arc::new(log_store)); - let header = WalHeader::with_last_manifest_version(111); - let seq_num = 3; - let _ = wal.write_to_wal(seq_num, header, None).await?; - - let mut stream = wal.read_from_wal(seq_num).await?; - let mut data = vec![]; - while let Some((seq_num, header, write_batch)) = stream.try_next().await? { - data.push((seq_num, header, write_batch)); - } - assert_eq!(1, data.len()); - assert_eq!(111, data[0].1.last_manifest_version); - assert!(data[0].2.is_none()); - - Ok(()) - } - - #[test] - pub fn test_wal_header_codec() { - let wal_header = WalHeader { - last_manifest_version: 99999999, - mutation_types: vec![], - }; - - let mut buf: Vec = vec![]; - let wal_encoder = WalHeaderEncoder {}; - wal_encoder.encode(&wal_header, &mut buf).unwrap(); - - buf.push(1u8); // data - buf.push(2u8); // data - buf.push(3u8); // data - - let decoder = WalHeaderDecoder {}; - let res = decoder.decode(&buf).unwrap(); - - let data_pos = res.0; - assert_eq!(buf.len() - 3, data_pos); - } -} diff --git a/src/storage/src/window_infer.rs b/src/storage/src/window_infer.rs deleted file mode 100644 index 35c06bb14470..000000000000 --- a/src/storage/src/window_infer.rs +++ /dev/null @@ -1,401 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::HashSet; - -use common_time::range::TimestampRange; -use common_time::timestamp::TimeUnit; -use common_time::timestamp_millis::BucketAligned; -use itertools::Itertools; - -use crate::memtable::MemtableStats; -use crate::sst::FileMeta; - -/// A set of predefined time windows. -const TIME_WINDOW_SIZE: [i64; 14] = [ - 1, // 1 second - 60, // 1 minute - 60 * 10, // 10 minutes - 60 * 30, // 30 minutes - 60 * 60, // 1 hour - 2 * 60 * 60, // 2 hours - 6 * 60 * 60, // 6 hours - 12 * 60 * 60, // 12 hours - 24 * 60 * 60, // 1 day - 7 * 24 * 60 * 60, // 1 week - 30 * 24 * 60 * 60, // 1 month - 12 * 30 * 24 * 60 * 60, // 1 year - 10 * 12 * 30 * 24 * 60 * 60, // 10 years - 100 * 12 * 30 * 24 * 60 * 60, // 100 years -]; - -/// [WindowInfer] infers the time windows that can be used to optimize table scans ordered by -/// timestamp column or have explicit time windows. By splitting time spans of tables into -/// time windows, we can scan entries window by window. -pub(crate) trait WindowInfer { - /// Infers time windows according to the SST files and memtables. - /// - /// ### Note - /// The order of returned vector defines how records are yielded. - fn infer_window( - &self, - files: &[FileMeta], - mem_tables: &[MemtableStats], - ts_desc: bool, - ) -> Vec; -} - -/// [PlainWindowInference] simply finds the minimum time span within all SST files in level 0 and -/// memtables, matches that time span into a set of predefined time windows. -pub(crate) struct PlainWindowInference; - -impl WindowInfer for PlainWindowInference { - fn infer_window( - &self, - files: &[FileMeta], - mem_tables: &[MemtableStats], - ts_desc: bool, - ) -> Vec { - let mut min_duration_sec = i64::MAX; - let mut max_durations_sec = i64::MIN; - let mut durations = Vec::with_capacity(files.len() + mem_tables.len()); - - for meta in files { - if let Some((start, end)) = &meta.time_range { - // unwrap safety: converting timestamps with any unit to seconds won't overflow. - let start_sec = start.convert_to(TimeUnit::Second).unwrap().value(); - // file timestamp range end is inclusive - let end_sec = end.convert_to_ceil(TimeUnit::Second).unwrap().value(); - debug_assert!(end_sec >= start_sec); - if meta.level == 0 { - // only level 0 is involved when calculating time windows. - min_duration_sec = min_duration_sec.min(end_sec - start_sec); - max_durations_sec = max_durations_sec.max(end_sec - start_sec); - } - durations.push((start_sec, end_sec)); - } - } - - for stats in mem_tables { - // unwrap safety: converting timestamps with any unit to seconds won't overflow. - let start_sec = stats - .min_timestamp - .convert_to(TimeUnit::Second) - .unwrap() - .value(); - let end_sec = stats - .max_timestamp - .convert_to_ceil(TimeUnit::Second) - .unwrap() - .value(); - min_duration_sec = min_duration_sec.min(end_sec - start_sec); - max_durations_sec = max_durations_sec.max(end_sec - start_sec); - durations.push((start_sec, end_sec)); - } - - let window_size = duration_to_window_size(min_duration_sec, max_durations_sec); - align_time_spans_to_windows(&durations, window_size) - .into_iter() - .sorted_by(|(l_start, _), (r_start, _)| { - if ts_desc { - l_start.cmp(r_start) - } else { - r_start.cmp(l_start) - } - }) // sort time windows in descending order - // unwrap safety: we ensure that end>=start so that TimestampRange::with_unit won't return None - .map(|(start, end)| TimestampRange::with_unit(start, end, TimeUnit::Second).unwrap()) - .collect() - } -} - -/// Given a set of time spans and a min duration, this function aligns the time spans to windows that -/// collectively covers all the time spans. -/// -/// For example, given time span `[1, 6)` and duration 5, the span can be aligned and split to -/// two windows with length 5: `[0, 5)` and `[5, 10]`, and these two windows can cover the original -/// span `[1, 6)`. -fn align_time_spans_to_windows(durations: &[(i64, i64)], min_duration: i64) -> HashSet<(i64, i64)> { - let mut res = HashSet::new(); - for (start, end) in durations { - let mut next = *start; - while next <= *end { - let next_aligned = next.align_by_bucket(min_duration).unwrap_or(i64::MIN); - if let Some(next_end_aligned) = next_aligned.checked_add(min_duration) { - let _ = res.insert((next_aligned, next_end_aligned)); - next = next_end_aligned; - } else { - // arithmetic overflow, clamp to i64::MAX and break the loop. - let _ = res.insert((next_aligned, i64::MAX)); - break; - } - } - } - res -} - -/// Find the most suitable time window size according to the `min_duration` and `max_duration` -/// found across all SST files and memtables through a binary search. -fn duration_to_window_size(min_duration: i64, max_duration: i64) -> i64 { - let max_bucket = max_duration >> 7; - let target = min_duration.max(max_bucket); - match TIME_WINDOW_SIZE.binary_search(&target) { - Ok(idx) => TIME_WINDOW_SIZE[idx], - Err(idx) => { - if idx < TIME_WINDOW_SIZE.len() { - TIME_WINDOW_SIZE[idx] - } else { - TIME_WINDOW_SIZE.last().copied().unwrap() - } - } - } -} - -#[cfg(test)] -mod tests { - use common_time::Timestamp; - - use super::*; - - #[test] - fn test_get_time_window_size() { - assert_eq!(1, duration_to_window_size(0, 0)); - for window in TIME_WINDOW_SIZE { - assert_eq!(window, duration_to_window_size(window, window)); - } - assert_eq!(1, duration_to_window_size(1, 1)); - assert_eq!(60, duration_to_window_size(60, 60)); - assert_eq!(60 * 10, duration_to_window_size(100, 100)); - assert_eq!(60 * 30, duration_to_window_size(1800, 1800)); - assert_eq!(60 * 60, duration_to_window_size(3000, 3000)); - assert_eq!(2 * 60 * 60, duration_to_window_size(4000, 4000)); - assert_eq!(6 * 60 * 60, duration_to_window_size(21599, 21599)); - assert_eq!(12 * 60 * 60, duration_to_window_size(21601, 21601)); - assert_eq!(24 * 60 * 60, duration_to_window_size(43201, 43201)); - assert_eq!(7 * 24 * 60 * 60, duration_to_window_size(604799, 604799)); - assert_eq!(311040000, duration_to_window_size(31535999, 31535999)); - assert_eq!(3110400000, duration_to_window_size(i64::MAX, i64::MAX)); - } - - #[test] - fn test_get_time_window_size_with_diff() { - assert_eq!(600, duration_to_window_size(60, 10000)); - assert_eq!( - TIME_WINDOW_SIZE.last().copied().unwrap(), - duration_to_window_size(60, i64::MAX) - ); - } - - fn check_align_durations_to_windows( - durations: &[(i64, i64)], - min_duration: i64, - expected: &[(i64, i64)], - ) { - let res = align_time_spans_to_windows(durations, min_duration); - let expected = expected.iter().copied().collect::>(); - assert_eq!(res, expected); - } - - #[test] - fn test_duration_to_windows() { - check_align_durations_to_windows(&[(0, 1)], 2, &[(0, 2)]); - check_align_durations_to_windows(&[(-3, 1)], 2, &[(-4, -2), (-2, 0), (0, 2)]); - check_align_durations_to_windows(&[(1, 3)], 2, &[(0, 2), (2, 4)]); - check_align_durations_to_windows( - &[(i64::MIN, i64::MIN + 3)], - 2, - &[(i64::MIN, i64::MIN + 2), (i64::MIN + 2, i64::MIN + 4)], - ); - - check_align_durations_to_windows( - &[(i64::MAX - 3, i64::MAX)], - 2, - &[(i64::MAX - 3, i64::MAX - 1), (i64::MAX - 1, i64::MAX)], - ); - - check_align_durations_to_windows(&[(-3, 10)], 7, &[(-7, 0), (0, 7), (7, 14)]); - } - - #[test] - fn test_multiple_duration_to_windows() { - check_align_durations_to_windows(&[(0, 1), (1, 3)], 3, &[(0, 3), (3, 6)]); - check_align_durations_to_windows(&[(0, 1), (1, 2), (7, 11)], 3, &[(0, 3), (6, 9), (9, 12)]); - - check_align_durations_to_windows( - &[(-2, 1), (i64::MAX - 2, i64::MAX)], - 3, - &[ - (-3, 0), - (0, 3), - (i64::MAX - 4, i64::MAX - 1), - (i64::MAX - 1, i64::MAX), - ], - ); - } - - #[test] - fn test_plain_window_inference() { - let window_inference = PlainWindowInference {}; - - let res = window_inference.infer_window( - &[FileMeta { - time_range: Some(( - Timestamp::new(1000, TimeUnit::Millisecond), - Timestamp::new(3000, TimeUnit::Millisecond), - )), - ..Default::default() - }], - &[MemtableStats { - max_timestamp: Timestamp::new(3001, TimeUnit::Millisecond), - min_timestamp: Timestamp::new(2001, TimeUnit::Millisecond), - ..Default::default() - }], - true, - ); - assert_eq!( - vec![TimestampRange::with_unit(0, 60, TimeUnit::Second).unwrap(),], - res - ); - - let res = window_inference.infer_window( - &[FileMeta { - time_range: Some(( - Timestamp::new(0, TimeUnit::Millisecond), - Timestamp::new(60 * 1000 + 1, TimeUnit::Millisecond), - )), - ..Default::default() - }], - &[MemtableStats { - max_timestamp: Timestamp::new(3001, TimeUnit::Millisecond), - min_timestamp: Timestamp::new(2001, TimeUnit::Millisecond), - ..Default::default() - }], - true, - ); - assert_eq!( - vec![ - TimestampRange::with_unit(0, 60, TimeUnit::Second).unwrap(), - TimestampRange::with_unit(60, 120, TimeUnit::Second).unwrap(), - ], - res - ); - - let res = window_inference.infer_window( - &[ - FileMeta { - time_range: Some(( - Timestamp::new(0, TimeUnit::Millisecond), - Timestamp::new(60 * 1000 + 1, TimeUnit::Millisecond), - )), - ..Default::default() - }, - FileMeta { - time_range: Some(( - Timestamp::new(60 * 60 * 1000, TimeUnit::Millisecond), - Timestamp::new(60 * 60 * 1000 + 1, TimeUnit::Millisecond), - )), - ..Default::default() - }, - ], - &[MemtableStats { - max_timestamp: Timestamp::new(3001, TimeUnit::Millisecond), - min_timestamp: Timestamp::new(2001, TimeUnit::Millisecond), - ..Default::default() - }], - true, - ); - - let mut expect = (0..=61) - .map(|s| TimestampRange::with_unit(s, s + 1, TimeUnit::Second).unwrap()) - .collect::>(); - expect.push(TimestampRange::with_unit(60 * 60, 60 * 60 + 1, TimeUnit::Second).unwrap()); - expect.push(TimestampRange::with_unit(60 * 60 + 1, 60 * 60 + 2, TimeUnit::Second).unwrap()); - - assert_eq!(expect, res); - - let res = window_inference.infer_window( - &[ - FileMeta { - time_range: Some(( - Timestamp::new(0, TimeUnit::Millisecond), - Timestamp::new(60 * 1000, TimeUnit::Millisecond), - )), - level: 1, // this SST will be ignored - ..Default::default() - }, - FileMeta { - time_range: Some(( - Timestamp::new(0, TimeUnit::Millisecond), - Timestamp::new(10 * 60 * 1000, TimeUnit::Millisecond), - )), - ..Default::default() - }, - ], - &[MemtableStats { - max_timestamp: Timestamp::new(60 * 30 * 1000 + 1, TimeUnit::Millisecond), - min_timestamp: Timestamp::new(0, TimeUnit::Millisecond), - ..Default::default() - }], - true, - ); - - // inferred window size should be 600 sec - assert_eq!( - vec![ - TimestampRange::with_unit(0, 600, TimeUnit::Second).unwrap(), - TimestampRange::with_unit(600, 1200, TimeUnit::Second).unwrap(), - TimestampRange::with_unit(1200, 1800, TimeUnit::Second).unwrap(), - TimestampRange::with_unit(1800, 2400, TimeUnit::Second).unwrap(), - ], - res - ); - - let res = window_inference.infer_window( - &[ - FileMeta { - time_range: Some(( - Timestamp::new(0, TimeUnit::Millisecond), - Timestamp::new(60 * 1000, TimeUnit::Millisecond), - )), - level: 1, // this SST will be ignored - ..Default::default() - }, - FileMeta { - time_range: Some(( - Timestamp::new(0, TimeUnit::Millisecond), - Timestamp::new(10 * 60 * 1000, TimeUnit::Millisecond), - )), - ..Default::default() - }, - ], - &[MemtableStats { - max_timestamp: Timestamp::new(60 * 30 * 1000 + 1, TimeUnit::Millisecond), - min_timestamp: Timestamp::new(0, TimeUnit::Millisecond), - ..Default::default() - }], - false, - ); - - // timestamp asc order - assert_eq!( - vec![ - TimestampRange::with_unit(1800, 2400, TimeUnit::Second).unwrap(), - TimestampRange::with_unit(1200, 1800, TimeUnit::Second).unwrap(), - TimestampRange::with_unit(600, 1200, TimeUnit::Second).unwrap(), - TimestampRange::with_unit(0, 600, TimeUnit::Second).unwrap(), - ], - res - ); - } -} diff --git a/src/storage/src/write_batch.rs b/src/storage/src/write_batch.rs deleted file mode 100644 index 4d58c0695695..000000000000 --- a/src/storage/src/write_batch.rs +++ /dev/null @@ -1,580 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub mod codec; -mod compat; - -use std::collections::HashMap; - -use api::v1::OpType; -use common_recordbatch::RecordBatch; -use datatypes::schema::{ColumnSchema, SchemaRef}; -use datatypes::vectors::VectorRef; -use snafu::{ensure, OptionExt, ResultExt}; -use store_api::storage::WriteRequest; - -use crate::error::{ - BatchMissingColumnSnafu, CreateDefaultSnafu, CreateRecordBatchSnafu, Error, HasNullSnafu, - MoreColumnThanExpectedSnafu, RequestTooLargeSnafu, Result, TypeMismatchSnafu, - UnequalLengthsSnafu, UnknownColumnSnafu, -}; - -/// Max number of updates in a write batch. -pub(crate) const MAX_BATCH_SIZE: usize = 1_000_000; - -/// Data of [WriteBatch]. -/// -/// We serialize this struct to the WAL instead of the whole `WriteBatch` to avoid -/// storing unnecessary information. -#[derive(Debug, PartialEq)] -pub struct Payload { - /// Schema of the payload. - /// - /// This schema doesn't contain internal columns. - pub schema: SchemaRef, - pub mutations: Vec, -} - -impl Payload { - /// Creates a new payload with given `schema`. - fn new(schema: SchemaRef) -> Payload { - Payload { - schema, - mutations: Vec::new(), - } - } - - /// Returns true if there is no mutation in the payload. - #[inline] - pub fn is_empty(&self) -> bool { - self.mutations.is_empty() - } -} - -/// A write operation to the region. -#[derive(Debug, PartialEq)] -pub struct Mutation { - /// Type of the mutation. - pub op_type: OpType, - /// Data of the mutation. - pub record_batch: RecordBatch, -} - -/// Implementation of [WriteRequest]. -#[derive(Debug)] -pub struct WriteBatch { - payload: Payload, - /// Number of rows this batch need to mutate (put, delete, etc). - /// - /// We use it to check whether this batch is too large. - num_rows_to_mutate: usize, - /// The ending index of row key columns. - /// - /// The `WriteBatch` use this index to locate all row key columns from - /// the schema. - row_key_end: usize, -} - -impl WriteRequest for WriteBatch { - type Error = Error; - - fn put(&mut self, data: HashMap) -> Result<()> { - let data = NameToVector::new(data)?; - if data.is_empty() { - return Ok(()); - } - - let record_batch = self.process_put_data(data)?; - - self.add_num_rows_to_mutate(record_batch.num_rows())?; - self.payload.mutations.push(Mutation { - op_type: OpType::Put, - record_batch, - }); - - Ok(()) - } - - fn delete(&mut self, keys: HashMap) -> Result<()> { - let data = NameToVector::new(keys)?; - if data.is_empty() { - return Ok(()); - } - - let record_batch = self.process_delete_data(data)?; - - self.add_num_rows_to_mutate(record_batch.num_rows())?; - self.payload.mutations.push(Mutation { - op_type: OpType::Delete, - record_batch, - }); - - Ok(()) - } -} - -// WriteBatch pub methods. -impl WriteBatch { - /// Creates a new `WriteBatch`. - /// - /// The `schema` is the user schema of the region (no internal columns) and - /// the `row_key_end` is the ending index of row key columns. - /// - /// # Panics - /// Panics if `row_key_end <= schema.num_columns()`. - pub fn new(schema: SchemaRef, row_key_end: usize) -> Self { - assert!(row_key_end <= schema.num_columns()); - - Self { - payload: Payload::new(schema), - num_rows_to_mutate: 0, - row_key_end, - } - } - - #[inline] - pub fn schema(&self) -> &SchemaRef { - &self.payload.schema - } - - #[inline] - pub fn payload(&self) -> &Payload { - &self.payload - } -} - -impl WriteBatch { - /// Validates `data` and converts it into a [RecordBatch]. - /// - /// It fills missing columns by schema's default values. - fn process_put_data(&self, data: NameToVector) -> Result { - let num_rows = data.num_rows(); - let mut columns = Vec::with_capacity(self.schema().num_columns()); - - for column_schema in self.schema().column_schemas() { - match data.0.get(&column_schema.name) { - Some(col) => { - validate_column(column_schema, col)?; - columns.push(col.clone()); - } - None => { - // If column is not provided, fills it by default value. - let col = new_column_with_default_value(column_schema, num_rows)?; - columns.push(col); - } - } - } - - // Check all columns in data also exists in schema, which means we - // are not inserting unknown columns. - for name in data.0.keys() { - ensure!( - self.schema().contains_column(name), - UnknownColumnSnafu { name } - ); - } - - RecordBatch::new(self.schema().clone(), columns).context(CreateRecordBatchSnafu) - } - - /// Validates `data` and converts it into a [RecordBatch]. - /// - /// It fills value columns by null, ignoring whether the column is nullable as the contents - /// of value columns won't be read. - fn process_delete_data(&self, data: NameToVector) -> Result { - // Ensure row key columns are provided. - for column_schema in self.row_key_column_schemas() { - ensure!( - data.0.contains_key(&column_schema.name), - BatchMissingColumnSnafu { - column: &column_schema.name, - } - ); - } - // Ensure only provides row key columns. - ensure!( - data.0.len() == self.row_key_column_schemas().len(), - MoreColumnThanExpectedSnafu - ); - - let num_rows = data.num_rows(); - let mut columns = Vec::with_capacity(self.schema().num_columns()); - for column_schema in self.schema().column_schemas() { - match data.0.get(&column_schema.name) { - Some(col) => { - validate_column(column_schema, col)?; - columns.push(col.clone()); - } - None => { - // Fills value columns by default value, these columns are just placeholders to ensure - // the schema of the record batch is correct. - let col = column_schema.create_default_vector_for_padding(num_rows); - columns.push(col); - } - } - } - - RecordBatch::new(self.schema().clone(), columns).context(CreateRecordBatchSnafu) - } - - fn add_num_rows_to_mutate(&mut self, len: usize) -> Result<()> { - let num_rows = self.num_rows_to_mutate + len; - ensure!( - num_rows <= MAX_BATCH_SIZE, - RequestTooLargeSnafu { num_rows } - ); - self.num_rows_to_mutate = num_rows; - Ok(()) - } - - /// Returns all row key columns in the schema. - fn row_key_column_schemas(&self) -> &[ColumnSchema] { - &self.payload.schema.column_schemas()[..self.row_key_end] - } -} - -/// Returns the length of the first vector in `data`. -fn first_vector_len(data: &HashMap) -> usize { - data.values().next().map(|col| col.len()).unwrap_or(0) -} - -/// Checks whether `col` matches given `column_schema`. -fn validate_column(column_schema: &ColumnSchema, col: &VectorRef) -> Result<()> { - if !col.data_type().is_null() { - // This allow us to use NullVector for columns that only have null value. - // TODO(yingwen): Let NullVector supports different logical type so we could - // check data type directly. - ensure!( - col.data_type() == column_schema.data_type, - TypeMismatchSnafu { - name: &column_schema.name, - expect: column_schema.data_type.clone(), - given: col.data_type(), - } - ); - } - - ensure!( - column_schema.is_nullable() || col.null_count() == 0, - HasNullSnafu { - name: &column_schema.name, - } - ); - - Ok(()) -} - -/// Creates a new column and fills it by default value. -/// -/// `num_rows` MUST be greater than 0. This function will also validate the schema. -pub(crate) fn new_column_with_default_value( - column_schema: &ColumnSchema, - num_rows: usize, -) -> Result { - // If column is not provided, fills it by default value. - let vector = column_schema - .create_default_vector(num_rows) - .context(CreateDefaultSnafu { - name: &column_schema.name, - })? - .context(BatchMissingColumnSnafu { - column: &column_schema.name, - })?; - - validate_column(column_schema, &vector)?; - - Ok(vector) -} - -/// Vectors in [NameToVector] have same length. -/// -/// MUST construct it via [`NameToVector::new()`] to ensure the vector lengths are validated. -struct NameToVector(HashMap); - -impl NameToVector { - fn new(data: HashMap) -> Result { - let num_rows = first_vector_len(&data); - for (name, vector) in &data { - ensure!( - num_rows == vector.len(), - UnequalLengthsSnafu { - name, - expect: num_rows, - given: vector.len(), - } - ); - } - - Ok(NameToVector(data)) - } - - fn num_rows(&self) -> usize { - first_vector_len(&self.0) - } - - fn is_empty(&self) -> bool { - self.num_rows() == 0 - } -} - -#[cfg(test)] -pub(crate) fn new_test_batch() -> WriteBatch { - use datatypes::type_id::LogicalTypeId; - - use crate::test_util::write_batch_util; - - write_batch_util::new_write_batch( - &[ - ("k1", LogicalTypeId::UInt64, false), - ("ts", LogicalTypeId::TimestampMillisecond, false), - ("v1", LogicalTypeId::Boolean, true), - ], - Some(1), - 2, - ) -} - -#[cfg(test)] -mod tests { - use std::iter; - use std::sync::Arc; - - use common_error::ext::ErrorExt; - use common_error::status_code::StatusCode; - use datatypes::prelude::ScalarVector; - use datatypes::type_id::LogicalTypeId; - use datatypes::vectors::{ - BooleanVector, Int32Vector, Int64Vector, TimestampMillisecondVector, UInt64Vector, - }; - - use super::*; - use crate::test_util::write_batch_util; - - #[test] - fn test_name_to_vector_basic() { - let columns = NameToVector::new(HashMap::new()).unwrap(); - assert!(columns.is_empty()); - - let vector1 = Arc::new(Int32Vector::from_slice([1, 2, 3, 4, 5])) as VectorRef; - - let put_data = HashMap::from([ - ("k1".to_string(), vector1.clone()), - ("v1".to_string(), vector1), - ]); - - let columns = NameToVector::new(put_data).unwrap(); - assert_eq!(5, columns.num_rows()); - assert!(!columns.is_empty()); - } - - #[test] - fn test_name_to_vector_empty_vector() { - let vector1 = Arc::new(Int32Vector::from_slice([])) as VectorRef; - let put_data = HashMap::from([("k1".to_string(), vector1)]); - - let columns = NameToVector::new(put_data).unwrap(); - assert_eq!(0, columns.num_rows()); - assert!(columns.is_empty()); - } - - #[test] - fn test_write_batch_put() { - let intv = Arc::new(UInt64Vector::from_slice([1, 2, 3])) as VectorRef; - let boolv = Arc::new(BooleanVector::from(vec![true, false, true])) as VectorRef; - let tsv = Arc::new(TimestampMillisecondVector::from_slice([0, 0, 0])) as VectorRef; - let put_data = HashMap::from([ - ("k1".to_string(), intv), - ("v1".to_string(), boolv), - ("ts".to_string(), tsv), - ]); - - let mut batch = new_test_batch(); - batch.put(put_data).unwrap(); - assert!(!batch.payload().is_empty()); - - let mutation = &batch.payload().mutations[0]; - assert_eq!(3, mutation.record_batch.num_rows()); - } - - fn check_err(err: Error, msg: &str) { - assert_eq!(StatusCode::InvalidArguments, err.status_code()); - assert!( - err.to_string().contains(msg), - "<{err}> does not contain {msg}", - ); - } - - #[test] - fn test_write_batch_too_large() { - let boolv = Arc::new(BooleanVector::from_iterator( - iter::repeat(true).take(MAX_BATCH_SIZE + 1), - )) as VectorRef; - let put_data = HashMap::from([("k1".to_string(), boolv)]); - - let mut batch = - write_batch_util::new_write_batch(&[("k1", LogicalTypeId::Boolean, false)], None, 1); - let err = batch.put(put_data).unwrap_err(); - check_err(err, "Request is too large"); - } - - #[test] - fn test_put_data_different_len() { - let intv = Arc::new(UInt64Vector::from_slice([1, 2, 3])) as VectorRef; - let tsv = Arc::new(TimestampMillisecondVector::from_slice([0, 0])) as VectorRef; - let boolv = Arc::new(BooleanVector::from(vec![true, false, true])) as VectorRef; - let put_data = HashMap::from([ - ("k1".to_string(), intv), - ("v1".to_string(), boolv), - ("ts".to_string(), tsv), - ]); - - let mut batch = new_test_batch(); - let err = batch.put(put_data).unwrap_err(); - check_err(err, "not equals to other columns"); - } - - #[test] - fn test_put_type_mismatch() { - let boolv = Arc::new(BooleanVector::from(vec![true, false, true])) as VectorRef; - let tsv = Arc::new(Int64Vector::from_slice([0, 0, 0])) as VectorRef; - let put_data = HashMap::from([("k1".to_string(), boolv), ("ts".to_string(), tsv)]); - - let mut batch = new_test_batch(); - let err = batch.put(put_data).unwrap_err(); - check_err(err, "Type of column k1 does not match"); - } - - #[test] - fn test_put_type_has_null() { - let intv = Arc::new(UInt64Vector::from(vec![Some(1), None, Some(3)])) as VectorRef; - let tsv = Arc::new(Int64Vector::from_slice([0, 0, 0])) as VectorRef; - let put_data = HashMap::from([("k1".to_string(), intv), ("ts".to_string(), tsv)]); - - let mut batch = new_test_batch(); - let err = batch.put(put_data).unwrap_err(); - check_err(err, "Column k1 is not null"); - } - - #[test] - fn test_put_missing_column() { - let boolv = Arc::new(BooleanVector::from(vec![true, false, true])) as VectorRef; - let tsv = Arc::new(TimestampMillisecondVector::from_slice([0, 0, 0])) as VectorRef; - let put_data = HashMap::from([("v1".to_string(), boolv), ("ts".to_string(), tsv)]); - - let mut batch = new_test_batch(); - let err = batch.put(put_data).unwrap_err(); - check_err(err, "Missing column k1"); - } - - #[test] - fn test_put_unknown_column() { - let intv = Arc::new(UInt64Vector::from_slice([1, 2, 3])) as VectorRef; - let tsv = Arc::new(TimestampMillisecondVector::from_slice([0, 0, 0])) as VectorRef; - let boolv = Arc::new(BooleanVector::from(vec![true, false, true])) as VectorRef; - let put_data = HashMap::from([ - ("k1".to_string(), intv.clone()), - ("v1".to_string(), boolv.clone()), - ("ts".to_string(), tsv), - ("v2".to_string(), boolv), - ]); - - let mut batch = new_test_batch(); - let err = batch.put(put_data).unwrap_err(); - assert_eq!(StatusCode::TableColumnNotFound, err.status_code()); - } - - #[test] - fn test_put_empty() { - let mut batch = new_test_batch(); - batch.put(HashMap::new()).unwrap(); - assert!(batch.payload().is_empty()); - } - - #[test] - fn test_delete_empty() { - let mut batch = new_test_batch(); - batch.delete(HashMap::new()).unwrap(); - assert!(batch.payload().is_empty()); - } - - #[test] - fn test_write_batch_delete() { - let intv = Arc::new(UInt64Vector::from_slice([1, 2, 3])) as VectorRef; - let tsv = Arc::new(TimestampMillisecondVector::from_slice([0, 0, 0])) as VectorRef; - let keys = HashMap::from([("k1".to_string(), intv), ("ts".to_string(), tsv)]); - - let mut batch = new_test_batch(); - batch.delete(keys).unwrap(); - - let record_batch = &batch.payload().mutations[0].record_batch; - assert_eq!(3, record_batch.num_rows()); - assert_eq!(3, record_batch.num_columns()); - let v1 = record_batch.column_by_name("v1").unwrap(); - assert!(v1.only_null()); - } - - #[test] - fn test_delete_missing_column() { - let intv = Arc::new(UInt64Vector::from_slice([1, 2, 3])) as VectorRef; - let keys = HashMap::from([("k1".to_string(), intv)]); - - let mut batch = new_test_batch(); - let err = batch.delete(keys).unwrap_err(); - check_err(err, "Missing column ts"); - } - - #[test] - fn test_delete_columns_more_than_row_key() { - let intv = Arc::new(UInt64Vector::from_slice([1, 2, 3])) as VectorRef; - let tsv = Arc::new(TimestampMillisecondVector::from_slice([0, 0, 0])) as VectorRef; - let keys = HashMap::from([ - ("k1".to_string(), intv.clone()), - ("ts".to_string(), tsv), - ("v2".to_string(), intv), - ]); - - let mut batch = new_test_batch(); - let err = batch.delete(keys).unwrap_err(); - check_err(err, "More columns than expected"); - } - - #[test] - fn test_delete_type_mismatch() { - let intv = Arc::new(UInt64Vector::from_slice([1, 2, 3])) as VectorRef; - let boolv = Arc::new(BooleanVector::from(vec![true, false, true])) as VectorRef; - let keys = HashMap::from([("k1".to_string(), intv.clone()), ("ts".to_string(), boolv)]); - - let mut batch = new_test_batch(); - let err = batch.delete(keys).unwrap_err(); - check_err(err, "Type of column ts does not match"); - } - - #[test] - fn test_delete_non_null_value() { - let intv = Arc::new(UInt64Vector::from_slice([1, 2, 3])) as VectorRef; - let tsv = Arc::new(TimestampMillisecondVector::from_slice([0, 0, 0])) as VectorRef; - let keys = HashMap::from([("k1".to_string(), intv.clone()), ("ts".to_string(), tsv)]); - - let mut batch = write_batch_util::new_write_batch( - &[ - ("k1", LogicalTypeId::UInt64, false), - ("ts", LogicalTypeId::TimestampMillisecond, false), - ("v1", LogicalTypeId::Boolean, false), - ], - Some(1), - 2, - ); - batch.delete(keys).unwrap(); - } -} diff --git a/src/storage/src/write_batch/codec.rs b/src/storage/src/write_batch/codec.rs deleted file mode 100644 index c299fb51ea22..000000000000 --- a/src/storage/src/write_batch/codec.rs +++ /dev/null @@ -1,212 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::io::Cursor; -use std::sync::Arc; - -use api::v1::OpType; -use common_recordbatch::RecordBatch; -use datatypes::arrow::ipc::reader::StreamReader; -use datatypes::arrow::ipc::writer::{IpcWriteOptions, StreamWriter}; -use datatypes::schema::Schema; -use snafu::{ensure, ResultExt}; - -use crate::codec::{Decoder, Encoder}; -use crate::error::{ - BatchCorruptedSnafu, CreateRecordBatchSnafu, DecodeArrowSnafu, EncodeArrowSnafu, Error, - ParseSchemaSnafu, Result, -}; -use crate::proto::wal::MutationType; -use crate::write_batch::{Mutation, Payload}; - -#[derive(Default)] -pub struct PayloadEncoder {} - -impl PayloadEncoder { - pub fn new() -> Self { - Self::default() - } -} - -impl Encoder for PayloadEncoder { - type Item = Payload; - type Error = Error; - - fn encode(&self, item: &Payload, dst: &mut Vec) -> Result<()> { - let arrow_schema = item.schema.arrow_schema(); - - let opts = IpcWriteOptions::default(); - let mut writer = StreamWriter::try_new_with_options(dst, arrow_schema, opts) - .context(EncodeArrowSnafu)?; - - for mutation in &item.mutations { - let record_batch = mutation.record_batch.df_record_batch(); - writer.write(record_batch).context(EncodeArrowSnafu)?; - } - writer.finish().context(EncodeArrowSnafu)?; - - Ok(()) - } -} - -pub struct PayloadDecoder<'a> { - mutation_types: &'a [i32], -} - -impl<'a> PayloadDecoder<'a> { - pub fn new(mutation_types: &'a [i32]) -> Self { - Self { mutation_types } - } -} - -impl<'a> Decoder for PayloadDecoder<'a> { - type Item = Payload; - type Error = Error; - - fn decode(&self, src: &[u8]) -> Result { - let reader = Cursor::new(src); - let mut reader = StreamReader::try_new(reader, None).context(DecodeArrowSnafu)?; - let arrow_schema = reader.schema(); - - // We could let the decoder takes a schema as input if possible, then we don't - // need to rebuild the schema here. - let schema = Arc::new(Schema::try_from(arrow_schema).context(ParseSchemaSnafu)?); - let mut mutations = Vec::with_capacity(self.mutation_types.len()); - - for (record_batch, mutation_type) in reader.by_ref().zip(self.mutation_types) { - let record_batch = record_batch.context(DecodeArrowSnafu)?; - let record_batch = RecordBatch::try_from_df_record_batch(schema.clone(), record_batch) - .context(CreateRecordBatchSnafu)?; - let op_type = match MutationType::try_from(*mutation_type) { - Ok(MutationType::Delete) => OpType::Delete, - Ok(MutationType::Put) => OpType::Put, - Err(e) => { - return BatchCorruptedSnafu { - message: format!("Unexpceted decode error for mutation type: {e}"), - } - .fail() - } - }; - mutations.push(Mutation { - op_type, - record_batch, - }); - } - - // check if exactly finished - ensure!( - reader.is_finished(), - BatchCorruptedSnafu { - message: "The num of data chunks is different than expected." - } - ); - - ensure!( - mutations.len() == self.mutation_types.len(), - BatchCorruptedSnafu { - message: format!( - "expected {} mutations, but got {}", - self.mutation_types.len(), - mutations.len() - ) - } - ); - - Ok(Payload { schema, mutations }) - } -} - -#[cfg(test)] -mod tests { - use std::collections::HashMap; - use std::sync::Arc; - - use datatypes::vectors::{BooleanVector, TimestampMillisecondVector, UInt64Vector, VectorRef}; - use store_api::storage::WriteRequest; - - use super::*; - use crate::write_batch::WriteBatch; - use crate::{proto, write_batch}; - - fn gen_new_batch_and_types() -> (WriteBatch, Vec) { - let mut batch = write_batch::new_test_batch(); - for i in 0..10 { - let intv = Arc::new(UInt64Vector::from_slice([1, 2, 3])) as VectorRef; - let boolv = - Arc::new(BooleanVector::from(vec![Some(true), Some(false), None])) as VectorRef; - let tsv = Arc::new(TimestampMillisecondVector::from_vec(vec![i, i, i])) as VectorRef; - - let put_data = HashMap::from([ - ("k1".to_string(), intv), - ("v1".to_string(), boolv), - ("ts".to_string(), tsv), - ]); - - batch.put(put_data).unwrap(); - } - - let types = proto::wal::gen_mutation_types(batch.payload()); - - (batch, types) - } - - #[test] - fn test_codec_arrow() -> Result<()> { - let (batch, mutation_types) = gen_new_batch_and_types(); - - let encoder = PayloadEncoder::new(); - let mut dst = vec![]; - encoder.encode(batch.payload(), &mut dst).unwrap(); - - let decoder = PayloadDecoder::new(&mutation_types); - let result = decoder.decode(&dst); - let payload = result?; - assert_eq!(*batch.payload(), payload); - - Ok(()) - } - - fn gen_new_batch_and_types_with_none_column() -> (WriteBatch, Vec) { - let mut batch = write_batch::new_test_batch(); - for _ in 0..10 { - let intv = Arc::new(UInt64Vector::from_slice([1, 2, 3])) as VectorRef; - let tsv = Arc::new(TimestampMillisecondVector::from_vec(vec![0, 0, 0])) as VectorRef; - - let put_data = - HashMap::from([("k1".to_string(), intv.clone()), ("ts".to_string(), tsv)]); - - batch.put(put_data).unwrap(); - } - - let types = proto::wal::gen_mutation_types(batch.payload()); - - (batch, types) - } - - #[test] - fn test_codec_with_none_column_arrow() -> Result<()> { - let (batch, mutation_types) = gen_new_batch_and_types_with_none_column(); - - let encoder = PayloadEncoder::new(); - let mut dst = vec![]; - encoder.encode(batch.payload(), &mut dst).unwrap(); - - let decoder = PayloadDecoder::new(&mutation_types); - let result = decoder.decode(&dst); - let payload = result?; - assert_eq!(*batch.payload(), payload); - - Ok(()) - } -} diff --git a/src/storage/src/write_batch/compat.rs b/src/storage/src/write_batch/compat.rs deleted file mode 100644 index 4ec0c08fd27f..000000000000 --- a/src/storage/src/write_batch/compat.rs +++ /dev/null @@ -1,226 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use common_recordbatch::RecordBatch; -use datatypes::schema::{ColumnSchema, SchemaRef}; -use snafu::{ensure, ResultExt}; - -use crate::error::{self, Result}; -use crate::schema::compat::CompatWrite; -use crate::write_batch::{self, Mutation, WriteBatch}; - -impl CompatWrite for WriteBatch { - fn compat_write(&mut self, dest_schema: &SchemaRef) -> Result<()> { - let data_version = dest_schema.version(); - let schema_version = self.schema().version(); - // Fast path, nothing to do if schema version of the write batch is equal to version - // of destination. - if data_version == schema_version { - debug_assert_eq!(dest_schema.column_schemas(), self.schema().column_schemas()); - - return Ok(()); - } - - ensure!( - data_version > schema_version, - error::WriteToOldVersionSnafu { - data_version, - schema_version, - } - ); - - // For columns not in schema, returns error instead of discarding the column silently. - let column_not_in = column_not_in_schema(dest_schema, self.schema().column_schemas()); - ensure!( - column_not_in.is_none(), - error::NotInSchemaToCompatSnafu { - column: column_not_in.unwrap(), - version: data_version, - } - ); - - for mutation in &mut self.payload.mutations { - mutation.compat_write(dest_schema)?; - } - - // Change schema to `dest_schema`. - self.payload.schema = dest_schema.clone(); - - Ok(()) - } -} - -impl CompatWrite for Mutation { - fn compat_write(&mut self, dest_schema: &SchemaRef) -> Result<()> { - if self.record_batch.num_rows() == 0 { - return Ok(()); - } - - let num_rows = self.record_batch.num_rows(); - let mut columns = Vec::with_capacity(dest_schema.num_columns()); - for column_schema in dest_schema.column_schemas() { - if let Some(vector) = self.record_batch.column_by_name(&column_schema.name) { - columns.push(vector.clone()); - } else { - // We need to fill the column by null or its default value. - let vector = write_batch::new_column_with_default_value(column_schema, num_rows)?; - columns.push(vector); - } - } - - // Using dest schema to build RecordBatch. - self.record_batch = RecordBatch::new(dest_schema.clone(), columns) - .context(error::CreateRecordBatchSnafu)?; - - Ok(()) - } -} - -fn column_not_in_schema(schema: &SchemaRef, column_schemas: &[ColumnSchema]) -> Option { - column_schemas.iter().find_map(|col| { - if schema.column_schema_by_name(&col.name).is_none() { - Some(col.name.clone()) - } else { - None - } - }) -} - -#[cfg(test)] -mod tests { - use std::collections::HashMap; - use std::sync::Arc; - - use datatypes::data_type::ConcreteDataType; - use datatypes::schema::{ColumnDefaultConstraint, SchemaBuilder}; - use datatypes::vectors::{Int32Vector, TimestampMillisecondVector, VectorRef}; - use store_api::storage::WriteRequest; - - use super::*; - use crate::error::Error; - - // Test schema only has two row key columns: k0, ts. - const TEST_ROW_KEY_END: usize = 2; - - fn new_test_schema_builder( - v0_constraint: Option>, - ) -> SchemaBuilder { - let mut column_schemas = vec![ - ColumnSchema::new("k0", ConcreteDataType::int32_datatype(), false), - ColumnSchema::new( - "ts", - ConcreteDataType::timestamp_millisecond_datatype(), - false, - ) - .with_time_index(true), - ]; - - if let Some(v0_constraint) = v0_constraint { - column_schemas.push( - ColumnSchema::new("v0", ConcreteDataType::int32_datatype(), true) - .with_default_constraint(v0_constraint) - .unwrap(), - ); - } - - SchemaBuilder::try_from(column_schemas).unwrap() - } - - fn new_test_schema(v0_constraint: Option>) -> SchemaRef { - let schema = new_test_schema_builder(v0_constraint).build().unwrap(); - - Arc::new(schema) - } - - fn new_put_data() -> HashMap { - let k0 = Arc::new(Int32Vector::from_slice([1, 2, 3])) as VectorRef; - let ts = Arc::new(TimestampMillisecondVector::from_values([11, 12, 13])) as VectorRef; - HashMap::from([("k0".to_string(), k0), ("ts".to_string(), ts)]) - } - - #[test] - fn test_mutation_compat_write() { - let put_data = new_put_data(); - let schema_old = new_test_schema(None); - // Mutation doesn't check schema version, so we don't have to bump the version here. - let schema = new_test_schema(Some(Some(ColumnDefaultConstraint::null_value()))); - // Use WriteBatch to build a payload and its mutation. - let mut batch = WriteBatch::new(schema_old, TEST_ROW_KEY_END); - batch.put(put_data).unwrap(); - - let mutation = &mut batch.payload.mutations[0]; - mutation.compat_write(&schema).unwrap(); - - let v0 = mutation.record_batch.column_by_name("v0").unwrap(); - assert!(v0.only_null()); - } - - #[test] - fn test_write_batch_compat_write() { - let schema_old = new_test_schema(None); - let mut batch = WriteBatch::new(schema_old, TEST_ROW_KEY_END); - let put_data = new_put_data(); - batch.put(put_data).unwrap(); - - let schema_new = Arc::new( - new_test_schema_builder(Some(Some(ColumnDefaultConstraint::null_value()))) - .version(1) - .build() - .unwrap(), - ); - batch.compat_write(&schema_new).unwrap(); - assert_eq!(schema_new, *batch.schema()); - - let mutation = &batch.payload().mutations[0]; - let _ = mutation.record_batch.column_by_name("v0").unwrap(); - } - - #[test] - fn test_write_batch_compat_to_old() { - let schema_old = new_test_schema(None); - let schema_new = Arc::new( - new_test_schema_builder(None) - .version(1) // Bump the version - .build() - .unwrap(), - ); - - let mut batch = WriteBatch::new(schema_new, TEST_ROW_KEY_END); - let err = batch.compat_write(&schema_old).unwrap_err(); - assert!( - matches!(err, Error::WriteToOldVersion { .. }), - "err {err} is not WriteToOldVersion", - ); - } - - #[test] - fn test_write_batch_skip_compat() { - let schema = new_test_schema(None); - let mut batch = WriteBatch::new(schema.clone(), TEST_ROW_KEY_END); - batch.compat_write(&schema).unwrap(); - } - - #[test] - fn test_write_batch_compat_columns_not_in_schema() { - let schema_has_column = new_test_schema(Some(None)); - let mut batch = WriteBatch::new(schema_has_column, TEST_ROW_KEY_END); - - let schema_no_column = Arc::new(new_test_schema_builder(None).version(1).build().unwrap()); - let err = batch.compat_write(&schema_no_column).unwrap_err(); - assert!( - matches!(err, Error::NotInSchemaToCompat { .. }), - "err {err} is not NotInSchemaToCompat", - ); - } -} diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index e5b398e2a673..b6caafdad641 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -713,22 +713,6 @@ sync_write = false [datanode.storage] type = "{}" -[datanode.storage.compaction] -max_inflight_tasks = 4 -max_files_in_level0 = 8 -max_purge_tasks = 32 - -[datanode.storage.manifest] -checkpoint_margin = 10 -gc_duration = "10m" -compress = false - -[datanode.storage.flush] -max_flush_tasks = 8 -region_write_buffer_size = "32MiB" -picker_schedule_interval = "5m" -auto_flush_interval = "1h" - [[datanode.region_engine]] [datanode.region_engine.mito]