From 94618bd01b4abefeab8373658a4f39b36d536761 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Wed, 27 Dec 2023 10:45:51 +0000 Subject: [PATCH 01/27] index integration Signed-off-by: Zhenchi --- Cargo.lock | 15 +- Cargo.toml | 4 +- src/datatypes/src/value.rs | 19 +- src/index/src/inverted_index/create.rs | 6 +- src/index/src/inverted_index/create/sort.rs | 4 +- src/index/src/inverted_index/error.rs | 2 +- .../search/fst_apply/intersection_apply.rs | 109 +++++---- .../src/inverted_index/search/index_apply.rs | 6 +- .../search/index_apply/predicates_apply.rs | 14 +- src/mito2/Cargo.toml | 3 + src/mito2/src/access_layer.rs | 32 ++- src/mito2/src/error.rs | 22 +- src/mito2/src/read/scan_region.rs | 26 ++- src/mito2/src/read/seq_scan.rs | 12 + src/mito2/src/row_converter.rs | 8 +- src/mito2/src/sst.rs | 2 + src/mito2/src/sst/file.rs | 5 + src/mito2/src/sst/index.rs | 17 ++ src/mito2/src/sst/index/applier.rs | 84 +++++++ src/mito2/src/sst/index/applier/builder.rs | 151 ++++++++++++ .../src/sst/index/applier/builder/between.rs | 64 +++++ .../sst/index/applier/builder/comparison.rs | 168 ++++++++++++++ .../src/sst/index/applier/builder/in_list.rs | 57 +++++ .../sst/index/applier/builder/or_eq_list.rs | 100 ++++++++ .../sst/index/applier/builder/regex_match.rs | 45 ++++ src/mito2/src/sst/index/codec.rs | 87 +++++++ src/mito2/src/sst/index/creator.rs | 218 ++++++++++++++++++ src/mito2/src/sst/location.rs | 39 ++++ src/mito2/src/sst/parquet.rs | 19 +- src/mito2/src/sst/parquet/reader.rs | 36 ++- src/mito2/src/sst/parquet/writer.rs | 62 ++++- src/puffin/src/file_format/writer.rs | 8 +- src/puffin/src/file_format/writer/file.rs | 38 +-- src/store-api/src/metadata.rs | 4 + 34 files changed, 1357 insertions(+), 129 deletions(-) create mode 100644 src/mito2/src/sst/index.rs create mode 100644 src/mito2/src/sst/index/applier.rs create mode 100644 src/mito2/src/sst/index/applier/builder.rs create mode 100644 src/mito2/src/sst/index/applier/builder/between.rs create mode 100644 src/mito2/src/sst/index/applier/builder/comparison.rs create mode 100644 src/mito2/src/sst/index/applier/builder/in_list.rs create mode 100644 src/mito2/src/sst/index/applier/builder/or_eq_list.rs create mode 100644 src/mito2/src/sst/index/applier/builder/regex_match.rs create mode 100644 src/mito2/src/sst/index/codec.rs create mode 100644 src/mito2/src/sst/index/creator.rs create mode 100644 src/mito2/src/sst/location.rs diff --git a/Cargo.lock b/Cargo.lock index 78f7e5c7470e..c6e88a554486 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4006,7 +4006,7 @@ dependencies = [ "prost 0.12.2", "rand", "regex", - "regex-automata 0.1.10", + "regex-automata 0.2.0", "snafu", "tokio", "tokio-util", @@ -4940,6 +4940,7 @@ dependencies = [ "datatypes", "futures", "humantime-serde", + "index", "lazy_static", "log-store", "memcomparable", @@ -4948,8 +4949,10 @@ dependencies = [ "object-store", "parquet", "paste", + "pin-project", "prometheus", "prost 0.12.2", + "puffin", "regex", "serde", "serde_json", @@ -7097,8 +7100,18 @@ name = "regex-automata" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax 0.6.29", +] + +[[package]] +name = "regex-automata" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9368763f5a9b804326f3af749e16f9abf378d227bcdee7634b13d8f17793782" dependencies = [ "fst", + "memchr", "regex-syntax 0.6.29", ] diff --git a/Cargo.toml b/Cargo.toml index 87985d74935b..541b93ee4798 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -110,7 +110,7 @@ prost = "0.12" raft-engine = { git = "https://github.com/tikv/raft-engine.git", rev = "22dfb426cd994602b57725ef080287d3e53db479" } rand = "0.8" regex = "1.8" -regex-automata = { version = "0.1", features = ["transducer"] } +regex-automata = { version = "0.2", features = ["transducer"] } reqwest = { version = "0.11", default-features = false, features = [ "json", "rustls-tls-native-roots", @@ -167,6 +167,8 @@ datanode = { path = "src/datanode" } datatypes = { path = "src/datatypes" } file-engine = { path = "src/file-engine" } frontend = { path = "src/frontend" } +index = { path = "src/index" } +puffin = { path = "src/puffin" } log-store = { path = "src/log-store" } meta-client = { path = "src/meta-client" } meta-srv = { path = "src/meta-srv" } diff --git a/src/datatypes/src/value.rs b/src/datatypes/src/value.rs index c198fde9a7b1..cef7ea124e62 100644 --- a/src/datatypes/src/value.rs +++ b/src/datatypes/src/value.rs @@ -827,8 +827,10 @@ impl TryFrom for Value { ScalarValue::DurationNanosecond(d) => d .map(|x| Value::Duration(Duration::new(x, TimeUnit::Nanosecond))) .unwrap_or(Value::Null), - ScalarValue::Decimal128(_, _, _) - | ScalarValue::Decimal256(_, _, _) + ScalarValue::Decimal128(v, precision, scale) => v + .map(|x| Value::Decimal128(Decimal128::new(x, precision, scale))) + .unwrap_or(Value::Null), + ScalarValue::Decimal256(_, _, _) | ScalarValue::Struct(_, _) | ScalarValue::Dictionary(_, _) => { return error::UnsupportedArrowTypeSnafu { @@ -1474,11 +1476,14 @@ mod tests { ScalarValue::DurationNanosecond(None).try_into().unwrap() ); - let result: Result = ScalarValue::Decimal128(Some(1), 0, 0).try_into(); - assert!(result - .unwrap_err() - .to_string() - .contains("Unsupported arrow data type, type: Decimal128(0, 0)")); + assert_eq!( + Value::Decimal128(Decimal128::new(1, 38, 10)), + ScalarValue::Decimal128(Some(1), 38, 10).try_into().unwrap() + ); + assert_eq!( + Value::Null, + ScalarValue::Decimal128(None, 0, 0).try_into().unwrap() + ); } #[test] diff --git a/src/index/src/inverted_index/create.rs b/src/index/src/inverted_index/create.rs index db6bf1ad2595..15674d696cd6 100644 --- a/src/index/src/inverted_index/create.rs +++ b/src/index/src/inverted_index/create.rs @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod sort; -mod sort_create; +pub mod sort; +pub mod sort_create; use async_trait::async_trait; @@ -23,7 +23,7 @@ use crate::inverted_index::BytesRef; /// `InvertedIndexCreator` provides functionality to construct an inverted index #[async_trait] -pub trait InvertedIndexCreator { +pub trait InvertedIndexCreator: Send { /// Adds a value to the named index. A `None` value represents an absence of data (null) /// /// - `index_name`: Identifier for the index being built diff --git a/src/index/src/inverted_index/create/sort.rs b/src/index/src/inverted_index/create/sort.rs index 53a70fc7b5c0..369017835643 100644 --- a/src/index/src/inverted_index/create/sort.rs +++ b/src/index/src/inverted_index/create/sort.rs @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -mod external_provider; -mod external_sort; +pub mod external_provider; +pub mod external_sort; mod intermediate_rw; mod merge_stream; diff --git a/src/index/src/inverted_index/error.rs b/src/index/src/inverted_index/error.rs index b795e33003b7..230d0569af05 100644 --- a/src/index/src/inverted_index/error.rs +++ b/src/index/src/inverted_index/error.rs @@ -113,7 +113,7 @@ pub enum Error { #[snafu(display("Failed to parse regex DFA"))] ParseDFA { #[snafu(source)] - error: regex_automata::Error, + error: regex_automata::dfa::Error, location: Location, }, diff --git a/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs b/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs index a0ae0d7b9afb..1b22d2344a69 100644 --- a/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs +++ b/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs @@ -14,7 +14,7 @@ use fst::map::OpBuilder; use fst::{IntoStreamer, Streamer}; -use regex_automata::DenseDFA; +use regex_automata::dfa::dense::DFA; use snafu::{ensure, ResultExt}; use crate::inverted_index::error::{ @@ -24,15 +24,13 @@ use crate::inverted_index::search::fst_apply::FstApplier; use crate::inverted_index::search::predicate::{Predicate, Range}; use crate::inverted_index::FstMap; -type Dfa = DenseDFA, usize>; - /// `IntersectionFstApplier` applies intersection operations on an FstMap using specified ranges and regex patterns. pub struct IntersectionFstApplier { /// A list of `Range` which define inclusive or exclusive ranges for keys to be queried in the FstMap. ranges: Vec, /// A list of `Dfa` compiled from regular expression patterns. - dfas: Vec, + dfas: Vec>>, } impl FstApplier for IntersectionFstApplier { @@ -88,7 +86,7 @@ impl IntersectionFstApplier { match predicate { Predicate::Range(range) => ranges.push(range.range), Predicate::RegexMatch(regex) => { - let dfa = DenseDFA::new(®ex.pattern); + let dfa = DFA::new(®ex.pattern); let dfa = dfa.context(ParseDFASnafu)?; dfas.push(dfa); } @@ -210,47 +208,66 @@ mod tests { #[test] fn test_intersection_fst_applier_with_valid_pattern() { - let test_fst = FstMap::from_iter([("aa", 1), ("bb", 2), ("cc", 3)]).unwrap(); - - let applier = create_applier_from_pattern("a.?").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![1]); - - let applier = create_applier_from_pattern("b.?").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![2]); - - let applier = create_applier_from_pattern("c.?").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![3]); - - let applier = create_applier_from_pattern("a.*").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![1]); - - let applier = create_applier_from_pattern("b.*").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![2]); - - let applier = create_applier_from_pattern("c.*").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![3]); - - let applier = create_applier_from_pattern("d.?").unwrap(); - let results = applier.apply(&test_fst); - assert!(results.is_empty()); - - let applier = create_applier_from_pattern("a.?|b.?").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![1, 2]); - - let applier = create_applier_from_pattern("d.?|a.?").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![1]); - - let applier = create_applier_from_pattern(".*").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![1, 2, 3]); + let test_fst = FstMap::from_iter([("123", 1), ("abc", 2)]).unwrap(); + + let cases = vec![ + ("1", vec![1]), + ("2", vec![1]), + ("3", vec![1]), + ("^1", vec![1]), + ("^2", vec![]), + ("^3", vec![]), + ("^1.*", vec![1]), + ("^.*2", vec![1]), + ("^.*3", vec![1]), + ("1$", vec![]), + ("2$", vec![]), + ("3$", vec![1]), + ("1.*$", vec![1]), + ("2.*$", vec![1]), + ("3.*$", vec![1]), + ("^1..$", vec![1]), + ("^.2.$", vec![1]), + ("^..3$", vec![1]), + ("^[0-9]", vec![1]), + ("^[0-9]+$", vec![1]), + ("^[0-9][0-9]$", vec![]), + ("^[0-9][0-9][0-9]$", vec![1]), + ("^123$", vec![1]), + ("a", vec![2]), + ("b", vec![2]), + ("c", vec![2]), + ("^a", vec![2]), + ("^b", vec![]), + ("^c", vec![]), + ("^a.*", vec![2]), + ("^.*b", vec![2]), + ("^.*c", vec![2]), + ("a$", vec![]), + ("b$", vec![]), + ("c$", vec![2]), + ("a.*$", vec![2]), + ("b.*$", vec![2]), + ("c.*$", vec![2]), + ("^.[a-z]", vec![2]), + ("^abc$", vec![2]), + ("^ab$", vec![]), + ("abc$", vec![2]), + ("^a.c$", vec![2]), + ("^..c$", vec![2]), + ("ab", vec![2]), + (".*", vec![1, 2]), + ("", vec![1, 2]), + ("^$", vec![]), + ("1|a", vec![1, 2]), + ("^123$|^abc$", vec![1, 2]), + ]; + + for (pattern, expected) in cases { + let applier = create_applier_from_pattern(pattern).unwrap(); + let results = applier.apply(&test_fst); + assert_eq!(results, expected); + } } #[test] diff --git a/src/index/src/inverted_index/search/index_apply.rs b/src/index/src/inverted_index/search/index_apply.rs index 35d8c387a2d6..6701f03cac3d 100644 --- a/src/index/src/inverted_index/search/index_apply.rs +++ b/src/index/src/inverted_index/search/index_apply.rs @@ -14,6 +14,8 @@ mod predicates_apply; +use std::collections::BTreeSet; + use async_trait::async_trait; pub use predicates_apply::PredicatesIndexApplier; @@ -25,14 +27,14 @@ use crate::inverted_index::format::reader::InvertedIndexReader; /// Applier instances are reusable and work with various `InvertedIndexReader` instances, /// avoiding repeated compilation of fixed predicates such as regex patterns. #[async_trait] -pub trait IndexApplier { +pub trait IndexApplier: Send + Sync { /// Applies the predefined predicates to the data read by the given index reader, returning /// a list of relevant indices (e.g., post IDs, group IDs, row IDs). async fn apply( &self, context: SearchContext, reader: &mut dyn InvertedIndexReader, - ) -> Result>; + ) -> Result>; } /// A context for searching the inverted index. diff --git a/src/index/src/inverted_index/search/index_apply/predicates_apply.rs b/src/index/src/inverted_index/search/index_apply/predicates_apply.rs index e2bea2756a7f..2331d8af6ffd 100644 --- a/src/index/src/inverted_index/search/index_apply/predicates_apply.rs +++ b/src/index/src/inverted_index/search/index_apply/predicates_apply.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::BTreeSet; + use async_trait::async_trait; use common_base::BitVec; use greptime_proto::v1::index::InvertedIndexMetas; @@ -45,7 +47,7 @@ impl IndexApplier for PredicatesIndexApplier { &self, context: SearchContext, reader: &mut dyn InvertedIndexReader, - ) -> Result> { + ) -> Result> { let metadata = reader.metadata().await?; let mut bitmap = Self::bitmap_full_range(&metadata); @@ -58,7 +60,7 @@ impl IndexApplier for PredicatesIndexApplier { let Some(meta) = metadata.metas.get(name) else { match context.index_not_found_strategy { IndexNotFoundStrategy::ReturnEmpty => { - return Ok(vec![]); + return Ok(BTreeSet::default()); } IndexNotFoundStrategy::Ignore => { continue; @@ -197,7 +199,7 @@ mod tests { .apply(SearchContext::default(), &mut mock_reader) .await .unwrap(); - assert_eq!(indices, vec![0, 2, 4, 6]); + assert_eq!(indices, BTreeSet::from_iter([0, 2, 4, 6])); // An index reader with a single tag "tag-0" but without value "tag-0_value-0" let mut mock_reader = MockInvertedIndexReader::new(); @@ -251,7 +253,7 @@ mod tests { .apply(SearchContext::default(), &mut mock_reader) .await .unwrap(); - assert_eq!(indices, vec![0, 4, 6]); + assert_eq!(indices, BTreeSet::from_iter([0, 4, 6])); } #[tokio::test] @@ -269,7 +271,7 @@ mod tests { .apply(SearchContext::default(), &mut mock_reader) .await .unwrap(); - assert_eq!(indices, vec![0, 1, 2, 3, 4, 5, 6, 7]); // full range to scan + assert_eq!(indices, BTreeSet::from_iter([0, 1, 2, 3, 4, 5, 6, 7])); // full range to scan } #[tokio::test] @@ -341,6 +343,6 @@ mod tests { ) .await .unwrap(); - assert_eq!(indices, vec![0, 1, 2, 3, 4, 5, 6, 7]); + assert_eq!(indices, BTreeSet::from_iter([0, 1, 2, 3, 4, 5, 6, 7])); } } diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml index 0ee17bb85861..2a5ba3fbb2a1 100644 --- a/src/mito2/Cargo.toml +++ b/src/mito2/Cargo.toml @@ -39,6 +39,7 @@ datafusion.workspace = true datatypes.workspace = true futures.workspace = true humantime-serde.workspace = true +index.workspace = true lazy_static = "1.4" log-store = { workspace = true, optional = true } memcomparable = "0.2" @@ -62,6 +63,8 @@ tokio-stream.workspace = true tokio-util.workspace = true tokio.workspace = true uuid.workspace = true +pin-project.workspace = true +puffin.workspace = true [dev-dependencies] common-procedure-test.workspace = true diff --git a/src/mito2/src/access_layer.rs b/src/mito2/src/access_layer.rs index 4c0e8bbde50b..7ef6909ad9b1 100644 --- a/src/mito2/src/access_layer.rs +++ b/src/mito2/src/access_layer.rs @@ -14,13 +14,14 @@ use std::sync::Arc; -use object_store::{util, ObjectStore}; +use object_store::{ErrorKind, ObjectStore}; use snafu::ResultExt; use store_api::metadata::RegionMetadataRef; -use crate::error::{DeleteSstSnafu, Result}; +use crate::error::{DeleteSstSnafu, OpenDalSnafu, Result}; use crate::read::Source; use crate::sst::file::{FileHandle, FileId}; +use crate::sst::location; use crate::sst::parquet::reader::ParquetReaderBuilder; use crate::sst::parquet::writer::ParquetWriter; @@ -61,11 +62,18 @@ impl AccessLayer { /// Deletes a SST file with given file id. pub(crate) async fn delete_sst(&self, file_id: FileId) -> Result<()> { - let path = self.sst_file_path(&file_id.as_parquet()); + let sst_path = location::sst_file_path(&self.region_dir, &file_id); self.object_store - .delete(&path) + .delete(&sst_path) .await - .context(DeleteSstSnafu { file_id }) + .context(DeleteSstSnafu { file_id })?; + + let index_path = location::index_file_path(&self.region_dir, &file_id); + self.object_store + .delete(&index_path) + .await + .context(OpenDalSnafu) + .or_else(|e| e.is_object_not_found().then_some(()).ok_or(e)) } /// Returns a reader builder for specific `file`. @@ -81,12 +89,12 @@ impl AccessLayer { metadata: RegionMetadataRef, source: Source, ) -> ParquetWriter { - let path = self.sst_file_path(&file_id.as_parquet()); - ParquetWriter::new(path, metadata, source, self.object_store.clone()) - } - - /// Returns the `file_path` for the `file_name` in the object store. - fn sst_file_path(&self, file_name: &str) -> String { - util::join_path(&self.region_dir, file_name) + ParquetWriter::new( + self.region_dir.clone(), + file_id, + metadata, + source, + self.object_store.clone(), + ) } } diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs index d29450e50a22..f23dd11612a8 100644 --- a/src/mito2/src/error.rs +++ b/src/mito2/src/error.rs @@ -399,6 +399,23 @@ pub enum Error { error: ArrowError, location: Location, }, + + #[snafu(display("Column not found, column: {column}"))] + ColumnNotFound { column: String, location: Location }, + + #[snafu(display("Failed to build index applier"))] + BuildIndexApplier { + #[snafu(source)] + error: index::inverted_index::error::Error, + location: Location, + }, + + #[snafu(display("Failed to push index value"))] + PushIndexValue { + #[snafu(source)] + error: index::inverted_index::error::Error, + location: Location, + }, } pub type Result = std::result::Result; @@ -443,13 +460,16 @@ impl ErrorExt for Error { | InvalidRequest { .. } | FillDefault { .. } | ConvertColumnDataType { .. } + | ColumnNotFound { .. } + | BuildIndexApplier { .. } | InvalidMetadata { .. } => StatusCode::InvalidArguments, RegionMetadataNotFound { .. } | Join { .. } | WorkerStopped { .. } | Recv { .. } | EncodeWal { .. } - | DecodeWal { .. } => StatusCode::Internal, + | DecodeWal { .. } + | PushIndexValue { .. } => StatusCode::Internal, WriteBuffer { source, .. } => source.status_code(), WriteGroup { source, .. } => source.status_code(), FieldTypeMismatch { source, .. } => source.status_code(), diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index 4a8c7028357b..d1e74639175d 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -14,9 +14,12 @@ //! Scans a region according to the scan request. +use std::sync::Arc; + use common_recordbatch::SendableRecordBatchStream; -use common_telemetry::debug; +use common_telemetry::{debug, logging}; use common_time::range::TimestampRange; +use index::inverted_index::search::index_apply::IndexApplier; use store_api::storage::ScanRequest; use table::predicate::{Predicate, TimeRangePredicateBuilder}; @@ -27,6 +30,8 @@ use crate::read::projection::ProjectionMapper; use crate::read::seq_scan::SeqScan; use crate::region::version::VersionRef; use crate::sst::file::FileHandle; +use crate::sst::index::applier::builder::SstIndexApplierBuilder; +use crate::sst::index::applier::SstIndexApplier; /// A scanner scans a region and returns a [SendableRecordBatchStream]. pub(crate) enum Scanner { @@ -194,6 +199,7 @@ impl ScanRegion { total_ssts ); + let index_applier = self.build_index_applier(); let predicate = Predicate::new(self.request.filters.clone()); // The mapper always computes projected column ids as the schema of SSTs may change. let mapper = match &self.request.projection { @@ -204,6 +210,7 @@ impl ScanRegion { let seq_scan = SeqScan::new(self.access_layer.clone(), mapper) .with_time_range(Some(time_range)) .with_predicate(Some(predicate)) + .with_index_applier(index_applier) .with_memtables(memtables) .with_files(files) .with_cache(self.cache_manager) @@ -224,6 +231,23 @@ impl ScanRegion { TimeRangePredicateBuilder::new(&time_index.column_schema.name, unit, &self.request.filters) .build() } + + /// Use the latest schema to build the index applier. + /// + /// To use this fixed schema to apply to different versions of SSTs, we have to make sure: + /// 1. Type of column cannot be changed. + /// 2. Column cannot be renamed. + fn build_index_applier(&self) -> Option { + SstIndexApplierBuilder::new( + self.access_layer.region_dir().to_string(), + self.access_layer.object_store().clone(), + self.version.metadata.as_ref(), + ) + .build(&self.request.filters) + .inspect_err(|e| logging::warn!("Failed to build index applier: {}", e)) + .ok() + .flatten() + } } /// Config for parallel scan. diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs index f963568cad59..b22f66c3382a 100644 --- a/src/mito2/src/read/seq_scan.rs +++ b/src/mito2/src/read/seq_scan.rs @@ -23,6 +23,7 @@ use common_recordbatch::error::ExternalSnafu; use common_recordbatch::{RecordBatch, RecordBatchStreamWrapper, SendableRecordBatchStream}; use common_telemetry::{debug, error}; use common_time::range::TimestampRange; +use index::inverted_index::search::index_apply::IndexApplier; use snafu::ResultExt; use table::predicate::Predicate; use tokio::sync::{mpsc, Semaphore}; @@ -39,6 +40,7 @@ use crate::read::projection::ProjectionMapper; use crate::read::scan_region::ScanParallism; use crate::read::{BatchReader, BoxedBatchReader, BoxedBatchStream, Source}; use crate::sst::file::FileHandle; +use crate::sst::index::applier::SstIndexApplier; /// Scans a region and returns rows in a sorted sequence. /// @@ -62,6 +64,8 @@ pub struct SeqScan { ignore_file_not_found: bool, /// Parallelism to scan data. parallelism: ScanParallism, + + index_appiler: Option, } impl SeqScan { @@ -73,6 +77,7 @@ impl SeqScan { mapper: Arc::new(mapper), time_range: None, predicate: None, + index_appiler: None, memtables: Vec::new(), files: Vec::new(), cache_manager: None, @@ -95,6 +100,12 @@ impl SeqScan { self } + #[must_use] + pub(crate) fn with_index_applier(mut self, index_applier: Option) -> Self { + self.index_appiler = index_applier; + self + } + /// Sets memtables to read. #[must_use] pub(crate) fn with_memtables(mut self, memtables: Vec) -> Self { @@ -210,6 +221,7 @@ impl SeqScan { .access_layer .read_sst(file.clone()) .predicate(self.predicate.clone()) + .index_applier(self.index_appiler.clone()) .time_range(self.time_range) .projection(Some(self.mapper.column_ids().to_vec())) .cache(self.cache_manager.clone()) diff --git a/src/mito2/src/row_converter.rs b/src/mito2/src/row_converter.rs index 4cc6fd3274ac..f22022e39ab8 100644 --- a/src/mito2/src/row_converter.rs +++ b/src/mito2/src/row_converter.rs @@ -48,7 +48,7 @@ pub trait RowCodec { fn decode(&self, bytes: &[u8]) -> Result>; } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct SortField { data_type: ConcreteDataType, } @@ -84,7 +84,11 @@ impl SortField { } impl SortField { - fn serialize(&self, serializer: &mut Serializer<&mut Vec>, value: &ValueRef) -> Result<()> { + pub(crate) fn serialize( + &self, + serializer: &mut Serializer<&mut Vec>, + value: &ValueRef, + ) -> Result<()> { macro_rules! cast_value_and_serialize { ( $self: ident; diff --git a/src/mito2/src/sst.rs b/src/mito2/src/sst.rs index 32c7b4951a55..94e0cb205bc2 100644 --- a/src/mito2/src/sst.rs +++ b/src/mito2/src/sst.rs @@ -16,5 +16,7 @@ pub mod file; pub mod file_purger; +pub mod index; +pub mod location; pub mod parquet; pub(crate) mod version; diff --git a/src/mito2/src/sst/file.rs b/src/mito2/src/sst/file.rs index a16987690d09..a68a5d81c87d 100644 --- a/src/mito2/src/sst/file.rs +++ b/src/mito2/src/sst/file.rs @@ -57,6 +57,11 @@ impl FileId { pub fn as_parquet(&self) -> String { format!("{}{}", self, ".parquet") } + + /// Append `.puffin` to file id to make a complete file name + pub fn as_puffin(&self) -> String { + format!("{}{}", self, ".puffin") + } } impl fmt::Display for FileId { diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs new file mode 100644 index 000000000000..c26e3aee1dae --- /dev/null +++ b/src/mito2/src/sst/index.rs @@ -0,0 +1,17 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod applier; +mod codec; +pub mod creator; diff --git a/src/mito2/src/sst/index/applier.rs b/src/mito2/src/sst/index/applier.rs new file mode 100644 index 000000000000..9a5d72b6eb7d --- /dev/null +++ b/src/mito2/src/sst/index/applier.rs @@ -0,0 +1,84 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod builder; + +use std::collections::BTreeSet; +use std::sync::Arc; + +use index::inverted_index::format::reader::InvertedIndexBlobReader; +use index::inverted_index::search::index_apply::{ + IndexApplier, IndexNotFoundStrategy, SearchContext, +}; +use object_store::ObjectStore; +use puffin::file_format::reader::{PuffinAsyncReader, PuffinFileReader}; +use snafu::ResultExt; + +use crate::error::{OpenDalSnafu, Result}; +use crate::sst::file::FileId; +use crate::sst::location; + +#[derive(Clone)] +pub struct SstIndexApplier { + region_dir: String, + object_store: ObjectStore, + + index_applier: Arc, +} + +impl SstIndexApplier { + pub fn new( + region_dir: String, + object_store: ObjectStore, + index_applier: Arc, + ) -> Self { + Self { + region_dir, + object_store, + index_applier, + } + } + + pub async fn apply(&self, file_id: FileId) -> Result> { + let file_name = location::index_file_path(&self.region_dir, &file_id); + + let file_reader = self + .object_store + .reader(&file_name) + .await + .context(OpenDalSnafu)?; + let mut puffin_reader = PuffinFileReader::new(file_reader); + + let file_meta = puffin_reader.metadata().await.unwrap(); + let blob_meta = file_meta + .blobs + .iter() + .find(|blob| blob.blob_type == "greptime-inverted-index-v1".to_string()) + .unwrap(); + + let blob_reader = puffin_reader.blob_reader(blob_meta).unwrap(); + let mut index_reader = InvertedIndexBlobReader::new(blob_reader); + + let context = SearchContext { + index_not_found_strategy: IndexNotFoundStrategy::ReturnEmpty, + }; + let res = self + .index_applier + .apply(context, &mut index_reader) + .await + .unwrap(); + + Ok(res) + } +} diff --git a/src/mito2/src/sst/index/applier/builder.rs b/src/mito2/src/sst/index/applier/builder.rs new file mode 100644 index 000000000000..4df9e6c43620 --- /dev/null +++ b/src/mito2/src/sst/index/applier/builder.rs @@ -0,0 +1,151 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod between; +mod comparison; +mod in_list; +mod or_eq_list; +mod regex_match; + +use std::collections::HashMap; +use std::sync::Arc; + +use common_query::logical_plan::Expr; +use datafusion_common::ScalarValue; +use datafusion_expr::{BinaryExpr, Expr as DfExpr, Operator}; +use datatypes::data_type::ConcreteDataType; +use datatypes::schema::Schema; +use datatypes::value::Value; +use index::inverted_index::search::index_apply::PredicatesIndexApplier; +use index::inverted_index::search::predicate::Predicate; +use object_store::ObjectStore; +use snafu::{OptionExt, ResultExt}; +use store_api::metadata::{ColumnMetadata, RegionMetadata}; + +use crate::error::{BuildIndexApplierSnafu, ColumnNotFoundSnafu, Result}; +use crate::row_converter::SortField; +use crate::sst::index::applier::SstIndexApplier; +use crate::sst::index::codec::IndexValueCodec; + +type ColumnName = String; + +pub struct SstIndexApplierBuilder<'a> { + region_dir: String, + object_store: ObjectStore, + metadata: &'a RegionMetadata, + output: HashMap>, +} + +impl<'a> SstIndexApplierBuilder<'a> { + pub fn new( + region_dir: String, + object_store: ObjectStore, + metadata: &'a RegionMetadata, + ) -> Self { + Self { + region_dir, + object_store, + metadata, + output: HashMap::default(), + } + } + + pub fn build(mut self, exprs: &[Expr]) -> Result> { + for expr in exprs { + self.traverse_and_collect(expr.df_expr())?; + } + + if self.output.is_empty() { + return Ok(None); + } + + let predicates = self.output.into_iter().collect::>(); + let applier = + PredicatesIndexApplier::try_from(predicates).context(BuildIndexApplierSnafu)?; + Ok(Some(SstIndexApplier::new( + self.region_dir, + self.object_store, + Arc::new(applier), + ))) + } + + fn traverse_and_collect(&mut self, expr: &DfExpr) -> Result<()> { + match expr { + DfExpr::Between(between) => self.collect_between(between), + DfExpr::InList(in_list) => self.collect_inlist(in_list), + + DfExpr::BinaryExpr(BinaryExpr { + left, + op: Operator::And, + right, + }) => { + self.traverse_and_collect(left)?; + self.traverse_and_collect(right) + } + + DfExpr::BinaryExpr(BinaryExpr { + left, + op: Operator::Or, + right, + }) => self.collect_or_eq_list(left, right), + + DfExpr::BinaryExpr(BinaryExpr { + left, + op: Operator::RegexMatch, + right, + }) => self.collect_regex_match(left, right), + + DfExpr::BinaryExpr( + b @ BinaryExpr { + left, + op: Operator::Eq | Operator::Lt | Operator::LtEq | Operator::Gt | Operator::GtEq, + right, + }, + ) => self.collect_comparison_expr(left, &b.op, right), + + // TODO(zhongzc): support more expressions, e.g. IsNull, IsNotNull, ... + _ => Ok(()), + } + } + + fn add_predicate(&mut self, column_name: &str, predicate: Predicate) { + match self.output.get_mut(column_name) { + Some(predicates) => predicates.push(predicate), + None => { + self.output.insert(column_name.to_string(), vec![predicate]); + } + } + } + + fn tag_column_type(&self, column_name: &str) -> Result> { + let column = self + .metadata + .column_by_name(column_name) + .context(ColumnNotFoundSnafu { + column: column_name, + })?; + + Ok(column + .is_tag() + .then_some(column.column_schema.data_type.clone())) + } + + fn encode_lit(lit: &ScalarValue, data_type: ConcreteDataType) -> Result> { + let value = Value::try_from(lit.clone()).unwrap(); + let mut bytes = vec![]; + let field = SortField::new(data_type); + IndexValueCodec::encode_value(value.as_value_ref(), &field, &mut bytes)?; + Ok(bytes) + } +} diff --git a/src/mito2/src/sst/index/applier/builder/between.rs b/src/mito2/src/sst/index/applier/builder/between.rs new file mode 100644 index 000000000000..26e30bcbe1d0 --- /dev/null +++ b/src/mito2/src/sst/index/applier/builder/between.rs @@ -0,0 +1,64 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion_expr::{Between, Expr as DfExpr}; +use index::inverted_index::search::predicate::{Bound, Predicate, Range, RangePredicate}; + +use crate::error::Result; +use crate::sst::index::applier::builder::SstIndexApplierBuilder; + +impl<'a> SstIndexApplierBuilder<'a> { + /// ```sql + /// column_name BETWEEN literal1 AND literal2 + /// ``` + pub(crate) fn collect_between(&mut self, between: &Between) -> Result<()> { + if between.negated { + return Ok(()); + } + + let DfExpr::Column(c) = between.expr.as_ref() else { + return Ok(()); + }; + let low = match between.low.as_ref() { + DfExpr::Literal(lit) if !lit.is_null() => lit, + _ => return Ok(()), + }; + let high = match between.high.as_ref() { + DfExpr::Literal(lit) if !lit.is_null() => lit, + _ => return Ok(()), + }; + + let Some(data_type) = self.tag_column_type(&c.name)? else { + return Ok(()); + }; + let low = Self::encode_lit(low, data_type.clone())?; + let high = Self::encode_lit(high, data_type)?; + + let predicate = Predicate::Range(RangePredicate { + range: Range { + lower: Some(Bound { + inclusive: true, + value: low, + }), + upper: Some(Bound { + inclusive: true, + value: high, + }), + }, + }); + + self.add_predicate(&c.name, predicate); + Ok(()) + } +} diff --git a/src/mito2/src/sst/index/applier/builder/comparison.rs b/src/mito2/src/sst/index/applier/builder/comparison.rs new file mode 100644 index 000000000000..c640bfe40a36 --- /dev/null +++ b/src/mito2/src/sst/index/applier/builder/comparison.rs @@ -0,0 +1,168 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::iter; + +use datafusion_expr::{Expr as DfExpr, Operator}; +use index::inverted_index::search::predicate::{ + Bound, InListPredicate, Predicate, Range, RangePredicate, +}; +use index::inverted_index::Bytes; + +use crate::error::Result; +use crate::sst::index::applier::builder::SstIndexApplierBuilder; + +impl<'a> SstIndexApplierBuilder<'a> { + pub(crate) fn collect_comparison_expr( + &mut self, + left: &DfExpr, + op: &Operator, + right: &DfExpr, + ) -> Result<()> { + match op { + Operator::Eq => self.collect_eq(left, right), + Operator::Lt => { + if matches!(right, DfExpr::Column(_)) { + self.collect_gt(right, left) + } else { + self.collect_lt(left, right) + } + } + Operator::LtEq => { + if matches!(right, DfExpr::Column(_)) { + self.collect_ge(right, left) + } else { + self.collect_le(left, right) + } + } + Operator::Gt => { + if matches!(right, DfExpr::Column(_)) { + self.collect_lt(right, left) + } else { + self.collect_gt(left, right) + } + } + Operator::GtEq => { + if matches!(right, DfExpr::Column(_)) { + self.collect_le(right, left) + } else { + self.collect_ge(left, right) + } + } + _ => Ok(()), + } + } + + /// ```sql + /// column_name <> literal + /// # or + /// literal <> column_name + /// ``` + fn collect_eq(&mut self, left: &DfExpr, right: &DfExpr) -> Result<()> { + let (column, lit) = match (left, right) { + (DfExpr::Column(c), DfExpr::Literal(lit)) if !lit.is_null() => (c, lit), + (DfExpr::Literal(lit), DfExpr::Column(c)) if !lit.is_null() => (c, lit), + _ => return Ok(()), + }; + + let Some(data_type) = self.tag_column_type(&column.name)? else { + return Ok(()); + }; + let bytes = Self::encode_lit(lit, data_type)?; + + let predicate = Predicate::InList(InListPredicate { + list: iter::once(bytes).collect(), + }); + self.add_predicate(&column.name, predicate); + Ok(()) + } + + /// ```sql + /// column_name < literal + /// ``` + fn collect_lt(&mut self, left: &DfExpr, right: &DfExpr) -> Result<()> { + self.collect_cmp(left, right, |value| Range { + lower: None, + upper: Some(Bound { + inclusive: false, + value, + }), + }) + } + + /// ```sql + /// column_name > literal + /// ``` + fn collect_gt(&mut self, left: &DfExpr, right: &DfExpr) -> Result<()> { + self.collect_cmp(left, right, |value| Range { + lower: Some(Bound { + inclusive: false, + value, + }), + upper: None, + }) + } + + /// ```sql + /// column_name <= literal + /// ``` + fn collect_le(&mut self, left: &DfExpr, right: &DfExpr) -> Result<()> { + self.collect_cmp(left, right, |value| Range { + lower: None, + upper: Some(Bound { + inclusive: true, + value, + }), + }) + } + + /// ```sql + /// column_name >= literal + /// ``` + fn collect_ge(&mut self, left: &DfExpr, right: &DfExpr) -> Result<()> { + self.collect_cmp(left, right, |value| Range { + lower: Some(Bound { + inclusive: true, + value, + }), + upper: None, + }) + } + + fn collect_cmp( + &mut self, + left: &DfExpr, + right: &DfExpr, + range: impl FnOnce(Bytes) -> Range, + ) -> Result<()> { + let DfExpr::Column(c) = left else { + return Ok(()); + }; + let lit = match right { + DfExpr::Literal(lit) if !lit.is_null() => lit, + _ => return Ok(()), + }; + + let Some(data_type) = self.tag_column_type(&c.name)? else { + return Ok(()); + }; + let bytes = Self::encode_lit(lit, data_type)?; + + let predicate = Predicate::Range(RangePredicate { + range: range(bytes), + }); + self.add_predicate(&c.name, predicate); + Ok(()) + } +} diff --git a/src/mito2/src/sst/index/applier/builder/in_list.rs b/src/mito2/src/sst/index/applier/builder/in_list.rs new file mode 100644 index 000000000000..3b5aaf3f3ab6 --- /dev/null +++ b/src/mito2/src/sst/index/applier/builder/in_list.rs @@ -0,0 +1,57 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashSet; + +use datafusion_expr::expr::InList; +use datafusion_expr::Expr as DfExpr; +use index::inverted_index::search::predicate::{InListPredicate, Predicate}; + +use crate::error::Result; +use crate::sst::index::applier::builder::SstIndexApplierBuilder; + +impl<'a> SstIndexApplierBuilder<'a> { + /// ```sql + /// column_name IN (literal1, literal2, ...) + /// ``` + pub(crate) fn collect_inlist(&mut self, inlist: &InList) -> Result<()> { + if inlist.negated { + return Ok(()); + } + + let DfExpr::Column(c) = inlist.expr.as_ref() else { + return Ok(()); + }; + + let mut predicate = InListPredicate { + list: HashSet::with_capacity(inlist.list.len()), + }; + + let Some(data_type) = self.tag_column_type(&c.name)? else { + return Ok(()); + }; + for lit in &inlist.list { + let lit = match lit { + DfExpr::Literal(lit) if !lit.is_null() => lit, + _ => return Ok(()), + }; + + let bytes = Self::encode_lit(lit, data_type.clone())?; + predicate.list.insert(bytes); + } + + self.add_predicate(&c.name, Predicate::InList(predicate)); + Ok(()) + } +} diff --git a/src/mito2/src/sst/index/applier/builder/or_eq_list.rs b/src/mito2/src/sst/index/applier/builder/or_eq_list.rs new file mode 100644 index 000000000000..f75612f8a601 --- /dev/null +++ b/src/mito2/src/sst/index/applier/builder/or_eq_list.rs @@ -0,0 +1,100 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashSet; + +use datafusion_expr::{BinaryExpr, Expr as DfExpr, Operator}; +use datatypes::data_type::ConcreteDataType; +use index::inverted_index::search::predicate::{InListPredicate, Predicate}; +use index::inverted_index::Bytes; + +use crate::error::Result; +use crate::sst::index::applier::builder::SstIndexApplierBuilder; + +impl<'a> SstIndexApplierBuilder<'a> { + /// ```sql + /// column_name <> literal1 OR column_name <> literal2 OR ... + /// ``` + pub(crate) fn collect_or_eq_list(&mut self, eq_expr: &DfExpr, or_list: &DfExpr) -> Result<()> { + let (column_name, lit) = match eq_expr { + DfExpr::BinaryExpr(BinaryExpr { + left, + op: Operator::Eq, + right, + }) => match (left.as_ref(), right.as_ref()) { + // column_name <> literal + (DfExpr::Column(c), DfExpr::Literal(lit)) if !lit.is_null() => (&c.name, lit), + // literal <> column_name + (DfExpr::Literal(lit), DfExpr::Column(c)) if !lit.is_null() => (&c.name, lit), + _ => return Ok(()), + }, + _ => return Ok(()), + }; + + let Some(data_type) = self.tag_column_type(column_name)? else { + return Ok(()); + }; + let bytes = Self::encode_lit(lit, data_type.clone())?; + + let mut inlist = HashSet::from_iter([bytes]); + if Self::collect_eq_list_inner(column_name, &data_type, or_list, &mut inlist)? { + let predicate = Predicate::InList(InListPredicate { list: inlist }); + self.add_predicate(column_name, predicate); + } + + Ok(()) + } + + fn collect_eq_list_inner( + column_name: &str, + data_type: &ConcreteDataType, + expr: &DfExpr, + inlist: &mut HashSet, + ) -> Result { + let DfExpr::BinaryExpr(BinaryExpr { left, op, right }) = expr else { + return Ok(false); + }; + match op { + Operator::Eq => { + let lit = match (left.as_ref(), right.as_ref()) { + (DfExpr::Column(c), DfExpr::Literal(lit)) + if c.name == column_name && !lit.is_null() => + { + lit + } + (DfExpr::Literal(lit), DfExpr::Column(c)) + if c.name == column_name && !lit.is_null() => + { + lit + } + _ => return Ok(false), + }; + let bytes = Self::encode_lit(lit, data_type.clone())?; + inlist.insert(bytes); + Ok(true) + } + + Operator::Or => { + let (left, right) = (left.as_ref(), right.as_ref()); + if Self::collect_eq_list_inner(column_name, data_type, left, inlist)? { + Self::collect_eq_list_inner(column_name, data_type, right, inlist) + } else { + Ok(false) + } + } + + _ => return Ok(false), + } + } +} diff --git a/src/mito2/src/sst/index/applier/builder/regex_match.rs b/src/mito2/src/sst/index/applier/builder/regex_match.rs new file mode 100644 index 000000000000..c6ccc8bdf09b --- /dev/null +++ b/src/mito2/src/sst/index/applier/builder/regex_match.rs @@ -0,0 +1,45 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion_common::ScalarValue; +use datafusion_expr::Expr as DfExpr; +use index::inverted_index::search::predicate::{Predicate, RegexMatchPredicate}; + +use crate::error::Result; +use crate::sst::index::applier::builder::SstIndexApplierBuilder; + +impl<'a> SstIndexApplierBuilder<'a> { + /// ```sql + /// column_name REGEXP literal + /// ``` + pub(crate) fn collect_regex_match(&mut self, left: &DfExpr, right: &DfExpr) -> Result<()> { + let (column, pattern) = match (left, right) { + (DfExpr::Column(c), DfExpr::Literal(ScalarValue::Utf8(Some(pattern)))) => (c, pattern), + _ => return Ok(()), + }; + + let Some(data_type) = self.tag_column_type(&column.name)? else { + return Ok(()); + }; + if !data_type.is_string() { + return Ok(()); + } + + let predicate = Predicate::RegexMatch(RegexMatchPredicate { + pattern: pattern.clone(), + }); + self.add_predicate(&column.name, predicate); + Ok(()) + } +} diff --git a/src/mito2/src/sst/index/codec.rs b/src/mito2/src/sst/index/codec.rs new file mode 100644 index 000000000000..71526f76cdbd --- /dev/null +++ b/src/mito2/src/sst/index/codec.rs @@ -0,0 +1,87 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::iter; +use std::pin::{pin, Pin}; + +use datatypes::data_type::ConcreteDataType; +use datatypes::value::{Value, ValueRef}; +use futures::Future; +use index::inverted_index::BytesRef; +use memcomparable::Serializer; +use pin_project::pin_project; +use store_api::metadata::ColumnMetadata; + +use crate::error::Result; +use crate::row_converter::{McmpRowCodec, RowCodec, SortField}; + +type ColumnName = String; + +pub struct IndexValueCodec; + +impl IndexValueCodec { + pub fn encode_value(value: ValueRef, field: &SortField, buffer: &mut Vec) -> Result<()> { + buffer.reserve(field.estimated_size()); + let mut serializer = Serializer::new(buffer); + field.serialize(&mut serializer, &value) + } +} + +#[pin_project] +pub struct IndexValuesCodec { + column_names: Vec, + fields: Vec, + decoder: McmpRowCodec, +} + +impl IndexValuesCodec { + pub fn from_tag_columns<'a>(tag_columns: impl Iterator) -> Self { + let (column_names, fields): (Vec<_>, Vec<_>) = tag_columns + .map(|column| { + ( + column.column_schema.name.clone(), + SortField::new(column.column_schema.data_type.clone()), + ) + }) + .unzip(); + + let decoder = McmpRowCodec::new(fields.clone()); + Self { + column_names, + fields, + decoder, + } + } + + pub fn decode( + &self, + primary_key: &[u8], + ) -> Result)>> { + let values = self.decoder.decode(primary_key)?; + + let iter = values + .into_iter() + .zip(&self.column_names) + .zip(&self.fields) + .map(|((value, column_name), encoder)| { + if value.is_null() { + (column_name, encoder, None) + } else { + (column_name, encoder, Some(value)) + } + }); + + Ok(iter) + } +} diff --git a/src/mito2/src/sst/index/creator.rs b/src/mito2/src/sst/index/creator.rs new file mode 100644 index 000000000000..c9aa7bc908b4 --- /dev/null +++ b/src/mito2/src/sst/index/creator.rs @@ -0,0 +1,218 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::num::NonZeroUsize; +use std::sync::Arc; + +use async_trait::async_trait; +use common_telemetry::logging; +use futures::{AsyncRead, AsyncWrite}; +use index::inverted_index::create::sort::external_provider::ExternalTempFileProvider; +use index::inverted_index::create::sort::external_sort::ExternalSorter; +use index::inverted_index::create::sort_create::SortIndexCreator; +use index::inverted_index::create::InvertedIndexCreator; +use index::inverted_index::error::Result as IndexResult; +use index::inverted_index::format::writer::InvertedIndexBlobWriter; +use object_store::{util, ObjectStore}; +use puffin::file_format::writer::{Blob, PuffinAsyncWriter, PuffinFileWriter}; +use snafu::ResultExt; +use store_api::metadata::RegionMetadataRef; +use tokio::io::duplex; +use tokio_util::compat::{TokioAsyncReadCompatExt, TokioAsyncWriteCompatExt}; + +use crate::error::{PushIndexValueSnafu, Result}; +use crate::read::Batch; +use crate::sst::file::FileId; +use crate::sst::index::codec::{IndexValueCodec, IndexValuesCodec}; +use crate::sst::location; + +type ByteCount = usize; +type RowCount = usize; + +pub struct SstIndexCreator { + region_dir: String, + sst_file_id: FileId, + object_store: ObjectStore, + + codec: IndexValuesCodec, + index_creator: Box, + + temp_file_provider: Arc, + value_buf: Vec, + + row_count: RowCount, +} + +impl SstIndexCreator { + pub fn new( + region_dir: String, + sst_file_id: FileId, + metadata: &RegionMetadataRef, + object_store: ObjectStore, + memory_usage_threshold: Option, + row_group_size: NonZeroUsize, + ) -> Self { + let temp_file_provider = Arc::new(TempFileProvider { + temp_file_dir: location::index_creation_temp_dir(®ion_dir, &sst_file_id), + object_store: object_store.clone(), + }); + let memory_usage_threshold = memory_usage_threshold + .map(|threshold| (threshold / metadata.primary_key.len()).max(1024)); + let sorter = + ExternalSorter::factory(temp_file_provider.clone() as _, memory_usage_threshold); + let index_creator = Box::new(SortIndexCreator::new(sorter, row_group_size)); + + let codec = IndexValuesCodec::from_tag_columns(metadata.primary_key_columns()); + Self { + region_dir, + sst_file_id, + object_store, + codec, + index_creator, + temp_file_provider, + value_buf: vec![], + row_count: 0, + } + } + + pub async fn update(&mut self, batch: &Batch) -> Result<()> { + if batch.is_empty() { + return Ok(()); + } + + if let Err(err) = self.do_update(batch).await { + if let Err(err) = self.cleanup().await { + logging::warn!( + "Failed to clean up index creator, region_dir: {}, sst_file_id: {}, error: {err}", + self.region_dir, self.sst_file_id + ); + } + return Err(err); + } + + Ok(()) + } + + pub async fn finish(&mut self) -> Result<(RowCount, ByteCount)> { + if self.row_count == 0 { + return Ok((0, 0)); + } + + let res = self.do_finish().await; + + if let Err(err) = self.cleanup().await { + logging::warn!( + "Failed to clean up index creator, region_dir: {}, sst_file_id: {}, error: {err}", + self.region_dir, + self.sst_file_id + ); + } + + res.map(|bytes| (self.row_count, bytes)) + } + + async fn do_update(&mut self, batch: &Batch) -> Result<()> { + let n = batch.num_rows(); + self.row_count += n; + for (column_name, field, value) in self.codec.decode(batch.primary_key())? { + if let Some(value) = value.as_ref() { + self.value_buf.clear(); + IndexValueCodec::encode_value(value.as_value_ref(), field, &mut self.value_buf)?; + } + + let v = value.is_some().then(|| self.value_buf.as_slice()); + self.index_creator + .push_with_name_n(&column_name, v, n) + .await + .context(PushIndexValueSnafu)?; + } + + Ok(()) + } + + async fn do_finish(&mut self) -> Result { + let file_path = location::index_file_path(&self.region_dir, &self.sst_file_id); + let writer = self.object_store.writer(&file_path).await.unwrap(); + let mut puffin_writer = PuffinFileWriter::new(writer); + + let (tx, rx) = duplex(8 * 1024); + + let blob = Blob { + blob_type: "greptime-inverted-index-v1".to_string(), + data: rx.compat(), + properties: HashMap::default(), + }; + + let mut index_writer = InvertedIndexBlobWriter::new(tx.compat_write()); + let (source, sink) = futures::join!( + self.index_creator.finish(&mut index_writer), + puffin_writer.add_blob(blob) + ); + + source.unwrap(); + sink.unwrap(); + + Ok(puffin_writer.finish().await.unwrap()) + } + + async fn cleanup(&mut self) -> Result<()> { + self.temp_file_provider.cleanup().await + } +} + +struct TempFileProvider { + temp_file_dir: String, + object_store: ObjectStore, +} + +#[async_trait] +impl ExternalTempFileProvider for TempFileProvider { + async fn create( + &self, + index_name: &str, + tmp_file_id: &str, + ) -> IndexResult> { + let child = format!("{index_name}/{tmp_file_id}.im"); + let path = util::join_path(&self.temp_file_dir, &child); + Ok(Box::new(self.object_store.writer(&path).await.unwrap())) + } + + async fn read_all( + &self, + index_name: &str, + ) -> IndexResult>> { + let dir = util::join_dir(&self.temp_file_dir, index_name); + + let entries = self.object_store.list(&dir).await.unwrap(); + let mut readers = Vec::with_capacity(entries.len()); + + for entry in entries { + let path = entry.path(); + readers.push(Box::new(self.object_store.reader(path).await.unwrap()) as _); + } + + Ok(readers) + } +} + +impl TempFileProvider { + async fn cleanup(&self) -> Result<()> { + self.object_store + .remove_all(&self.temp_file_dir) + .await + .unwrap(); + Ok(()) + } +} diff --git a/src/mito2/src/sst/location.rs b/src/mito2/src/sst/location.rs new file mode 100644 index 000000000000..b342fa15b283 --- /dev/null +++ b/src/mito2/src/sst/location.rs @@ -0,0 +1,39 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use object_store::util; +use uuid::Uuid; + +use crate::sst::file::FileId; + +/// Returns the path of the SST file in the object store: +/// `{region_dir}/{sst_file_id}.parquet` +pub fn sst_file_path(region_dir: &str, sst_file_id: &FileId) -> String { + util::join_path(region_dir, &sst_file_id.as_parquet()) +} + +/// Returns the path of the index file in the object store: +/// `{region_dir}/index/{sst_file_id}.puffin` +pub fn index_file_path(region_dir: &str, sst_file_id: &FileId) -> String { + let dir = util::join_dir(region_dir, "index"); + util::join_path(&dir, &sst_file_id.as_puffin()) +} + +/// Returns the path of the index creation temp directory in the object store: +/// `{region_dir}/index/__intermediate/{sst_file_id}/{uuid}/` +pub fn index_creation_temp_dir(region_dir: &str, sst_file_id: &FileId) -> String { + let uuid = Uuid::new_v4(); + let child = format!("index/__intermediate/{sst_file_id}/{uuid}"); + util::join_dir(region_dir, &child) +} diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index af3f8479f39c..5289dc5cb00b 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -97,7 +97,6 @@ mod tests { let mut env = TestEnv::new(); let object_store = env.init_object_store_manager(); let handle = sst_file_handle(0, 1000); - let file_path = handle.file_path(FILE_DIR); let metadata = Arc::new(sst_region_metadata()); let source = new_source(&[ new_batch_by_range(&["a", "d"], 0, 60), @@ -110,7 +109,13 @@ mod tests { ..Default::default() }; - let mut writer = ParquetWriter::new(file_path, metadata, source, object_store.clone()); + let mut writer = ParquetWriter::new( + FILE_DIR.to_string(), + handle.file_id(), + metadata, + source, + object_store.clone(), + ); let info = writer.write_all(&write_opts).await.unwrap().unwrap(); assert_eq!(200, info.num_rows); assert!(info.file_size > 0); @@ -142,7 +147,6 @@ mod tests { let mut env = TestEnv::new(); let object_store = env.init_object_store_manager(); let handle = sst_file_handle(0, 1000); - let file_path = handle.file_path(FILE_DIR); let metadata = Arc::new(sst_region_metadata()); let source = new_source(&[ new_batch_by_range(&["a", "d"], 0, 60), @@ -155,8 +159,13 @@ mod tests { ..Default::default() }; // Prepare data. - let mut writer = - ParquetWriter::new(file_path, metadata.clone(), source, object_store.clone()); + let mut writer = ParquetWriter::new( + FILE_DIR.to_string(), + handle.file_id(), + metadata.clone(), + source, + object_store.clone(), + ); writer.write_all(&write_opts).await.unwrap().unwrap(); let cache = Some(Arc::new(CacheManager::new(0, 0, 64 * 1024 * 1024))); diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index 45e36786d41d..bdd9b968c217 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -14,7 +14,7 @@ //! Parquet reader. -use std::collections::{HashSet, VecDeque}; +use std::collections::{BTreeSet, HashSet, VecDeque}; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -42,6 +42,7 @@ use crate::error::{ use crate::metrics::{READ_ROWS_TOTAL, READ_STAGE_ELAPSED}; use crate::read::{Batch, BatchReader}; use crate::sst::file::FileHandle; +use crate::sst::index::applier::SstIndexApplier; use crate::sst::parquet::format::ReadFormat; use crate::sst::parquet::row_group::InMemoryRowGroup; use crate::sst::parquet::stats::RowGroupPruningStats; @@ -64,6 +65,8 @@ pub struct ParquetReaderBuilder { projection: Option>, /// Manager that caches SST data. cache_manager: Option, + + index_applier: Option, } impl ParquetReaderBuilder { @@ -81,6 +84,7 @@ impl ParquetReaderBuilder { time_range: None, projection: None, cache_manager: None, + index_applier: None, } } @@ -110,6 +114,11 @@ impl ParquetReaderBuilder { self } + pub fn index_applier(mut self, index_applier: Option) -> Self { + self.index_applier = index_applier; + self + } + /// Builds and initializes a [ParquetReader]. /// /// This needs to perform IO operation. @@ -143,19 +152,28 @@ impl ParquetReaderBuilder { }); let read_format = ReadFormat::new(Arc::new(region_meta)); + let mut row_groups: BTreeSet<_> = (0..parquet_meta.num_row_groups()).collect(); + if let Some(index_applier) = self.index_applier.as_ref() { + match index_applier.apply(self.file_handle.file_id()).await { + Ok(rgs) => row_groups = rgs, + Err(err) => debug!("Failed to apply index: {err}"), + } + } + // Prunes row groups by metadata. - let row_groups: VecDeque<_> = if let Some(predicate) = &self.predicate { + if let Some(predicate) = &self.predicate { let stats = RowGroupPruningStats::new(parquet_meta.row_groups(), &read_format, column_ids); - predicate + for (row_group, valid) in predicate .prune_with_stats(&stats, read_format.metadata().schema.arrow_schema()) .into_iter() .enumerate() - .filter_map(|(idx, valid)| if valid { Some(idx) } else { None }) - .collect() - } else { - (0..parquet_meta.num_row_groups()).collect() + { + if !valid { + row_groups.remove(&row_group); + } + } }; // Computes the projection mask. @@ -335,7 +353,7 @@ impl RowGroupReaderBuilder { /// Parquet batch reader to read our SST format. pub struct ParquetReader { /// Indices of row groups to read. - row_groups: VecDeque, + row_groups: BTreeSet, /// Helper to read record batches. /// /// Not `None` if [ParquetReader::stream] is not `None`. @@ -430,7 +448,7 @@ impl ParquetReader { } // No more items in current row group, reads next row group. - while let Some(row_group_idx) = self.row_groups.pop_front() { + while let Some(row_group_idx) = self.row_groups.pop_first() { let mut row_group_reader = self.reader_builder.build(row_group_idx).await?; let Some(record_batch) = row_group_reader diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs index d776b3ac627d..d26a197558c8 100644 --- a/src/mito2/src/sst/parquet/writer.rs +++ b/src/mito2/src/sst/parquet/writer.rs @@ -14,8 +14,10 @@ //! Parquet writer. +use std::num::NonZeroUsize; + use common_datasource::file_format::parquet::BufferedWriter; -use common_telemetry::debug; +use common_telemetry::{debug, warn}; use common_time::Timestamp; use object_store::ObjectStore; use parquet::basic::{Compression, Encoding, ZstdLevel}; @@ -28,13 +30,18 @@ use store_api::storage::consts::SEQUENCE_COLUMN_NAME; use crate::error::{InvalidMetadataSnafu, Result, WriteBufferSnafu}; use crate::read::{Batch, Source}; +use crate::sst::file::FileId; +use crate::sst::index::creator::SstIndexCreator; +use crate::sst::location; use crate::sst::parquet::format::WriteFormat; use crate::sst::parquet::{SstInfo, WriteOptions, PARQUET_METADATA_KEY}; /// Parquet SST writer. pub struct ParquetWriter { - /// SST output file path. - file_path: String, + /// Directory of the region. + region_dir: String, + /// SST file id. + file_id: FileId, /// Input data source. source: Source, /// Region metadata of the source and the target SST. @@ -45,13 +52,15 @@ pub struct ParquetWriter { impl ParquetWriter { /// Creates a new parquet SST writer. pub fn new( - file_path: String, + region_dir: String, + file_id: FileId, metadata: RegionMetadataRef, source: Source, object_store: ObjectStore, ) -> ParquetWriter { ParquetWriter { - file_path, + region_dir, + file_id, source, metadata, object_store, @@ -75,9 +84,10 @@ impl ParquetWriter { let props_builder = Self::customize_column_config(props_builder, &self.metadata); let writer_props = props_builder.build(); + let file_path = location::sst_file_path(&self.region_dir, &self.file_id); let write_format = WriteFormat::new(self.metadata.clone()); let mut buffered_writer = BufferedWriter::try_new( - self.file_path.clone(), + file_path.clone(), self.object_store.clone(), write_format.arrow_schema(), Some(writer_props), @@ -87,6 +97,17 @@ impl ParquetWriter { .context(WriteBufferSnafu)?; let mut stats = SourceStats::default(); + let mut index_creator = (!self.metadata.primary_key.is_empty()).then(|| { + SstIndexCreator::new( + self.region_dir.clone(), + self.file_id, + &self.metadata, + self.object_store.clone(), + Some(4 * 1024 * 1024), + NonZeroUsize::new(opts.row_group_size).unwrap(), + ) + }); + while let Some(batch) = self.source.next_batch().await? { stats.update(&batch); let arrow_batch = write_format.convert_batch(&batch)?; @@ -95,13 +116,34 @@ impl ParquetWriter { .write(&arrow_batch) .await .context(WriteBufferSnafu)?; + + if let Some(creator) = index_creator.as_mut() { + if let Err(err) = creator.update(&batch).await { + debug!("Failed to update index: {}", err); + + // Skip index creation if failed to update. + index_creator = None; + } + } + } + + if let Some(mut creator) = index_creator { + match creator.finish().await { + Ok((row_count, byte_count)) => { + debug!( + "Finished index, region_dir: {}, sst_file_id: {}, bytes: {}", + self.region_dir, self.file_id, byte_count + ); + } + Err(err) => { + warn!("Failed to finish index: {}", err); + return Ok(None); + } + } } if stats.num_rows == 0 { - debug!( - "No data written, try to stop the writer: {}", - self.file_path - ); + debug!("No data written, try to stop the writer: {file_path}"); buffered_writer.close().await.context(WriteBufferSnafu)?; return Ok(None); diff --git a/src/puffin/src/file_format/writer.rs b/src/puffin/src/file_format/writer.rs index 95760df0fea3..24d1fb0e9f16 100644 --- a/src/puffin/src/file_format/writer.rs +++ b/src/puffin/src/file_format/writer.rs @@ -44,8 +44,8 @@ pub trait PuffinSyncWriter { /// Add a blob to the Puffin file fn add_blob(&mut self, blob: Blob) -> Result<()>; - /// Finish writing the Puffin file - fn finish(&mut self) -> Result<()>; + /// Finish writing the Puffin file, returns the number of bytes written + fn finish(&mut self) -> Result; } /// The trait for writing Puffin files asynchronously @@ -57,6 +57,6 @@ pub trait PuffinAsyncWriter { /// Add a blob to the Puffin file async fn add_blob(&mut self, blob: Blob) -> Result<()>; - /// Finish writing the Puffin file - async fn finish(&mut self) -> Result<()>; + /// Finish writing the Puffin file, returns the number of bytes written + async fn finish(&mut self) -> Result; } diff --git a/src/puffin/src/file_format/writer/file.rs b/src/puffin/src/file_format/writer/file.rs index 3f65b9c89d1c..6251d15c9d31 100644 --- a/src/puffin/src/file_format/writer/file.rs +++ b/src/puffin/src/file_format/writer/file.rs @@ -36,8 +36,8 @@ pub struct PuffinFileWriter { /// The metadata of the blobs blob_metadata: Vec, - /// The offset of the next blob - next_blob_offset: u64, + /// The number of bytes written + written_bytes: u64, } impl PuffinFileWriter { @@ -46,7 +46,7 @@ impl PuffinFileWriter { writer, properties: HashMap::new(), blob_metadata: Vec::new(), - next_blob_offset: 0, + written_bytes: 0, } } @@ -59,7 +59,7 @@ impl PuffinFileWriter { BlobMetadataBuilder::default() .blob_type(typ) .properties(properties) - .offset(self.next_blob_offset as _) + .offset(self.written_bytes as _) .length(size as _) .build() .expect("Required fields are not set") @@ -79,14 +79,15 @@ impl PuffinSyncWriter for PuffinFileWriter { let blob_metadata = self.create_blob_metadata(blob.blob_type, blob.properties, size); self.blob_metadata.push(blob_metadata); - self.next_blob_offset += size; + self.written_bytes += size; Ok(()) } - fn finish(&mut self) -> Result<()> { + fn finish(&mut self) -> Result { self.write_header_if_needed_sync()?; self.write_footer_sync()?; - self.writer.flush().context(FlushSnafu) + self.writer.flush().context(FlushSnafu)?; + Ok(self.written_bytes as _) } } @@ -106,23 +107,24 @@ impl PuffinAsyncWriter for PuffinFileWriter { let blob_metadata = self.create_blob_metadata(blob.blob_type, blob.properties, size); self.blob_metadata.push(blob_metadata); - self.next_blob_offset += size; + self.written_bytes += size; Ok(()) } - async fn finish(&mut self) -> Result<()> { + async fn finish(&mut self) -> Result { self.write_header_if_needed_async().await?; self.write_footer_async().await?; self.writer.flush().await.context(FlushSnafu)?; - self.writer.close().await.context(CloseSnafu) + self.writer.close().await.context(CloseSnafu)?; + Ok(self.written_bytes as _) } } impl PuffinFileWriter { fn write_header_if_needed_sync(&mut self) -> Result<()> { - if self.next_blob_offset == 0 { + if self.written_bytes == 0 { self.writer.write_all(&MAGIC).context(WriteSnafu)?; - self.next_blob_offset += MAGIC.len() as u64; + self.written_bytes += MAGIC.len() as u64; } Ok(()) } @@ -134,15 +136,17 @@ impl PuffinFileWriter { ) .into_footer_bytes()?; - self.writer.write_all(&bytes).context(WriteSnafu) + self.writer.write_all(&bytes).context(WriteSnafu)?; + self.written_bytes += bytes.len() as u64; + Ok(()) } } impl PuffinFileWriter { async fn write_header_if_needed_async(&mut self) -> Result<()> { - if self.next_blob_offset == 0 { + if self.written_bytes == 0 { self.writer.write_all(&MAGIC).await.context(WriteSnafu)?; - self.next_blob_offset += MAGIC.len() as u64; + self.written_bytes += MAGIC.len() as u64; } Ok(()) } @@ -154,6 +158,8 @@ impl PuffinFileWriter { ) .into_footer_bytes()?; - self.writer.write_all(&bytes).await.context(WriteSnafu) + self.writer.write_all(&bytes).await.context(WriteSnafu)?; + self.written_bytes += bytes.len() as u64; + Ok(()) } } diff --git a/src/store-api/src/metadata.rs b/src/store-api/src/metadata.rs index 1ab1feaf2def..cd4e0463ec56 100644 --- a/src/store-api/src/metadata.rs +++ b/src/store-api/src/metadata.rs @@ -95,6 +95,10 @@ impl ColumnMetadata { column_id, }) } + + pub fn is_tag(&self) -> bool { + self.semantic_type == SemanticType::Tag + } } #[cfg_attr(doc, aquamarine::aquamarine)] From a97fad3f2a9c6edff5ea063befad9711f8290658 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Thu, 28 Dec 2023 03:50:26 +0000 Subject: [PATCH 02/27] index integration Signed-off-by: Zhenchi --- Cargo.lock | 1 - src/index/src/inverted_index/error.rs | 10 +- src/mito2/Cargo.toml | 1 - src/mito2/src/access_layer.rs | 3 +- src/mito2/src/read/scan_region.rs | 2 - src/mito2/src/read/seq_scan.rs | 1 - src/mito2/src/sst/index.rs | 7 ++ src/mito2/src/sst/index/applier.rs | 7 +- src/mito2/src/sst/index/applier/builder.rs | 3 +- src/mito2/src/sst/index/codec.rs | 8 -- src/mito2/src/sst/index/creator.rs | 105 +++++++++++++-------- src/mito2/src/sst/location.rs | 33 +++++-- src/mito2/src/sst/parquet.rs | 1 - src/mito2/src/sst/parquet/reader.rs | 90 ++++++++++-------- src/mito2/src/sst/parquet/writer.rs | 15 +-- 15 files changed, 177 insertions(+), 110 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3b70b7853da9..3fff439cb99e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4986,7 +4986,6 @@ dependencies = [ "object-store", "parquet", "paste", - "pin-project", "prometheus", "prost 0.12.3", "puffin", diff --git a/src/index/src/inverted_index/error.rs b/src/index/src/inverted_index/error.rs index 230d0569af05..50f24a1b96ea 100644 --- a/src/index/src/inverted_index/error.rs +++ b/src/index/src/inverted_index/error.rs @@ -15,7 +15,7 @@ use std::any::Any; use std::io::Error as IoError; -use common_error::ext::ErrorExt; +use common_error::ext::{BoxedError, ErrorExt}; use common_error::status_code::StatusCode; use common_macro::stack_trace_debug; use snafu::{Location, Snafu}; @@ -167,6 +167,12 @@ pub enum Error { total_row_count: usize, expected_row_count: usize, }, + + #[snafu(display("External error"))] + External { + source: BoxedError, + location: Location, + }, } impl ErrorExt for Error { @@ -197,6 +203,8 @@ impl ErrorExt for Error { | FstInsert { .. } | InconsistentRowCount { .. } | IndexNotFound { .. } => StatusCode::InvalidArguments, + + External { source, .. } => source.status_code(), } } diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml index 5c185bff8ffb..db8a1eb97d3a 100644 --- a/src/mito2/Cargo.toml +++ b/src/mito2/Cargo.toml @@ -63,7 +63,6 @@ tokio-stream.workspace = true tokio-util.workspace = true tokio.workspace = true uuid.workspace = true -pin-project.workspace = true puffin.workspace = true [dev-dependencies] diff --git a/src/mito2/src/access_layer.rs b/src/mito2/src/access_layer.rs index 7ef6909ad9b1..9f62e2bcda28 100644 --- a/src/mito2/src/access_layer.rs +++ b/src/mito2/src/access_layer.rs @@ -14,7 +14,7 @@ use std::sync::Arc; -use object_store::{ErrorKind, ObjectStore}; +use object_store::ObjectStore; use snafu::ResultExt; use store_api::metadata::RegionMetadataRef; @@ -73,6 +73,7 @@ impl AccessLayer { .delete(&index_path) .await .context(OpenDalSnafu) + // ignore error if index file not found for compatibility .or_else(|e| e.is_object_not_found().then_some(()).ok_or(e)) } diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index d1e74639175d..25f1866bae6f 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -14,12 +14,10 @@ //! Scans a region according to the scan request. -use std::sync::Arc; use common_recordbatch::SendableRecordBatchStream; use common_telemetry::{debug, logging}; use common_time::range::TimestampRange; -use index::inverted_index::search::index_apply::IndexApplier; use store_api::storage::ScanRequest; use table::predicate::{Predicate, TimeRangePredicateBuilder}; diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs index b22f66c3382a..3ad7e21155b5 100644 --- a/src/mito2/src/read/seq_scan.rs +++ b/src/mito2/src/read/seq_scan.rs @@ -23,7 +23,6 @@ use common_recordbatch::error::ExternalSnafu; use common_recordbatch::{RecordBatch, RecordBatchStreamWrapper, SendableRecordBatchStream}; use common_telemetry::{debug, error}; use common_time::range::TimestampRange; -use index::inverted_index::search::index_apply::IndexApplier; use snafu::ResultExt; use table::predicate::Predicate; use tokio::sync::{mpsc, Semaphore}; diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs index c26e3aee1dae..1a0365309b85 100644 --- a/src/mito2/src/sst/index.rs +++ b/src/mito2/src/sst/index.rs @@ -15,3 +15,10 @@ pub mod applier; mod codec; pub mod creator; + +const INDEX_BLOB_TYPE: &str = "greptime-inverted-index-v1"; + +// TODO(zhongzc): how to determine this value? +const MIN_MEMORY_USAGE_THRESHOLD: usize = 8192; + +const PIPE_BUFFER_SIZE_FOR_SENDING_BLOB: usize = 8192; diff --git a/src/mito2/src/sst/index/applier.rs b/src/mito2/src/sst/index/applier.rs index 9a5d72b6eb7d..38c5f7450166 100644 --- a/src/mito2/src/sst/index/applier.rs +++ b/src/mito2/src/sst/index/applier.rs @@ -25,6 +25,7 @@ use object_store::ObjectStore; use puffin::file_format::reader::{PuffinAsyncReader, PuffinFileReader}; use snafu::ResultExt; +use crate::sst::index::INDEX_BLOB_TYPE; use crate::error::{OpenDalSnafu, Result}; use crate::sst::file::FileId; use crate::sst::location; @@ -51,11 +52,11 @@ impl SstIndexApplier { } pub async fn apply(&self, file_id: FileId) -> Result> { - let file_name = location::index_file_path(&self.region_dir, &file_id); + let file_path = location::index_file_path(&self.region_dir, &file_id); let file_reader = self .object_store - .reader(&file_name) + .reader(&file_path) .await .context(OpenDalSnafu)?; let mut puffin_reader = PuffinFileReader::new(file_reader); @@ -64,7 +65,7 @@ impl SstIndexApplier { let blob_meta = file_meta .blobs .iter() - .find(|blob| blob.blob_type == "greptime-inverted-index-v1".to_string()) + .find(|blob| blob.blob_type == INDEX_BLOB_TYPE.to_string()) .unwrap(); let blob_reader = puffin_reader.blob_reader(blob_meta).unwrap(); diff --git a/src/mito2/src/sst/index/applier/builder.rs b/src/mito2/src/sst/index/applier/builder.rs index 4df9e6c43620..4118c793f041 100644 --- a/src/mito2/src/sst/index/applier/builder.rs +++ b/src/mito2/src/sst/index/applier/builder.rs @@ -25,13 +25,12 @@ use common_query::logical_plan::Expr; use datafusion_common::ScalarValue; use datafusion_expr::{BinaryExpr, Expr as DfExpr, Operator}; use datatypes::data_type::ConcreteDataType; -use datatypes::schema::Schema; use datatypes::value::Value; use index::inverted_index::search::index_apply::PredicatesIndexApplier; use index::inverted_index::search::predicate::Predicate; use object_store::ObjectStore; use snafu::{OptionExt, ResultExt}; -use store_api::metadata::{ColumnMetadata, RegionMetadata}; +use store_api::metadata::RegionMetadata; use crate::error::{BuildIndexApplierSnafu, ColumnNotFoundSnafu, Result}; use crate::row_converter::SortField; diff --git a/src/mito2/src/sst/index/codec.rs b/src/mito2/src/sst/index/codec.rs index 71526f76cdbd..ea20d581fe68 100644 --- a/src/mito2/src/sst/index/codec.rs +++ b/src/mito2/src/sst/index/codec.rs @@ -12,15 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::iter; -use std::pin::{pin, Pin}; - -use datatypes::data_type::ConcreteDataType; use datatypes::value::{Value, ValueRef}; -use futures::Future; -use index::inverted_index::BytesRef; use memcomparable::Serializer; -use pin_project::pin_project; use store_api::metadata::ColumnMetadata; use crate::error::Result; @@ -38,7 +31,6 @@ impl IndexValueCodec { } } -#[pin_project] pub struct IndexValuesCodec { column_names: Vec, fields: Vec, diff --git a/src/mito2/src/sst/index/creator.rs b/src/mito2/src/sst/index/creator.rs index c9aa7bc908b4..36dd5ab62421 100644 --- a/src/mito2/src/sst/index/creator.rs +++ b/src/mito2/src/sst/index/creator.rs @@ -17,26 +17,31 @@ use std::num::NonZeroUsize; use std::sync::Arc; use async_trait::async_trait; -use common_telemetry::logging; +use common_error::ext::BoxedError; +use common_telemetry::warn; use futures::{AsyncRead, AsyncWrite}; use index::inverted_index::create::sort::external_provider::ExternalTempFileProvider; use index::inverted_index::create::sort::external_sort::ExternalSorter; use index::inverted_index::create::sort_create::SortIndexCreator; use index::inverted_index::create::InvertedIndexCreator; +use index::inverted_index::error as index_error; use index::inverted_index::error::Result as IndexResult; use index::inverted_index::format::writer::InvertedIndexBlobWriter; -use object_store::{util, ObjectStore}; +use object_store::ObjectStore; use puffin::file_format::writer::{Blob, PuffinAsyncWriter, PuffinFileWriter}; use snafu::ResultExt; use store_api::metadata::RegionMetadataRef; use tokio::io::duplex; use tokio_util::compat::{TokioAsyncReadCompatExt, TokioAsyncWriteCompatExt}; -use crate::error::{PushIndexValueSnafu, Result}; +use crate::error::{OpenDalSnafu, PushIndexValueSnafu, Result}; use crate::read::Batch; use crate::sst::file::FileId; use crate::sst::index::codec::{IndexValueCodec, IndexValuesCodec}; -use crate::sst::location; +use crate::sst::index::{ + INDEX_BLOB_TYPE, MIN_MEMORY_USAGE_THRESHOLD, PIPE_BUFFER_SIZE_FOR_SENDING_BLOB, +}; +use crate::sst::location::{self, IntermediateLocation}; type ByteCount = usize; type RowCount = usize; @@ -65,11 +70,12 @@ impl SstIndexCreator { row_group_size: NonZeroUsize, ) -> Self { let temp_file_provider = Arc::new(TempFileProvider { - temp_file_dir: location::index_creation_temp_dir(®ion_dir, &sst_file_id), + location: IntermediateLocation::new(®ion_dir, &sst_file_id), object_store: object_store.clone(), }); - let memory_usage_threshold = memory_usage_threshold - .map(|threshold| (threshold / metadata.primary_key.len()).max(1024)); + let memory_usage_threshold = memory_usage_threshold.map(|threshold| { + (threshold / metadata.primary_key.len()).max(MIN_MEMORY_USAGE_THRESHOLD) + }); let sorter = ExternalSorter::factory(temp_file_provider.clone() as _, memory_usage_threshold); let index_creator = Box::new(SortIndexCreator::new(sorter, row_group_size)); @@ -93,11 +99,11 @@ impl SstIndexCreator { } if let Err(err) = self.do_update(batch).await { - if let Err(err) = self.cleanup().await { - logging::warn!( - "Failed to clean up index creator, region_dir: {}, sst_file_id: {}, error: {err}", - self.region_dir, self.sst_file_id - ); + // clean up garbage if failed to update + if let Err(err) = self.do_cleanup().await { + let region_dir = &self.region_dir; + let sst_file_id = &self.sst_file_id; + warn!("Failed to clean up index creator, region_dir: {region_dir}, sst_file_id: {sst_file_id}, error: {err}"); } return Err(err); } @@ -107,20 +113,21 @@ impl SstIndexCreator { pub async fn finish(&mut self) -> Result<(RowCount, ByteCount)> { if self.row_count == 0 { + // Everything is clean, no IO is performed. return Ok((0, 0)); } - let res = self.do_finish().await; + let finish_res = self.do_finish().await; + // clean up garbage no matter finish success or not + let cleanup_res = self.do_cleanup().await; - if let Err(err) = self.cleanup().await { - logging::warn!( - "Failed to clean up index creator, region_dir: {}, sst_file_id: {}, error: {err}", - self.region_dir, - self.sst_file_id - ); + if let Err(err) = cleanup_res { + let region_dir = &self.region_dir; + let sst_file_id = &self.sst_file_id; + warn!("Failed to clean up index creator, region_dir: {region_dir}, sst_file_id: {sst_file_id}, error: {err}"); } - res.map(|bytes| (self.row_count, bytes)) + finish_res.map(|bytes| (self.row_count, bytes)) } async fn do_update(&mut self, batch: &Batch) -> Result<()> { @@ -144,13 +151,17 @@ impl SstIndexCreator { async fn do_finish(&mut self) -> Result { let file_path = location::index_file_path(&self.region_dir, &self.sst_file_id); - let writer = self.object_store.writer(&file_path).await.unwrap(); + let writer = self + .object_store + .writer(&file_path) + .await + .context(OpenDalSnafu)?; let mut puffin_writer = PuffinFileWriter::new(writer); - let (tx, rx) = duplex(8 * 1024); + let (tx, rx) = duplex(PIPE_BUFFER_SIZE_FOR_SENDING_BLOB); let blob = Blob { - blob_type: "greptime-inverted-index-v1".to_string(), + blob_type: INDEX_BLOB_TYPE.to_string(), data: rx.compat(), properties: HashMap::default(), }; @@ -167,13 +178,13 @@ impl SstIndexCreator { Ok(puffin_writer.finish().await.unwrap()) } - async fn cleanup(&mut self) -> Result<()> { + async fn do_cleanup(&mut self) -> Result<()> { self.temp_file_provider.cleanup().await } } struct TempFileProvider { - temp_file_dir: String, + location: IntermediateLocation, object_store: ObjectStore, } @@ -181,26 +192,43 @@ struct TempFileProvider { impl ExternalTempFileProvider for TempFileProvider { async fn create( &self, - index_name: &str, - tmp_file_id: &str, + column_name: &str, + file_id: &str, ) -> IndexResult> { - let child = format!("{index_name}/{tmp_file_id}.im"); - let path = util::join_path(&self.temp_file_dir, &child); - Ok(Box::new(self.object_store.writer(&path).await.unwrap())) + let path = self.location.file_path(column_name, file_id); + let writer = self + .object_store + .writer(&path) + .await + .context(OpenDalSnafu) + .map_err(BoxedError::new) + .context(index_error::ExternalSnafu)?; + Ok(Box::new(writer)) } async fn read_all( &self, - index_name: &str, + column_name: &str, ) -> IndexResult>> { - let dir = util::join_dir(&self.temp_file_dir, index_name); - - let entries = self.object_store.list(&dir).await.unwrap(); + let dir = self.location.column_dir(column_name); + let entries = self + .object_store + .list(&dir) + .await + .context(OpenDalSnafu) + .map_err(BoxedError::new) + .context(index_error::ExternalSnafu)?; let mut readers = Vec::with_capacity(entries.len()); for entry in entries { - let path = entry.path(); - readers.push(Box::new(self.object_store.reader(path).await.unwrap()) as _); + let reader = self + .object_store + .reader(entry.path()) + .await + .context(OpenDalSnafu) + .map_err(BoxedError::new) + .context(index_error::ExternalSnafu)?; + readers.push(Box::new(reader) as _); } Ok(readers) @@ -210,9 +238,8 @@ impl ExternalTempFileProvider for TempFileProvider { impl TempFileProvider { async fn cleanup(&self) -> Result<()> { self.object_store - .remove_all(&self.temp_file_dir) + .remove_all(&self.location.root_dir()) .await - .unwrap(); - Ok(()) + .context(OpenDalSnafu) } } diff --git a/src/mito2/src/sst/location.rs b/src/mito2/src/sst/location.rs index b342fa15b283..646c3c854bf3 100644 --- a/src/mito2/src/sst/location.rs +++ b/src/mito2/src/sst/location.rs @@ -30,10 +30,31 @@ pub fn index_file_path(region_dir: &str, sst_file_id: &FileId) -> String { util::join_path(&dir, &sst_file_id.as_puffin()) } -/// Returns the path of the index creation temp directory in the object store: -/// `{region_dir}/index/__intermediate/{sst_file_id}/{uuid}/` -pub fn index_creation_temp_dir(region_dir: &str, sst_file_id: &FileId) -> String { - let uuid = Uuid::new_v4(); - let child = format!("index/__intermediate/{sst_file_id}/{uuid}"); - util::join_dir(region_dir, &child) +pub struct IntermediateLocation { + root_dir: String, +} + +impl IntermediateLocation { + /// `{region_dir}/index/__intermediate/{sst_file_id}/{uuid}/` + pub fn new(region_dir: &str, sst_file_id: &FileId) -> Self { + let uuid = Uuid::new_v4(); + let child = format!("index/__intermediate/{sst_file_id}/{uuid}"); + Self { + root_dir: util::join_dir(region_dir, &child), + } + } + + pub fn root_dir(&self) -> &str { + &self.root_dir + } + + /// `{region_dir}/index/__intermediate/{sst_file_id}/{uuid}/{column_name}/` + pub fn column_dir(&self, column_name: &str) -> String { + util::join_dir(&self.root_dir, column_name) + } + + /// `{region_dir}/index/__intermediate/{sst_file_id}/{uuid}/{column_name}/{im_file_id}.im` + pub fn file_path(&self, column_name: &str, im_file_id: &str) -> String { + util::join_path(&self.column_dir(column_name), &format!("{im_file_id}.im")) + } } diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index 715b9f3b476c..5bcc8b545d23 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -215,7 +215,6 @@ mod tests { let mut env = crate::test_util::TestEnv::new(); let object_store = env.init_object_store_manager(); let handle = sst_file_handle(0, 1000); - let file_path = handle.file_path(FILE_DIR); let metadata = Arc::new(sst_region_metadata()); let source = new_source(&[ new_batch_by_range(&["a", "d"], 0, 60), diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index 2d098f097df4..e8404b7112fa 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -14,7 +14,7 @@ //! Parquet reader. -use std::collections::{BTreeSet, HashSet, VecDeque}; +use std::collections::{BTreeSet, VecDeque}; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -138,44 +138,8 @@ impl ParquetReaderBuilder { // Decodes region metadata. let key_value_meta = parquet_meta.file_metadata().key_value_metadata(); let region_meta = Self::get_region_metadata(&file_path, key_value_meta)?; - // Computes column ids to read. - let column_ids: HashSet<_> = self - .projection - .as_ref() - .map(|p| p.iter().cloned().collect()) - .unwrap_or_else(|| { - region_meta - .column_metadatas - .iter() - .map(|c| c.column_id) - .collect() - }); let read_format = ReadFormat::new(Arc::new(region_meta)); - let mut row_groups: BTreeSet<_> = (0..parquet_meta.num_row_groups()).collect(); - if let Some(index_applier) = self.index_applier.as_ref() { - match index_applier.apply(self.file_handle.file_id()).await { - Ok(rgs) => row_groups = rgs, - Err(err) => debug!("Failed to apply index: {err}"), - } - } - - // Prunes row groups by metadata. - if let Some(predicate) = &self.predicate { - let stats = - RowGroupPruningStats::new(parquet_meta.row_groups(), &read_format, column_ids); - - for (row_group, valid) in predicate - .prune_with_stats(&stats, read_format.metadata().schema.arrow_schema()) - .into_iter() - .enumerate() - { - if !valid { - row_groups.remove(&row_group); - } - } - }; - // Computes the projection mask. let parquet_schema_desc = parquet_meta.file_metadata().schema_descr(); let projection_mask = if let Some(column_ids) = self.projection.as_ref() { @@ -192,6 +156,9 @@ impl ParquetReaderBuilder { parquet_to_arrow_field_levels(parquet_schema_desc, projection_mask.clone(), hint) .context(ReadParquetSnafu { path: &file_path })?; + // Computes row groups to read. + let row_groups = self.row_groups_to_read(&read_format, &parquet_meta).await; + let reader_builder = RowGroupReaderBuilder { file_handle: self.file_handle.clone(), file_path, @@ -274,6 +241,55 @@ impl ParquetReaderBuilder { Ok(metadata) } + + /// Computes row groups to read. + async fn row_groups_to_read( + &self, + read_format: &ReadFormat, + parquet_meta: &ParquetMetaData, + ) -> BTreeSet { + let mut row_group_ids = (0..parquet_meta.num_row_groups()).collect(); + + // Applies index to prune row groups. + if let Some(index_applier) = &self.index_applier { + match index_applier.apply(self.file_handle.file_id()).await { + Ok(row_groups) => row_group_ids = row_groups, + Err(err) => { + if !err.is_object_not_found() { + debug!("Failed to apply index: {err}"); + } + // Ignores the error since it won't affect correctness. + } + } + } + + // Prunes row groups by metadata. + if let Some(predicate) = &self.predicate { + let region_meta = read_format.metadata(); + let column_ids = match &self.projection { + Some(ids) => ids.iter().cloned().collect(), + None => region_meta + .column_metadatas + .iter() + .map(|c| c.column_id) + .collect(), + }; + + let row_groups = parquet_meta.row_groups(); + let stats = RowGroupPruningStats::new(row_groups, read_format, column_ids); + let row_groups_to_prune = predicate + .prune_with_stats(&stats, region_meta.schema.arrow_schema()) + .into_iter() + .enumerate() + .filter_map(|(id, remain)| (!remain).then_some(id)); + + for row_group_id in row_groups_to_prune { + row_group_ids.remove(&row_group_id); + } + }; + + row_group_ids + } } /// Parquet reader metrics. diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs index e983a65ff8ac..de8b6e07c300 100644 --- a/src/mito2/src/sst/parquet/writer.rs +++ b/src/mito2/src/sst/parquet/writer.rs @@ -120,24 +120,25 @@ impl ParquetWriter { if let Some(creator) = index_creator.as_mut() { if let Err(err) = creator.update(&batch).await { - debug!("Failed to update index: {}", err); + let region_id = &self.metadata.region_id; + let file_id = &self.file_id; + warn!("Failed to update index, error: {err}, region_id: {region_id}, file_id: {file_id}"); - // Skip index creation if failed to update. + // Skip index creation if error occurs. index_creator = None; } } } if let Some(mut creator) = index_creator { + let region_id = &self.metadata.region_id; + let file_id = &self.file_id; match creator.finish().await { Ok((row_count, byte_count)) => { - debug!( - "Finished index, region_dir: {}, sst_file_id: {}, bytes: {}", - self.region_dir, self.file_id, byte_count - ); + debug!("Create index successfully, region_id: {region_id}, file_id: {file_id}, bytes: {byte_count}, rows: {row_count}"); } Err(err) => { - warn!("Failed to finish index: {}", err); + warn!("Failed to create index, error: {err}, region_id: {region_id}, file_id: {file_id}"); return Ok(None); } } From ff2de5b247b0b496b9fa404ad6386d7b48b67567 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Thu, 28 Dec 2023 10:11:05 +0000 Subject: [PATCH 03/27] feat(inverted_index.integration): Add applier builder to convert Expr to Predicates (Part 1) Signed-off-by: Zhenchi --- Cargo.lock | 13 +- Cargo.toml | 3 +- src/index/src/inverted_index/error.rs | 2 +- .../search/fst_apply/intersection_apply.rs | 112 ++++---- src/mito2/Cargo.toml | 1 + src/mito2/src/error.rs | 12 + src/mito2/src/row_converter.rs | 6 +- src/mito2/src/sst.rs | 1 + src/mito2/src/sst/index.rs | 16 ++ src/mito2/src/sst/index/applier.rs | 42 +++ src/mito2/src/sst/index/applier/builder.rs | 258 ++++++++++++++++++ .../src/sst/index/applier/builder/between.rs | 171 ++++++++++++ src/mito2/src/sst/index/codec.rs | 65 +++++ 13 files changed, 651 insertions(+), 51 deletions(-) create mode 100644 src/mito2/src/sst/index.rs create mode 100644 src/mito2/src/sst/index/applier.rs create mode 100644 src/mito2/src/sst/index/applier/builder.rs create mode 100644 src/mito2/src/sst/index/applier/builder/between.rs create mode 100644 src/mito2/src/sst/index/codec.rs diff --git a/Cargo.lock b/Cargo.lock index abe0acb61213..b22b67875cc9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4029,7 +4029,7 @@ dependencies = [ "prost 0.12.3", "rand", "regex", - "regex-automata 0.1.10", + "regex-automata 0.2.0", "snafu", "tokio", "tokio-util", @@ -4977,6 +4977,7 @@ dependencies = [ "datatypes", "futures", "humantime-serde", + "index", "lazy_static", "log-store", "memcomparable", @@ -7134,8 +7135,18 @@ name = "regex-automata" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax 0.6.29", +] + +[[package]] +name = "regex-automata" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9368763f5a9b804326f3af749e16f9abf378d227bcdee7634b13d8f17793782" dependencies = [ "fst", + "memchr", "regex-syntax 0.6.29", ] diff --git a/Cargo.toml b/Cargo.toml index 0e38d914eccb..a3413aa9d48d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -111,7 +111,7 @@ prost = "0.12" raft-engine = { git = "https://github.com/tikv/raft-engine.git", rev = "22dfb426cd994602b57725ef080287d3e53db479" } rand = "0.8" regex = "1.8" -regex-automata = { version = "0.1", features = ["transducer"] } +regex-automata = { version = "0.2", features = ["transducer"] } reqwest = { version = "0.11", default-features = false, features = [ "json", "rustls-tls-native-roots", @@ -169,6 +169,7 @@ datanode = { path = "src/datanode" } datatypes = { path = "src/datatypes" } file-engine = { path = "src/file-engine" } frontend = { path = "src/frontend" } +index = { path = "src/index" } log-store = { path = "src/log-store" } meta-client = { path = "src/meta-client" } meta-srv = { path = "src/meta-srv" } diff --git a/src/index/src/inverted_index/error.rs b/src/index/src/inverted_index/error.rs index b795e33003b7..6e5f39006eb9 100644 --- a/src/index/src/inverted_index/error.rs +++ b/src/index/src/inverted_index/error.rs @@ -113,7 +113,7 @@ pub enum Error { #[snafu(display("Failed to parse regex DFA"))] ParseDFA { #[snafu(source)] - error: regex_automata::Error, + error: Box, location: Location, }, diff --git a/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs b/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs index a0ae0d7b9afb..a608acd0bab5 100644 --- a/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs +++ b/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs @@ -14,7 +14,7 @@ use fst::map::OpBuilder; use fst::{IntoStreamer, Streamer}; -use regex_automata::DenseDFA; +use regex_automata::dfa::dense::DFA; use snafu::{ensure, ResultExt}; use crate::inverted_index::error::{ @@ -24,15 +24,13 @@ use crate::inverted_index::search::fst_apply::FstApplier; use crate::inverted_index::search::predicate::{Predicate, Range}; use crate::inverted_index::FstMap; -type Dfa = DenseDFA, usize>; - /// `IntersectionFstApplier` applies intersection operations on an FstMap using specified ranges and regex patterns. pub struct IntersectionFstApplier { /// A list of `Range` which define inclusive or exclusive ranges for keys to be queried in the FstMap. ranges: Vec, /// A list of `Dfa` compiled from regular expression patterns. - dfas: Vec, + dfas: Vec>>, } impl FstApplier for IntersectionFstApplier { @@ -88,8 +86,8 @@ impl IntersectionFstApplier { match predicate { Predicate::Range(range) => ranges.push(range.range), Predicate::RegexMatch(regex) => { - let dfa = DenseDFA::new(®ex.pattern); - let dfa = dfa.context(ParseDFASnafu)?; + let dfa = DFA::new(®ex.pattern); + let dfa = dfa.map_err(Box::new).context(ParseDFASnafu)?; dfas.push(dfa); } // Rejection of `InList` predicates is enforced here. @@ -210,47 +208,67 @@ mod tests { #[test] fn test_intersection_fst_applier_with_valid_pattern() { - let test_fst = FstMap::from_iter([("aa", 1), ("bb", 2), ("cc", 3)]).unwrap(); - - let applier = create_applier_from_pattern("a.?").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![1]); - - let applier = create_applier_from_pattern("b.?").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![2]); - - let applier = create_applier_from_pattern("c.?").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![3]); - - let applier = create_applier_from_pattern("a.*").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![1]); - - let applier = create_applier_from_pattern("b.*").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![2]); - - let applier = create_applier_from_pattern("c.*").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![3]); - - let applier = create_applier_from_pattern("d.?").unwrap(); - let results = applier.apply(&test_fst); - assert!(results.is_empty()); - - let applier = create_applier_from_pattern("a.?|b.?").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![1, 2]); - - let applier = create_applier_from_pattern("d.?|a.?").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![1]); - - let applier = create_applier_from_pattern(".*").unwrap(); - let results = applier.apply(&test_fst); - assert_eq!(results, vec![1, 2, 3]); + let test_fst = FstMap::from_iter([("123", 1), ("abc", 2)]).unwrap(); + + let cases = vec![ + ("1", vec![1]), + ("2", vec![1]), + ("3", vec![1]), + ("^1", vec![1]), + ("^2", vec![]), + ("^3", vec![]), + ("^1.*", vec![1]), + ("^.*2", vec![1]), + ("^.*3", vec![1]), + ("1$", vec![]), + ("2$", vec![]), + ("3$", vec![1]), + ("1.*$", vec![1]), + ("2.*$", vec![1]), + ("3.*$", vec![1]), + ("^1..$", vec![1]), + ("^.2.$", vec![1]), + ("^..3$", vec![1]), + ("^[0-9]", vec![1]), + ("^[0-9]+$", vec![1]), + ("^[0-9][0-9]$", vec![]), + ("^[0-9][0-9][0-9]$", vec![1]), + ("^123$", vec![1]), + ("a", vec![2]), + ("b", vec![2]), + ("c", vec![2]), + ("^a", vec![2]), + ("^b", vec![]), + ("^c", vec![]), + ("^a.*", vec![2]), + ("^.*b", vec![2]), + ("^.*c", vec![2]), + ("a$", vec![]), + ("b$", vec![]), + ("c$", vec![2]), + ("a.*$", vec![2]), + ("b.*$", vec![2]), + ("c.*$", vec![2]), + ("^.[a-z]", vec![2]), + ("^abc$", vec![2]), + ("^ab$", vec![]), + ("abc$", vec![2]), + ("^a.c$", vec![2]), + ("^..c$", vec![2]), + ("ab", vec![2]), + (".*", vec![1, 2]), + ("", vec![1, 2]), + ("^$", vec![]), + ("1|a", vec![1, 2]), + ("^123$|^abc$", vec![1, 2]), + ("^123$|d", vec![1]), + ]; + + for (pattern, expected) in cases { + let applier = create_applier_from_pattern(pattern).unwrap(); + let results = applier.apply(&test_fst); + assert_eq!(results, expected); + } } #[test] diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml index 8c3ef50ec2c7..a28e4f0426ea 100644 --- a/src/mito2/Cargo.toml +++ b/src/mito2/Cargo.toml @@ -39,6 +39,7 @@ datafusion.workspace = true datatypes.workspace = true futures.workspace = true humantime-serde.workspace = true +index.workspace = true lazy_static = "1.4" log-store = { workspace = true, optional = true } memcomparable = "0.2" diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs index 39457281d76b..a54956442596 100644 --- a/src/mito2/src/error.rs +++ b/src/mito2/src/error.rs @@ -423,6 +423,16 @@ pub enum Error { #[snafu(source)] error: parquet::errors::ParquetError, }, + + #[snafu(display("Column not found, column: {column}"))] + ColumnNotFound { column: String, location: Location }, + + #[snafu(display("Failed to build index applier"))] + BuildIndexApplier { + #[snafu(source)] + error: index::inverted_index::error::Error, + location: Location, + }, } pub type Result = std::result::Result; @@ -468,6 +478,8 @@ impl ErrorExt for Error { | InvalidRequest { .. } | FillDefault { .. } | ConvertColumnDataType { .. } + | ColumnNotFound { .. } + | BuildIndexApplier { .. } | InvalidMetadata { .. } => StatusCode::InvalidArguments, RegionMetadataNotFound { .. } | Join { .. } diff --git a/src/mito2/src/row_converter.rs b/src/mito2/src/row_converter.rs index 4cc6fd3274ac..33ef05433521 100644 --- a/src/mito2/src/row_converter.rs +++ b/src/mito2/src/row_converter.rs @@ -84,7 +84,11 @@ impl SortField { } impl SortField { - fn serialize(&self, serializer: &mut Serializer<&mut Vec>, value: &ValueRef) -> Result<()> { + pub(crate) fn serialize( + &self, + serializer: &mut Serializer<&mut Vec>, + value: &ValueRef, + ) -> Result<()> { macro_rules! cast_value_and_serialize { ( $self: ident; diff --git a/src/mito2/src/sst.rs b/src/mito2/src/sst.rs index 32c7b4951a55..55939c2d246a 100644 --- a/src/mito2/src/sst.rs +++ b/src/mito2/src/sst.rs @@ -16,5 +16,6 @@ pub mod file; pub mod file_purger; +mod index; pub mod parquet; pub(crate) mod version; diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs new file mode 100644 index 000000000000..34ccd1d0f71f --- /dev/null +++ b/src/mito2/src/sst/index.rs @@ -0,0 +1,16 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod applier; +mod codec; diff --git a/src/mito2/src/sst/index/applier.rs b/src/mito2/src/sst/index/applier.rs new file mode 100644 index 000000000000..d4c9350caa4f --- /dev/null +++ b/src/mito2/src/sst/index/applier.rs @@ -0,0 +1,42 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub mod builder; + +use std::sync::Arc; + +use index::inverted_index::search::index_apply::IndexApplier; +use object_store::ObjectStore; + +#[allow(dead_code)] +#[derive(Clone)] +pub struct SstIndexApplier { + region_dir: String, + object_store: ObjectStore, + index_applier: Arc, +} + +impl SstIndexApplier { + pub fn new( + region_dir: String, + object_store: ObjectStore, + index_applier: Arc, + ) -> Self { + Self { + region_dir, + object_store, + index_applier, + } + } +} diff --git a/src/mito2/src/sst/index/applier/builder.rs b/src/mito2/src/sst/index/applier/builder.rs new file mode 100644 index 000000000000..5034c757caee --- /dev/null +++ b/src/mito2/src/sst/index/applier/builder.rs @@ -0,0 +1,258 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod between; + +// TODO(zhongzc): This PR is too large. The following modules are comming soon. + +// mod comparison; +// mod eq_list; +// mod in_list; +// mod regex_match; + +use std::collections::HashMap; +use std::sync::Arc; + +use api::v1::SemanticType; +use common_query::logical_plan::Expr; +use datafusion_common::ScalarValue; +use datafusion_expr::Expr as DfExpr; +use datatypes::data_type::ConcreteDataType; +use datatypes::value::Value; +use index::inverted_index::search::index_apply::PredicatesIndexApplier; +use index::inverted_index::search::predicate::Predicate; +use object_store::ObjectStore; +use snafu::{OptionExt, ResultExt}; +use store_api::metadata::RegionMetadata; + +use crate::error::{BuildIndexApplierSnafu, ColumnNotFoundSnafu, Result}; +use crate::row_converter::SortField; +use crate::sst::index::applier::SstIndexApplier; +use crate::sst::index::codec::IndexValueCodec; + +type ColumnName = String; + +/// Constructs an [`SstIndexApplier`] which applies predicates to SST files during scan. +pub struct SstIndexApplierBuilder<'a> { + /// Directory of the region, required argument for constructing [`SstIndexApplier`]. + region_dir: String, + + /// Object store, required argument for constructing [`SstIndexApplier`]. + object_store: ObjectStore, + + /// Metadata of the region, used to get metadata like column type. + metadata: &'a RegionMetadata, + + /// Stores predicates during traversal on the Expr tree. + output: HashMap>, +} + +impl<'a> SstIndexApplierBuilder<'a> { + /// Creates a new [`SstIndexApplierBuilder`]. + #[allow(dead_code)] + pub fn new( + region_dir: String, + object_store: ObjectStore, + metadata: &'a RegionMetadata, + ) -> Self { + Self { + region_dir, + object_store, + metadata, + output: HashMap::default(), + } + } + + /// Consumes the builder to construct an [`SstIndexApplier`], optionally returned based on + /// the expressions provided. If no predicates match, returns `None`. + #[allow(dead_code)] + pub fn build(mut self, exprs: &[Expr]) -> Result> { + for expr in exprs { + self.traverse_and_collect(expr.df_expr())?; + } + + if self.output.is_empty() { + return Ok(None); + } + + let predicates = self.output.into_iter().collect(); + let applier = PredicatesIndexApplier::try_from(predicates); + Ok(Some(SstIndexApplier::new( + self.region_dir, + self.object_store, + Arc::new(applier.context(BuildIndexApplierSnafu)?), + ))) + } + + /// Recursively traverses expressions to collect predicates. + /// Results are stored in `self.output`. + fn traverse_and_collect(&mut self, expr: &DfExpr) -> Result<()> { + match expr { + DfExpr::Between(between) => self.collect_between(between), + + // TODO(zhongzc): This PR is too large. The following arms are comming soon. + + // DfExpr::InList(in_list) => self.collect_inlist(in_list), + // DfExpr::BinaryExpr(BinaryExpr { left, op, right }) => match op { + // Operator::And => { + // self.traverse_and_collect(left)?; + // self.traverse_and_collect(right) + // } + // Operator::Or => self.collect_or_eq_list(left, right), + // Operator::Eq => self.collect_eq(left, right), + // Operator::Lt | Operator::LtEq | Operator::Gt | Operator::GtEq => { + // self.collect_comparison_expr(left, op, right) + // } + // Operator::RegexMatch => self.collect_regex_match(left, right), + // _ => Ok(()), + // }, + + // TODO(zhongzc): support more expressions, e.g. IsNull, IsNotNull, ... + _ => Ok(()), + } + } + + /// Helper function to add a predicate to the output. + fn add_predicate(&mut self, column_name: &str, predicate: Predicate) { + match self.output.get_mut(column_name) { + Some(predicates) => predicates.push(predicate), + None => { + self.output.insert(column_name.to_string(), vec![predicate]); + } + } + } + + /// Helper function to get the column type of a tag column. + /// Returns `None` if the column is not a tag column. + fn tag_column_type(&self, column_name: &str) -> Result> { + let column = self + .metadata + .column_by_name(column_name) + .context(ColumnNotFoundSnafu { + column: column_name, + })?; + + Ok((column.semantic_type == SemanticType::Tag) + .then(|| column.column_schema.data_type.clone())) + } + + /// Helper funtion to get a non-null literal. + fn nonnull_lit(expr: &DfExpr) -> Option<&ScalarValue> { + match expr { + DfExpr::Literal(lit) if !lit.is_null() => Some(lit), + _ => None, + } + } + + /// Helper function to get the column name of a column expression. + fn column_name(expr: &DfExpr) -> Option<&str> { + match expr { + DfExpr::Column(column) => Some(&column.name), + _ => None, + } + } + + /// Helper function to encode a literal into bytes. + fn encode_lit(lit: &ScalarValue, data_type: ConcreteDataType) -> Result> { + let value = Value::try_from(lit.clone()).unwrap(); + let mut bytes = vec![]; + let field = SortField::new(data_type); + IndexValueCodec::encode_value(value.as_value_ref(), &field, &mut bytes)?; + Ok(bytes) + } +} + +#[cfg(test)] +mod tests { + use api::v1::SemanticType; + use datafusion_common::Column; + use datatypes::data_type::ConcreteDataType; + use datatypes::schema::ColumnSchema; + use object_store::services::Memory; + use object_store::ObjectStore; + use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder}; + use store_api::storage::RegionId; + + use super::*; + + pub(crate) fn test_region_metadata() -> RegionMetadata { + let mut builder = RegionMetadataBuilder::new(RegionId::new(1234, 5678)); + builder + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new("a", ConcreteDataType::string_datatype(), false), + semantic_type: SemanticType::Tag, + column_id: 1, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new("b", ConcreteDataType::string_datatype(), false), + semantic_type: SemanticType::Field, + column_id: 2, + }) + .push_column_metadata(ColumnMetadata { + column_schema: ColumnSchema::new( + "c", + ConcreteDataType::timestamp_millisecond_datatype(), + false, + ), + semantic_type: SemanticType::Timestamp, + column_id: 3, + }) + .primary_key(vec![1]); + builder.build().unwrap() + } + + pub(crate) fn test_object_store() -> ObjectStore { + ObjectStore::new(Memory::default()).unwrap().finish() + } + + pub(crate) fn tag_column() -> DfExpr { + DfExpr::Column(Column { + relation: None, + name: "a".to_string(), + }) + } + + pub(crate) fn field_column() -> DfExpr { + DfExpr::Column(Column { + relation: None, + name: "b".to_string(), + }) + } + + pub(crate) fn nonexistent_column() -> DfExpr { + DfExpr::Column(Column { + relation: None, + name: "nonexistent".to_string(), + }) + } + + pub(crate) fn string_lit(s: impl Into) -> DfExpr { + DfExpr::Literal(ScalarValue::Utf8(Some(s.into()))) + } + + pub(crate) fn int64_lit(i: impl Into) -> DfExpr { + DfExpr::Literal(ScalarValue::Int64(Some(i.into()))) + } + + pub(crate) fn encoded_string(s: impl Into) -> Vec { + let mut bytes = vec![]; + IndexValueCodec::encode_value( + Value::from(s.into()).as_value_ref(), + &SortField::new(ConcreteDataType::string_datatype()), + &mut bytes, + ) + .unwrap(); + bytes + } +} diff --git a/src/mito2/src/sst/index/applier/builder/between.rs b/src/mito2/src/sst/index/applier/builder/between.rs new file mode 100644 index 000000000000..50ae7073b2db --- /dev/null +++ b/src/mito2/src/sst/index/applier/builder/between.rs @@ -0,0 +1,171 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion_expr::Between; +use index::inverted_index::search::predicate::{Bound, Predicate, Range, RangePredicate}; + +use crate::error::Result; +use crate::sst::index::applier::builder::SstIndexApplierBuilder; + +impl<'a> SstIndexApplierBuilder<'a> { + /// Collects a `BETWEEN` expression in the form of `column BETWEEN lit AND lit`. + pub(crate) fn collect_between(&mut self, between: &Between) -> Result<()> { + if between.negated { + return Ok(()); + } + + let Some(column_name) = Self::column_name(&between.expr) else { + return Ok(()); + }; + let Some(data_type) = self.tag_column_type(column_name)? else { + return Ok(()); + }; + let Some(low) = Self::nonnull_lit(&between.low) else { + return Ok(()); + }; + let Some(high) = Self::nonnull_lit(&between.high) else { + return Ok(()); + }; + + let predicate = Predicate::Range(RangePredicate { + range: Range { + lower: Some(Bound { + inclusive: true, + value: Self::encode_lit(low, data_type.clone())?, + }), + upper: Some(Bound { + inclusive: true, + value: Self::encode_lit(high, data_type)?, + }), + }, + }); + + self.add_predicate(column_name, predicate); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::error::Error; + use crate::sst::index::applier::builder::tests::{ + encoded_string, field_column, int64_lit, nonexistent_column, string_lit, tag_column, + test_object_store, test_region_metadata, + }; + + #[test] + fn test_collect_between_basic() { + let metadata = test_region_metadata(); + let mut builder = + SstIndexApplierBuilder::new("test".to_string(), test_object_store(), &metadata); + + let between = Between { + negated: false, + expr: Box::new(tag_column()), + low: Box::new(string_lit("abc")), + high: Box::new(string_lit("def")), + }; + + builder.collect_between(&between).unwrap(); + + let predicates = builder.output.get("a").unwrap(); + assert_eq!(predicates.len(), 1); + assert_eq!( + predicates[0], + Predicate::Range(RangePredicate { + range: Range { + lower: Some(Bound { + inclusive: true, + value: encoded_string("abc"), + }), + upper: Some(Bound { + inclusive: true, + value: encoded_string("def"), + }), + } + }) + ); + } + + #[test] + fn test_collect_between_negated() { + let metadata = test_region_metadata(); + let mut builder = + SstIndexApplierBuilder::new("test".to_string(), test_object_store(), &metadata); + + let between = Between { + negated: true, + expr: Box::new(tag_column()), + low: Box::new(string_lit("abc")), + high: Box::new(string_lit("def")), + }; + + builder.collect_between(&between).unwrap(); + assert!(builder.output.is_empty()); + } + + #[test] + fn test_collect_between_field_column() { + let metadata = test_region_metadata(); + let mut builder = + SstIndexApplierBuilder::new("test".to_string(), test_object_store(), &metadata); + + let between = Between { + negated: false, + expr: Box::new(field_column()), + low: Box::new(string_lit("abc")), + high: Box::new(string_lit("def")), + }; + + builder.collect_between(&between).unwrap(); + assert!(builder.output.is_empty()); + } + + #[test] + fn test_collect_between_type_mismatch() { + let metadata = test_region_metadata(); + let mut builder = + SstIndexApplierBuilder::new("test".to_string(), test_object_store(), &metadata); + + let between = Between { + negated: false, + expr: Box::new(tag_column()), + low: Box::new(int64_lit(123)), + high: Box::new(int64_lit(456)), + }; + + let res = builder.collect_between(&between); + assert!(matches!(res, Err(Error::FieldTypeMismatch { .. }))); + assert!(builder.output.is_empty()); + } + + #[test] + fn test_collect_between_nonexistent_column() { + let metadata = test_region_metadata(); + let mut builder = + SstIndexApplierBuilder::new("test".to_string(), test_object_store(), &metadata); + + let between = Between { + negated: false, + expr: Box::new(nonexistent_column()), + low: Box::new(string_lit("abc")), + high: Box::new(string_lit("def")), + }; + + let res = builder.collect_between(&between); + assert!(matches!(res, Err(Error::ColumnNotFound { .. }))); + assert!(builder.output.is_empty()); + } +} diff --git a/src/mito2/src/sst/index/codec.rs b/src/mito2/src/sst/index/codec.rs new file mode 100644 index 000000000000..ada5ac07cbfc --- /dev/null +++ b/src/mito2/src/sst/index/codec.rs @@ -0,0 +1,65 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datatypes::value::ValueRef; +use memcomparable::Serializer; + +use crate::error::Result; +use crate::row_converter::SortField; + +/// Encodes index values according to their data types for sorting and storage use. +pub struct IndexValueCodec; + +impl IndexValueCodec { + /// Serializes a `ValueRef` using the data type defined in `SortField` and writes + /// the result into a buffer. + /// + /// # Arguments + /// * `value` - The value to be encoded. + /// * `field` - Contains data type to guide serialization. + /// * `buffer` - Destination buffer for the serialized value. + pub fn encode_value(value: ValueRef, field: &SortField, buffer: &mut Vec) -> Result<()> { + buffer.reserve(field.estimated_size()); + let mut serializer = Serializer::new(buffer); + field.serialize(&mut serializer, &value) + } +} + +#[cfg(test)] +mod tests { + use datatypes::data_type::ConcreteDataType; + + use super::*; + use crate::error::Error; + + #[test] + fn test_encode_value_basic() { + let value = ValueRef::from("hello"); + let field = SortField::new(ConcreteDataType::string_datatype()); + + let mut buffer = Vec::new(); + IndexValueCodec::encode_value(value, &field, &mut buffer).unwrap(); + assert!(!buffer.is_empty()); + } + + #[test] + fn test_encode_value_type_mismatch() { + let value = ValueRef::from("hello"); + let field = SortField::new(ConcreteDataType::int64_datatype()); + + let mut buffer = Vec::new(); + let res = IndexValueCodec::encode_value(value, &field, &mut buffer); + assert!(matches!(res, Err(Error::FieldTypeMismatch { .. }))); + } +} From 55fc1f9583e3d5e5938c0b46abf53317e4ac1d56 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Thu, 28 Dec 2023 10:19:24 +0000 Subject: [PATCH 04/27] chore: add docs Signed-off-by: Zhenchi --- src/mito2/src/sst/index.rs | 2 ++ src/mito2/src/sst/index/applier.rs | 11 +++++++++-- src/mito2/src/sst/index/applier/builder.rs | 2 -- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs index 34ccd1d0f71f..baffda27aa6e 100644 --- a/src/mito2/src/sst/index.rs +++ b/src/mito2/src/sst/index.rs @@ -12,5 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#![allow(dead_code)] + pub mod applier; mod codec; diff --git a/src/mito2/src/sst/index/applier.rs b/src/mito2/src/sst/index/applier.rs index d4c9350caa4f..564a20d33c0f 100644 --- a/src/mito2/src/sst/index/applier.rs +++ b/src/mito2/src/sst/index/applier.rs @@ -19,15 +19,22 @@ use std::sync::Arc; use index::inverted_index::search::index_apply::IndexApplier; use object_store::ObjectStore; -#[allow(dead_code)] -#[derive(Clone)] +/// The [`SstIndexApplier`] is responsible for applying predicates to the provided SST files +/// and returning the relevant row group ids for further scan. pub struct SstIndexApplier { + /// The root directory of the region. region_dir: String, + + /// Object store responsible for accessing SST files. object_store: ObjectStore, + + /// Predifined index applier used to apply predicates to index files + /// and return the relevant row group ids for further scan. index_applier: Arc, } impl SstIndexApplier { + /// Creates a new [`SstIndexApplier`]. pub fn new( region_dir: String, object_store: ObjectStore, diff --git a/src/mito2/src/sst/index/applier/builder.rs b/src/mito2/src/sst/index/applier/builder.rs index 5034c757caee..6ecd0b0e5d5c 100644 --- a/src/mito2/src/sst/index/applier/builder.rs +++ b/src/mito2/src/sst/index/applier/builder.rs @@ -60,7 +60,6 @@ pub struct SstIndexApplierBuilder<'a> { impl<'a> SstIndexApplierBuilder<'a> { /// Creates a new [`SstIndexApplierBuilder`]. - #[allow(dead_code)] pub fn new( region_dir: String, object_store: ObjectStore, @@ -76,7 +75,6 @@ impl<'a> SstIndexApplierBuilder<'a> { /// Consumes the builder to construct an [`SstIndexApplier`], optionally returned based on /// the expressions provided. If no predicates match, returns `None`. - #[allow(dead_code)] pub fn build(mut self, exprs: &[Expr]) -> Result> { for expr in exprs { self.traverse_and_collect(expr.df_expr())?; From 09639a7412228a3bc14d7f30f7c7bcc417a83f63 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Thu, 28 Dec 2023 10:36:03 +0000 Subject: [PATCH 05/27] fix: typos Signed-off-by: Zhenchi --- src/common/config/src/wal/kafka.rs | 2 +- src/mito2/src/sst/index/applier.rs | 2 +- src/mito2/src/sst/index/applier/builder.rs | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/common/config/src/wal/kafka.rs b/src/common/config/src/wal/kafka.rs index e93aa6cb2271..858991264bb6 100644 --- a/src/common/config/src/wal/kafka.rs +++ b/src/common/config/src/wal/kafka.rs @@ -42,7 +42,7 @@ pub struct KafkaConfig { #[serde(skip)] #[serde(default)] pub compression: RsKafkaCompression, - /// The maximum log size a kakfa batch producer could buffer. + /// The maximum log size a kafka batch producer could buffer. pub max_batch_size: ReadableSize, /// The linger duration of a kafka batch producer. #[serde(with = "humantime_serde")] diff --git a/src/mito2/src/sst/index/applier.rs b/src/mito2/src/sst/index/applier.rs index 564a20d33c0f..633aad007f1a 100644 --- a/src/mito2/src/sst/index/applier.rs +++ b/src/mito2/src/sst/index/applier.rs @@ -28,7 +28,7 @@ pub struct SstIndexApplier { /// Object store responsible for accessing SST files. object_store: ObjectStore, - /// Predifined index applier used to apply predicates to index files + /// Predefined index applier used to apply predicates to index files /// and return the relevant row group ids for further scan. index_applier: Arc, } diff --git a/src/mito2/src/sst/index/applier/builder.rs b/src/mito2/src/sst/index/applier/builder.rs index 6ecd0b0e5d5c..6414081ac713 100644 --- a/src/mito2/src/sst/index/applier/builder.rs +++ b/src/mito2/src/sst/index/applier/builder.rs @@ -14,7 +14,7 @@ mod between; -// TODO(zhongzc): This PR is too large. The following modules are comming soon. +// TODO(zhongzc): This PR is too large. The following modules are coming soon. // mod comparison; // mod eq_list; @@ -99,7 +99,7 @@ impl<'a> SstIndexApplierBuilder<'a> { match expr { DfExpr::Between(between) => self.collect_between(between), - // TODO(zhongzc): This PR is too large. The following arms are comming soon. + // TODO(zhongzc): This PR is too large. The following arms are coming soon. // DfExpr::InList(in_list) => self.collect_inlist(in_list), // DfExpr::BinaryExpr(BinaryExpr { left, op, right }) => match op { @@ -145,7 +145,7 @@ impl<'a> SstIndexApplierBuilder<'a> { .then(|| column.column_schema.data_type.clone())) } - /// Helper funtion to get a non-null literal. + /// Helper function to get a non-null literal. fn nonnull_lit(expr: &DfExpr) -> Option<&ScalarValue> { match expr { DfExpr::Literal(lit) if !lit.is_null() => Some(lit), From 19c5f38f1057837f06ae6c0d7014950376d3907e Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Fri, 29 Dec 2023 04:29:34 +0000 Subject: [PATCH 06/27] fix: address comments Signed-off-by: Zhenchi --- src/mito2/src/sst/index/applier.rs | 6 ++---- src/mito2/src/sst/index/applier/builder.rs | 19 ++++++++++++------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/mito2/src/sst/index/applier.rs b/src/mito2/src/sst/index/applier.rs index 633aad007f1a..95ca25ba003d 100644 --- a/src/mito2/src/sst/index/applier.rs +++ b/src/mito2/src/sst/index/applier.rs @@ -14,8 +14,6 @@ pub mod builder; -use std::sync::Arc; - use index::inverted_index::search::index_apply::IndexApplier; use object_store::ObjectStore; @@ -30,7 +28,7 @@ pub struct SstIndexApplier { /// Predefined index applier used to apply predicates to index files /// and return the relevant row group ids for further scan. - index_applier: Arc, + index_applier: Box, } impl SstIndexApplier { @@ -38,7 +36,7 @@ impl SstIndexApplier { pub fn new( region_dir: String, object_store: ObjectStore, - index_applier: Arc, + index_applier: Box, ) -> Self { Self { region_dir, diff --git a/src/mito2/src/sst/index/applier/builder.rs b/src/mito2/src/sst/index/applier/builder.rs index 6414081ac713..d3cf6dc799b8 100644 --- a/src/mito2/src/sst/index/applier/builder.rs +++ b/src/mito2/src/sst/index/applier/builder.rs @@ -22,10 +22,10 @@ mod between; // mod regex_match; use std::collections::HashMap; -use std::sync::Arc; use api::v1::SemanticType; use common_query::logical_plan::Expr; +use common_telemetry::warn; use datafusion_common::ScalarValue; use datafusion_expr::Expr as DfExpr; use datatypes::data_type::ConcreteDataType; @@ -77,7 +77,7 @@ impl<'a> SstIndexApplierBuilder<'a> { /// the expressions provided. If no predicates match, returns `None`. pub fn build(mut self, exprs: &[Expr]) -> Result> { for expr in exprs { - self.traverse_and_collect(expr.df_expr())?; + self.traverse_and_collect(expr.df_expr()); } if self.output.is_empty() { @@ -89,14 +89,14 @@ impl<'a> SstIndexApplierBuilder<'a> { Ok(Some(SstIndexApplier::new( self.region_dir, self.object_store, - Arc::new(applier.context(BuildIndexApplierSnafu)?), + Box::new(applier.context(BuildIndexApplierSnafu)?), ))) } /// Recursively traverses expressions to collect predicates. /// Results are stored in `self.output`. - fn traverse_and_collect(&mut self, expr: &DfExpr) -> Result<()> { - match expr { + fn traverse_and_collect(&mut self, expr: &DfExpr) { + let res = match expr { DfExpr::Between(between) => self.collect_between(between), // TODO(zhongzc): This PR is too large. The following arms are coming soon. @@ -104,8 +104,9 @@ impl<'a> SstIndexApplierBuilder<'a> { // DfExpr::InList(in_list) => self.collect_inlist(in_list), // DfExpr::BinaryExpr(BinaryExpr { left, op, right }) => match op { // Operator::And => { - // self.traverse_and_collect(left)?; - // self.traverse_and_collect(right) + // self.traverse_and_collect(left); + // self.traverse_and_collect(right); + // Ok(()) // } // Operator::Or => self.collect_or_eq_list(left, right), // Operator::Eq => self.collect_eq(left, right), @@ -118,6 +119,10 @@ impl<'a> SstIndexApplierBuilder<'a> { // TODO(zhongzc): support more expressions, e.g. IsNull, IsNotNull, ... _ => Ok(()), + }; + + if let Err(err) = res { + warn!("Failed to collect predicates, ignore it. error: {err}, expr: {expr}"); } } From a1792d1bb326b9f87832e75e16e6802ff168ca0f Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Fri, 29 Dec 2023 14:54:28 +0800 Subject: [PATCH 07/27] Update src/mito2/src/sst/index/applier/builder.rs Co-authored-by: Yingwen --- src/mito2/src/sst/index/applier/builder.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mito2/src/sst/index/applier/builder.rs b/src/mito2/src/sst/index/applier/builder.rs index d3cf6dc799b8..95c812017cdb 100644 --- a/src/mito2/src/sst/index/applier/builder.rs +++ b/src/mito2/src/sst/index/applier/builder.rs @@ -122,7 +122,7 @@ impl<'a> SstIndexApplierBuilder<'a> { }; if let Err(err) = res { - warn!("Failed to collect predicates, ignore it. error: {err}, expr: {expr}"); + warn!(err; "Failed to collect predicates, ignore it. expr: {expr}"); } } From 315c77bf88bd010034836a68bfd863008b03c51a Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Fri, 29 Dec 2023 07:33:19 +0000 Subject: [PATCH 08/27] add some metrics Signed-off-by: Zhenchi --- src/mito2/src/metrics.rs | 30 +++++ src/mito2/src/sst/index/applier.rs | 3 + src/mito2/src/sst/index/creator.rs | 107 ++++------------ src/mito2/src/sst/index/creator/statistics.rs | 118 ++++++++++++++++++ .../src/sst/index/creator/temp_provider.rs | 99 +++++++++++++++ 5 files changed, 276 insertions(+), 81 deletions(-) create mode 100644 src/mito2/src/sst/index/creator/statistics.rs create mode 100644 src/mito2/src/sst/index/creator/temp_provider.rs diff --git a/src/mito2/src/metrics.rs b/src/mito2/src/metrics.rs index d53cbd495dd5..dae565fb3baf 100644 --- a/src/mito2/src/metrics.rs +++ b/src/mito2/src/metrics.rs @@ -143,4 +143,34 @@ lazy_static! { &[TYPE_LABEL] ) .unwrap(); + // ------- End of cache metrics. + + // Index metrics. + /// Timer of index application. + pub static ref INDEX_APPLY_COST_TIME: Histogram = register_histogram!( + "index_apply_cost_time", + "index apply cost time", + ) + .unwrap(); + /// Timer of index creation. + pub static ref INDEX_CREATE_COST_TIME: HistogramVec = register_histogram_vec!( + "index_create_cost_time", + "index create cost time", + &[STAGE_LABEL] + ) + .unwrap(); + /// Counter of rows indexed. + pub static ref INDEX_CREATE_ROWS_TOTAL: IntCounter = register_int_counter!( + "index_rows_total", + "index rows total", + ) + .unwrap(); + /// Counter of created index bytes. + pub static ref INDEX_CREATE_BYTES_TOTAL: IntCounter = register_int_counter!( + "index_bytes_total", + "index bytes total", + ) + .unwrap(); + + // ------- End of index metrics. } diff --git a/src/mito2/src/sst/index/applier.rs b/src/mito2/src/sst/index/applier.rs index 67568248d837..041ad3086adf 100644 --- a/src/mito2/src/sst/index/applier.rs +++ b/src/mito2/src/sst/index/applier.rs @@ -26,6 +26,7 @@ use puffin::file_format::reader::{PuffinAsyncReader, PuffinFileReader}; use snafu::ResultExt; use crate::error::{OpenDalSnafu, Result}; +use crate::metrics::INDEX_APPLY_COST_TIME; use crate::sst::file::FileId; use crate::sst::index::INDEX_BLOB_TYPE; use crate::sst::location; @@ -52,6 +53,8 @@ impl SstIndexApplier { } pub async fn apply(&self, file_id: FileId) -> Result> { + let _timer = INDEX_APPLY_COST_TIME.start_timer(); + let file_path = location::index_file_path(&self.region_dir, &file_id); let file_reader = self diff --git a/src/mito2/src/sst/index/creator.rs b/src/mito2/src/sst/index/creator.rs index 4b374979a95a..5f9c0042f370 100644 --- a/src/mito2/src/sst/index/creator.rs +++ b/src/mito2/src/sst/index/creator.rs @@ -12,20 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. +mod statistics; +mod temp_provider; + use std::collections::HashMap; use std::num::NonZeroUsize; use std::sync::Arc; -use async_trait::async_trait; -use common_error::ext::BoxedError; use common_telemetry::warn; -use futures::{AsyncRead, AsyncWrite}; -use index::inverted_index::create::sort::external_provider::ExternalTempFileProvider; use index::inverted_index::create::sort::external_sort::ExternalSorter; use index::inverted_index::create::sort_create::SortIndexCreator; use index::inverted_index::create::InvertedIndexCreator; -use index::inverted_index::error as index_error; -use index::inverted_index::error::Result as IndexResult; use index::inverted_index::format::writer::InvertedIndexBlobWriter; use object_store::ObjectStore; use puffin::file_format::writer::{Blob, PuffinAsyncWriter, PuffinFileWriter}; @@ -38,6 +35,8 @@ use crate::error::{OpenDalSnafu, PushIndexValueSnafu, Result}; use crate::read::Batch; use crate::sst::file::FileId; use crate::sst::index::codec::{IndexValueCodec, IndexValuesCodec}; +use crate::sst::index::creator::statistics::Statistics; +use crate::sst::index::creator::temp_provider::TempFileProvider; use crate::sst::index::{ INDEX_BLOB_TYPE, MIN_MEMORY_USAGE_THRESHOLD, PIPE_BUFFER_SIZE_FOR_SENDING_BLOB, }; @@ -57,7 +56,7 @@ pub struct SstIndexCreator { temp_file_provider: Arc, value_buf: Vec, - row_count: RowCount, + stats: Statistics, } impl SstIndexCreator { @@ -69,10 +68,10 @@ impl SstIndexCreator { memory_usage_threshold: Option, row_group_size: NonZeroUsize, ) -> Self { - let temp_file_provider = Arc::new(TempFileProvider { - location: IntermediateLocation::new(®ion_dir, &sst_file_id), - object_store: object_store.clone(), - }); + let temp_file_provider = Arc::new(TempFileProvider::new( + IntermediateLocation::new(®ion_dir, &sst_file_id), + object_store.clone(), + )); let memory_usage_threshold = memory_usage_threshold.map(|threshold| { (threshold / metadata.primary_key.len()).max(MIN_MEMORY_USAGE_THRESHOLD) }); @@ -89,7 +88,7 @@ impl SstIndexCreator { index_creator, temp_file_provider, value_buf: vec![], - row_count: 0, + stats: Statistics::default(), } } @@ -103,7 +102,7 @@ impl SstIndexCreator { if let Err(err) = self.do_cleanup().await { let region_dir = &self.region_dir; let sst_file_id = &self.sst_file_id; - warn!("Failed to clean up index creator, region_dir: {region_dir}, sst_file_id: {sst_file_id}, error: {err}"); + warn!(err; "Failed to clean up index creator, region_dir: {region_dir}, sst_file_id: {sst_file_id}"); } return Err(err); } @@ -112,8 +111,8 @@ impl SstIndexCreator { } pub async fn finish(&mut self) -> Result<(RowCount, ByteCount)> { - if self.row_count == 0 { - // Everything is clean, no IO is performed. + if self.stats.row_count() == 0 { + // no IO is performed, no garbage to clean up, just return return Ok((0, 0)); } @@ -124,15 +123,17 @@ impl SstIndexCreator { if let Err(err) = cleanup_res { let region_dir = &self.region_dir; let sst_file_id = &self.sst_file_id; - warn!("Failed to clean up index creator, region_dir: {region_dir}, sst_file_id: {sst_file_id}, error: {err}"); + warn!(err; "Failed to clean up index creator, region_dir: {region_dir}, sst_file_id: {sst_file_id}"); } - finish_res.map(|bytes| (self.row_count, bytes)) + finish_res.map(|_| (self.stats.row_count(), self.stats.byte_count())) } async fn do_update(&mut self, batch: &Batch) -> Result<()> { + let mut guard = self.stats.record_update(); + let n = batch.num_rows(); - self.row_count += n; + guard.inc_row_count(n); for (column_name, field, value) in self.codec.decode(batch.primary_key())? { if let Some(value) = value.as_ref() { self.value_buf.clear(); @@ -149,7 +150,9 @@ impl SstIndexCreator { Ok(()) } - async fn do_finish(&mut self) -> Result { + async fn do_finish(&mut self) -> Result<()> { + let mut guard = self.stats.record_finish(); + let file_path = location::index_file_path(&self.region_dir, &self.sst_file_id); let writer = self .object_store @@ -175,71 +178,13 @@ impl SstIndexCreator { source.unwrap(); sink.unwrap(); - Ok(puffin_writer.finish().await.unwrap()) + let byte_count = puffin_writer.finish().await.unwrap(); + guard.inc_byte_count(byte_count); + Ok(()) } async fn do_cleanup(&mut self) -> Result<()> { + let _guard = self.stats.record_cleanup(); self.temp_file_provider.cleanup().await } } - -struct TempFileProvider { - location: IntermediateLocation, - object_store: ObjectStore, -} - -#[async_trait] -impl ExternalTempFileProvider for TempFileProvider { - async fn create( - &self, - column_name: &str, - file_id: &str, - ) -> IndexResult> { - let path = self.location.file_path(column_name, file_id); - let writer = self - .object_store - .writer(&path) - .await - .context(OpenDalSnafu) - .map_err(BoxedError::new) - .context(index_error::ExternalSnafu)?; - Ok(Box::new(writer)) - } - - async fn read_all( - &self, - column_name: &str, - ) -> IndexResult>> { - let dir = self.location.column_dir(column_name); - let entries = self - .object_store - .list(&dir) - .await - .context(OpenDalSnafu) - .map_err(BoxedError::new) - .context(index_error::ExternalSnafu)?; - let mut readers = Vec::with_capacity(entries.len()); - - for entry in entries { - let reader = self - .object_store - .reader(entry.path()) - .await - .context(OpenDalSnafu) - .map_err(BoxedError::new) - .context(index_error::ExternalSnafu)?; - readers.push(Box::new(reader) as _); - } - - Ok(readers) - } -} - -impl TempFileProvider { - async fn cleanup(&self) -> Result<()> { - self.object_store - .remove_all(self.location.root_dir()) - .await - .context(OpenDalSnafu) - } -} diff --git a/src/mito2/src/sst/index/creator/statistics.rs b/src/mito2/src/sst/index/creator/statistics.rs new file mode 100644 index 000000000000..70016290a5b1 --- /dev/null +++ b/src/mito2/src/sst/index/creator/statistics.rs @@ -0,0 +1,118 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::time::{Duration, Instant}; + +use crate::metrics::{INDEX_CREATE_BYTES_TOTAL, INDEX_CREATE_COST_TIME, INDEX_CREATE_ROWS_TOTAL}; + +enum Stage { + Update, + Finish, + Cleanup, +} + +#[derive(Default)] +pub(crate) struct Statistics { + update_cost: Duration, + finish_cost: Duration, + cleanup_cost: Duration, + row_count: usize, + byte_count: usize, +} + +impl Statistics { + pub fn record_update(&mut self) -> TimerGuard<'_> { + TimerGuard::new(self, Stage::Update) + } + + pub fn record_finish(&mut self) -> TimerGuard<'_> { + TimerGuard::new(self, Stage::Finish) + } + + pub fn record_cleanup(&mut self) -> TimerGuard<'_> { + TimerGuard::new(self, Stage::Cleanup) + } + + pub fn row_count(&self) -> usize { + self.row_count + } + + pub fn byte_count(&self) -> usize { + self.byte_count + } + + fn flush(&self) { + INDEX_CREATE_COST_TIME + .with_label_values(&["update"]) + .observe(self.update_cost.as_secs_f64()); + INDEX_CREATE_COST_TIME + .with_label_values(&["finish"]) + .observe(self.finish_cost.as_secs_f64()); + INDEX_CREATE_COST_TIME + .with_label_values(&["cleanup"]) + .observe(self.cleanup_cost.as_secs_f64()); + INDEX_CREATE_COST_TIME + .with_label_values(&["total"]) + .observe((self.update_cost + self.finish_cost + self.cleanup_cost).as_secs_f64()); + + INDEX_CREATE_ROWS_TOTAL.inc_by(self.row_count as _); + INDEX_CREATE_BYTES_TOTAL.inc_by(self.byte_count as _); + } +} + +impl Drop for Statistics { + fn drop(&mut self) { + self.flush(); + } +} + +pub(crate) struct TimerGuard<'a> { + stats: &'a mut Statistics, + stage: Stage, + timer: Instant, +} + +impl<'a> TimerGuard<'a> { + fn new(stats: &'a mut Statistics, stage: Stage) -> Self { + Self { + stats, + stage, + timer: Instant::now(), + } + } + + pub fn inc_row_count(&mut self, n: usize) { + self.stats.row_count += n; + } + + pub fn inc_byte_count(&mut self, n: usize) { + self.stats.byte_count += n; + } +} + +impl Drop for TimerGuard<'_> { + fn drop(&mut self) { + match self.stage { + Stage::Update => { + self.stats.update_cost += self.timer.elapsed(); + } + Stage::Finish => { + self.stats.finish_cost += self.timer.elapsed(); + } + Stage::Cleanup => { + self.stats.cleanup_cost += self.timer.elapsed(); + } + } + } +} diff --git a/src/mito2/src/sst/index/creator/temp_provider.rs b/src/mito2/src/sst/index/creator/temp_provider.rs new file mode 100644 index 000000000000..2148a19af9bd --- /dev/null +++ b/src/mito2/src/sst/index/creator/temp_provider.rs @@ -0,0 +1,99 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use async_trait::async_trait; +use common_error::ext::BoxedError; +use common_telemetry::warn; +use futures::{AsyncRead, AsyncWrite}; +use index::inverted_index::create::sort::external_provider::ExternalTempFileProvider; +use index::inverted_index::error as index_error; +use index::inverted_index::error::Result as IndexResult; +use object_store::ObjectStore; +use snafu::ResultExt; + +use crate::error::{OpenDalSnafu, Result}; +use crate::sst::location::IntermediateLocation; + +pub(crate) struct TempFileProvider { + location: IntermediateLocation, + object_store: ObjectStore, +} + +#[async_trait] +impl ExternalTempFileProvider for TempFileProvider { + async fn create( + &self, + column_name: &str, + file_id: &str, + ) -> IndexResult> { + let path = self.location.file_path(column_name, file_id); + let writer = self + .object_store + .writer(&path) + .await + .context(OpenDalSnafu) + .map_err(BoxedError::new) + .context(index_error::ExternalSnafu)?; + Ok(Box::new(writer)) + } + + async fn read_all( + &self, + column_name: &str, + ) -> IndexResult>> { + let dir = self.location.column_dir(column_name); + let entries = self + .object_store + .list(&dir) + .await + .context(OpenDalSnafu) + .map_err(BoxedError::new) + .context(index_error::ExternalSnafu)?; + let mut readers = Vec::with_capacity(entries.len()); + + for entry in entries { + if entry.metadata().is_dir() { + warn!("Unexpected entry in index temp dir: {:?}", entry.path()); + continue; + } + + let reader = self + .object_store + .reader(entry.path()) + .await + .context(OpenDalSnafu) + .map_err(BoxedError::new) + .context(index_error::ExternalSnafu)?; + readers.push(Box::new(reader) as _); + } + + Ok(readers) + } +} + +impl TempFileProvider { + pub fn new(location: IntermediateLocation, object_store: ObjectStore) -> Self { + Self { + location, + object_store, + } + } + + pub async fn cleanup(&self) -> Result<()> { + self.object_store + .remove_all(self.location.root_dir()) + .await + .context(OpenDalSnafu) + } +} From 6f181c4e5dcb4f18aab3414d7e537db1e7067ebe Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Fri, 29 Dec 2023 09:52:27 +0000 Subject: [PATCH 09/27] track io bytes Signed-off-by: Zhenchi --- Cargo.lock | 1 + .../src/inverted_index/search/fst_apply.rs | 3 + .../search/fst_apply/intersection_apply.rs | 34 ++++- .../search/fst_apply/keys_apply.rs | 15 ++- .../src/inverted_index/search/index_apply.rs | 3 + .../search/index_apply/predicates_apply.rs | 8 ++ src/mito2/Cargo.toml | 1 + src/mito2/src/metrics.rs | 28 +++- src/mito2/src/sst/index.rs | 1 + src/mito2/src/sst/index/applier.rs | 15 ++- src/mito2/src/sst/index/creator.rs | 7 +- .../src/sst/index/creator/temp_provider.rs | 4 + src/mito2/src/sst/index/io_stats.rs | 122 ++++++++++++++++++ 13 files changed, 231 insertions(+), 11 deletions(-) create mode 100644 src/mito2/src/sst/index/io_stats.rs diff --git a/Cargo.lock b/Cargo.lock index 3fff439cb99e..3b70b7853da9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4986,6 +4986,7 @@ dependencies = [ "object-store", "parquet", "paste", + "pin-project", "prometheus", "prost 0.12.3", "puffin", diff --git a/src/index/src/inverted_index/search/fst_apply.rs b/src/index/src/inverted_index/search/fst_apply.rs index 9f54d0d88918..b496539d866a 100644 --- a/src/index/src/inverted_index/search/fst_apply.rs +++ b/src/index/src/inverted_index/search/fst_apply.rs @@ -30,4 +30,7 @@ pub trait FstApplier: Send + Sync { /// /// Returns a `Vec`, with each u64 being a value from the FstMap. fn apply(&self, fst: &FstMap) -> Vec; + + /// Returns the memory usage of the FstApplier. + fn memory_usage(&self) -> usize; } diff --git a/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs b/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs index a5783fd97526..6589c299a912 100644 --- a/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs +++ b/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs @@ -31,6 +31,9 @@ pub struct IntersectionFstApplier { /// A list of `Dfa` compiled from regular expression patterns. dfas: Vec>>, + + /// The memory usage of the `IntersectionFstApplier`. + memory_usage: usize, } impl FstApplier for IntersectionFstApplier { @@ -68,6 +71,10 @@ impl FstApplier for IntersectionFstApplier { } values } + + fn memory_usage(&self) -> usize { + self.memory_usage + } } impl IntersectionFstApplier { @@ -82,12 +89,17 @@ impl IntersectionFstApplier { let mut dfas = Vec::with_capacity(predicates.len()); let mut ranges = Vec::with_capacity(predicates.len()); + let mut memory_usage = 0; for predicate in predicates { match predicate { - Predicate::Range(range) => ranges.push(range.range), + Predicate::Range(range) => { + memory_usage += Self::range_memory_usage(&range.range); + ranges.push(range.range) + } Predicate::RegexMatch(regex) => { let dfa = DFA::new(®ex.pattern); let dfa = dfa.map_err(Box::new).context(ParseDFASnafu)?; + memory_usage += dfa.memory_usage(); dfas.push(dfa); } // Rejection of `InList` predicates is enforced here. @@ -97,7 +109,25 @@ impl IntersectionFstApplier { } } - Ok(Self { dfas, ranges }) + Ok(Self { + dfas, + ranges, + memory_usage, + }) + } + + fn range_memory_usage(range: &Range) -> usize { + let mut memory_usage = std::mem::size_of::(); + + if let Some(lower) = &range.lower { + memory_usage += lower.value.len(); + } + + if let Some(upper) = &range.upper { + memory_usage += upper.value.len(); + } + + memory_usage } } diff --git a/src/index/src/inverted_index/search/fst_apply/keys_apply.rs b/src/index/src/inverted_index/search/fst_apply/keys_apply.rs index 4ec5710a3435..79b79db1ced0 100644 --- a/src/index/src/inverted_index/search/fst_apply/keys_apply.rs +++ b/src/index/src/inverted_index/search/fst_apply/keys_apply.rs @@ -29,12 +29,19 @@ use crate::inverted_index::{Bytes, FstMap}; pub struct KeysFstApplier { /// A list of keys to be fetched directly from the FstMap. keys: Vec, + + /// The memory usage of the applier. + memory_usage: usize, } impl FstApplier for KeysFstApplier { fn apply(&self, fst: &FstMap) -> Vec { self.keys.iter().filter_map(|k| fst.get(k)).collect() } + + fn memory_usage(&self) -> usize { + self.memory_usage + } } impl KeysFstApplier { @@ -56,6 +63,7 @@ impl KeysFstApplier { let regex_matched_keys = Self::filter_by_regexes(range_matched_keys, regexes)?; Ok(Self { + memory_usage: regex_matched_keys.iter().map(|k| k.len()).sum(), keys: regex_matched_keys, }) } @@ -192,6 +200,7 @@ mod tests { let test_fst = create_fst_map(&[(b"foo", 1), (b"bar", 2), (b"baz", 3)]); let applier = KeysFstApplier { keys: vec![b("foo"), b("baz")], + memory_usage: 6, }; let results = applier.apply(&test_fst); @@ -201,7 +210,10 @@ mod tests { #[test] fn test_keys_fst_applier_with_empty_keys() { let test_fst = create_fst_map(&[(b"foo", 1), (b"bar", 2), (b"baz", 3)]); - let applier = KeysFstApplier { keys: vec![] }; + let applier = KeysFstApplier { + keys: vec![], + memory_usage: 0, + }; let results = applier.apply(&test_fst); assert!(results.is_empty()); @@ -212,6 +224,7 @@ mod tests { let test_fst = create_fst_map(&[(b"foo", 1), (b"bar", 2), (b"baz", 3)]); let applier = KeysFstApplier { keys: vec![b("qux"), b("quux")], + memory_usage: 7, }; let results = applier.apply(&test_fst); diff --git a/src/index/src/inverted_index/search/index_apply.rs b/src/index/src/inverted_index/search/index_apply.rs index 6701f03cac3d..8ad4db10904d 100644 --- a/src/index/src/inverted_index/search/index_apply.rs +++ b/src/index/src/inverted_index/search/index_apply.rs @@ -35,6 +35,9 @@ pub trait IndexApplier: Send + Sync { context: SearchContext, reader: &mut dyn InvertedIndexReader, ) -> Result>; + + /// Returns the memory usage of the applier. + fn memory_usage(&self) -> usize; } /// A context for searching the inverted index. diff --git a/src/index/src/inverted_index/search/index_apply/predicates_apply.rs b/src/index/src/inverted_index/search/index_apply/predicates_apply.rs index 2331d8af6ffd..c8a06965e241 100644 --- a/src/index/src/inverted_index/search/index_apply/predicates_apply.rs +++ b/src/index/src/inverted_index/search/index_apply/predicates_apply.rs @@ -82,6 +82,14 @@ impl IndexApplier for PredicatesIndexApplier { Ok(bitmap.iter_ones().collect()) } + + /// Returns the memory usage of the applier. + fn memory_usage(&self) -> usize { + self.fst_appliers + .iter() + .map(|(n, fst_applier)| n.as_bytes().len() + fst_applier.memory_usage()) + .sum() + } } impl PredicatesIndexApplier { diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml index 0fbc469fdd22..b977dc40ec3e 100644 --- a/src/mito2/Cargo.toml +++ b/src/mito2/Cargo.toml @@ -29,6 +29,7 @@ common-procedure.workspace = true common-query.workspace = true common-recordbatch.workspace = true common-runtime.workspace = true +pin-project.workspace = true common-telemetry.workspace = true common-test-util = { workspace = true, optional = true } common-time.workspace = true diff --git a/src/mito2/src/metrics.rs b/src/mito2/src/metrics.rs index dae565fb3baf..5f4038d93018 100644 --- a/src/mito2/src/metrics.rs +++ b/src/mito2/src/metrics.rs @@ -21,6 +21,8 @@ pub const STAGE_LABEL: &str = "stage"; pub const TYPE_LABEL: &str = "type"; /// Reason to flush. pub const FLUSH_REASON: &str = "reason"; +/// File type label. +pub const FILE_TYPE_LABEL: &str = "file_type"; lazy_static! { /// Global write buffer size in bytes. @@ -152,6 +154,12 @@ lazy_static! { "index apply cost time", ) .unwrap(); + /// Gauge of index apply memory usage. + pub static ref INDEX_APPLY_MEMORY_USAGE: IntGauge = register_int_gauge!( + "index_apply_memory_usage", + "index apply memory usage", + ) + .unwrap(); /// Timer of index creation. pub static ref INDEX_CREATE_COST_TIME: HistogramVec = register_histogram_vec!( "index_create_cost_time", @@ -161,16 +169,26 @@ lazy_static! { .unwrap(); /// Counter of rows indexed. pub static ref INDEX_CREATE_ROWS_TOTAL: IntCounter = register_int_counter!( - "index_rows_total", - "index rows total", + "index_create_rows_total", + "index create rows total", ) .unwrap(); /// Counter of created index bytes. pub static ref INDEX_CREATE_BYTES_TOTAL: IntCounter = register_int_counter!( - "index_bytes_total", - "index bytes total", + "index_create_bytes_total", + "index create bytes total", ) .unwrap(); - + /// Counter of r/w bytes on index related IO operations. + pub static ref INDEX_IO_BYTES_TOTAL: IntCounterVec = register_int_counter_vec!( + "index_io_bytes_total", + "index io bytes total", + &[TYPE_LABEL, FILE_TYPE_LABEL] + ) + .unwrap(); + pub static ref INDEX_INTERMEDIATE_READ_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL.with_label_values(&["read", "intermediate"]); + pub static ref INDEX_INTERMEDIATE_WRITE_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL.with_label_values(&["write", "intermediate"]); + pub static ref INDEX_PUFFIN_READ_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL.with_label_values(&["read", "puffin"]); + pub static ref INDEX_PUFFIN_WRITE_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL.with_label_values(&["write", "puffin"]); // ------- End of index metrics. } diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs index 1a0365309b85..5e3089f7bef1 100644 --- a/src/mito2/src/sst/index.rs +++ b/src/mito2/src/sst/index.rs @@ -15,6 +15,7 @@ pub mod applier; mod codec; pub mod creator; +mod io_stats; const INDEX_BLOB_TYPE: &str = "greptime-inverted-index-v1"; diff --git a/src/mito2/src/sst/index/applier.rs b/src/mito2/src/sst/index/applier.rs index 041ad3086adf..191763425e4a 100644 --- a/src/mito2/src/sst/index/applier.rs +++ b/src/mito2/src/sst/index/applier.rs @@ -25,8 +25,11 @@ use object_store::ObjectStore; use puffin::file_format::reader::{PuffinAsyncReader, PuffinFileReader}; use snafu::ResultExt; +use super::io_stats::InstrumentedAsyncRead; use crate::error::{OpenDalSnafu, Result}; -use crate::metrics::INDEX_APPLY_COST_TIME; +use crate::metrics::{ + INDEX_APPLY_COST_TIME, INDEX_APPLY_MEMORY_USAGE, INDEX_PUFFIN_READ_BYTES_TOTAL, +}; use crate::sst::file::FileId; use crate::sst::index::INDEX_BLOB_TYPE; use crate::sst::location; @@ -45,6 +48,8 @@ impl SstIndexApplier { object_store: ObjectStore, index_applier: Arc, ) -> Self { + INDEX_APPLY_MEMORY_USAGE.add(index_applier.memory_usage() as i64); + Self { region_dir, object_store, @@ -62,6 +67,8 @@ impl SstIndexApplier { .reader(&file_path) .await .context(OpenDalSnafu)?; + let file_reader = InstrumentedAsyncRead::new(file_reader, &INDEX_PUFFIN_READ_BYTES_TOTAL); + let mut puffin_reader = PuffinFileReader::new(file_reader); let file_meta = puffin_reader.metadata().await.unwrap(); @@ -86,3 +93,9 @@ impl SstIndexApplier { Ok(res) } } + +impl Drop for SstIndexApplier { + fn drop(&mut self) { + INDEX_APPLY_MEMORY_USAGE.sub(self.index_applier.memory_usage() as i64); + } +} diff --git a/src/mito2/src/sst/index/creator.rs b/src/mito2/src/sst/index/creator.rs index 5f9c0042f370..998c635e61ae 100644 --- a/src/mito2/src/sst/index/creator.rs +++ b/src/mito2/src/sst/index/creator.rs @@ -31,7 +31,9 @@ use store_api::metadata::RegionMetadataRef; use tokio::io::duplex; use tokio_util::compat::{TokioAsyncReadCompatExt, TokioAsyncWriteCompatExt}; +use super::io_stats::InstrumentedAsyncWrite; use crate::error::{OpenDalSnafu, PushIndexValueSnafu, Result}; +use crate::metrics::INDEX_PUFFIN_WRITE_BYTES_TOTAL; use crate::read::Batch; use crate::sst::file::FileId; use crate::sst::index::codec::{IndexValueCodec, IndexValuesCodec}; @@ -154,12 +156,13 @@ impl SstIndexCreator { let mut guard = self.stats.record_finish(); let file_path = location::index_file_path(&self.region_dir, &self.sst_file_id); - let writer = self + let file_writer = self .object_store .writer(&file_path) .await .context(OpenDalSnafu)?; - let mut puffin_writer = PuffinFileWriter::new(writer); + let file_writer = InstrumentedAsyncWrite::new(file_writer, &INDEX_PUFFIN_WRITE_BYTES_TOTAL); + let mut puffin_writer = PuffinFileWriter::new(file_writer); let (tx, rx) = duplex(PIPE_BUFFER_SIZE_FOR_SENDING_BLOB); diff --git a/src/mito2/src/sst/index/creator/temp_provider.rs b/src/mito2/src/sst/index/creator/temp_provider.rs index 2148a19af9bd..434fd8d4120f 100644 --- a/src/mito2/src/sst/index/creator/temp_provider.rs +++ b/src/mito2/src/sst/index/creator/temp_provider.rs @@ -23,6 +23,8 @@ use object_store::ObjectStore; use snafu::ResultExt; use crate::error::{OpenDalSnafu, Result}; +use crate::metrics::{INDEX_INTERMEDIATE_READ_BYTES_TOTAL, INDEX_INTERMEDIATE_WRITE_BYTES_TOTAL}; +use crate::sst::index::io_stats::{InstrumentedAsyncRead, InstrumentedAsyncWrite}; use crate::sst::location::IntermediateLocation; pub(crate) struct TempFileProvider { @@ -45,6 +47,7 @@ impl ExternalTempFileProvider for TempFileProvider { .context(OpenDalSnafu) .map_err(BoxedError::new) .context(index_error::ExternalSnafu)?; + let writer = InstrumentedAsyncWrite::new(writer, &INDEX_INTERMEDIATE_WRITE_BYTES_TOTAL); Ok(Box::new(writer)) } @@ -75,6 +78,7 @@ impl ExternalTempFileProvider for TempFileProvider { .context(OpenDalSnafu) .map_err(BoxedError::new) .context(index_error::ExternalSnafu)?; + let reader = InstrumentedAsyncRead::new(reader, &INDEX_INTERMEDIATE_READ_BYTES_TOTAL); readers.push(Box::new(reader) as _); } diff --git a/src/mito2/src/sst/index/io_stats.rs b/src/mito2/src/sst/index/io_stats.rs new file mode 100644 index 000000000000..b63166caca70 --- /dev/null +++ b/src/mito2/src/sst/index/io_stats.rs @@ -0,0 +1,122 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::io; +use std::pin::Pin; +use std::task::{Context, Poll}; + +use futures::{AsyncRead, AsyncSeek, AsyncWrite}; +use pin_project::pin_project; +use prometheus::IntCounter; + +struct BytesRecorder { + bytes: usize, + recorder: &'static IntCounter, +} + +impl BytesRecorder { + fn new(recorder: &'static IntCounter) -> Self { + Self { bytes: 0, recorder } + } + + fn inc_by(&mut self, bytes: usize) { + self.bytes += bytes; + } +} + +impl Drop for BytesRecorder { + fn drop(&mut self) { + if self.bytes > 0 { + self.recorder.inc_by(self.bytes as _); + } + } +} + +#[pin_project] +pub(crate) struct InstrumentedAsyncRead { + #[pin] + inner: R, + recorder: BytesRecorder, +} + +impl InstrumentedAsyncRead { + pub(crate) fn new(inner: R, recorder: &'static IntCounter) -> Self { + Self { + inner, + recorder: BytesRecorder::new(recorder), + } + } +} + +impl AsyncRead for InstrumentedAsyncRead { + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut [u8], + ) -> Poll> { + let poll = self.as_mut().project().inner.poll_read(cx, buf); + if let Poll::Ready(Ok(n)) = &poll { + self.recorder.inc_by(*n); + } + poll + } +} + +impl AsyncSeek for InstrumentedAsyncRead { + fn poll_seek( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + pos: io::SeekFrom, + ) -> Poll> { + self.project().inner.poll_seek(cx, pos) + } +} + +#[pin_project] +pub(crate) struct InstrumentedAsyncWrite { + #[pin] + inner: W, + recorder: BytesRecorder, +} + +impl InstrumentedAsyncWrite { + pub(crate) fn new(inner: W, recorder: &'static IntCounter) -> Self { + Self { + inner, + recorder: BytesRecorder::new(recorder), + } + } +} + +impl AsyncWrite for InstrumentedAsyncWrite { + fn poll_write( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + let poll = self.as_mut().project().inner.poll_write(cx, buf); + if let Poll::Ready(Ok(n)) = &poll { + self.recorder.inc_by(*n); + } + poll + } + + fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().inner.poll_flush(cx) + } + + fn poll_close(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + self.project().inner.poll_close(cx) + } +} From c5b3d6998aae73e4b4522d526ecad47f3a3979a9 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Fri, 29 Dec 2023 09:54:23 +0000 Subject: [PATCH 10/27] add comments Signed-off-by: Zhenchi --- src/mito2/src/metrics.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/mito2/src/metrics.rs b/src/mito2/src/metrics.rs index 5f4038d93018..9c608999e716 100644 --- a/src/mito2/src/metrics.rs +++ b/src/mito2/src/metrics.rs @@ -186,9 +186,13 @@ lazy_static! { &[TYPE_LABEL, FILE_TYPE_LABEL] ) .unwrap(); + /// Counter of read bytes on intermediate files. pub static ref INDEX_INTERMEDIATE_READ_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL.with_label_values(&["read", "intermediate"]); + /// Counter of write bytes on intermediate files. pub static ref INDEX_INTERMEDIATE_WRITE_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL.with_label_values(&["write", "intermediate"]); + /// Counter of read bytes on puffin files. pub static ref INDEX_PUFFIN_READ_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL.with_label_values(&["read", "puffin"]); + /// Counter of write bytes on puffin files. pub static ref INDEX_PUFFIN_WRITE_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL.with_label_values(&["write", "puffin"]); // ------- End of index metrics. } From 56d1621905cb9496520c426048756a0acee65478 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Fri, 29 Dec 2023 10:16:15 +0000 Subject: [PATCH 11/27] instrument object store to track its usage Signed-off-by: Zhenchi --- src/mito2/src/sst/index.rs | 2 +- src/mito2/src/sst/index/applier.rs | 19 ++--- src/mito2/src/sst/index/applier/builder.rs | 3 +- src/mito2/src/sst/index/creator.rs | 14 ++-- .../src/sst/index/creator/temp_provider.rs | 23 ++---- .../index/{io_stats.rs => object_store.rs} | 77 +++++++++++++++---- 6 files changed, 85 insertions(+), 53 deletions(-) rename src/mito2/src/sst/index/{io_stats.rs => object_store.rs} (67%) diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs index 5e3089f7bef1..a7e2159f2c83 100644 --- a/src/mito2/src/sst/index.rs +++ b/src/mito2/src/sst/index.rs @@ -15,7 +15,7 @@ pub mod applier; mod codec; pub mod creator; -mod io_stats; +mod object_store; const INDEX_BLOB_TYPE: &str = "greptime-inverted-index-v1"; diff --git a/src/mito2/src/sst/index/applier.rs b/src/mito2/src/sst/index/applier.rs index 191763425e4a..3cc04974defb 100644 --- a/src/mito2/src/sst/index/applier.rs +++ b/src/mito2/src/sst/index/applier.rs @@ -21,31 +21,29 @@ use index::inverted_index::format::reader::InvertedIndexBlobReader; use index::inverted_index::search::index_apply::{ IndexApplier, IndexNotFoundStrategy, SearchContext, }; -use object_store::ObjectStore; use puffin::file_format::reader::{PuffinAsyncReader, PuffinFileReader}; -use snafu::ResultExt; -use super::io_stats::InstrumentedAsyncRead; -use crate::error::{OpenDalSnafu, Result}; +use crate::error::Result; use crate::metrics::{ INDEX_APPLY_COST_TIME, INDEX_APPLY_MEMORY_USAGE, INDEX_PUFFIN_READ_BYTES_TOTAL, }; use crate::sst::file::FileId; +use crate::sst::index::object_store::InstrumentedObjectStore; use crate::sst::index::INDEX_BLOB_TYPE; use crate::sst::location; #[derive(Clone)] pub struct SstIndexApplier { region_dir: String, - object_store: ObjectStore, + object_store: InstrumentedObjectStore, index_applier: Arc, } impl SstIndexApplier { - pub fn new( + pub(crate) fn new( region_dir: String, - object_store: ObjectStore, + object_store: InstrumentedObjectStore, index_applier: Arc, ) -> Self { INDEX_APPLY_MEMORY_USAGE.add(index_applier.memory_usage() as i64); @@ -64,11 +62,8 @@ impl SstIndexApplier { let file_reader = self .object_store - .reader(&file_path) - .await - .context(OpenDalSnafu)?; - let file_reader = InstrumentedAsyncRead::new(file_reader, &INDEX_PUFFIN_READ_BYTES_TOTAL); - + .reader(&file_path, &INDEX_PUFFIN_READ_BYTES_TOTAL) + .await?; let mut puffin_reader = PuffinFileReader::new(file_reader); let file_meta = puffin_reader.metadata().await.unwrap(); diff --git a/src/mito2/src/sst/index/applier/builder.rs b/src/mito2/src/sst/index/applier/builder.rs index 959210d00ce0..1c435b46f38e 100644 --- a/src/mito2/src/sst/index/applier/builder.rs +++ b/src/mito2/src/sst/index/applier/builder.rs @@ -36,6 +36,7 @@ use crate::error::{BuildIndexApplierSnafu, ColumnNotFoundSnafu, Result}; use crate::row_converter::SortField; use crate::sst::index::applier::SstIndexApplier; use crate::sst::index::codec::IndexValueCodec; +use crate::sst::index::object_store::InstrumentedObjectStore; type ColumnName = String; @@ -73,7 +74,7 @@ impl<'a> SstIndexApplierBuilder<'a> { let applier = PredicatesIndexApplier::try_from(predicates); Ok(Some(SstIndexApplier::new( self.region_dir, - self.object_store, + InstrumentedObjectStore::new(self.object_store), Arc::new(applier.context(BuildIndexApplierSnafu)?), ))) } diff --git a/src/mito2/src/sst/index/creator.rs b/src/mito2/src/sst/index/creator.rs index 998c635e61ae..3fe0e09ec7ec 100644 --- a/src/mito2/src/sst/index/creator.rs +++ b/src/mito2/src/sst/index/creator.rs @@ -31,14 +31,14 @@ use store_api::metadata::RegionMetadataRef; use tokio::io::duplex; use tokio_util::compat::{TokioAsyncReadCompatExt, TokioAsyncWriteCompatExt}; -use super::io_stats::InstrumentedAsyncWrite; -use crate::error::{OpenDalSnafu, PushIndexValueSnafu, Result}; +use crate::error::{PushIndexValueSnafu, Result}; use crate::metrics::INDEX_PUFFIN_WRITE_BYTES_TOTAL; use crate::read::Batch; use crate::sst::file::FileId; use crate::sst::index::codec::{IndexValueCodec, IndexValuesCodec}; use crate::sst::index::creator::statistics::Statistics; use crate::sst::index::creator::temp_provider::TempFileProvider; +use crate::sst::index::object_store::InstrumentedObjectStore; use crate::sst::index::{ INDEX_BLOB_TYPE, MIN_MEMORY_USAGE_THRESHOLD, PIPE_BUFFER_SIZE_FOR_SENDING_BLOB, }; @@ -50,7 +50,7 @@ type RowCount = usize; pub struct SstIndexCreator { region_dir: String, sst_file_id: FileId, - object_store: ObjectStore, + object_store: InstrumentedObjectStore, codec: IndexValuesCodec, index_creator: Box, @@ -70,6 +70,8 @@ impl SstIndexCreator { memory_usage_threshold: Option, row_group_size: NonZeroUsize, ) -> Self { + let object_store = InstrumentedObjectStore::new(object_store); + let temp_file_provider = Arc::new(TempFileProvider::new( IntermediateLocation::new(®ion_dir, &sst_file_id), object_store.clone(), @@ -158,10 +160,8 @@ impl SstIndexCreator { let file_path = location::index_file_path(&self.region_dir, &self.sst_file_id); let file_writer = self .object_store - .writer(&file_path) - .await - .context(OpenDalSnafu)?; - let file_writer = InstrumentedAsyncWrite::new(file_writer, &INDEX_PUFFIN_WRITE_BYTES_TOTAL); + .writer(&file_path, &INDEX_PUFFIN_WRITE_BYTES_TOTAL) + .await?; let mut puffin_writer = PuffinFileWriter::new(file_writer); let (tx, rx) = duplex(PIPE_BUFFER_SIZE_FOR_SENDING_BLOB); diff --git a/src/mito2/src/sst/index/creator/temp_provider.rs b/src/mito2/src/sst/index/creator/temp_provider.rs index 434fd8d4120f..b2b4d967a7df 100644 --- a/src/mito2/src/sst/index/creator/temp_provider.rs +++ b/src/mito2/src/sst/index/creator/temp_provider.rs @@ -19,17 +19,16 @@ use futures::{AsyncRead, AsyncWrite}; use index::inverted_index::create::sort::external_provider::ExternalTempFileProvider; use index::inverted_index::error as index_error; use index::inverted_index::error::Result as IndexResult; -use object_store::ObjectStore; use snafu::ResultExt; -use crate::error::{OpenDalSnafu, Result}; +use crate::error::Result; use crate::metrics::{INDEX_INTERMEDIATE_READ_BYTES_TOTAL, INDEX_INTERMEDIATE_WRITE_BYTES_TOTAL}; -use crate::sst::index::io_stats::{InstrumentedAsyncRead, InstrumentedAsyncWrite}; +use crate::sst::index::object_store::InstrumentedObjectStore; use crate::sst::location::IntermediateLocation; pub(crate) struct TempFileProvider { location: IntermediateLocation, - object_store: ObjectStore, + object_store: InstrumentedObjectStore, } #[async_trait] @@ -42,12 +41,10 @@ impl ExternalTempFileProvider for TempFileProvider { let path = self.location.file_path(column_name, file_id); let writer = self .object_store - .writer(&path) + .writer(&path, &INDEX_INTERMEDIATE_WRITE_BYTES_TOTAL) .await - .context(OpenDalSnafu) .map_err(BoxedError::new) .context(index_error::ExternalSnafu)?; - let writer = InstrumentedAsyncWrite::new(writer, &INDEX_INTERMEDIATE_WRITE_BYTES_TOTAL); Ok(Box::new(writer)) } @@ -60,7 +57,6 @@ impl ExternalTempFileProvider for TempFileProvider { .object_store .list(&dir) .await - .context(OpenDalSnafu) .map_err(BoxedError::new) .context(index_error::ExternalSnafu)?; let mut readers = Vec::with_capacity(entries.len()); @@ -73,12 +69,10 @@ impl ExternalTempFileProvider for TempFileProvider { let reader = self .object_store - .reader(entry.path()) + .reader(entry.path(), &INDEX_INTERMEDIATE_READ_BYTES_TOTAL) .await - .context(OpenDalSnafu) .map_err(BoxedError::new) .context(index_error::ExternalSnafu)?; - let reader = InstrumentedAsyncRead::new(reader, &INDEX_INTERMEDIATE_READ_BYTES_TOTAL); readers.push(Box::new(reader) as _); } @@ -87,7 +81,7 @@ impl ExternalTempFileProvider for TempFileProvider { } impl TempFileProvider { - pub fn new(location: IntermediateLocation, object_store: ObjectStore) -> Self { + pub fn new(location: IntermediateLocation, object_store: InstrumentedObjectStore) -> Self { Self { location, object_store, @@ -95,9 +89,6 @@ impl TempFileProvider { } pub async fn cleanup(&self) -> Result<()> { - self.object_store - .remove_all(self.location.root_dir()) - .await - .context(OpenDalSnafu) + self.object_store.remove_all(self.location.root_dir()).await } } diff --git a/src/mito2/src/sst/index/io_stats.rs b/src/mito2/src/sst/index/object_store.rs similarity index 67% rename from src/mito2/src/sst/index/io_stats.rs rename to src/mito2/src/sst/index/object_store.rs index b63166caca70..2316b51a20f0 100644 --- a/src/mito2/src/sst/index/io_stats.rs +++ b/src/mito2/src/sst/index/object_store.rs @@ -17,29 +17,51 @@ use std::pin::Pin; use std::task::{Context, Poll}; use futures::{AsyncRead, AsyncSeek, AsyncWrite}; +use object_store::ObjectStore; use pin_project::pin_project; use prometheus::IntCounter; +use snafu::ResultExt; -struct BytesRecorder { - bytes: usize, - recorder: &'static IntCounter, +use crate::error::{OpenDalSnafu, Result}; + +#[derive(Clone)] +pub(crate) struct InstrumentedObjectStore { + object_store: ObjectStore, } -impl BytesRecorder { - fn new(recorder: &'static IntCounter) -> Self { - Self { bytes: 0, recorder } +impl InstrumentedObjectStore { + pub(crate) fn new(object_store: ObjectStore) -> Self { + Self { object_store } } - fn inc_by(&mut self, bytes: usize) { - self.bytes += bytes; + pub(crate) async fn reader( + &self, + path: &str, + recoder: &'static IntCounter, + ) -> Result> { + let reader = self.object_store.reader(path).await.context(OpenDalSnafu)?; + Ok(InstrumentedAsyncRead::new(reader, recoder)) } -} -impl Drop for BytesRecorder { - fn drop(&mut self) { - if self.bytes > 0 { - self.recorder.inc_by(self.bytes as _); - } + pub(crate) async fn writer( + &self, + path: &str, + recoder: &'static IntCounter, + ) -> Result> { + let writer = self.object_store.writer(path).await.context(OpenDalSnafu)?; + Ok(InstrumentedAsyncWrite::new(writer, recoder)) + } + + pub(crate) async fn list(&self, path: &str) -> Result> { + let list = self.object_store.list(path).await.context(OpenDalSnafu)?; + Ok(list) + } + + pub(crate) async fn remove_all(&self, path: &str) -> Result<()> { + self.object_store + .remove_all(path) + .await + .context(OpenDalSnafu) } } @@ -51,7 +73,7 @@ pub(crate) struct InstrumentedAsyncRead { } impl InstrumentedAsyncRead { - pub(crate) fn new(inner: R, recorder: &'static IntCounter) -> Self { + fn new(inner: R, recorder: &'static IntCounter) -> Self { Self { inner, recorder: BytesRecorder::new(recorder), @@ -91,7 +113,7 @@ pub(crate) struct InstrumentedAsyncWrite { } impl InstrumentedAsyncWrite { - pub(crate) fn new(inner: W, recorder: &'static IntCounter) -> Self { + fn new(inner: W, recorder: &'static IntCounter) -> Self { Self { inner, recorder: BytesRecorder::new(recorder), @@ -120,3 +142,26 @@ impl AsyncWrite for InstrumentedAsyncWrite { self.project().inner.poll_close(cx) } } + +struct BytesRecorder { + bytes: usize, + recorder: &'static IntCounter, +} + +impl BytesRecorder { + fn new(recorder: &'static IntCounter) -> Self { + Self { bytes: 0, recorder } + } + + fn inc_by(&mut self, bytes: usize) { + self.bytes += bytes; + } +} + +impl Drop for BytesRecorder { + fn drop(&mut self) { + if self.bytes > 0 { + self.recorder.inc_by(self.bytes as _); + } + } +} From 2f9319af33c6348c0cdc5ad7e31f8b294b44ebbb Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Fri, 29 Dec 2023 10:21:10 +0000 Subject: [PATCH 12/27] store is nice Signed-off-by: Zhenchi --- src/mito2/src/sst/index.rs | 2 +- src/mito2/src/sst/index/applier.rs | 10 +++++----- src/mito2/src/sst/index/applier/builder.rs | 4 ++-- src/mito2/src/sst/index/creator.rs | 12 ++++++------ .../src/sst/index/creator/temp_provider.rs | 19 ++++++++----------- .../sst/index/{object_store.rs => store.rs} | 14 +++++++------- 6 files changed, 29 insertions(+), 32 deletions(-) rename src/mito2/src/sst/index/{object_store.rs => store.rs} (92%) diff --git a/src/mito2/src/sst/index.rs b/src/mito2/src/sst/index.rs index a7e2159f2c83..1a5e3288bd0d 100644 --- a/src/mito2/src/sst/index.rs +++ b/src/mito2/src/sst/index.rs @@ -15,7 +15,7 @@ pub mod applier; mod codec; pub mod creator; -mod object_store; +mod store; const INDEX_BLOB_TYPE: &str = "greptime-inverted-index-v1"; diff --git a/src/mito2/src/sst/index/applier.rs b/src/mito2/src/sst/index/applier.rs index 3cc04974defb..58bab7152fd8 100644 --- a/src/mito2/src/sst/index/applier.rs +++ b/src/mito2/src/sst/index/applier.rs @@ -28,14 +28,14 @@ use crate::metrics::{ INDEX_APPLY_COST_TIME, INDEX_APPLY_MEMORY_USAGE, INDEX_PUFFIN_READ_BYTES_TOTAL, }; use crate::sst::file::FileId; -use crate::sst::index::object_store::InstrumentedObjectStore; +use crate::sst::index::store::InstrumentedStore; use crate::sst::index::INDEX_BLOB_TYPE; use crate::sst::location; #[derive(Clone)] pub struct SstIndexApplier { region_dir: String, - object_store: InstrumentedObjectStore, + store: InstrumentedStore, index_applier: Arc, } @@ -43,14 +43,14 @@ pub struct SstIndexApplier { impl SstIndexApplier { pub(crate) fn new( region_dir: String, - object_store: InstrumentedObjectStore, + store: InstrumentedStore, index_applier: Arc, ) -> Self { INDEX_APPLY_MEMORY_USAGE.add(index_applier.memory_usage() as i64); Self { region_dir, - object_store, + store, index_applier, } } @@ -61,7 +61,7 @@ impl SstIndexApplier { let file_path = location::index_file_path(&self.region_dir, &file_id); let file_reader = self - .object_store + .store .reader(&file_path, &INDEX_PUFFIN_READ_BYTES_TOTAL) .await?; let mut puffin_reader = PuffinFileReader::new(file_reader); diff --git a/src/mito2/src/sst/index/applier/builder.rs b/src/mito2/src/sst/index/applier/builder.rs index 1c435b46f38e..7868f7f72cba 100644 --- a/src/mito2/src/sst/index/applier/builder.rs +++ b/src/mito2/src/sst/index/applier/builder.rs @@ -36,7 +36,7 @@ use crate::error::{BuildIndexApplierSnafu, ColumnNotFoundSnafu, Result}; use crate::row_converter::SortField; use crate::sst::index::applier::SstIndexApplier; use crate::sst::index::codec::IndexValueCodec; -use crate::sst::index::object_store::InstrumentedObjectStore; +use crate::sst::index::store::InstrumentedStore; type ColumnName = String; @@ -74,7 +74,7 @@ impl<'a> SstIndexApplierBuilder<'a> { let applier = PredicatesIndexApplier::try_from(predicates); Ok(Some(SstIndexApplier::new( self.region_dir, - InstrumentedObjectStore::new(self.object_store), + InstrumentedStore::new(self.object_store), Arc::new(applier.context(BuildIndexApplierSnafu)?), ))) } diff --git a/src/mito2/src/sst/index/creator.rs b/src/mito2/src/sst/index/creator.rs index 3fe0e09ec7ec..6f5a911716f3 100644 --- a/src/mito2/src/sst/index/creator.rs +++ b/src/mito2/src/sst/index/creator.rs @@ -38,7 +38,7 @@ use crate::sst::file::FileId; use crate::sst::index::codec::{IndexValueCodec, IndexValuesCodec}; use crate::sst::index::creator::statistics::Statistics; use crate::sst::index::creator::temp_provider::TempFileProvider; -use crate::sst::index::object_store::InstrumentedObjectStore; +use crate::sst::index::store::InstrumentedStore; use crate::sst::index::{ INDEX_BLOB_TYPE, MIN_MEMORY_USAGE_THRESHOLD, PIPE_BUFFER_SIZE_FOR_SENDING_BLOB, }; @@ -50,7 +50,7 @@ type RowCount = usize; pub struct SstIndexCreator { region_dir: String, sst_file_id: FileId, - object_store: InstrumentedObjectStore, + store: InstrumentedStore, codec: IndexValuesCodec, index_creator: Box, @@ -70,11 +70,11 @@ impl SstIndexCreator { memory_usage_threshold: Option, row_group_size: NonZeroUsize, ) -> Self { - let object_store = InstrumentedObjectStore::new(object_store); + let store = InstrumentedStore::new(object_store); let temp_file_provider = Arc::new(TempFileProvider::new( IntermediateLocation::new(®ion_dir, &sst_file_id), - object_store.clone(), + store.clone(), )); let memory_usage_threshold = memory_usage_threshold.map(|threshold| { (threshold / metadata.primary_key.len()).max(MIN_MEMORY_USAGE_THRESHOLD) @@ -87,7 +87,7 @@ impl SstIndexCreator { Self { region_dir, sst_file_id, - object_store, + store, codec, index_creator, temp_file_provider, @@ -159,7 +159,7 @@ impl SstIndexCreator { let file_path = location::index_file_path(&self.region_dir, &self.sst_file_id); let file_writer = self - .object_store + .store .writer(&file_path, &INDEX_PUFFIN_WRITE_BYTES_TOTAL) .await?; let mut puffin_writer = PuffinFileWriter::new(file_writer); diff --git a/src/mito2/src/sst/index/creator/temp_provider.rs b/src/mito2/src/sst/index/creator/temp_provider.rs index b2b4d967a7df..2206dcb34c70 100644 --- a/src/mito2/src/sst/index/creator/temp_provider.rs +++ b/src/mito2/src/sst/index/creator/temp_provider.rs @@ -23,12 +23,12 @@ use snafu::ResultExt; use crate::error::Result; use crate::metrics::{INDEX_INTERMEDIATE_READ_BYTES_TOTAL, INDEX_INTERMEDIATE_WRITE_BYTES_TOTAL}; -use crate::sst::index::object_store::InstrumentedObjectStore; +use crate::sst::index::store::InstrumentedStore; use crate::sst::location::IntermediateLocation; pub(crate) struct TempFileProvider { location: IntermediateLocation, - object_store: InstrumentedObjectStore, + store: InstrumentedStore, } #[async_trait] @@ -40,7 +40,7 @@ impl ExternalTempFileProvider for TempFileProvider { ) -> IndexResult> { let path = self.location.file_path(column_name, file_id); let writer = self - .object_store + .store .writer(&path, &INDEX_INTERMEDIATE_WRITE_BYTES_TOTAL) .await .map_err(BoxedError::new) @@ -54,7 +54,7 @@ impl ExternalTempFileProvider for TempFileProvider { ) -> IndexResult>> { let dir = self.location.column_dir(column_name); let entries = self - .object_store + .store .list(&dir) .await .map_err(BoxedError::new) @@ -68,7 +68,7 @@ impl ExternalTempFileProvider for TempFileProvider { } let reader = self - .object_store + .store .reader(entry.path(), &INDEX_INTERMEDIATE_READ_BYTES_TOTAL) .await .map_err(BoxedError::new) @@ -81,14 +81,11 @@ impl ExternalTempFileProvider for TempFileProvider { } impl TempFileProvider { - pub fn new(location: IntermediateLocation, object_store: InstrumentedObjectStore) -> Self { - Self { - location, - object_store, - } + pub fn new(location: IntermediateLocation, store: InstrumentedStore) -> Self { + Self { location, store } } pub async fn cleanup(&self) -> Result<()> { - self.object_store.remove_all(self.location.root_dir()).await + self.store.remove_all(self.location.root_dir()).await } } diff --git a/src/mito2/src/sst/index/object_store.rs b/src/mito2/src/sst/index/store.rs similarity index 92% rename from src/mito2/src/sst/index/object_store.rs rename to src/mito2/src/sst/index/store.rs index 2316b51a20f0..c4f7a5b1cf31 100644 --- a/src/mito2/src/sst/index/object_store.rs +++ b/src/mito2/src/sst/index/store.rs @@ -25,16 +25,16 @@ use snafu::ResultExt; use crate::error::{OpenDalSnafu, Result}; #[derive(Clone)] -pub(crate) struct InstrumentedObjectStore { +pub(crate) struct InstrumentedStore { object_store: ObjectStore, } -impl InstrumentedObjectStore { - pub(crate) fn new(object_store: ObjectStore) -> Self { +impl InstrumentedStore { + pub fn new(object_store: ObjectStore) -> Self { Self { object_store } } - pub(crate) async fn reader( + pub async fn reader( &self, path: &str, recoder: &'static IntCounter, @@ -43,7 +43,7 @@ impl InstrumentedObjectStore { Ok(InstrumentedAsyncRead::new(reader, recoder)) } - pub(crate) async fn writer( + pub async fn writer( &self, path: &str, recoder: &'static IntCounter, @@ -52,12 +52,12 @@ impl InstrumentedObjectStore { Ok(InstrumentedAsyncWrite::new(writer, recoder)) } - pub(crate) async fn list(&self, path: &str) -> Result> { + pub async fn list(&self, path: &str) -> Result> { let list = self.object_store.list(path).await.context(OpenDalSnafu)?; Ok(list) } - pub(crate) async fn remove_all(&self, path: &str) -> Result<()> { + pub async fn remove_all(&self, path: &str) -> Result<()> { self.object_store .remove_all(path) .await From 50d8a1be54b2ef69fc032fe5ae54e6adfafda8c7 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Fri, 29 Dec 2023 18:29:23 +0000 Subject: [PATCH 13/27] remove some unwraps Signed-off-by: Zhenchi --- src/mito2/src/error.rs | 73 ++++++++++++++++++++-- src/mito2/src/sst/index/applier.rs | 21 +++++-- src/mito2/src/sst/index/applier/builder.rs | 4 +- src/mito2/src/sst/index/creator.rs | 33 +++++----- src/mito2/src/sst/index/store.rs | 44 ++++++------- src/mito2/src/sst/parquet/writer.rs | 47 ++++++++++---- 6 files changed, 162 insertions(+), 60 deletions(-) diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs index 108763a214ee..4999b0dbfaa5 100644 --- a/src/mito2/src/error.rs +++ b/src/mito2/src/error.rs @@ -423,14 +423,69 @@ pub enum Error { #[snafu(display("Failed to build index applier"))] BuildIndexApplier { #[snafu(source)] - error: index::inverted_index::error::Error, + source: index::inverted_index::error::Error, location: Location, }, #[snafu(display("Failed to push index value"))] PushIndexValue { #[snafu(source)] - error: index::inverted_index::error::Error, + source: index::inverted_index::error::Error, + location: Location, + }, + + #[snafu(display("Failed to apply index"))] + ApplyIndex { + #[snafu(source)] + source: index::inverted_index::error::Error, + location: Location, + }, + + #[snafu(display("Failed to write index completely"))] + IndexFinish { + #[snafu(source)] + source: index::inverted_index::error::Error, + location: Location, + }, + + #[snafu(display("Failed to read puffin metadata"))] + PuffinReadMetadata { + #[snafu(source)] + source: puffin::error::Error, + location: Location, + }, + + #[snafu(display("Failed to read puffin blob"))] + PuffinReadBlob { + #[snafu(source)] + source: puffin::error::Error, + location: Location, + }, + + #[snafu(display("Blob type not found, blob_type: {blob_type}"))] + PuffinBlobTypeNotFound { + blob_type: String, + location: Location, + }, + + #[snafu(display("Failed to write puffin completely"))] + PuffinFinish { + #[snafu(source)] + source: puffin::error::Error, + location: Location, + }, + + #[snafu(display("Failed to add blob to puffin file"))] + PuffinAddBlob { + #[snafu(source)] + source: puffin::error::Error, + location: Location, + }, + + #[snafu(display("Failed to convert value"))] + ConvertValue { + #[snafu(source)] + source: datatypes::error::Error, location: Location, }, @@ -477,6 +532,7 @@ impl ErrorExt for Error { | RegionCorrupted { .. } | CreateDefault { .. } | InvalidParquet { .. } + | PuffinBlobTypeNotFound { .. } | UnexpectedReplay { .. } => StatusCode::Unexpected, RegionNotFound { .. } => StatusCode::RegionNotFound, ObjectStoreNotFound { .. } @@ -486,15 +542,13 @@ impl ErrorExt for Error { | FillDefault { .. } | ConvertColumnDataType { .. } | ColumnNotFound { .. } - | BuildIndexApplier { .. } | InvalidMetadata { .. } => StatusCode::InvalidArguments, RegionMetadataNotFound { .. } | Join { .. } | WorkerStopped { .. } | Recv { .. } | EncodeWal { .. } - | DecodeWal { .. } - | PushIndexValue { .. } => StatusCode::Internal, + | DecodeWal { .. } => StatusCode::Internal, WriteBuffer { source, .. } => source.status_code(), WriteGroup { source, .. } => source.status_code(), FieldTypeMismatch { source, .. } => source.status_code(), @@ -524,6 +578,15 @@ impl ErrorExt for Error { JsonOptions { .. } => StatusCode::InvalidArguments, EmptyRegionDir { .. } | EmptyManifestDir { .. } => StatusCode::RegionNotFound, ArrowReader { .. } => StatusCode::StorageUnavailable, + ConvertValue { source, .. } => source.status_code(), + BuildIndexApplier { source, .. } + | PushIndexValue { source, .. } + | ApplyIndex { source, .. } + | IndexFinish { source, .. } => source.status_code(), + PuffinReadMetadata { source, .. } + | PuffinReadBlob { source, .. } + | PuffinFinish { source, .. } + | PuffinAddBlob { source, .. } => source.status_code(), } } diff --git a/src/mito2/src/sst/index/applier.rs b/src/mito2/src/sst/index/applier.rs index 58bab7152fd8..b83d6f130ea6 100644 --- a/src/mito2/src/sst/index/applier.rs +++ b/src/mito2/src/sst/index/applier.rs @@ -22,8 +22,12 @@ use index::inverted_index::search::index_apply::{ IndexApplier, IndexNotFoundStrategy, SearchContext, }; use puffin::file_format::reader::{PuffinAsyncReader, PuffinFileReader}; +use snafu::{OptionExt, ResultExt}; -use crate::error::Result; +use crate::error::{ + ApplyIndexSnafu, PuffinBlobTypeNotFoundSnafu, PuffinReadBlobSnafu, PuffinReadMetadataSnafu, + Result, +}; use crate::metrics::{ INDEX_APPLY_COST_TIME, INDEX_APPLY_MEMORY_USAGE, INDEX_PUFFIN_READ_BYTES_TOTAL, }; @@ -66,14 +70,21 @@ impl SstIndexApplier { .await?; let mut puffin_reader = PuffinFileReader::new(file_reader); - let file_meta = puffin_reader.metadata().await.unwrap(); + let file_meta = puffin_reader + .metadata() + .await + .context(PuffinReadMetadataSnafu)?; let blob_meta = file_meta .blobs .iter() .find(|blob| blob.blob_type == INDEX_BLOB_TYPE) - .unwrap(); + .context(PuffinBlobTypeNotFoundSnafu { + blob_type: INDEX_BLOB_TYPE, + })?; - let blob_reader = puffin_reader.blob_reader(blob_meta).unwrap(); + let blob_reader = puffin_reader + .blob_reader(blob_meta) + .context(PuffinReadBlobSnafu)?; let mut index_reader = InvertedIndexBlobReader::new(blob_reader); let context = SearchContext { @@ -83,7 +94,7 @@ impl SstIndexApplier { .index_applier .apply(context, &mut index_reader) .await - .unwrap(); + .context(ApplyIndexSnafu)?; Ok(res) } diff --git a/src/mito2/src/sst/index/applier/builder.rs b/src/mito2/src/sst/index/applier/builder.rs index 7868f7f72cba..45fefc529ee1 100644 --- a/src/mito2/src/sst/index/applier/builder.rs +++ b/src/mito2/src/sst/index/applier/builder.rs @@ -32,7 +32,7 @@ use object_store::ObjectStore; use snafu::{OptionExt, ResultExt}; use store_api::metadata::RegionMetadata; -use crate::error::{BuildIndexApplierSnafu, ColumnNotFoundSnafu, Result}; +use crate::error::{BuildIndexApplierSnafu, ColumnNotFoundSnafu, ConvertValueSnafu, Result}; use crate::row_converter::SortField; use crate::sst::index::applier::SstIndexApplier; use crate::sst::index::codec::IndexValueCodec; @@ -139,7 +139,7 @@ impl<'a> SstIndexApplierBuilder<'a> { } fn encode_lit(lit: &ScalarValue, data_type: ConcreteDataType) -> Result> { - let value = Value::try_from(lit.clone()).unwrap(); + let value = Value::try_from(lit.clone()).context(ConvertValueSnafu)?; let mut bytes = vec![]; let field = SortField::new(data_type); IndexValueCodec::encode_value(value.as_value_ref(), &field, &mut bytes)?; diff --git a/src/mito2/src/sst/index/creator.rs b/src/mito2/src/sst/index/creator.rs index 6f5a911716f3..080e092fc2ba 100644 --- a/src/mito2/src/sst/index/creator.rs +++ b/src/mito2/src/sst/index/creator.rs @@ -31,7 +31,9 @@ use store_api::metadata::RegionMetadataRef; use tokio::io::duplex; use tokio_util::compat::{TokioAsyncReadCompatExt, TokioAsyncWriteCompatExt}; -use crate::error::{PushIndexValueSnafu, Result}; +use crate::error::{ + IndexFinishSnafu, PuffinAddBlobSnafu, PuffinFinishSnafu, PushIndexValueSnafu, Result, +}; use crate::metrics::INDEX_PUFFIN_WRITE_BYTES_TOTAL; use crate::read::Batch; use crate::sst::file::FileId; @@ -104,9 +106,10 @@ impl SstIndexCreator { if let Err(err) = self.do_update(batch).await { // clean up garbage if failed to update if let Err(err) = self.do_cleanup().await { - let region_dir = &self.region_dir; - let sst_file_id = &self.sst_file_id; - warn!(err; "Failed to clean up index creator, region_dir: {region_dir}, sst_file_id: {sst_file_id}"); + warn!( + err; "Failed to clean up index creator, region_dir: {}, sst_file_id: {}", + self.region_dir, self.sst_file_id, + ); } return Err(err); } @@ -121,13 +124,13 @@ impl SstIndexCreator { } let finish_res = self.do_finish().await; - // clean up garbage no matter finish success or not - let cleanup_res = self.do_cleanup().await; - if let Err(err) = cleanup_res { - let region_dir = &self.region_dir; - let sst_file_id = &self.sst_file_id; - warn!(err; "Failed to clean up index creator, region_dir: {region_dir}, sst_file_id: {sst_file_id}"); + // clean up garbage no matter finish success or not + if let Err(err) = self.do_cleanup().await { + warn!( + err; "Failed to clean up index creator, region_dir: {}, sst_file_id: {}", + self.region_dir, self.sst_file_id, + ); } finish_res.map(|_| (self.stats.row_count(), self.stats.byte_count())) @@ -144,6 +147,7 @@ impl SstIndexCreator { IndexValueCodec::encode_value(value.as_value_ref(), field, &mut self.value_buf)?; } + // null value -> None let v = value.is_some().then_some(self.value_buf.as_slice()); self.index_creator .push_with_name_n(column_name, v, n) @@ -173,21 +177,22 @@ impl SstIndexCreator { }; let mut index_writer = InvertedIndexBlobWriter::new(tx.compat_write()); - let (source, sink) = futures::join!( + let (index_finish, puffin_add_blob) = futures::join!( self.index_creator.finish(&mut index_writer), puffin_writer.add_blob(blob) ); - source.unwrap(); - sink.unwrap(); + index_finish.context(IndexFinishSnafu)?; + puffin_add_blob.context(PuffinAddBlobSnafu)?; - let byte_count = puffin_writer.finish().await.unwrap(); + let byte_count = puffin_writer.finish().await.context(PuffinFinishSnafu)?; guard.inc_byte_count(byte_count); Ok(()) } async fn do_cleanup(&mut self) -> Result<()> { let _guard = self.stats.record_cleanup(); + self.temp_file_provider.cleanup().await } } diff --git a/src/mito2/src/sst/index/store.rs b/src/mito2/src/sst/index/store.rs index c4f7a5b1cf31..6a584b0d1781 100644 --- a/src/mito2/src/sst/index/store.rs +++ b/src/mito2/src/sst/index/store.rs @@ -34,20 +34,20 @@ impl InstrumentedStore { Self { object_store } } - pub async fn reader( + pub async fn reader<'a>( &self, path: &str, - recoder: &'static IntCounter, - ) -> Result> { + recoder: &'a IntCounter, + ) -> Result> { let reader = self.object_store.reader(path).await.context(OpenDalSnafu)?; Ok(InstrumentedAsyncRead::new(reader, recoder)) } - pub async fn writer( + pub async fn writer<'a>( &self, path: &str, - recoder: &'static IntCounter, - ) -> Result> { + recoder: &'a IntCounter, + ) -> Result> { let writer = self.object_store.writer(path).await.context(OpenDalSnafu)?; Ok(InstrumentedAsyncWrite::new(writer, recoder)) } @@ -66,14 +66,14 @@ impl InstrumentedStore { } #[pin_project] -pub(crate) struct InstrumentedAsyncRead { +pub(crate) struct InstrumentedAsyncRead<'a, R> { #[pin] inner: R, - recorder: BytesRecorder, + recorder: BytesRecorder<'a>, } -impl InstrumentedAsyncRead { - fn new(inner: R, recorder: &'static IntCounter) -> Self { +impl<'a, R> InstrumentedAsyncRead<'a, R> { + fn new(inner: R, recorder: &'a IntCounter) -> Self { Self { inner, recorder: BytesRecorder::new(recorder), @@ -81,7 +81,7 @@ impl InstrumentedAsyncRead { } } -impl AsyncRead for InstrumentedAsyncRead { +impl<'a, R: AsyncRead + Unpin + Send> AsyncRead for InstrumentedAsyncRead<'a, R> { fn poll_read( mut self: Pin<&mut Self>, cx: &mut Context<'_>, @@ -95,7 +95,7 @@ impl AsyncRead for InstrumentedAsyncRead { } } -impl AsyncSeek for InstrumentedAsyncRead { +impl<'a, R: AsyncSeek + Unpin + Send> AsyncSeek for InstrumentedAsyncRead<'a, R> { fn poll_seek( self: Pin<&mut Self>, cx: &mut Context<'_>, @@ -106,14 +106,14 @@ impl AsyncSeek for InstrumentedAsyncRead { } #[pin_project] -pub(crate) struct InstrumentedAsyncWrite { +pub(crate) struct InstrumentedAsyncWrite<'a, W> { #[pin] inner: W, - recorder: BytesRecorder, + recorder: BytesRecorder<'a>, } -impl InstrumentedAsyncWrite { - fn new(inner: W, recorder: &'static IntCounter) -> Self { +impl<'a, W> InstrumentedAsyncWrite<'a, W> { + fn new(inner: W, recorder: &'a IntCounter) -> Self { Self { inner, recorder: BytesRecorder::new(recorder), @@ -121,7 +121,7 @@ impl InstrumentedAsyncWrite { } } -impl AsyncWrite for InstrumentedAsyncWrite { +impl<'a, W: AsyncWrite + Unpin + Send> AsyncWrite for InstrumentedAsyncWrite<'a, W> { fn poll_write( mut self: Pin<&mut Self>, cx: &mut Context<'_>, @@ -143,13 +143,13 @@ impl AsyncWrite for InstrumentedAsyncWrite { } } -struct BytesRecorder { +struct BytesRecorder<'a> { bytes: usize, - recorder: &'static IntCounter, + recorder: &'a IntCounter, } -impl BytesRecorder { - fn new(recorder: &'static IntCounter) -> Self { +impl<'a> BytesRecorder<'a> { + fn new(recorder: &'a IntCounter) -> Self { Self { bytes: 0, recorder } } @@ -158,7 +158,7 @@ impl BytesRecorder { } } -impl Drop for BytesRecorder { +impl<'a> Drop for BytesRecorder<'a> { fn drop(&mut self) { if self.bytes > 0 { self.recorder.inc_by(self.bytes as _); diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs index de8b6e07c300..e5d20f23cc0f 100644 --- a/src/mito2/src/sst/parquet/writer.rs +++ b/src/mito2/src/sst/parquet/writer.rs @@ -98,16 +98,34 @@ impl ParquetWriter { .context(WriteBufferSnafu)?; let mut stats = SourceStats::default(); - let mut index_creator = (!self.metadata.primary_key.is_empty()).then(|| { - SstIndexCreator::new( + + let mut index_creator = match ( + self.metadata.primary_key.is_empty(), + NonZeroUsize::new(opts.row_group_size), + ) { + (no_tags, _) if no_tags => { + debug!( + "No tag columns, skip creating index, region_id: {}, file_id: {}", + self.metadata.region_id, self.file_id, + ); + None + } + (_, None) => { + warn!( + "Row group size is 0, skip creating index, region_id: {}, file_id: {}", + self.metadata.region_id, self.file_id, + ); + None + } + (_, Some(row_group_size)) => Some(SstIndexCreator::new( self.region_dir.clone(), self.file_id, &self.metadata, self.object_store.clone(), Some(4 * 1024 * 1024), - NonZeroUsize::new(opts.row_group_size).unwrap(), - ) - }); + row_group_size, + )), + }; while let Some(batch) = self.source.next_batch().await? { stats.update(&batch); @@ -120,9 +138,10 @@ impl ParquetWriter { if let Some(creator) = index_creator.as_mut() { if let Err(err) = creator.update(&batch).await { - let region_id = &self.metadata.region_id; - let file_id = &self.file_id; - warn!("Failed to update index, error: {err}, region_id: {region_id}, file_id: {file_id}"); + warn!( + err; "Failed to update index, skip creating index, region_id: {}, file_id: {}", + self.metadata.region_id, self.file_id, + ); // Skip index creation if error occurs. index_creator = None; @@ -131,14 +150,18 @@ impl ParquetWriter { } if let Some(mut creator) = index_creator { - let region_id = &self.metadata.region_id; - let file_id = &self.file_id; match creator.finish().await { Ok((row_count, byte_count)) => { - debug!("Create index successfully, region_id: {region_id}, file_id: {file_id}, bytes: {byte_count}, rows: {row_count}"); + debug!( + "Create index successfully, region_id: {}, file_id: {}, bytes: {byte_count}, rows: {row_count}", + self.metadata.region_id, self.file_id, + ); } Err(err) => { - warn!("Failed to create index, error: {err}, region_id: {region_id}, file_id: {file_id}"); + warn!( + err; "Failed to create index, region_id: {}, file_id: {}", + self.metadata.region_id, self.file_id, + ); return Ok(None); } } From dc2b4332025463de11b8e9392e88928cfd1f38d6 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Fri, 29 Dec 2023 18:30:30 +0000 Subject: [PATCH 14/27] typos Signed-off-by: Zhenchi --- src/mito2/src/sst/index/store.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mito2/src/sst/index/store.rs b/src/mito2/src/sst/index/store.rs index 6a584b0d1781..abacf8c27238 100644 --- a/src/mito2/src/sst/index/store.rs +++ b/src/mito2/src/sst/index/store.rs @@ -37,19 +37,19 @@ impl InstrumentedStore { pub async fn reader<'a>( &self, path: &str, - recoder: &'a IntCounter, + recorder: &'a IntCounter, ) -> Result> { let reader = self.object_store.reader(path).await.context(OpenDalSnafu)?; - Ok(InstrumentedAsyncRead::new(reader, recoder)) + Ok(InstrumentedAsyncRead::new(reader, recorder)) } pub async fn writer<'a>( &self, path: &str, - recoder: &'a IntCounter, + recorder: &'a IntCounter, ) -> Result> { let writer = self.object_store.writer(path).await.context(OpenDalSnafu)?; - Ok(InstrumentedAsyncWrite::new(writer, recoder)) + Ok(InstrumentedAsyncWrite::new(writer, recorder)) } pub async fn list(&self, path: &str) -> Result> { From 9c980340f2f4ed7c56cc26d4996c35fab95f615b Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Fri, 29 Dec 2023 18:37:48 +0000 Subject: [PATCH 15/27] fix: remove unwrap Signed-off-by: Zhenchi --- src/mito2/src/error.rs | 8 ++++++++ src/mito2/src/sst/index/applier/builder.rs | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs index a54956442596..610955c3aa0f 100644 --- a/src/mito2/src/error.rs +++ b/src/mito2/src/error.rs @@ -433,6 +433,13 @@ pub enum Error { error: index::inverted_index::error::Error, location: Location, }, + + #[snafu(display("Failed to convert value"))] + ConvertValue { + #[snafu(source)] + source: datatypes::error::Error, + location: Location, + }, } pub type Result = std::result::Result; @@ -516,6 +523,7 @@ impl ErrorExt for Error { JsonOptions { .. } => StatusCode::InvalidArguments, EmptyRegionDir { .. } | EmptyManifestDir { .. } => StatusCode::RegionNotFound, ArrowReader { .. } => StatusCode::StorageUnavailable, + ConvertValue { source, .. } => source.status_code(), } } diff --git a/src/mito2/src/sst/index/applier/builder.rs b/src/mito2/src/sst/index/applier/builder.rs index 95c812017cdb..52af22effb18 100644 --- a/src/mito2/src/sst/index/applier/builder.rs +++ b/src/mito2/src/sst/index/applier/builder.rs @@ -36,7 +36,7 @@ use object_store::ObjectStore; use snafu::{OptionExt, ResultExt}; use store_api::metadata::RegionMetadata; -use crate::error::{BuildIndexApplierSnafu, ColumnNotFoundSnafu, Result}; +use crate::error::{BuildIndexApplierSnafu, ColumnNotFoundSnafu, ConvertValueSnafu, Result}; use crate::row_converter::SortField; use crate::sst::index::applier::SstIndexApplier; use crate::sst::index::codec::IndexValueCodec; @@ -168,7 +168,7 @@ impl<'a> SstIndexApplierBuilder<'a> { /// Helper function to encode a literal into bytes. fn encode_lit(lit: &ScalarValue, data_type: ConcreteDataType) -> Result> { - let value = Value::try_from(lit.clone()).unwrap(); + let value = Value::try_from(lit.clone()).context(ConvertValueSnafu)?; let mut bytes = vec![]; let field = SortField::new(data_type); IndexValueCodec::encode_value(value.as_value_ref(), &field, &mut bytes)?; From d5c38b34a5ebb4f3fe0bf27230c76132da56e70f Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Sat, 30 Dec 2023 04:15:13 +0000 Subject: [PATCH 16/27] fix: toml format Signed-off-by: Zhenchi --- src/mito2/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mito2/Cargo.toml b/src/mito2/Cargo.toml index b977dc40ec3e..8e59c191eadd 100644 --- a/src/mito2/Cargo.toml +++ b/src/mito2/Cargo.toml @@ -29,7 +29,6 @@ common-procedure.workspace = true common-query.workspace = true common-recordbatch.workspace = true common-runtime.workspace = true -pin-project.workspace = true common-telemetry.workspace = true common-test-util = { workspace = true, optional = true } common-time.workspace = true @@ -49,6 +48,7 @@ num_cpus = "1.13" object-store.workspace = true parquet = { workspace = true, features = ["async"] } paste.workspace = true +pin-project.workspace = true prometheus.workspace = true prost.workspace = true puffin.workspace = true From 983c09c40e502403ac8a638f16cbf143363b7f62 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Sat, 30 Dec 2023 06:35:54 +0000 Subject: [PATCH 17/27] feat: add filter metrics Signed-off-by: Zhenchi --- src/mito2/src/metrics.rs | 3 +++ src/mito2/src/sst/parquet/reader.rs | 42 ++++++++++++++++++++--------- 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/src/mito2/src/metrics.rs b/src/mito2/src/metrics.rs index 9c608999e716..a08d191a3d5d 100644 --- a/src/mito2/src/metrics.rs +++ b/src/mito2/src/metrics.rs @@ -121,6 +121,9 @@ lazy_static! { /// Counter of filtered rows during merge. pub static ref MERGE_FILTER_ROWS_TOTAL: IntCounterVec = register_int_counter_vec!("mito_merge_filter_rows_total", "mito merge filter rows total", &[TYPE_LABEL]).unwrap(); + /// Counter of row groups read. + pub static ref READ_ROW_GROUPS_TOTAL: IntCounterVec = + register_int_counter_vec!("mito_read_row_groups_total", "mito read row groups total", &[TYPE_LABEL]).unwrap(); // ------- End of query metrics. // Cache related metrics. diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index 61a405d74a3e..5cac12217e7c 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -39,7 +39,7 @@ use crate::error::{ ArrowReaderSnafu, InvalidMetadataSnafu, InvalidParquetSnafu, OpenDalSnafu, ReadParquetSnafu, Result, }; -use crate::metrics::{READ_ROWS_TOTAL, READ_STAGE_ELAPSED}; +use crate::metrics::{READ_ROWS_TOTAL, READ_ROW_GROUPS_TOTAL, READ_STAGE_ELAPSED}; use crate::read::{Batch, BatchReader}; use crate::sst::file::FileHandle; use crate::sst::index::applier::SstIndexApplierRef; @@ -156,8 +156,12 @@ impl ParquetReaderBuilder { parquet_to_arrow_field_levels(parquet_schema_desc, projection_mask.clone(), hint) .context(ReadParquetSnafu { path: &file_path })?; + let mut metrics = Metrics::default(); + // Computes row groups to read. - let row_groups = self.row_groups_to_read(&read_format, &parquet_meta).await; + let row_groups = self + .row_groups_to_read(&read_format, &parquet_meta, &mut metrics) + .await; let reader_builder = RowGroupReaderBuilder { file_handle: self.file_handle.clone(), @@ -169,12 +173,7 @@ impl ParquetReaderBuilder { cache_manager: self.cache_manager.clone(), }; - let metrics = Metrics { - read_row_groups: row_groups.len(), - build_cost: start.elapsed(), - ..Default::default() - }; - + metrics.build_cost = start.elapsed(); Ok(ParquetReader { row_groups, read_format, @@ -247,8 +246,10 @@ impl ParquetReaderBuilder { &self, read_format: &ReadFormat, parquet_meta: &ParquetMetaData, + metrics: &mut Metrics, ) -> BTreeSet { - let mut row_group_ids = (0..parquet_meta.num_row_groups()).collect(); + let mut row_group_ids: BTreeSet<_> = (0..parquet_meta.num_row_groups()).collect(); + metrics.num_unfiltered_row_groups += row_group_ids.len(); // Applies index to prune row groups. if let Some(index_applier) = &self.index_applier { @@ -262,6 +263,7 @@ impl ParquetReaderBuilder { } } } + metrics.num_inverted_index_filtered_row_groups += row_group_ids.len(); // Prunes row groups by metadata. if let Some(predicate) = &self.predicate { @@ -287,6 +289,7 @@ impl ParquetReaderBuilder { row_group_ids.remove(&row_group_id); } }; + metrics.num_min_max_filtered_row_groups += row_group_ids.len(); row_group_ids } @@ -295,8 +298,12 @@ impl ParquetReaderBuilder { /// Parquet reader metrics. #[derive(Debug, Default)] struct Metrics { - /// Number of row groups to read. - read_row_groups: usize, + /// Number of unfiltered row groups. + num_unfiltered_row_groups: usize, + /// Number of row groups to read after filtering by inverted index. + num_inverted_index_filtered_row_groups: usize, + /// Number of row groups to read after filtering by min-max index. + num_min_max_filtered_row_groups: usize, /// Duration to build the parquet reader. build_cost: Duration, /// Duration to scan the reader. @@ -424,8 +431,8 @@ impl Drop for ParquetReader { self.reader_builder.file_handle.region_id(), self.reader_builder.file_handle.file_id(), self.reader_builder.file_handle.time_range(), - self.metrics.read_row_groups, - self.reader_builder.parquet_meta.num_row_groups(), + self.metrics.num_min_max_filtered_row_groups, + self.metrics.num_unfiltered_row_groups, self.metrics ); @@ -439,6 +446,15 @@ impl Drop for ParquetReader { READ_ROWS_TOTAL .with_label_values(&["parquet"]) .inc_by(self.metrics.num_rows as u64); + READ_ROW_GROUPS_TOTAL + .with_label_values(&["unfiltered"]) + .inc_by(self.metrics.num_unfiltered_row_groups as u64); + READ_ROW_GROUPS_TOTAL + .with_label_values(&["inverted_index_filtered"]) + .inc_by(self.metrics.num_inverted_index_filtered_row_groups as u64); + READ_ROW_GROUPS_TOTAL + .with_label_values(&["min_max_filtered"]) + .inc_by(self.metrics.num_min_max_filtered_row_groups as u64); } } From 3ed9b305ab2ad6c0c5f296034598be990f6fad28 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Tue, 2 Jan 2024 09:58:31 +0000 Subject: [PATCH 18/27] add abort Signed-off-by: Zhenchi --- src/mito2/src/compaction.rs | 7 +- src/mito2/src/compaction/output.rs | 21 ++- src/mito2/src/compaction/twcs.rs | 12 +- src/mito2/src/config.rs | 21 +++ src/mito2/src/error.rs | 4 + src/mito2/src/flush.rs | 13 +- src/mito2/src/sst/index/applier.rs | 1 + src/mito2/src/sst/index/creator.rs | 20 ++- src/mito2/src/sst/parquet.rs | 10 ++ src/mito2/src/sst/parquet/writer.rs | 215 +++++++++++++++++++--------- 10 files changed, 242 insertions(+), 82 deletions(-) diff --git a/src/mito2/src/compaction.rs b/src/mito2/src/compaction.rs index 94f04b17aa66..848849368018 100644 --- a/src/mito2/src/compaction.rs +++ b/src/mito2/src/compaction.rs @@ -22,7 +22,6 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::Instant; -use common_base::readable_size::ReadableSize; use common_telemetry::{debug, error}; pub use picker::CompactionPickerRef; use snafu::ResultExt; @@ -53,8 +52,8 @@ pub struct CompactionRequest { pub(crate) file_purger: FilePurgerRef, /// Start time of compaction task. pub(crate) start_time: Instant, - /// Buffering threshold while writing SST files. - pub(crate) sst_write_buffer_size: ReadableSize, + + pub(crate) engine_config: Arc, } impl CompactionRequest { @@ -324,7 +323,7 @@ impl CompactionStatus { waiters: Vec::new(), file_purger: self.file_purger.clone(), start_time, - sst_write_buffer_size: engine_config.sst_write_buffer_size, + engine_config, }; if let Some(pending) = self.pending_compaction.take() { diff --git a/src/mito2/src/compaction/output.rs b/src/mito2/src/compaction/output.rs index 6111e95c40e7..901f527cc1e4 100644 --- a/src/mito2/src/compaction/output.rs +++ b/src/mito2/src/compaction/output.rs @@ -12,17 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -use common_base::readable_size::ReadableSize; +use std::sync::Arc; + use store_api::metadata::RegionMetadataRef; use store_api::storage::RegionId; use crate::access_layer::AccessLayerRef; +use crate::config::MitoConfig; use crate::error; use crate::read::projection::ProjectionMapper; use crate::read::seq_scan::SeqScan; use crate::read::{BoxedBatchReader, Source}; use crate::sst::file::{FileHandle, FileId, FileMeta, Level}; -use crate::sst::parquet::{SstInfo, WriteOptions}; +use crate::sst::parquet::{InvertedIndexCreateOptions, SstInfo, WriteOptions}; #[derive(Debug)] pub(crate) struct CompactionOutput { @@ -39,12 +41,23 @@ impl CompactionOutput { region_id: RegionId, schema: RegionMetadataRef, sst_layer: AccessLayerRef, - sst_write_buffer_size: ReadableSize, + engine_config: Arc, ) -> error::Result> { let reader = build_sst_reader(schema.clone(), sst_layer.clone(), &self.inputs).await?; + let inverted_index_config = &engine_config.inverted_index; + let inverted_index_options = + (!inverted_index_config.disable_creation_on_compact).then(|| { + InvertedIndexCreateOptions { + memory_usage_threshold: inverted_index_config + .creation_memory_usage_threshold + .map(|size| size.as_bytes() as _), + } + }); + let opts = WriteOptions { - write_buffer_size: sst_write_buffer_size, + write_buffer_size: engine_config.sst_write_buffer_size, + inverted_index_options, ..Default::default() }; diff --git a/src/mito2/src/compaction/twcs.rs b/src/mito2/src/compaction/twcs.rs index 6b853cc98313..e19b11485fb5 100644 --- a/src/mito2/src/compaction/twcs.rs +++ b/src/mito2/src/compaction/twcs.rs @@ -17,7 +17,6 @@ use std::fmt::{Debug, Formatter}; use std::sync::Arc; use std::time::{Duration, Instant}; -use common_base::readable_size::ReadableSize; use common_telemetry::{debug, error, info}; use common_time::timestamp::TimeUnit; use common_time::timestamp_millis::BucketAligned; @@ -31,6 +30,7 @@ use crate::access_layer::AccessLayerRef; use crate::compaction::output::CompactionOutput; use crate::compaction::picker::{CompactionTask, Picker}; use crate::compaction::CompactionRequest; +use crate::config::MitoConfig; use crate::error; use crate::error::CompactRegionSnafu; use crate::metrics::{COMPACTION_FAILURE_COUNT, COMPACTION_STAGE_ELAPSED}; @@ -125,7 +125,7 @@ impl Picker for TwcsPicker { waiters, file_purger, start_time, - sst_write_buffer_size, + engine_config, } = req; let region_metadata = current_version.metadata.clone(); @@ -173,12 +173,12 @@ impl Picker for TwcsPicker { sst_layer: access_layer, outputs, expired_ssts, - sst_write_buffer_size, compaction_time_window: Some(time_window_size), request_sender, waiters, file_purger, start_time, + engine_config, }; Some(Box::new(task)) } @@ -232,7 +232,6 @@ pub(crate) struct TwcsCompactionTask { pub sst_layer: AccessLayerRef, pub outputs: Vec, pub expired_ssts: Vec, - pub sst_write_buffer_size: ReadableSize, pub compaction_time_window: Option, pub file_purger: FilePurgerRef, /// Request sender to notify the worker. @@ -241,6 +240,7 @@ pub(crate) struct TwcsCompactionTask { pub waiters: Vec, /// Start time of compaction task pub start_time: Instant, + pub engine_config: Arc, } impl Debug for TwcsCompactionTask { @@ -278,7 +278,7 @@ impl TwcsCompactionTask { for output in self.outputs.drain(..) { let schema = self.schema.clone(); let sst_layer = self.sst_layer.clone(); - let sst_write_buffer_size = self.sst_write_buffer_size; + let engine_config = self.engine_config.clone(); compacted_inputs.extend(output.inputs.iter().map(FileHandle::meta)); info!( @@ -296,7 +296,7 @@ impl TwcsCompactionTask { // TODO(hl): Maybe spawn to runtime to exploit in-job parallelism. futs.push(async move { output - .build(region_id, schema, sst_layer, sst_write_buffer_size) + .build(region_id, schema, sst_layer, engine_config) .await }); } diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs index 2e779b760260..fefee3abd808 100644 --- a/src/mito2/src/config.rs +++ b/src/mito2/src/config.rs @@ -78,6 +78,9 @@ pub struct MitoConfig { pub scan_parallelism: usize, /// Capacity of the channel to send data from parallel scan tasks to the main task (default 32). pub parallel_scan_channel_size: usize, + + #[serde(flatten)] + pub inverted_index: InvertedIndexConfig, } impl Default for MitoConfig { @@ -98,6 +101,7 @@ impl Default for MitoConfig { sst_write_buffer_size: ReadableSize::mb(8), scan_parallelism: divide_num_cpus(4), parallel_scan_channel_size: DEFAULT_SCAN_CHANNEL_SIZE, + inverted_index: InvertedIndexConfig::default(), } } } @@ -160,3 +164,20 @@ fn divide_num_cpus(divisor: usize) -> usize { (cores + divisor - 1) / divisor } + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +pub struct InvertedIndexConfig { + pub disable_creation_on_flush: bool, + pub disable_creation_on_compact: bool, + pub creation_memory_usage_threshold: Option, +} + +impl Default for InvertedIndexConfig { + fn default() -> Self { + InvertedIndexConfig { + disable_creation_on_flush: false, + disable_creation_on_compact: false, + creation_memory_usage_threshold: Some(ReadableSize::mb(128)), + } + } +} diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs index b0ea7887e6f5..08d859701767 100644 --- a/src/mito2/src/error.rs +++ b/src/mito2/src/error.rs @@ -462,6 +462,9 @@ pub enum Error { location: Location, }, + #[snafu(display("Operate on aborted index"))] + OperateAbortedIndex { location: Location }, + #[snafu(display("Failed to read puffin metadata"))] PuffinReadMetadata { #[snafu(source)] @@ -533,6 +536,7 @@ impl ErrorExt for Error { | CreateDefault { .. } | InvalidParquet { .. } | PuffinBlobTypeNotFound { .. } + | OperateAbortedIndex { .. } | UnexpectedReplay { .. } => StatusCode::Unexpected, RegionNotFound { .. } => StatusCode::RegionNotFound, ObjectStoreNotFound { .. } diff --git a/src/mito2/src/flush.rs b/src/mito2/src/flush.rs index 99b880de7cf2..1eb87e59a3d3 100644 --- a/src/mito2/src/flush.rs +++ b/src/mito2/src/flush.rs @@ -40,7 +40,7 @@ use crate::request::{ use crate::schedule::scheduler::{Job, SchedulerRef}; use crate::sst::file::{FileId, FileMeta}; use crate::sst::file_purger::FilePurgerRef; -use crate::sst::parquet::WriteOptions; +use crate::sst::parquet::{InvertedIndexCreateOptions, WriteOptions}; use crate::worker::WorkerListener; /// Global write buffer (memtable) manager. @@ -290,8 +290,19 @@ impl RegionFlushTask { .with_label_values(&["flush_memtables"]) .start_timer(); + let inverted_index_config = &self.engine_config.inverted_index; + let inverted_index_options = + (!inverted_index_config.disable_creation_on_flush).then(|| { + InvertedIndexCreateOptions { + memory_usage_threshold: inverted_index_config + .creation_memory_usage_threshold + .map(|size| size.as_bytes() as _), + } + }); + let mut write_opts = WriteOptions { write_buffer_size: self.engine_config.sst_write_buffer_size, + inverted_index_options, ..Default::default() }; if let Some(row_group_size) = self.row_group_size { diff --git a/src/mito2/src/sst/index/applier.rs b/src/mito2/src/sst/index/applier.rs index e16d36f4d303..2157cf1f5527 100644 --- a/src/mito2/src/sst/index/applier.rs +++ b/src/mito2/src/sst/index/applier.rs @@ -98,6 +98,7 @@ impl SstIndexApplier { let mut index_reader = InvertedIndexBlobReader::new(blob_reader); let context = SearchContext { + // Encountering a non-existing column indicates that it doesn't match predicates. index_not_found_strategy: IndexNotFoundStrategy::ReturnEmpty, }; let res = self diff --git a/src/mito2/src/sst/index/creator.rs b/src/mito2/src/sst/index/creator.rs index 080e092fc2ba..717310e5699d 100644 --- a/src/mito2/src/sst/index/creator.rs +++ b/src/mito2/src/sst/index/creator.rs @@ -26,13 +26,14 @@ use index::inverted_index::create::InvertedIndexCreator; use index::inverted_index::format::writer::InvertedIndexBlobWriter; use object_store::ObjectStore; use puffin::file_format::writer::{Blob, PuffinAsyncWriter, PuffinFileWriter}; -use snafu::ResultExt; +use snafu::{ensure, ResultExt}; use store_api::metadata::RegionMetadataRef; use tokio::io::duplex; use tokio_util::compat::{TokioAsyncReadCompatExt, TokioAsyncWriteCompatExt}; use crate::error::{ - IndexFinishSnafu, PuffinAddBlobSnafu, PuffinFinishSnafu, PushIndexValueSnafu, Result, + IndexFinishSnafu, OperateAbortedIndexSnafu, PuffinAddBlobSnafu, PuffinFinishSnafu, + PushIndexValueSnafu, Result, }; use crate::metrics::INDEX_PUFFIN_WRITE_BYTES_TOTAL; use crate::read::Batch; @@ -60,6 +61,7 @@ pub struct SstIndexCreator { temp_file_provider: Arc, value_buf: Vec, + aborted: bool, stats: Statistics, } @@ -94,11 +96,14 @@ impl SstIndexCreator { index_creator, temp_file_provider, value_buf: vec![], + aborted: false, stats: Statistics::default(), } } pub async fn update(&mut self, batch: &Batch) -> Result<()> { + ensure!(!self.aborted, OperateAbortedIndexSnafu); + if batch.is_empty() { return Ok(()); } @@ -118,6 +123,8 @@ impl SstIndexCreator { } pub async fn finish(&mut self) -> Result<(RowCount, ByteCount)> { + ensure!(!self.aborted, OperateAbortedIndexSnafu); + if self.stats.row_count() == 0 { // no IO is performed, no garbage to clean up, just return return Ok((0, 0)); @@ -136,6 +143,15 @@ impl SstIndexCreator { finish_res.map(|_| (self.stats.row_count(), self.stats.byte_count())) } + pub async fn abort(&mut self) -> Result<()> { + if self.aborted { + return Ok(()); + } + self.aborted = true; + + self.do_cleanup().await + } + async fn do_update(&mut self, batch: &Batch) -> Result<()> { let mut guard = self.stats.record_update(); diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index 5bcc8b545d23..f4a1a68ad4bc 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -42,6 +42,8 @@ pub struct WriteOptions { pub write_buffer_size: ReadableSize, /// Row group size. pub row_group_size: usize, + /// Inverted index options. If it's None, inverted index will not be created. + pub inverted_index_options: Option, } impl Default for WriteOptions { @@ -49,10 +51,18 @@ impl Default for WriteOptions { WriteOptions { write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE, row_group_size: DEFAULT_ROW_GROUP_SIZE, + inverted_index_options: Some(InvertedIndexCreateOptions::default()), } } } +#[derive(Debug, Default)] +pub struct InvertedIndexCreateOptions { + /// The memory usage threshold for inverted index creation. + /// Set to non-none value to enable external sort during inverted index creation + pub memory_usage_threshold: Option, +} + /// Parquet SST info returned by the writer. pub struct SstInfo { /// Time range of the SST. diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs index e5d20f23cc0f..97d4daa4b673 100644 --- a/src/mito2/src/sst/parquet/writer.rs +++ b/src/mito2/src/sst/parquet/writer.rs @@ -19,6 +19,7 @@ use std::num::NonZeroUsize; use common_datasource::file_format::parquet::BufferedWriter; use common_telemetry::{debug, warn}; use common_time::Timestamp; +use futures::TryFutureExt; use object_store::ObjectStore; use parquet::basic::{Compression, Encoding, ZstdLevel}; use parquet::file::metadata::KeyValue; @@ -27,6 +28,7 @@ use parquet::schema::types::ColumnPath; use snafu::ResultExt; use store_api::metadata::RegionMetadataRef; use store_api::storage::consts::SEQUENCE_COLUMN_NAME; +use store_api::storage::RegionId; use super::helper::parse_parquet_metadata; use crate::error::{InvalidMetadataSnafu, Result, WriteBufferSnafu}; @@ -98,74 +100,26 @@ impl ParquetWriter { .context(WriteBufferSnafu)?; let mut stats = SourceStats::default(); + let mut index_creator = IndexCreator::new( + self.file_id.clone(), + self.region_dir.clone(), + &self.metadata, + self.object_store.clone(), + opts, + ); - let mut index_creator = match ( - self.metadata.primary_key.is_empty(), - NonZeroUsize::new(opts.row_group_size), - ) { - (no_tags, _) if no_tags => { - debug!( - "No tag columns, skip creating index, region_id: {}, file_id: {}", - self.metadata.region_id, self.file_id, - ); - None - } - (_, None) => { - warn!( - "Row group size is 0, skip creating index, region_id: {}, file_id: {}", - self.metadata.region_id, self.file_id, - ); - None - } - (_, Some(row_group_size)) => Some(SstIndexCreator::new( - self.region_dir.clone(), - self.file_id, - &self.metadata, - self.object_store.clone(), - Some(4 * 1024 * 1024), - row_group_size, - )), - }; - - while let Some(batch) = self.source.next_batch().await? { + while let Some(batch) = self + .write_next_batch(&write_format, &mut buffered_writer) + .or_else(|err| async { + index_creator.abort().await; + Err(err) + }) + .await? + { stats.update(&batch); - let arrow_batch = write_format.convert_batch(&batch)?; - - buffered_writer - .write(&arrow_batch) - .await - .context(WriteBufferSnafu)?; - - if let Some(creator) = index_creator.as_mut() { - if let Err(err) = creator.update(&batch).await { - warn!( - err; "Failed to update index, skip creating index, region_id: {}, file_id: {}", - self.metadata.region_id, self.file_id, - ); - - // Skip index creation if error occurs. - index_creator = None; - } - } - } - - if let Some(mut creator) = index_creator { - match creator.finish().await { - Ok((row_count, byte_count)) => { - debug!( - "Create index successfully, region_id: {}, file_id: {}, bytes: {byte_count}, rows: {row_count}", - self.metadata.region_id, self.file_id, - ); - } - Err(err) => { - warn!( - err; "Failed to create index, region_id: {}, file_id: {}", - self.metadata.region_id, self.file_id, - ); - return Ok(None); - } - } + index_creator.update(&batch).await; } + index_creator.finish().await; if stats.num_rows == 0 { debug!("No data written, try to stop the writer: {file_path}"); @@ -191,6 +145,24 @@ impl ParquetWriter { })) } + async fn write_next_batch( + &mut self, + write_format: &WriteFormat, + buffered_writer: &mut BufferedWriter, + ) -> Result> { + let Some(batch) = self.source.next_batch().await? else { + return Ok(None); + }; + + let arrow_batch = write_format.convert_batch(&batch)?; + buffered_writer + .write(&arrow_batch) + .await + .context(WriteBufferSnafu)?; + + Ok(Some(batch)) + } + /// Customizes per-column config according to schema and maybe column cardinality. fn customize_column_config( builder: WriterPropertiesBuilder, @@ -239,3 +211,116 @@ impl SourceStats { } } } + +struct IndexCreator { + file_id: FileId, + region_id: RegionId, + inner: Option, +} + +impl IndexCreator { + fn new( + file_id: FileId, + region_dir: String, + metadata: &RegionMetadataRef, + object_store: ObjectStore, + opts: &WriteOptions, + ) -> Self { + let Some(option) = &opts.inverted_index_options else { + debug!( + "Skip creating index due to config, region_id: {}, file_id: {}", + metadata.region_id, file_id, + ); + return Self { + file_id, + region_id: metadata.region_id, + inner: None, + }; + }; + + if metadata.primary_key.is_empty() { + debug!( + "No tag columns, skip creating index, region_id: {}, file_id: {}", + metadata.region_id, file_id, + ); + return Self { + file_id, + region_id: metadata.region_id, + inner: None, + }; + } + + let Some(row_group_size) = NonZeroUsize::new(opts.row_group_size) else { + warn!( + "Row group size is 0, skip creating index, region_id: {}, file_id: {}", + metadata.region_id, file_id, + ); + return Self { + file_id, + region_id: metadata.region_id, + inner: None, + }; + }; + + let creator = SstIndexCreator::new( + region_dir.clone(), + file_id.clone(), + metadata, + object_store, + option.memory_usage_threshold, + row_group_size, + ); + + Self { + file_id, + region_id: metadata.region_id, + inner: Some(creator), + } + } + + async fn update(&mut self, batch: &Batch) { + if let Some(creator) = self.inner.as_mut() { + if let Err(err) = creator.update(&batch).await { + warn!( + err; "Failed to update index, skip creating index, region_id: {}, file_id: {}", + self.region_id, self.file_id, + ); + + // Skip index creation if error occurs. + self.inner = None; + } + } + } + + async fn finish(mut self) { + if let Some(creator) = self.inner.as_mut() { + match creator.finish().await { + Ok((row_count, byte_count)) => { + debug!( + "Create index successfully, region_id: {}, file_id: {}, bytes: {byte_count}, rows: {row_count}", + self.region_id, self.file_id, + ); + } + Err(err) => { + warn!( + err; "Failed to create index, region_id: {}, file_id: {}", + self.region_id, self.file_id, + ); + } + } + } + } + + async fn abort(&mut self) { + if let Some(creator) = self.inner.as_mut() { + if let Err(err) = creator.abort().await { + warn!( + err; "Failed to abort index, region_id: {}, file_id: {}", + self.region_id, self.file_id, + ); + } + + self.inner = None; + } + } +} From 6115d8fb98ba9cc99a3837fa5af6467c93e4bc57 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Wed, 3 Jan 2024 06:58:13 +0000 Subject: [PATCH 19/27] chore: polish Signed-off-by: Zhenchi --- src/mito2/src/sst/index/applier.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/mito2/src/sst/index/applier.rs b/src/mito2/src/sst/index/applier.rs index 2157cf1f5527..5dd928cf382d 100644 --- a/src/mito2/src/sst/index/applier.rs +++ b/src/mito2/src/sst/index/applier.rs @@ -101,13 +101,10 @@ impl SstIndexApplier { // Encountering a non-existing column indicates that it doesn't match predicates. index_not_found_strategy: IndexNotFoundStrategy::ReturnEmpty, }; - let res = self - .index_applier + self.index_applier .apply(context, &mut index_reader) .await - .context(ApplyIndexSnafu)?; - - Ok(res) + .context(ApplyIndexSnafu) } } From 915f222bce95735815c9d00d3881849bcf51cfd5 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Wed, 3 Jan 2024 07:35:55 +0000 Subject: [PATCH 20/27] feat: add seek count Signed-off-by: Zhenchi --- src/mito2/src/metrics.rs | 26 +++++++-- src/mito2/src/sst/index/applier.rs | 48 ++++++++++------ .../src/sst/index/creator/temp_provider.rs | 11 +++- src/mito2/src/sst/index/store.rs | 57 +++++++++++-------- src/mito2/src/sst/parquet/writer.rs | 6 +- 5 files changed, 100 insertions(+), 48 deletions(-) diff --git a/src/mito2/src/metrics.rs b/src/mito2/src/metrics.rs index a08d191a3d5d..e0054b9817a7 100644 --- a/src/mito2/src/metrics.rs +++ b/src/mito2/src/metrics.rs @@ -182,6 +182,7 @@ lazy_static! { "index create bytes total", ) .unwrap(); + /// Counter of r/w bytes on index related IO operations. pub static ref INDEX_IO_BYTES_TOTAL: IntCounterVec = register_int_counter_vec!( "index_io_bytes_total", @@ -190,12 +191,29 @@ lazy_static! { ) .unwrap(); /// Counter of read bytes on intermediate files. - pub static ref INDEX_INTERMEDIATE_READ_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL.with_label_values(&["read", "intermediate"]); + pub static ref INDEX_INTERMEDIATE_READ_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL + .with_label_values(&["read", "intermediate"]); /// Counter of write bytes on intermediate files. - pub static ref INDEX_INTERMEDIATE_WRITE_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL.with_label_values(&["write", "intermediate"]); + pub static ref INDEX_INTERMEDIATE_WRITE_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL + .with_label_values(&["write", "intermediate"]); /// Counter of read bytes on puffin files. - pub static ref INDEX_PUFFIN_READ_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL.with_label_values(&["read", "puffin"]); + pub static ref INDEX_PUFFIN_READ_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL + .with_label_values(&["read", "puffin"]); /// Counter of write bytes on puffin files. - pub static ref INDEX_PUFFIN_WRITE_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL.with_label_values(&["write", "puffin"]); + pub static ref INDEX_PUFFIN_WRITE_BYTES_TOTAL: IntCounter = INDEX_IO_BYTES_TOTAL + .with_label_values(&["write", "puffin"]); + + /// Counter of seek operations on index related files. + pub static ref INDEX_IO_SEEK_TOTAL: IntCounterVec = register_int_counter_vec!( + "index_io_seek_total", + "index io seek total", + &[FILE_TYPE_LABEL] + ).unwrap(); + /// Counter of seek operations on intermediate files. + pub static ref INDEX_INTERMEDIATE_SEEK_TOTAL: IntCounter = INDEX_IO_SEEK_TOTAL + .with_label_values(&["intermediate"]); + /// Counter of seek operations on puffin files. + pub static ref INDEX_PUFFIN_SEEK_TOTAL: IntCounter = INDEX_IO_SEEK_TOTAL + .with_label_values(&["puffin"]); // ------- End of index metrics. } diff --git a/src/mito2/src/sst/index/applier.rs b/src/mito2/src/sst/index/applier.rs index 5dd928cf382d..3dd75c44cdca 100644 --- a/src/mito2/src/sst/index/applier.rs +++ b/src/mito2/src/sst/index/applier.rs @@ -17,6 +17,7 @@ pub mod builder; use std::collections::BTreeSet; use std::sync::Arc; +use futures::{AsyncRead, AsyncSeek}; use index::inverted_index::format::reader::InvertedIndexBlobReader; use index::inverted_index::search::index_apply::{ IndexApplier, IndexNotFoundStrategy, SearchContext, @@ -31,6 +32,7 @@ use crate::error::{ }; use crate::metrics::{ INDEX_APPLY_COST_TIME, INDEX_APPLY_MEMORY_USAGE, INDEX_PUFFIN_READ_BYTES_TOTAL, + INDEX_PUFFIN_SEEK_TOTAL, }; use crate::sst::file::FileId; use crate::sst::index::store::InstrumentedStore; @@ -72,14 +74,39 @@ impl SstIndexApplier { pub async fn apply(&self, file_id: FileId) -> Result> { let _timer = INDEX_APPLY_COST_TIME.start_timer(); - let file_path = location::index_file_path(&self.region_dir, &file_id); + let mut puffin_reader = self.puffin_reader(file_id).await?; + let blob_reader = Self::index_blob_reader(&mut puffin_reader).await?; + let mut index_reader = InvertedIndexBlobReader::new(blob_reader); + let context = SearchContext { + // Encountering a non-existing column indicates that it doesn't match predicates. + index_not_found_strategy: IndexNotFoundStrategy::ReturnEmpty, + }; + self.index_applier + .apply(context, &mut index_reader) + .await + .context(ApplyIndexSnafu) + } + + async fn puffin_reader( + &self, + file_id: FileId, + ) -> Result> { + let file_path = location::index_file_path(&self.region_dir, &file_id); let file_reader = self .store - .reader(&file_path, &INDEX_PUFFIN_READ_BYTES_TOTAL) + .reader( + &file_path, + &INDEX_PUFFIN_READ_BYTES_TOTAL, + &INDEX_PUFFIN_SEEK_TOTAL, + ) .await?; - let mut puffin_reader = PuffinFileReader::new(file_reader); + Ok(PuffinFileReader::new(file_reader)) + } + async fn index_blob_reader( + puffin_reader: &mut PuffinFileReader, + ) -> Result { let file_meta = puffin_reader .metadata() .await @@ -91,20 +118,9 @@ impl SstIndexApplier { .context(PuffinBlobTypeNotFoundSnafu { blob_type: INDEX_BLOB_TYPE, })?; - - let blob_reader = puffin_reader + puffin_reader .blob_reader(blob_meta) - .context(PuffinReadBlobSnafu)?; - let mut index_reader = InvertedIndexBlobReader::new(blob_reader); - - let context = SearchContext { - // Encountering a non-existing column indicates that it doesn't match predicates. - index_not_found_strategy: IndexNotFoundStrategy::ReturnEmpty, - }; - self.index_applier - .apply(context, &mut index_reader) - .await - .context(ApplyIndexSnafu) + .context(PuffinReadBlobSnafu) } } diff --git a/src/mito2/src/sst/index/creator/temp_provider.rs b/src/mito2/src/sst/index/creator/temp_provider.rs index 2206dcb34c70..8055a0f6d156 100644 --- a/src/mito2/src/sst/index/creator/temp_provider.rs +++ b/src/mito2/src/sst/index/creator/temp_provider.rs @@ -22,7 +22,10 @@ use index::inverted_index::error::Result as IndexResult; use snafu::ResultExt; use crate::error::Result; -use crate::metrics::{INDEX_INTERMEDIATE_READ_BYTES_TOTAL, INDEX_INTERMEDIATE_WRITE_BYTES_TOTAL}; +use crate::metrics::{ + INDEX_INTERMEDIATE_READ_BYTES_TOTAL, INDEX_INTERMEDIATE_SEEK_TOTAL, + INDEX_INTERMEDIATE_WRITE_BYTES_TOTAL, +}; use crate::sst::index::store::InstrumentedStore; use crate::sst::location::IntermediateLocation; @@ -69,7 +72,11 @@ impl ExternalTempFileProvider for TempFileProvider { let reader = self .store - .reader(entry.path(), &INDEX_INTERMEDIATE_READ_BYTES_TOTAL) + .reader( + entry.path(), + &INDEX_INTERMEDIATE_READ_BYTES_TOTAL, + &INDEX_INTERMEDIATE_SEEK_TOTAL, + ) .await .map_err(BoxedError::new) .context(index_error::ExternalSnafu)?; diff --git a/src/mito2/src/sst/index/store.rs b/src/mito2/src/sst/index/store.rs index abacf8c27238..9919d2d8e1b2 100644 --- a/src/mito2/src/sst/index/store.rs +++ b/src/mito2/src/sst/index/store.rs @@ -37,10 +37,15 @@ impl InstrumentedStore { pub async fn reader<'a>( &self, path: &str, - recorder: &'a IntCounter, + read_byte_count: &'a IntCounter, + seek_count: &'a IntCounter, ) -> Result> { let reader = self.object_store.reader(path).await.context(OpenDalSnafu)?; - Ok(InstrumentedAsyncRead::new(reader, recorder)) + Ok(InstrumentedAsyncRead::new( + reader, + read_byte_count, + seek_count, + )) } pub async fn writer<'a>( @@ -69,14 +74,16 @@ impl InstrumentedStore { pub(crate) struct InstrumentedAsyncRead<'a, R> { #[pin] inner: R, - recorder: BytesRecorder<'a>, + read_byte_count: Counter<'a>, + seek_count: Counter<'a>, } impl<'a, R> InstrumentedAsyncRead<'a, R> { - fn new(inner: R, recorder: &'a IntCounter) -> Self { + fn new(inner: R, read_byte_count: &'a IntCounter, seek_count: &'a IntCounter) -> Self { Self { inner, - recorder: BytesRecorder::new(recorder), + read_byte_count: Counter::new(read_byte_count), + seek_count: Counter::new(seek_count), } } } @@ -89,7 +96,7 @@ impl<'a, R: AsyncRead + Unpin + Send> AsyncRead for InstrumentedAsyncRead<'a, R> ) -> Poll> { let poll = self.as_mut().project().inner.poll_read(cx, buf); if let Poll::Ready(Ok(n)) = &poll { - self.recorder.inc_by(*n); + self.read_byte_count.inc_by(*n); } poll } @@ -97,11 +104,15 @@ impl<'a, R: AsyncRead + Unpin + Send> AsyncRead for InstrumentedAsyncRead<'a, R> impl<'a, R: AsyncSeek + Unpin + Send> AsyncSeek for InstrumentedAsyncRead<'a, R> { fn poll_seek( - self: Pin<&mut Self>, + mut self: Pin<&mut Self>, cx: &mut Context<'_>, pos: io::SeekFrom, ) -> Poll> { - self.project().inner.poll_seek(cx, pos) + let poll = self.as_mut().project().inner.poll_seek(cx, pos); + if let Poll::Ready(Ok(_)) = &poll { + self.seek_count.inc_by(1); + } + poll } } @@ -109,14 +120,14 @@ impl<'a, R: AsyncSeek + Unpin + Send> AsyncSeek for InstrumentedAsyncRead<'a, R> pub(crate) struct InstrumentedAsyncWrite<'a, W> { #[pin] inner: W, - recorder: BytesRecorder<'a>, + write_byte_count: Counter<'a>, } impl<'a, W> InstrumentedAsyncWrite<'a, W> { - fn new(inner: W, recorder: &'a IntCounter) -> Self { + fn new(inner: W, write_byte_count: &'a IntCounter) -> Self { Self { inner, - recorder: BytesRecorder::new(recorder), + write_byte_count: Counter::new(write_byte_count), } } } @@ -129,7 +140,7 @@ impl<'a, W: AsyncWrite + Unpin + Send> AsyncWrite for InstrumentedAsyncWrite<'a, ) -> Poll> { let poll = self.as_mut().project().inner.poll_write(cx, buf); if let Poll::Ready(Ok(n)) = &poll { - self.recorder.inc_by(*n); + self.write_byte_count.inc_by(*n); } poll } @@ -143,25 +154,25 @@ impl<'a, W: AsyncWrite + Unpin + Send> AsyncWrite for InstrumentedAsyncWrite<'a, } } -struct BytesRecorder<'a> { - bytes: usize, - recorder: &'a IntCounter, +struct Counter<'a> { + count: usize, + counter: &'a IntCounter, } -impl<'a> BytesRecorder<'a> { - fn new(recorder: &'a IntCounter) -> Self { - Self { bytes: 0, recorder } +impl<'a> Counter<'a> { + fn new(counter: &'a IntCounter) -> Self { + Self { count: 0, counter } } - fn inc_by(&mut self, bytes: usize) { - self.bytes += bytes; + fn inc_by(&mut self, n: usize) { + self.count += n; } } -impl<'a> Drop for BytesRecorder<'a> { +impl<'a> Drop for Counter<'a> { fn drop(&mut self) { - if self.bytes > 0 { - self.recorder.inc_by(self.bytes as _); + if self.count > 0 { + self.counter.inc_by(self.count as _); } } } diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs index 97d4daa4b673..65637e00357f 100644 --- a/src/mito2/src/sst/parquet/writer.rs +++ b/src/mito2/src/sst/parquet/writer.rs @@ -101,7 +101,7 @@ impl ParquetWriter { let mut stats = SourceStats::default(); let mut index_creator = IndexCreator::new( - self.file_id.clone(), + self.file_id, self.region_dir.clone(), &self.metadata, self.object_store.clone(), @@ -264,7 +264,7 @@ impl IndexCreator { let creator = SstIndexCreator::new( region_dir.clone(), - file_id.clone(), + file_id, metadata, object_store, option.memory_usage_threshold, @@ -280,7 +280,7 @@ impl IndexCreator { async fn update(&mut self, batch: &Batch) { if let Some(creator) = self.inner.as_mut() { - if let Err(err) = creator.update(&batch).await { + if let Err(err) = creator.update(batch).await { warn!( err; "Failed to update index, skip creating index, region_id: {}, file_id: {}", self.region_id, self.file_id, From 73eaacaae258efc666999d0b276f7fb887a2ab80 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Wed, 3 Jan 2024 07:44:18 +0000 Subject: [PATCH 21/27] remove reduntant code Signed-off-by: Zhenchi --- .../search/fst_apply/intersection_apply.rs | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs b/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs index 2f5e6c31f781..d76b44fe9d4b 100644 --- a/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs +++ b/src/index/src/inverted_index/search/fst_apply/intersection_apply.rs @@ -121,20 +121,6 @@ impl IntersectionFstApplier { Ok(Self { dfas, ranges }) } - - fn range_memory_usage(range: &Range) -> usize { - let mut memory_usage = std::mem::size_of::(); - - if let Some(lower) = &range.lower { - memory_usage += lower.value.len(); - } - - if let Some(upper) = &range.upper { - memory_usage += upper.value.len(); - } - - memory_usage - } } impl TryFrom> for IntersectionFstApplier { From ac2367c270de259ec430a00f8ab82dda15db5b8b Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Wed, 3 Jan 2024 13:59:14 +0000 Subject: [PATCH 22/27] feat: add inverted_index_available to file meta Signed-off-by: Zhenchi --- src/mito2/src/compaction/output.rs | 2 ++ src/mito2/src/compaction/test_util.rs | 1 + src/mito2/src/flush.rs | 1 + src/mito2/src/manifest/tests/checkpoint.rs | 1 + src/mito2/src/sst/file.rs | 3 ++ src/mito2/src/sst/file_purger.rs | 1 + src/mito2/src/sst/parquet.rs | 3 ++ src/mito2/src/sst/parquet/reader.rs | 35 ++++++++++------------ src/mito2/src/sst/parquet/writer.rs | 32 ++++++++------------ src/mito2/src/test_util/sst_util.rs | 1 + src/mito2/src/test_util/version_util.rs | 2 ++ 11 files changed, 44 insertions(+), 38 deletions(-) diff --git a/src/mito2/src/compaction/output.rs b/src/mito2/src/compaction/output.rs index 901f527cc1e4..daa9bf785cd5 100644 --- a/src/mito2/src/compaction/output.rs +++ b/src/mito2/src/compaction/output.rs @@ -68,6 +68,7 @@ impl CompactionOutput { |SstInfo { time_range, file_size, + inverted_index_available, .. }| { FileMeta { @@ -76,6 +77,7 @@ impl CompactionOutput { time_range, level: self.output_level, file_size, + inverted_index_available, } }, ); diff --git a/src/mito2/src/compaction/test_util.rs b/src/mito2/src/compaction/test_util.rs index fefae906ba69..1373049882de 100644 --- a/src/mito2/src/compaction/test_util.rs +++ b/src/mito2/src/compaction/test_util.rs @@ -35,6 +35,7 @@ pub fn new_file_handle( ), level, file_size: 0, + inverted_index_available: false, }, file_purger, ) diff --git a/src/mito2/src/flush.rs b/src/mito2/src/flush.rs index 1eb87e59a3d3..c668ba132490 100644 --- a/src/mito2/src/flush.rs +++ b/src/mito2/src/flush.rs @@ -336,6 +336,7 @@ impl RegionFlushTask { time_range: sst_info.time_range, level: 0, file_size: sst_info.file_size, + inverted_index_available: sst_info.inverted_index_available, }); } diff --git a/src/mito2/src/manifest/tests/checkpoint.rs b/src/mito2/src/manifest/tests/checkpoint.rs index c28f6cd6d598..805bb1c9914a 100644 --- a/src/mito2/src/manifest/tests/checkpoint.rs +++ b/src/mito2/src/manifest/tests/checkpoint.rs @@ -171,6 +171,7 @@ async fn checkpoint_with_different_compression_types() { time_range: (0.into(), 10000000.into()), level: 0, file_size: 1024000, + inverted_index_available: false, }; let action = RegionMetaActionList::new(vec![RegionMetaAction::Edit(RegionEdit { files_to_add: vec![file_meta], diff --git a/src/mito2/src/sst/file.rs b/src/mito2/src/sst/file.rs index a68a5d81c87d..e318cface86c 100644 --- a/src/mito2/src/sst/file.rs +++ b/src/mito2/src/sst/file.rs @@ -95,6 +95,8 @@ pub struct FileMeta { pub level: Level, /// Size of the file. pub file_size: u64, + /// Whether inverted index is available. + pub inverted_index_available: bool, } /// Handle to a SST file. @@ -236,6 +238,7 @@ mod tests { time_range: FileTimeRange::default(), level, file_size: 0, + inverted_index_available: false, } } diff --git a/src/mito2/src/sst/file_purger.rs b/src/mito2/src/sst/file_purger.rs index 15c3df6cc703..b43b5e595e67 100644 --- a/src/mito2/src/sst/file_purger.rs +++ b/src/mito2/src/sst/file_purger.rs @@ -136,6 +136,7 @@ mod tests { time_range: FileTimeRange::default(), level: 0, file_size: 4096, + inverted_index_available: false, }, file_purger, ); diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index f4a1a68ad4bc..2b2473d31b87 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -73,6 +73,8 @@ pub struct SstInfo { pub num_rows: usize, /// File Meta Data pub file_metadata: Option, + /// Whether inverted index is available. + pub inverted_index_available: bool, } #[cfg(test)] @@ -250,6 +252,7 @@ mod tests { .await .unwrap() .expect("write_all should return sst info"); + assert!(sst_info.inverted_index_available); let writer_metadata = sst_info.file_metadata.unwrap(); // read the sst file metadata diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs index 5cac12217e7c..15592a2626ee 100644 --- a/src/mito2/src/sst/parquet/reader.rs +++ b/src/mito2/src/sst/parquet/reader.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use async_trait::async_trait; -use common_telemetry::debug; +use common_telemetry::{debug, warn}; use common_time::range::TimestampRange; use datatypes::arrow::record_batch::RecordBatch; use object_store::ObjectStore; @@ -249,21 +249,18 @@ impl ParquetReaderBuilder { metrics: &mut Metrics, ) -> BTreeSet { let mut row_group_ids: BTreeSet<_> = (0..parquet_meta.num_row_groups()).collect(); - metrics.num_unfiltered_row_groups += row_group_ids.len(); + metrics.num_row_groups_unfiltered += row_group_ids.len(); // Applies index to prune row groups. if let Some(index_applier) = &self.index_applier { - match index_applier.apply(self.file_handle.file_id()).await { - Ok(row_groups) => row_group_ids = row_groups, - Err(err) => { - if !err.is_object_not_found() { - debug!("Failed to apply index: {err}"); - } - // Ignores the error since it won't affect correctness. + if self.file_handle.meta().inverted_index_available { + match index_applier.apply(self.file_handle.file_id()).await { + Ok(row_groups) => row_group_ids = row_groups, + Err(err) => warn!(err; "Failed to apply index"), } } } - metrics.num_inverted_index_filtered_row_groups += row_group_ids.len(); + metrics.num_row_groups_inverted_index_filtered += row_group_ids.len(); // Prunes row groups by metadata. if let Some(predicate) = &self.predicate { @@ -289,7 +286,7 @@ impl ParquetReaderBuilder { row_group_ids.remove(&row_group_id); } }; - metrics.num_min_max_filtered_row_groups += row_group_ids.len(); + metrics.num_row_groups_min_max_filtered += row_group_ids.len(); row_group_ids } @@ -299,11 +296,11 @@ impl ParquetReaderBuilder { #[derive(Debug, Default)] struct Metrics { /// Number of unfiltered row groups. - num_unfiltered_row_groups: usize, + num_row_groups_unfiltered: usize, /// Number of row groups to read after filtering by inverted index. - num_inverted_index_filtered_row_groups: usize, + num_row_groups_inverted_index_filtered: usize, /// Number of row groups to read after filtering by min-max index. - num_min_max_filtered_row_groups: usize, + num_row_groups_min_max_filtered: usize, /// Duration to build the parquet reader. build_cost: Duration, /// Duration to scan the reader. @@ -431,8 +428,8 @@ impl Drop for ParquetReader { self.reader_builder.file_handle.region_id(), self.reader_builder.file_handle.file_id(), self.reader_builder.file_handle.time_range(), - self.metrics.num_min_max_filtered_row_groups, - self.metrics.num_unfiltered_row_groups, + self.metrics.num_row_groups_min_max_filtered, + self.metrics.num_row_groups_unfiltered, self.metrics ); @@ -448,13 +445,13 @@ impl Drop for ParquetReader { .inc_by(self.metrics.num_rows as u64); READ_ROW_GROUPS_TOTAL .with_label_values(&["unfiltered"]) - .inc_by(self.metrics.num_unfiltered_row_groups as u64); + .inc_by(self.metrics.num_row_groups_unfiltered as u64); READ_ROW_GROUPS_TOTAL .with_label_values(&["inverted_index_filtered"]) - .inc_by(self.metrics.num_inverted_index_filtered_row_groups as u64); + .inc_by(self.metrics.num_row_groups_inverted_index_filtered as u64); READ_ROW_GROUPS_TOTAL .with_label_values(&["min_max_filtered"]) - .inc_by(self.metrics.num_min_max_filtered_row_groups as u64); + .inc_by(self.metrics.num_row_groups_min_max_filtered as u64); } } diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs index 65637e00357f..36b54331a7ba 100644 --- a/src/mito2/src/sst/parquet/writer.rs +++ b/src/mito2/src/sst/parquet/writer.rs @@ -119,7 +119,8 @@ impl ParquetWriter { stats.update(&batch); index_creator.update(&batch).await; } - index_creator.finish().await; + + let inverted_index_available = index_creator.finish().await; if stats.num_rows == 0 { debug!("No data written, try to stop the writer: {file_path}"); @@ -142,6 +143,7 @@ impl ParquetWriter { file_size, num_rows: stats.num_rows, file_metadata: Some(parquet_metadata), + inverted_index_available, })) } @@ -212,6 +214,7 @@ impl SourceStats { } } +#[derive(Default)] struct IndexCreator { file_id: FileId, region_id: RegionId, @@ -231,11 +234,7 @@ impl IndexCreator { "Skip creating index due to config, region_id: {}, file_id: {}", metadata.region_id, file_id, ); - return Self { - file_id, - region_id: metadata.region_id, - inner: None, - }; + return Self::default(); }; if metadata.primary_key.is_empty() { @@ -243,11 +242,7 @@ impl IndexCreator { "No tag columns, skip creating index, region_id: {}, file_id: {}", metadata.region_id, file_id, ); - return Self { - file_id, - region_id: metadata.region_id, - inner: None, - }; + return Self::default(); } let Some(row_group_size) = NonZeroUsize::new(opts.row_group_size) else { @@ -255,11 +250,7 @@ impl IndexCreator { "Row group size is 0, skip creating index, region_id: {}, file_id: {}", metadata.region_id, file_id, ); - return Self { - file_id, - region_id: metadata.region_id, - inner: None, - }; + return Self::default(); }; let creator = SstIndexCreator::new( @@ -292,14 +283,15 @@ impl IndexCreator { } } - async fn finish(mut self) { + async fn finish(mut self) -> bool { if let Some(creator) = self.inner.as_mut() { match creator.finish().await { Ok((row_count, byte_count)) => { debug!( - "Create index successfully, region_id: {}, file_id: {}, bytes: {byte_count}, rows: {row_count}", - self.region_id, self.file_id, + "Create index successfully, region_id: {}, file_id: {}, bytes: {}, rows: {}", + self.region_id, self.file_id, byte_count, row_count ); + return true; } Err(err) => { warn!( @@ -309,6 +301,8 @@ impl IndexCreator { } } } + + false } async fn abort(&mut self) { diff --git a/src/mito2/src/test_util/sst_util.rs b/src/mito2/src/test_util/sst_util.rs index 3638d119faa1..677eb00a3fc5 100644 --- a/src/mito2/src/test_util/sst_util.rs +++ b/src/mito2/src/test_util/sst_util.rs @@ -106,6 +106,7 @@ pub fn sst_file_handle(start_ms: i64, end_ms: i64) -> FileHandle { ), level: 0, file_size: 0, + inverted_index_available: false, }, file_purger, ) diff --git a/src/mito2/src/test_util/version_util.rs b/src/mito2/src/test_util/version_util.rs index e480b1f146df..be0dbd049b7c 100644 --- a/src/mito2/src/test_util/version_util.rs +++ b/src/mito2/src/test_util/version_util.rs @@ -96,6 +96,7 @@ impl VersionControlBuilder { ), level: 0, file_size: 0, // We don't care file size. + inverted_index_available: false, }, ); self @@ -136,6 +137,7 @@ pub(crate) fn apply_edit( ), level: 0, file_size: 0, // We don't care file size. + inverted_index_available: false, } }) .collect(); From e38d6cc69b5b1d92fc9858caea2f2c4a4869dc0c Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Thu, 4 Jan 2024 08:41:05 +0000 Subject: [PATCH 23/27] fix test Signed-off-by: Zhenchi --- src/mito2/src/config.rs | 1 - src/mito2/src/engine/basic_test.rs | 2 +- tests-integration/tests/http.rs | 5 +++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs index fefee3abd808..b64bbc2cda64 100644 --- a/src/mito2/src/config.rs +++ b/src/mito2/src/config.rs @@ -79,7 +79,6 @@ pub struct MitoConfig { /// Capacity of the channel to send data from parallel scan tasks to the main task (default 32). pub parallel_scan_channel_size: usize, - #[serde(flatten)] pub inverted_index: InvertedIndexConfig, } diff --git a/src/mito2/src/engine/basic_test.rs b/src/mito2/src/engine/basic_test.rs index 2e3dc4de0d53..ed8cd99e95da 100644 --- a/src/mito2/src/engine/basic_test.rs +++ b/src/mito2/src/engine/basic_test.rs @@ -553,5 +553,5 @@ async fn test_region_usage() { assert_eq!(region_stat.sst_usage, 2742); // region total usage - assert_eq!(region_stat.disk_usage(), 3748); + assert_eq!(region_stat.disk_usage(), 3780); } diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 42843ac22e81..7b70af80622a 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -807,6 +807,11 @@ page_cache_size = "512MiB" sst_write_buffer_size = "8MiB" parallel_scan_channel_size = 32 +[datanode.region_engine.mito.inverted_index] +disable_creation_on_flush = false +disable_creation_on_compact = false +creation_memory_usage_threshold = "128MiB" + [[datanode.region_engine]] [datanode.region_engine.file] From 34bc2e06b03067d93112b661c7a48c38a470f0a1 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Fri, 5 Jan 2024 06:54:51 +0000 Subject: [PATCH 24/27] rename config Signed-off-by: Zhenchi --- src/mito2/src/compaction/twcs.rs | 10 +++++----- src/mito2/src/flush.rs | 14 ++++++-------- src/mito2/src/read/scan_region.rs | 4 ++-- src/mito2/src/sst/parquet.rs | 29 ++++++++++++++++++++++------- src/mito2/src/sst/parquet/writer.rs | 4 ++-- 5 files changed, 37 insertions(+), 24 deletions(-) diff --git a/src/mito2/src/compaction/twcs.rs b/src/mito2/src/compaction/twcs.rs index 6d3368a8690f..11cfc954be54 100644 --- a/src/mito2/src/compaction/twcs.rs +++ b/src/mito2/src/compaction/twcs.rs @@ -42,7 +42,7 @@ use crate::request::{ }; use crate::sst::file::{FileHandle, FileId, FileMeta, Level}; use crate::sst::file_purger::FilePurgerRef; -use crate::sst::parquet::{InvertedIndexCreateOptions, WriteOptions}; +use crate::sst::parquet::{InvertedIndexOptions, WriteOptions}; use crate::sst::version::LevelMeta; const MAX_PARALLEL_COMPACTION: usize = 8; @@ -301,16 +301,16 @@ impl TwcsCompactionTask { ); let index_config = &self.engine_config.inverted_index; - let inverted_index_options = - (!index_config.disable_creation_on_compact).then(|| InvertedIndexCreateOptions { - memory_usage_threshold: index_config + let inverted_index = + (!index_config.disable_creation_on_compact).then(|| InvertedIndexOptions { + creation_memory_usage_threshold: index_config .creation_memory_usage_threshold .map(|size| size.as_bytes() as _), }); let write_opts = WriteOptions { write_buffer_size: self.engine_config.sst_write_buffer_size, - inverted_index_options, + inverted_index, ..Default::default() }; let metadata = self.metadata.clone(); diff --git a/src/mito2/src/flush.rs b/src/mito2/src/flush.rs index 5de52784e838..d9ba69ae449f 100644 --- a/src/mito2/src/flush.rs +++ b/src/mito2/src/flush.rs @@ -41,7 +41,7 @@ use crate::request::{ use crate::schedule::scheduler::{Job, SchedulerRef}; use crate::sst::file::{FileId, FileMeta}; use crate::sst::file_purger::FilePurgerRef; -use crate::sst::parquet::{InvertedIndexCreateOptions, WriteOptions}; +use crate::sst::parquet::{InvertedIndexOptions, WriteOptions}; use crate::worker::WorkerListener; /// Global write buffer (memtable) manager. @@ -296,17 +296,15 @@ impl RegionFlushTask { let inverted_index_config = &self.engine_config.inverted_index; let inverted_index_options = - (!inverted_index_config.disable_creation_on_flush).then(|| { - InvertedIndexCreateOptions { - memory_usage_threshold: inverted_index_config - .creation_memory_usage_threshold - .map(|size| size.as_bytes() as _), - } + (!inverted_index_config.disable_creation_on_flush).then(|| InvertedIndexOptions { + creation_memory_usage_threshold: inverted_index_config + .creation_memory_usage_threshold + .map(|size| size.as_bytes() as _), }); let mut write_opts = WriteOptions { write_buffer_size: self.engine_config.sst_write_buffer_size, - inverted_index_options, + inverted_index: inverted_index_options, ..Default::default() }; if let Some(row_group_size) = self.row_group_size { diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index 9557f81ef578..f3de2c64e3ae 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -17,7 +17,7 @@ use std::sync::Arc; use common_recordbatch::SendableRecordBatchStream; -use common_telemetry::{debug, logging}; +use common_telemetry::{debug, warn}; use common_time::range::TimestampRange; use store_api::storage::ScanRequest; use table::predicate::{Predicate, TimeRangePredicateBuilder}; @@ -243,7 +243,7 @@ impl ScanRegion { self.version.metadata.as_ref(), ) .build(&self.request.filters) - .inspect_err(|e| logging::warn!("Failed to build index applier: {}", e)) + .inspect_err(|err| warn!(err; "Failed to build index applier")) .ok() .flatten() .map(Arc::new) diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs index bf97050efe09..2832a5c5f5a9 100644 --- a/src/mito2/src/sst/parquet.rs +++ b/src/mito2/src/sst/parquet.rs @@ -43,7 +43,7 @@ pub struct WriteOptions { /// Row group size. pub row_group_size: usize, /// Inverted index options. If it's None, inverted index will not be created. - pub inverted_index_options: Option, + pub inverted_index: Option, } impl Default for WriteOptions { @@ -51,16 +51,16 @@ impl Default for WriteOptions { WriteOptions { write_buffer_size: DEFAULT_WRITE_BUFFER_SIZE, row_group_size: DEFAULT_ROW_GROUP_SIZE, - inverted_index_options: Some(InvertedIndexCreateOptions::default()), + inverted_index: Some(InvertedIndexOptions::default()), } } } #[derive(Debug, Default)] -pub struct InvertedIndexCreateOptions { +pub struct InvertedIndexOptions { /// The memory usage threshold for inverted index creation. /// Set to non-none value to enable external sort during inverted index creation - pub memory_usage_threshold: Option, + pub creation_memory_usage_threshold: Option, } /// Parquet SST info returned by the writer. @@ -125,7 +125,12 @@ mod tests { ..Default::default() }; - let mut writer = ParquetWriter::new(FILE_DIR.to_string(), handle.file_id(), metadata, object_store.clone()); + let mut writer = ParquetWriter::new( + FILE_DIR.to_string(), + handle.file_id(), + metadata, + object_store.clone(), + ); let info = writer .write_all(source, &write_opts) .await @@ -173,7 +178,12 @@ mod tests { ..Default::default() }; // Prepare data. - let mut writer = ParquetWriter::new(FILE_DIR.to_string(), handle.file_id(), metadata.clone(), object_store.clone()); + let mut writer = ParquetWriter::new( + FILE_DIR.to_string(), + handle.file_id(), + metadata.clone(), + object_store.clone(), + ); writer .write_all(source, &write_opts) .await @@ -236,7 +246,12 @@ mod tests { // write the sst file and get sst info // sst info contains the parquet metadata, which is converted from FileMetaData - let mut writer = ParquetWriter::new(FILE_DIR.to_string(), handle.file_id(), metadata.clone(), object_store.clone()); + let mut writer = ParquetWriter::new( + FILE_DIR.to_string(), + handle.file_id(), + metadata.clone(), + object_store.clone(), + ); let sst_info = writer .write_all(source, &write_opts) .await diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs index 99d4456c4a98..efc1a61c03fd 100644 --- a/src/mito2/src/sst/parquet/writer.rs +++ b/src/mito2/src/sst/parquet/writer.rs @@ -229,7 +229,7 @@ impl IndexCreator { object_store: ObjectStore, opts: &WriteOptions, ) -> Self { - let Some(option) = &opts.inverted_index_options else { + let Some(option) = &opts.inverted_index else { debug!( "Skip creating index due to config, region_id: {}, file_id: {}", metadata.region_id, file_id, @@ -258,7 +258,7 @@ impl IndexCreator { file_id, metadata, object_store, - option.memory_usage_threshold, + option.creation_memory_usage_threshold, row_group_size, ); From 0351d2853e897cfc29216814bbdadccf0dc7dd0a Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Fri, 5 Jan 2024 07:06:06 +0000 Subject: [PATCH 25/27] style Signed-off-by: Zhenchi --- src/mito2/src/sst/parquet/writer.rs | 60 ++++++++++++++--------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/src/mito2/src/sst/parquet/writer.rs b/src/mito2/src/sst/parquet/writer.rs index efc1a61c03fd..547bfc46202c 100644 --- a/src/mito2/src/sst/parquet/writer.rs +++ b/src/mito2/src/sst/parquet/writer.rs @@ -100,7 +100,7 @@ impl ParquetWriter { .context(WriteBufferSnafu)?; let mut stats = SourceStats::default(); - let mut index_creator = IndexCreator::new( + let mut index = Indexer::new( self.file_id, self.region_dir.clone(), &self.metadata, @@ -108,19 +108,19 @@ impl ParquetWriter { opts, ); - while let Some(batch) = - Self::write_next_batch(&mut source, &write_format, &mut buffered_writer) - .or_else(|err| async { - index_creator.abort().await; - Err(err) - }) - .await? + while let Some(batch) = write_next_batch(&mut source, &write_format, &mut buffered_writer) + .or_else(|err| async { + // abort index creation if error occurs. + index.abort().await; + Err(err) + }) + .await? { stats.update(&batch); - index_creator.update(&batch).await; + index.update(&batch).await; } - let inverted_index_available = index_creator.finish().await; + let inverted_index_available = index.finish().await; if stats.num_rows == 0 { debug!("No data written, try to stop the writer: {file_path}"); @@ -147,24 +147,6 @@ impl ParquetWriter { })) } - async fn write_next_batch( - source: &mut Source, - write_format: &WriteFormat, - buffered_writer: &mut BufferedWriter, - ) -> Result> { - let Some(batch) = source.next_batch().await? else { - return Ok(None); - }; - - let arrow_batch = write_format.convert_batch(&batch)?; - buffered_writer - .write(&arrow_batch) - .await - .context(WriteBufferSnafu)?; - - Ok(Some(batch)) - } - /// Customizes per-column config according to schema and maybe column cardinality. fn customize_column_config( builder: WriterPropertiesBuilder, @@ -185,6 +167,24 @@ impl ParquetWriter { } } +async fn write_next_batch( + source: &mut Source, + write_format: &WriteFormat, + buffered_writer: &mut BufferedWriter, +) -> Result> { + let Some(batch) = source.next_batch().await? else { + return Ok(None); + }; + + let arrow_batch = write_format.convert_batch(&batch)?; + buffered_writer + .write(&arrow_batch) + .await + .context(WriteBufferSnafu)?; + + Ok(Some(batch)) +} + #[derive(Default)] struct SourceStats { /// Number of rows fetched. @@ -215,13 +215,13 @@ impl SourceStats { } #[derive(Default)] -struct IndexCreator { +struct Indexer { file_id: FileId, region_id: RegionId, inner: Option, } -impl IndexCreator { +impl Indexer { fn new( file_id: FileId, region_dir: String, From 8633027ee4a1db27f081bdbebbd5b30b1d2a93c4 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Fri, 5 Jan 2024 09:00:38 +0000 Subject: [PATCH 26/27] typos Signed-off-by: Zhenchi --- src/mito2/src/sst/index/creator.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mito2/src/sst/index/creator.rs b/src/mito2/src/sst/index/creator.rs index daeeb049d744..a7632d6473a7 100644 --- a/src/mito2/src/sst/index/creator.rs +++ b/src/mito2/src/sst/index/creator.rs @@ -165,7 +165,7 @@ impl SstIndexCreator { IndexValueCodec::encode_value(value.as_value_ref(), field, &mut self.value_buf)?; } - // non-null value -> Some(enocded_bytes), null value -> None + // non-null value -> Some(encoded_bytes), null value -> None let v = value.is_some().then_some(self.value_buf.as_slice()); self.index_creator .push_with_name_n(column_name, v, n) From 384632f434386d89a3cca3b59ba25ecca832eb47 Mon Sep 17 00:00:00 2001 From: Zhenchi Date: Wed, 10 Jan 2024 09:17:57 +0000 Subject: [PATCH 27/27] tiny refine Signed-off-by: Zhenchi --- src/mito2/src/access_layer.rs | 3 +-- src/mito2/src/cache/write_cache.rs | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/mito2/src/access_layer.rs b/src/mito2/src/access_layer.rs index bb025d2638d7..1699c6880462 100644 --- a/src/mito2/src/access_layer.rs +++ b/src/mito2/src/access_layer.rs @@ -96,7 +96,6 @@ impl AccessLayer { request: SstWriteRequest, write_opts: &WriteOptions, ) -> Result> { - let file_id = request.file_id; let region_id = request.metadata.region_id; let sst_info = if let Some(write_cache) = request.cache_manager.write_cache() { @@ -118,7 +117,7 @@ impl AccessLayer { // Write cache is disabled. let mut writer = ParquetWriter::new( self.region_dir.clone(), - file_id, + request.file_id, request.metadata, self.object_store.clone(), ); diff --git a/src/mito2/src/cache/write_cache.rs b/src/mito2/src/cache/write_cache.rs index 83068f727be4..6d70afbe7b14 100644 --- a/src/mito2/src/cache/write_cache.rs +++ b/src/mito2/src/cache/write_cache.rs @@ -80,7 +80,7 @@ impl WriteCache { // TODO(yingwen): Write to the local store and then upload. // Now we write to the remote and ignore local cache. let mut writer = ParquetWriter::new( - request.region_dir.clone(), + request.region_dir, request.file_id, request.metadata, request.remote_store,