feat: Parquet reader builder supports building multiple ranges to read (

#3841) * chore: change `&mut self` to `&self` * feat: define partition and partition context * refactor: move precise_filter to PartitionContext * feat: filter wip * feat: compute projection and fields in format * feat: use RowGroupReader to implement ParquetReader * fix: use expected meta to get column id for filters * feat: partition returns row group reader * style: fix clippy * feat: add build partitions method * docs: comment * refactor: rename Partition to FileRange * chore: address CR comments * feat: avoid allocating column ids while constructing ReadFormat
GreptimeTeam · May 10, 2024 · 5a0629e · 5a0629e
1 parent 89dbf6d
commit 5a0629e
Show file tree

Hide file tree

Showing 4 changed files with 530 additions and 264 deletions.
diff --git a/src/mito2/src/sst/parquet.rs b/src/mito2/src/sst/parquet.rs
@@ -14,6 +14,7 @@
 
 //! SST in parquet format.
 
+pub(crate) mod file_range;
 mod format;
 pub(crate) mod helper;
 pub(crate) mod metadata;

diff --git a/src/mito2/src/sst/parquet/file_range.rs b/src/mito2/src/sst/parquet/file_range.rs
@@ -0,0 +1,186 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Structs and functions for reading ranges from a parquet file. A file range
+//! is usually a row group in a parquet file.
+
+use std::ops::BitAnd;
+use std::sync::Arc;
+
+use api::v1::SemanticType;
+use datatypes::arrow::array::BooleanArray;
+use datatypes::arrow::buffer::BooleanBuffer;
+use parquet::arrow::arrow_reader::RowSelection;
+use snafu::ResultExt;
+
+use crate::error::{FieldTypeMismatchSnafu, FilterRecordBatchSnafu, Result};
+use crate::read::Batch;
+use crate::row_converter::{McmpRowCodec, RowCodec};
+use crate::sst::parquet::format::ReadFormat;
+use crate::sst::parquet::reader::{RowGroupReader, RowGroupReaderBuilder, SimpleFilterContext};
+
+/// A range of a parquet SST. Now it is a row group.
+/// We can read different file ranges in parallel.
+pub struct FileRange {
+    /// Shared context.
+    context: FileRangeContextRef,
+    /// Index of the row group in the SST.
+    row_group_idx: usize,
+    /// Row selection for the row group. `None` means all rows.
+    row_selection: Option<RowSelection>,
+}
+
+impl FileRange {
+    /// Creates a new [FileRange].
+    pub(crate) fn new(
+        context: FileRangeContextRef,
+        row_group_idx: usize,
+        row_selection: Option<RowSelection>,
+    ) -> Self {
+        Self {
+            context,
+            row_group_idx,
+            row_selection,
+        }
+    }
+
+    /// Returns a reader to read the [FileRange].
+    #[allow(dead_code)]
+    pub(crate) async fn reader(&self) -> Result<RowGroupReader> {
+        let parquet_reader = self
+            .context
+            .reader_builder
+            .build(self.row_group_idx, self.row_selection.clone())
+            .await?;
+
+        Ok(RowGroupReader::new(self.context.clone(), parquet_reader))
+    }
+}
+
+/// Context shared by ranges of the same parquet SST.
+pub(crate) struct FileRangeContext {
+    // Row group reader builder for the file.
+    reader_builder: RowGroupReaderBuilder,
+    /// Filters pushed down.
+    filters: Vec<SimpleFilterContext>,
+    /// Helper to read the SST.
+    read_format: ReadFormat,
+    /// Decoder for primary keys
+    codec: McmpRowCodec,
+}
+
+pub(crate) type FileRangeContextRef = Arc<FileRangeContext>;
+
+impl FileRangeContext {
+    /// Creates a new [FileRangeContext].
+    pub(crate) fn new(
+        reader_builder: RowGroupReaderBuilder,
+        filters: Vec<SimpleFilterContext>,
+        read_format: ReadFormat,
+        codec: McmpRowCodec,
+    ) -> Self {
+        Self {
+            reader_builder,
+            filters,
+            read_format,
+            codec,
+        }
+    }
+
+    /// Returns the path of the file to read.
+    pub(crate) fn file_path(&self) -> &str {
+        self.reader_builder.file_path()
+    }
+
+    /// Returns filters pushed down.
+    pub(crate) fn filters(&self) -> &[SimpleFilterContext] {
+        &self.filters
+    }
+
+    /// Returns the format helper.
+    pub(crate) fn read_format(&self) -> &ReadFormat {
+        &self.read_format
+    }
+
+    /// Returns the reader builder.
+    pub(crate) fn reader_builder(&self) -> &RowGroupReaderBuilder {
+        &self.reader_builder
+    }
+
+    /// TRY THE BEST to perform pushed down predicate precisely on the input batch.
+    /// Return the filtered batch. If the entire batch is filtered out, return None.
+    ///
+    /// Supported filter expr type is defined in [SimpleFilterEvaluator].
+    ///
+    /// When a filter is referencing primary key column, this method will decode
+    /// the primary key and put it into the batch.
+    pub(crate) fn precise_filter(&self, mut input: Batch) -> Result<Option<Batch>> {
+        let mut mask = BooleanBuffer::new_set(input.num_rows());
+
+        // Run filter one by one and combine them result
+        // TODO(ruihang): run primary key filter first. It may short circuit other filters
+        for filter in &self.filters {
+            let result = match filter.semantic_type() {
+                SemanticType::Tag => {
+                    let pk_values = if let Some(pk_values) = input.pk_values() {
+                        pk_values
+                    } else {
+                        input.set_pk_values(self.codec.decode(input.primary_key())?);
+                        input.pk_values().unwrap()
+                    };
+                    // Safety: this is a primary key
+                    let pk_index = self
+                        .read_format
+                        .metadata()
+                        .primary_key_index(filter.column_id())
+                        .unwrap();
+                    let pk_value = pk_values[pk_index]
+                        .try_to_scalar_value(filter.data_type())
+                        .context(FieldTypeMismatchSnafu)?;
+                    if filter
+                        .filter()
+                        .evaluate_scalar(&pk_value)
+                        .context(FilterRecordBatchSnafu)?
+                    {
+                        continue;
+                    } else {
+                        // PK not match means the entire batch is filtered out.
+                        return Ok(None);
+                    }
+                }
+                SemanticType::Field => {
+                    let Some(field_index) = self.read_format.field_index_by_id(filter.column_id())
+                    else {
+                        continue;
+                    };
+                    let field_col = &input.fields()[field_index].data;
+                    filter
+                        .filter()
+                        .evaluate_vector(field_col)
+                        .context(FilterRecordBatchSnafu)?
+                }
+                SemanticType::Timestamp => filter
+                    .filter()
+                    .evaluate_vector(input.timestamps())
+                    .context(FilterRecordBatchSnafu)?,
+            };
+
+            mask = mask.bitand(&result);
+        }
+
+        input.filter(&BooleanArray::from(mask).into())?;
+
+        Ok(Some(input))
+    }
+}