Skip to content

Commit

Permalink
feat(inverted_index.create): add external sorter (#2950)
Browse files Browse the repository at this point in the history
* feat(inverted_index.create): add read/write for external intermediate files

Signed-off-by: Zhenchi <[email protected]>

* chore: MAGIC_CODEC_V1 -> CODEC_V1_MAGIC

Signed-off-by: Zhenchi <[email protected]>

* chore: polish comments

Signed-off-by: Zhenchi <[email protected]>

* chore: fix typos intermedia -> intermediate

Signed-off-by: Zhenchi <[email protected]>

* fix: typos

Signed-off-by: Zhenchi <[email protected]>

* feat(inverted_index.create): add external sorter

Signed-off-by: Zhenchi <[email protected]>

* chore: fix typos intermedia -> intermediate

Signed-off-by: Zhenchi <[email protected]>

* chore: polish comments

Signed-off-by: Zhenchi <[email protected]>

* chore: polish comments

Signed-off-by: Zhenchi <[email protected]>

* refactor: drop the stream as early as possible to avoid recursive calls to poll

Signed-off-by: Zhenchi <[email protected]>

* refactor: project merge sorted stream

Signed-off-by: Zhenchi <[email protected]>

* feat: add total_row_count to SortOutput

Signed-off-by: Zhenchi <[email protected]>

* feat: remove change of format

Signed-off-by: Zhenchi <[email protected]>

* refactor: rename segment null bitmap

Signed-off-by: Zhenchi <[email protected]>

* refactor: test type alias

Signed-off-by: Zhenchi <[email protected]>

* feat: allow `memory_usage_threshold` to be None to turn off dumping

Signed-off-by: Zhenchi <[email protected]>

* feat: change segment_row_count type to NonZeroUsize

Signed-off-by: Zhenchi <[email protected]>

* refactor: accept BytesRef instead

Signed-off-by: Zhenchi <[email protected]>

* feat: add `push_n` to adapt mito2

Signed-off-by: Zhenchi <[email protected]>

* chore: add k-way merge TODO

Signed-off-by: Zhenchi <[email protected]>

* refactor: more sorter cases

Signed-off-by: Zhenchi <[email protected]>

* refactor: make the merge tree balance

Signed-off-by: Zhenchi <[email protected]>

* Update src/index/src/inverted_index/create/sort/external_sort.rs

Co-authored-by: Yingwen <[email protected]>

* chore: address comments

Signed-off-by: Zhenchi <[email protected]>

* chore: stable feature

Signed-off-by: Zhenchi <[email protected]>

---------

Signed-off-by: Zhenchi <[email protected]>
Co-authored-by: Yingwen <[email protected]>
  • Loading branch information
zhongzc and evenyag authored Dec 19, 2023
1 parent 6b8dbcf commit 83de399
Show file tree
Hide file tree
Showing 7 changed files with 693 additions and 3 deletions.
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions src/index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,18 @@ bytes.workspace = true
common-base.workspace = true
common-error.workspace = true
common-macro.workspace = true
common-telemetry.workspace = true
fst.workspace = true
futures.workspace = true
greptime-proto.workspace = true
mockall.workspace = true
pin-project.workspace = true
prost.workspace = true
regex-automata.workspace = true
regex.workspace = true
snafu.workspace = true

[dev-dependencies]
rand.workspace = true
tokio-util.workspace = true
tokio.workspace = true
1 change: 1 addition & 0 deletions src/index/src/inverted_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ pub mod search;

pub type FstMap = fst::Map<Vec<u8>>;
pub type Bytes = Vec<u8>;
pub type BytesRef<'a> = &'a [u8];
39 changes: 36 additions & 3 deletions src/index/src/inverted_index/create/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,46 @@
// See the License for the specific language governing permissions and
// limitations under the License.

mod external_provider;
mod external_sort;
mod intermediate_rw;
mod merge_stream;

use async_trait::async_trait;
use common_base::BitVec;
use futures::Stream;

use crate::inverted_index::error::Result;
use crate::inverted_index::Bytes;

mod intermediate_rw;
use crate::inverted_index::{Bytes, BytesRef};

/// A stream of sorted values along with their associated bitmap
pub type SortedStream = Box<dyn Stream<Item = Result<(Bytes, BitVec)>> + Send + Unpin>;

/// Output of a sorting operation, encapsulating a bitmap for null values and a stream of sorted items
pub struct SortOutput {
/// Bitmap indicating which segments have null values
pub segment_null_bitmap: BitVec,

/// Stream of sorted items
pub sorted_stream: SortedStream,

/// Total number of rows in the sorted data
pub total_row_count: usize,
}

/// Handles data sorting, supporting incremental input and retrieval of sorted output
#[async_trait]
pub trait Sorter: Send {
/// Inputs a non-null or null value into the sorter.
/// Should be equivalent to calling `push_n` with n = 1
async fn push(&mut self, value: Option<BytesRef<'_>>) -> Result<()> {
self.push_n(value, 1).await
}

/// Pushing n identical non-null or null values into the sorter.
/// Should be equivalent to calling `push` n times
async fn push_n(&mut self, value: Option<BytesRef<'_>>, n: usize) -> Result<()>;

/// Completes the sorting process and returns the sorted data
async fn output(&mut self) -> Result<SortOutput>;
}
39 changes: 39 additions & 0 deletions src/index/src/inverted_index/create/sort/external_provider.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use async_trait::async_trait;
use futures::{AsyncRead, AsyncWrite};

use crate::inverted_index::error::Result;

/// Trait for managing intermediate files during external sorting for a particular index.
#[mockall::automock]
#[async_trait]
pub trait ExternalTempFileProvider: Send + Sync {
/// Creates and opens a new intermediate file associated with a specific index for writing.
/// The implementation should ensure that the file does not already exist.
///
/// - `index_name`: the name of the index for which the file will be associated
/// - `file_id`: a unique identifier for the new file
async fn create(
&self,
index_name: &str,
file_id: &str,
) -> Result<Box<dyn AsyncWrite + Unpin + Send>>;

/// Retrieves all intermediate files associated with a specific index for an external sorting operation.
///
/// `index_name`: the name of the index to retrieve intermediate files for
async fn read_all(&self, index_name: &str) -> Result<Vec<Box<dyn AsyncRead + Unpin + Send>>>;
}
Loading

0 comments on commit 83de399

Please sign in to comment.