Skip to content

Commit

Permalink
feat(inverted_index): add index reader (#2803)
Browse files Browse the repository at this point in the history
* feat(inverted_index): add reader

Signed-off-by: Zhenchi <[email protected]>

* fix: toml format

Signed-off-by: Zhenchi <[email protected]>

* chore: add prefix relative_ to the offset parameter

Signed-off-by: Zhenchi <[email protected]>

* docs: add doc comment

Signed-off-by: Zhenchi <[email protected]>

* chore: update proto

Signed-off-by: Zhenchi <[email protected]>

* fix: outdated docs

Signed-off-by: Zhenchi <[email protected]>

---------

Signed-off-by: Zhenchi <[email protected]>
  • Loading branch information
zhongzc authored Nov 27, 2023
1 parent 9e58bba commit b3edbef
Show file tree
Hide file tree
Showing 11 changed files with 699 additions and 2 deletions.
25 changes: 24 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ members = [
"src/sql",
"src/store-api",
"src/table",
"src/index",
"tests-integration",
"tests/runner",
]
Expand Down Expand Up @@ -83,9 +84,10 @@ datafusion-sql = { git = "https://github.com/apache/arrow-datafusion.git", rev =
datafusion-substrait = { git = "https://github.com/apache/arrow-datafusion.git", rev = "26e43acac3a96cec8dd4c8365f22dfb1a84306e9" }
derive_builder = "0.12"
etcd-client = "0.12"
fst = "0.4.7"
futures = "0.3"
futures-util = "0.3"
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "a11efce55d8ce20257e08842e4f4c1c8fce2b3a8" }
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "2b3ae45740a49ec6a0830d71fc09c3093aeb5fe7" }
humantime-serde = "1.1"
itertools = "0.10"
lazy_static = "1.4"
Expand Down
20 changes: 20 additions & 0 deletions src/index/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[package]
name = "index"
version.workspace = true
edition.workspace = true
license.workspace = true

[dependencies]
async-trait.workspace = true
common-base.workspace = true
common-error.workspace = true
common-macro.workspace = true
fst.workspace = true
futures.workspace = true
greptime-proto.workspace = true
prost.workspace = true
snafu.workspace = true

[dev-dependencies]
tokio-util.workspace = true
tokio.workspace = true
16 changes: 16 additions & 0 deletions src/index/src/inverted_index.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

pub mod error;
pub mod format;
99 changes: 99 additions & 0 deletions src/index/src/inverted_index/error.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use std::any::Any;
use std::io::Error as IoError;

use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use common_macro::stack_trace_debug;
use snafu::{Location, Snafu};

#[derive(Snafu)]
#[snafu(visibility(pub))]
#[stack_trace_debug]
pub enum Error {
#[snafu(display("Failed to seek"))]
Seek {
#[snafu(source)]
error: IoError,
location: Location,
},

#[snafu(display("Failed to read"))]
Read {
#[snafu(source)]
error: IoError,
location: Location,
},

#[snafu(display(
"Unexpected inverted index blob size, min: {min_blob_size}, actual: {actual_blob_size}"
))]
UnexpectedBlobSize {
min_blob_size: u64,
actual_blob_size: u64,
location: Location,
},

#[snafu(display("Unexpected inverted index footer payload size, max: {max_payload_size}, actual: {actual_payload_size}"))]
UnexpectedFooterPayloadSize {
max_payload_size: u64,
actual_payload_size: u64,
location: Location,
},

#[snafu(display("Unexpected inverted index offset size, offset: {offset}, size: {size}, blob_size: {blob_size}, payload_size: {payload_size}"))]
UnexpectedOffsetSize {
offset: u64,
size: u64,
blob_size: u64,
payload_size: u64,
},

#[snafu(display("Failed to decode fst"))]
DecodeFst {
#[snafu(source)]
error: fst::Error,
location: Location,
},

#[snafu(display("Failed to decode protobuf"))]
DecodeProto {
#[snafu(source)]
error: prost::DecodeError,
location: Location,
},
}

impl ErrorExt for Error {
fn status_code(&self) -> StatusCode {
use Error::*;
match self {
Seek { .. }
| Read { .. }
| UnexpectedFooterPayloadSize { .. }
| UnexpectedOffsetSize { .. }
| UnexpectedBlobSize { .. }
| DecodeProto { .. }
| DecodeFst { .. } => StatusCode::Unexpected,
}
}

fn as_any(&self) -> &dyn Any {
self
}
}

pub type Result<T> = std::result::Result<T, Error>;
56 changes: 56 additions & 0 deletions src/index/src/inverted_index/format.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//! # SST Files with Inverted Index Format Specification
//!
//! ## File Structure
//!
//! Each SST file includes a series of inverted indices followed by a footer.
//!
//! `inverted_index₀ inverted_index₁ ... inverted_indexₙ footer`
//!
//! - Each `inverted_indexᵢ` represents an index entry corresponding to tag values and their locations within the file.
//! - `footer`: Contains metadata about the inverted indices, encoded as a protobuf message.
//!
//! ## Inverted Index Internals
//!
//! An inverted index comprises a collection of bitmaps, a null bitmap, and a finite state transducer (FST) indicating tag values' positions:
//!
//! `bitmap₀ bitmap₁ bitmap₂ ... bitmapₙ null_bitmap fst`
//!
//! - `bitmapᵢ`: Bitset indicating the presence of tag values within a row group.
//! - `null_bitmap`: Bitset tracking the presence of null values within the tag column.
//! - `fst`: Finite State Transducer providing an ordered map of bytes, representing the tag values.
//!
//! ## Footer Details
//!
//! The footer encapsulates the metadata for inversion mappings:
//!
//! `footer_payload footer_payload_size`
//!
//! - `footer_payload`: Protobuf-encoded [`InvertedIndexMetas`] describing the metadata of each inverted index.
//! - `footer_payload_size`: Size in bytes of the `footer_payload`, displayed as a `u32` integer.
//! - The footer aids in the interpretation of the inverted indices, providing necessary offset and count information.
//!
//! ## Reference
//!
//! More detailed information regarding the encoding of the inverted indices can be found in the [RFC].
//!
//! [`InvertedIndexMetas`]: https://github.com/GreptimeTeam/greptime-proto/blob/2aaee38de81047537dfa42af9df63bcfb866e06c/proto/greptime/v1/index/inverted_index.proto#L32-L64
//! [RFC]: https://github.com/GreptimeTeam/greptimedb/blob/develop/docs/rfcs/2023-11-03-inverted-index.md
pub mod reader;

const FOOTER_PAYLOAD_SIZE_SIZE: u64 = 4;
const MIN_BLOB_SIZE: u64 = FOOTER_PAYLOAD_SIZE_SIZE;
43 changes: 43 additions & 0 deletions src/index/src/inverted_index/format/reader.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

mod blob;
mod footer;

use async_trait::async_trait;
use common_base::BitVec;
use fst::Map;
use greptime_proto::v1::index::{InvertedIndexMeta, InvertedIndexMetas};

use crate::inverted_index::error::Result;

pub type FstMap = Map<Vec<u8>>;

/// InvertedIndexReader defines an asynchronous reader of inverted index data
#[async_trait]
pub trait InvertedIndexReader {
/// Retrieve metadata of all inverted indices stored within the blob.
async fn metadata(&mut self) -> Result<InvertedIndexMetas>;

/// Retrieve the finite state transducer (FST) map for a given inverted index metadata entry.
async fn fst(&mut self, meta: &InvertedIndexMeta) -> Result<FstMap>;

/// Retrieve the bitmap for a given inverted index metadata entry at the specified offset and size.
async fn bitmap(
&mut self,
meta: &InvertedIndexMeta,
relative_offset: u32,
size: u32,
) -> Result<BitVec>;
}
Loading

0 comments on commit b3edbef

Please sign in to comment.