Skip to content

Commit

Permalink
feat(inverted_index): inverted index cache (#4309)
Browse files Browse the repository at this point in the history
* feat/inverted-index-cache:
 Update dependencies and add caching for inverted index reader

 - Updated `atomic` to 0.6.0 and `uuid` to 1.9.1 in `Cargo.lock`.
 - Added `moka` and `uuid` dependencies in `Cargo.toml`.
 - Introduced `seek_read` method in `InvertedIndexBlobReader` for common seek and read operations.
 - Added `cache.rs` module to implement caching for inverted index reader using `moka`.
 - Updated `async-compression` to 0.4.11 in `puffin/Cargo.toml`.

* feat/inverted-index-cache:
 Refactor InvertedIndexReader and Add Index Cache Support

 - Refactored `InvertedIndexReader` to include `seek_read` method and default implementations for `fst` and `bitmap`.
 - Implemented `seek_read` in `InvertedIndexBlobReader` and `CachedInvertedIndexBlobReader`.
 - Introduced `InvertedIndexCache` in `CacheManager` and `SstIndexApplier`.
 - Updated `SstIndexApplierBuilder` to accept and utilize `InvertedIndexCache`.
 - Added `From<FileId> for Uuid` implementation.

* feat/inverted-index-cache:
 Update Cargo.toml and refactor SstIndexApplier

 - Moved `uuid.workspace` entry in Cargo.toml for better organization.

* feat/inverted-index-cache:
 Refactor InvertedIndexCache to use type alias for Arc

 - Replaced `Arc<InvertedIndexCache>` with `InvertedIndexCacheRef` type alias.

* feat/inverted-index-cache:
 Add Prometheus metrics and caching improvements for inverted index

 - Introduced `prometheus` and `puffin` dependencies for metrics.

* feat/inverted-index-cache:
 Refactor InvertedIndexReader and Cache handling

 - Simplified `InvertedIndexReader` trait by removing seek-related comments.

* feat/inverted-index-cache:
 Add configurable cache sizes for inverted index metadata and content
 - Introduced `index_metadata_size` and `index_content_size` in `CacheManagerBuilder`.

* feat/inverted-index-cache:
 Refactor and optimize inverted index caching

 - Removed `metrics.rs` and integrated cache metrics into `index.rs`.

* feat/inverted-index-cache:
 Remove unused dependencies from Cargo.lock and Cargo.toml

 - Removed `moka`, `prometheus`, and `puffin` dependencies from both Cargo.lock and Cargo.toml.

* feat/inverted-index-cache:
 Replace Uuid with FileId in CachedInvertedIndexBlobReader

 - Updated `file_id` type from `Uuid` to `FileId` in `CachedInvertedIndexBlobReader` and related methods.

* feat/inverted-index-cache:
 Refactor cache configuration for inverted index

 - Moved `inverted_index_metadata_cache_size` and `inverted_index_cache_size` from `MitoConfig` to `InvertedIndexConfig`.

* feat/inverted-index-cache:
 Remove unnecessary conversion of `file_id` in `SstIndexApplier`

 - Simplified the initialization of `CachedInvertedIndexBlobReader` by removing the redundant `into()` conversion for `file_id`.
  • Loading branch information
v0y4g3r authored Jul 8, 2024
1 parent 4811fe8 commit aa4d10e
Show file tree
Hide file tree
Showing 20 changed files with 385 additions and 50 deletions.
16 changes: 10 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ snafu.workspace = true
tantivy = { version = "0.22", features = ["zstd-compression"] }
tantivy-jieba = "0.11.0"
tokio.workspace = true
uuid.workspace = true

[dev-dependencies]
common-test-util.workspace = true
Expand Down
32 changes: 23 additions & 9 deletions src/index/src/inverted_index/format/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,41 @@
// See the License for the specific language governing permissions and
// limitations under the License.

mod blob;
mod footer;
use std::sync::Arc;

use async_trait::async_trait;
use common_base::BitVec;
use greptime_proto::v1::index::InvertedIndexMetas;
use snafu::ResultExt;

use crate::inverted_index::error::Result;
use crate::inverted_index::error::{DecodeFstSnafu, Result};
pub use crate::inverted_index::format::reader::blob::InvertedIndexBlobReader;
use crate::inverted_index::FstMap;

mod blob;
mod footer;

/// InvertedIndexReader defines an asynchronous reader of inverted index data
#[mockall::automock]
#[async_trait]
pub trait InvertedIndexReader: Send {
/// Retrieve metadata of all inverted indices stored within the blob.
async fn metadata(&mut self) -> Result<InvertedIndexMetas>;
/// Reads all data to dest.
async fn read_all(&mut self, dest: &mut Vec<u8>) -> Result<usize>;

/// Seeks to given offset and reads data with exact size as provided.
async fn seek_read(&mut self, offset: u64, size: u32) -> Result<Vec<u8>>;

/// Retrieves metadata of all inverted indices stored within the blob.
async fn metadata(&mut self) -> Result<Arc<InvertedIndexMetas>>;

/// Retrieve the finite state transducer (FST) map from the given offset and size.
async fn fst(&mut self, offset: u64, size: u32) -> Result<FstMap>;
/// Retrieves the finite state transducer (FST) map from the given offset and size.
async fn fst(&mut self, offset: u64, size: u32) -> Result<FstMap> {
let fst_data = self.seek_read(offset, size).await?;
FstMap::new(fst_data).context(DecodeFstSnafu)
}

/// Retrieve the bitmap from the given offset and size.
async fn bitmap(&mut self, offset: u64, size: u32) -> Result<BitVec>;
/// Retrieves the bitmap from the given offset and size.
async fn bitmap(&mut self, offset: u64, size: u32) -> Result<BitVec> {
self.seek_read(offset, size).await.map(BitVec::from_vec)
}
}
40 changes: 17 additions & 23 deletions src/index/src/inverted_index/format/reader/blob.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,16 @@
// limitations under the License.

use std::io::SeekFrom;
use std::sync::Arc;

use async_trait::async_trait;
use common_base::BitVec;
use futures::{AsyncRead, AsyncReadExt, AsyncSeek, AsyncSeekExt};
use greptime_proto::v1::index::InvertedIndexMetas;
use snafu::{ensure, ResultExt};

use crate::inverted_index::error::{
DecodeFstSnafu, ReadSnafu, Result, SeekSnafu, UnexpectedBlobSizeSnafu,
};
use crate::inverted_index::error::{ReadSnafu, Result, SeekSnafu, UnexpectedBlobSizeSnafu};
use crate::inverted_index::format::reader::footer::InvertedIndeFooterReader;
use crate::inverted_index::format::reader::{FstMap, InvertedIndexReader};
use crate::inverted_index::format::reader::InvertedIndexReader;
use crate::inverted_index::format::MIN_BLOB_SIZE;

/// Inverted index blob reader, implements [`InvertedIndexReader`]
Expand Down Expand Up @@ -52,35 +50,31 @@ impl<R> InvertedIndexBlobReader<R> {

#[async_trait]
impl<R: AsyncRead + AsyncSeek + Unpin + Send> InvertedIndexReader for InvertedIndexBlobReader<R> {
async fn metadata(&mut self) -> Result<InvertedIndexMetas> {
let end = SeekFrom::End(0);
let blob_size = self.source.seek(end).await.context(SeekSnafu)?;
Self::validate_blob_size(blob_size)?;

let mut footer_reader = InvertedIndeFooterReader::new(&mut self.source, blob_size);
footer_reader.metadata().await
}

async fn fst(&mut self, offset: u64, size: u32) -> Result<FstMap> {
async fn read_all(&mut self, dest: &mut Vec<u8>) -> Result<usize> {
self.source
.seek(SeekFrom::Start(offset))
.seek(SeekFrom::Start(0))
.await
.context(SeekSnafu)?;
let mut buf = vec![0u8; size as usize];
self.source.read_exact(&mut buf).await.context(ReadSnafu)?;

FstMap::new(buf).context(DecodeFstSnafu)
self.source.read_to_end(dest).await.context(ReadSnafu)
}

async fn bitmap(&mut self, offset: u64, size: u32) -> Result<BitVec> {
async fn seek_read(&mut self, offset: u64, size: u32) -> Result<Vec<u8>> {
self.source
.seek(SeekFrom::Start(offset))
.await
.context(SeekSnafu)?;
let mut buf = vec![0u8; size as usize];
self.source.read_exact(&mut buf).await.context(ReadSnafu)?;
self.source.read(&mut buf).await.context(ReadSnafu)?;
Ok(buf)
}

async fn metadata(&mut self) -> Result<Arc<InvertedIndexMetas>> {
let end = SeekFrom::End(0);
let blob_size = self.source.seek(end).await.context(SeekSnafu)?;
Self::validate_blob_size(blob_size)?;

Ok(BitVec::from_vec(buf))
let mut footer_reader = InvertedIndeFooterReader::new(&mut self.source, blob_size);
footer_reader.metadata().await.map(Arc::new)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ impl TryFrom<Vec<(String, Vec<Predicate>)>> for PredicatesIndexApplier {

#[cfg(test)]
mod tests {
use std::sync::Arc;

use common_base::bit_vec::prelude::*;
use greptime_proto::v1::index::InvertedIndexMeta;

Expand All @@ -161,7 +163,7 @@ mod tests {
s.to_owned()
}

fn mock_metas(tags: impl IntoIterator<Item = (&'static str, u32)>) -> InvertedIndexMetas {
fn mock_metas(tags: impl IntoIterator<Item = (&'static str, u32)>) -> Arc<InvertedIndexMetas> {
let mut metas = InvertedIndexMetas {
total_row_count: 8,
segment_row_count: 1,
Expand All @@ -175,7 +177,7 @@ mod tests {
};
metas.metas.insert(s(tag), meta);
}
metas
Arc::new(metas)
}

fn key_fst_applier(value: &'static str) -> Box<dyn FstApplier> {
Expand Down Expand Up @@ -300,11 +302,11 @@ mod tests {
async fn test_index_applier_with_empty_index() {
let mut mock_reader = MockInvertedIndexReader::new();
mock_reader.expect_metadata().returning(move || {
Ok(InvertedIndexMetas {
Ok(Arc::new(InvertedIndexMetas {
total_row_count: 0, // No rows
segment_row_count: 1,
..Default::default()
})
}))
});

let mut mock_fst_applier = MockFstApplier::new();
Expand Down
25 changes: 25 additions & 0 deletions src/mito2/src/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
mod cache_size;

pub(crate) mod file_cache;
pub(crate) mod index;
#[cfg(test)]
pub(crate) mod test_util;
pub(crate) mod write_cache;
Expand All @@ -33,6 +34,7 @@ use store_api::storage::{ConcreteDataType, RegionId};

use crate::cache::cache_size::parquet_meta_size;
use crate::cache::file_cache::{FileType, IndexKey};
use crate::cache::index::{InvertedIndexCache, InvertedIndexCacheRef};
use crate::cache::write_cache::WriteCacheRef;
use crate::metrics::{CACHE_BYTES, CACHE_HIT, CACHE_MISS};
use crate::sst::file::FileId;
Expand All @@ -59,6 +61,8 @@ pub struct CacheManager {
page_cache: Option<PageCache>,
/// A Cache for writing files to object stores.
write_cache: Option<WriteCacheRef>,
/// Cache for inverted index.
index_cache: Option<InvertedIndexCacheRef>,
}

pub type CacheManagerRef = Arc<CacheManager>;
Expand Down Expand Up @@ -167,6 +171,10 @@ impl CacheManager {
pub(crate) fn write_cache(&self) -> Option<&WriteCacheRef> {
self.write_cache.as_ref()
}

pub(crate) fn index_cache(&self) -> Option<&InvertedIndexCacheRef> {
self.index_cache.as_ref()
}
}

/// Builder to construct a [CacheManager].
Expand All @@ -175,6 +183,8 @@ pub struct CacheManagerBuilder {
sst_meta_cache_size: u64,
vector_cache_size: u64,
page_cache_size: u64,
index_metadata_size: u64,
index_content_size: u64,
write_cache: Option<WriteCacheRef>,
}

Expand Down Expand Up @@ -203,6 +213,18 @@ impl CacheManagerBuilder {
self
}

/// Sets cache size for index metadata.
pub fn index_metadata_size(mut self, bytes: u64) -> Self {
self.index_metadata_size = bytes;
self
}

/// Sets cache size for index content.
pub fn index_content_size(mut self, bytes: u64) -> Self {
self.index_content_size = bytes;
self
}

/// Builds the [CacheManager].
pub fn build(self) -> CacheManager {
let sst_meta_cache = (self.sst_meta_cache_size != 0).then(|| {
Expand Down Expand Up @@ -240,11 +262,14 @@ impl CacheManagerBuilder {
.build()
});

let inverted_index_cache =
InvertedIndexCache::new(self.index_metadata_size, self.index_content_size);
CacheManager {
sst_meta_cache,
vector_cache,
page_cache,
write_cache: self.write_cache,
index_cache: Some(Arc::new(inverted_index_cache)),
}
}
}
Expand Down
Loading

0 comments on commit aa4d10e

Please sign in to comment.