Skip to content

Commit

Permalink
feat(mito): Cache repeated vector for tags (GreptimeTeam#2523)
Browse files Browse the repository at this point in the history
* feat: add vector_cache to CacheManager

* feat: cache repeated vectors

* feat: skip decoding pk if output doesn't contain tags

* test: add TestRegionMetadataBuilder

* test: test ProjectionMapper

* test: test vector cache

* test: test projection mapper convert

* style: fix clippy

* feat: do not cache vector if it is too large

* docs: update comment
  • Loading branch information
evenyag authored and paomian committed Oct 19, 2023
1 parent a2e09bb commit cfc9689
Show file tree
Hide file tree
Showing 9 changed files with 379 additions and 69 deletions.
68 changes: 62 additions & 6 deletions src/mito2/src/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ pub(crate) mod test_util;
use std::mem;
use std::sync::Arc;

use datatypes::value::Value;
use datatypes::vectors::VectorRef;
use moka::sync::Cache;
use parquet::file::metadata::ParquetMetaData;
use store_api::storage::RegionId;
Expand All @@ -32,13 +34,15 @@ use crate::sst::file::FileId;
pub struct CacheManager {
/// Cache for SST metadata.
sst_meta_cache: Option<SstMetaCache>,
/// Cache for vectors.
vector_cache: Option<VectorCache>,
}

pub type CacheManagerRef = Arc<CacheManager>;

impl CacheManager {
/// Creates a new manager with specific cache size in bytes.
pub fn new(sst_meta_cache_size: u64) -> CacheManager {
pub fn new(sst_meta_cache_size: u64, vector_cache_size: u64) -> CacheManager {
let sst_meta_cache = if sst_meta_cache_size == 0 {
None
} else {
Expand All @@ -51,8 +55,23 @@ impl CacheManager {
.build();
Some(cache)
};
let vector_cache = if vector_cache_size == 0 {
None
} else {
let cache = Cache::builder()
.max_capacity(vector_cache_size)
.weigher(|_k, v: &VectorRef| {
// We ignore the heap size of `Value`.
(mem::size_of::<Value>() + v.memory_size()) as u32
})
.build();
Some(cache)
};

CacheManager { sst_meta_cache }
CacheManager {
sst_meta_cache,
vector_cache,
}
}

/// Gets cached [ParquetMetaData].
Expand Down Expand Up @@ -84,9 +103,23 @@ impl CacheManager {
cache.remove(&SstMetaKey(region_id, file_id));
}
}

/// Gets a vector with repeated value for specific `key`.
pub fn get_repeated_vector(&self, key: &Value) -> Option<VectorRef> {
self.vector_cache
.as_ref()
.and_then(|vector_cache| vector_cache.get(key))
}

/// Puts a vector with repeated value into the cache.
pub fn put_repeated_vector(&self, key: Value, vector: VectorRef) {
if let Some(cache) = &self.vector_cache {
cache.insert(key, vector);
}
}
}

/// Cache key for SST meta.
/// Cache key (region id, file id) for SST meta.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct SstMetaKey(RegionId, FileId);

Expand All @@ -97,28 +130,40 @@ impl SstMetaKey {
}
}

/// Maps (region id, file id) to [ParquetMetaData].
type SstMetaCache = Cache<SstMetaKey, Arc<ParquetMetaData>>;
/// Maps [Value] to a vector that holds this value repeatedly.
///
/// e.g. `"hello" => ["hello", "hello", "hello"]`
type VectorCache = Cache<Value, VectorRef>;

#[cfg(test)]
mod tests {
use datatypes::vectors::Int64Vector;

use super::*;
use crate::cache::test_util::parquet_meta;

#[test]
fn test_disable_meta_cache() {
let cache = CacheManager::new(0);
fn test_disable_cache() {
let cache = CacheManager::new(0, 0);
assert!(cache.sst_meta_cache.is_none());

let region_id = RegionId::new(1, 1);
let file_id = FileId::random();
let metadata = parquet_meta();
cache.put_parquet_meta_data(region_id, file_id, metadata);
assert!(cache.get_parquet_meta_data(region_id, file_id).is_none());

let value = Value::Int64(10);
let vector: VectorRef = Arc::new(Int64Vector::from_slice([10, 10, 10, 10]));
cache.put_repeated_vector(value.clone(), vector.clone());
assert!(cache.get_repeated_vector(&value).is_none());
}

#[test]
fn test_parquet_meta_cache() {
let cache = CacheManager::new(2000);
let cache = CacheManager::new(2000, 0);
let region_id = RegionId::new(1, 1);
let file_id = FileId::random();
assert!(cache.get_parquet_meta_data(region_id, file_id).is_none());
Expand All @@ -128,4 +173,15 @@ mod tests {
cache.remove_parquet_meta_data(region_id, file_id);
assert!(cache.get_parquet_meta_data(region_id, file_id).is_none());
}

#[test]
fn test_repeated_vector_cache() {
let cache = CacheManager::new(0, 4096);
let value = Value::Int64(10);
assert!(cache.get_repeated_vector(&value).is_none());
let vector: VectorRef = Arc::new(Int64Vector::from_slice([10, 10, 10, 10]));
cache.put_repeated_vector(value.clone(), vector.clone());
let cached = cache.get_repeated_vector(&value).unwrap();
assert_eq!(vector, cached);
}
}
5 changes: 4 additions & 1 deletion src/mito2/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@ pub struct MitoConfig {
pub global_write_buffer_reject_size: ReadableSize,

// Cache configs:
/// Cache size for SST metadata (default 128MB). Setting it to 0 to disable cache.
/// Cache size for SST metadata (default 128MB). Setting it to 0 to disable the cache.
pub sst_meta_cache_size: ReadableSize,
/// Cache size for vectors and arrow arrays (default 512MB). Setting it to 0 to disable the cache.
pub vector_cache_size: ReadableSize,
}

impl Default for MitoConfig {
Expand All @@ -75,6 +77,7 @@ impl Default for MitoConfig {
global_write_buffer_size: ReadableSize::gb(1),
global_write_buffer_reject_size: ReadableSize::gb(2),
sst_meta_cache_size: ReadableSize::mb(128),
vector_cache_size: ReadableSize::mb(512),
}
}
}
Expand Down
52 changes: 7 additions & 45 deletions src/mito2/src/memtable/key_values.rs
Original file line number Diff line number Diff line change
Expand Up @@ -188,59 +188,21 @@ impl ReadRowHelper {
mod tests {
use api::v1;
use api::v1::ColumnDataType;
use datatypes::prelude::ConcreteDataType;
use datatypes::schema::ColumnSchema;
use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
use store_api::storage::RegionId;

use super::*;
use crate::test_util::i64_value;
use crate::test_util::meta_util::TestRegionMetadataBuilder;

const TS_NAME: &str = "ts";
const START_SEQ: SequenceNumber = 100;

/// Creates a region: `ts, k0, k1, ..., v0, v1, ...`
fn new_region_metadata(num_tag: usize, num_field: usize) -> RegionMetadata {
let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
let mut column_id = 0;
builder.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(
TS_NAME,
ConcreteDataType::timestamp_millisecond_datatype(),
false,
),
semantic_type: SemanticType::Timestamp,
column_id,
});
// For simplicity, we use the same data type for tag/field columns.
let mut primary_key = Vec::with_capacity(num_tag);
for i in 0..num_tag {
column_id += 1;
builder.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(
format!("k{i}"),
ConcreteDataType::int64_datatype(),
true,
),
semantic_type: SemanticType::Tag,
column_id,
});
primary_key.push(i as u32 + 1);
}
for i in 0..num_field {
column_id += 1;
builder.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(
format!("v{i}"),
ConcreteDataType::int64_datatype(),
true,
),
semantic_type: SemanticType::Field,
column_id,
});
}
builder.primary_key(primary_key);
builder.build().unwrap()
fn new_region_metadata(num_tags: usize, num_fields: usize) -> RegionMetadata {
TestRegionMetadataBuilder::default()
.ts_name(TS_NAME)
.num_tags(num_tags)
.num_fields(num_fields)
.build()
}

/// Creates rows `[ 0, 1, ..., n ] x num_rows`
Expand Down
Loading

0 comments on commit cfc9689

Please sign in to comment.