-
Notifications
You must be signed in to change notification settings - Fork 52
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
20 changed files
with
574 additions
and
145 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
[package] | ||
edition = "2021" | ||
name = "parquet-lru" | ||
version = "0.1.0" | ||
|
||
[dependencies] | ||
bytes = { version = "1.8.0", features = ["serde"] } | ||
foyer = "0.12.2" | ||
futures-core = "0.3.31" | ||
futures-util = "0.3.31" | ||
parquet = { version = "53.2.0", features = ["async"] } | ||
serde = "1.0.214" | ||
thiserror = "2.0.3" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
use std::{future::Future, hash::Hash, marker::PhantomData, ops::Range, sync::Arc}; | ||
|
||
use bytes::Bytes; | ||
use futures_core::future::BoxFuture; | ||
use futures_util::future::FutureExt; | ||
use parquet::{ | ||
arrow::async_reader::AsyncFileReader, | ||
errors::{ParquetError, Result}, | ||
file::metadata::ParquetMetaData, | ||
}; | ||
use serde::{Deserialize, Serialize}; | ||
use thiserror::Error; | ||
|
||
#[derive(Default)] | ||
pub struct Options { | ||
meta_capacity: usize, | ||
data_capacity: usize, | ||
} | ||
|
||
impl Options { | ||
pub fn meta_capacity(mut self, meta_capacity: usize) -> Self { | ||
self.meta_capacity = meta_capacity; | ||
self | ||
} | ||
|
||
pub fn data_capacity(mut self, data_capacity: usize) -> Self { | ||
self.data_capacity = data_capacity; | ||
self | ||
} | ||
} | ||
|
||
pub trait LruCache<K>: Clone + Send + Sync + 'static { | ||
type LruReader<R: AsyncFileReader + 'static>: AsyncFileReader + 'static; | ||
|
||
fn new(options: Options) -> impl Future<Output = Result<Self, Error>> + Send; | ||
|
||
fn get_reader<R>(&self, key: K, reader: R) -> impl Future<Output = Self::LruReader<R>> + Send | ||
where | ||
R: AsyncFileReader + 'static; | ||
} | ||
|
||
#[derive(Clone)] | ||
pub struct FoyerCache<K> | ||
where | ||
for<'a> K: Send + Sync + Hash + Eq + Serialize + Deserialize<'a> + 'static, | ||
{ | ||
inner: Arc<FoyerCacheInner<K>>, | ||
} | ||
|
||
pub struct FoyerCacheInner<K> | ||
where | ||
for<'a> K: Send + Sync + Hash + Eq + Serialize + Deserialize<'a> + 'static, | ||
{ | ||
meta: foyer::Cache<K, Arc<ParquetMetaData>>, | ||
data: foyer::HybridCache<(K, Range<usize>), Bytes>, | ||
} | ||
|
||
impl<K> LruCache<K> for FoyerCache<K> | ||
where | ||
for<'a> K: Send + Sync + Hash + Eq + Serialize + Deserialize<'a> + Clone + 'static, | ||
{ | ||
type LruReader<R: AsyncFileReader + 'static> = ParquetLru<K, R>; | ||
|
||
async fn new(options: Options) -> Result<Self, Error> { | ||
Ok(Self { | ||
inner: Arc::new(FoyerCacheInner { | ||
meta: foyer::CacheBuilder::new(options.meta_capacity).build(), | ||
data: foyer::HybridCacheBuilder::new() | ||
.memory(options.data_capacity) | ||
.storage(foyer::Engine::Large) | ||
.build() | ||
.await | ||
.map_err(|e| Error::Foyer(e.into()))?, | ||
}), | ||
}) | ||
} | ||
|
||
async fn get_reader<R: AsyncFileReader>(&self, key: K, reader: R) -> ParquetLru<K, R> { | ||
ParquetLru::new(self.clone(), key, reader) | ||
} | ||
} | ||
|
||
pub struct ParquetLru<K, R> | ||
where | ||
for<'a> K: Send + Sync + Hash + Eq + Serialize + Deserialize<'a> + 'static, | ||
{ | ||
cache: FoyerCache<K>, | ||
key: K, | ||
reader: R, | ||
} | ||
|
||
impl<K, R> ParquetLru<K, R> | ||
where | ||
for<'a> K: Send + Sync + Hash + Eq + Serialize + Deserialize<'a> + 'static, | ||
R: AsyncFileReader, | ||
{ | ||
fn new(cache: FoyerCache<K>, key: K, reader: R) -> Self { | ||
Self { cache, key, reader } | ||
} | ||
} | ||
|
||
impl<K, R> AsyncFileReader for ParquetLru<K, R> | ||
where | ||
for<'a> K: Send + Sync + Hash + Eq + Serialize + Deserialize<'a> + Clone + 'static, | ||
R: AsyncFileReader, | ||
{ | ||
fn get_bytes(&mut self, range: Range<usize>) -> BoxFuture<'_, Result<Bytes>> { | ||
async move { | ||
if let Some(data) = self | ||
.cache | ||
.inner | ||
.data | ||
.get(&(self.key.clone(), range.clone())) | ||
.await | ||
.map_err(|e| ParquetError::External(e.into()))? | ||
{ | ||
Ok(data.value().clone()) | ||
} else { | ||
let data = self.reader.get_bytes(range.clone()).await?; | ||
self.cache | ||
.inner | ||
.data | ||
.insert((self.key.clone(), range), data.clone()); | ||
Ok(data) | ||
} | ||
} | ||
.boxed() | ||
} | ||
|
||
fn get_metadata(&mut self) -> BoxFuture<'_, Result<Arc<ParquetMetaData>>> { | ||
async move { | ||
if let Some(meta) = self.cache.inner.meta.get(&self.key) { | ||
Ok(meta.value().clone()) | ||
} else { | ||
let meta = self.reader.get_metadata().await?; | ||
self.cache.inner.meta.insert(self.key.clone(), meta.clone()); | ||
Ok(meta) | ||
} | ||
} | ||
.boxed() | ||
} | ||
|
||
fn get_byte_ranges(&mut self, ranges: Vec<Range<usize>>) -> BoxFuture<'_, Result<Vec<Bytes>>> { | ||
async move { | ||
let mut missed = Vec::with_capacity(ranges.len()); | ||
let mut results = Vec::with_capacity(ranges.len()); | ||
for (id, range) in ranges.iter().enumerate() { | ||
if let Some(data) = self | ||
.cache | ||
.inner | ||
.data | ||
.get(&(self.key.clone(), range.clone())) | ||
.await | ||
.map_err(|e| ParquetError::External(e.into()))? | ||
{ | ||
results.push((id, data.value().clone())); | ||
} else { | ||
missed.push((id, range)); | ||
} | ||
} | ||
if !missed.is_empty() { | ||
let data = self | ||
.reader | ||
.get_byte_ranges(missed.iter().map(|&(_, r)| r.clone()).collect()) | ||
.await?; | ||
for (id, range) in missed { | ||
let data = data[id].clone(); | ||
self.cache | ||
.inner | ||
.data | ||
.insert((self.key.clone(), range.clone()), data.clone()); | ||
results.push((id, data)); | ||
} | ||
} | ||
results.sort_by_key(|(id, _)| *id); | ||
Ok(results.into_iter().map(|(_, data)| data).collect()) | ||
} | ||
.boxed() | ||
} | ||
} | ||
|
||
#[derive(Clone, Default)] | ||
pub struct NoopCache<K> { | ||
_phantom: PhantomData<K>, | ||
} | ||
|
||
impl<K> LruCache<K> for NoopCache<K> | ||
where | ||
for<'a> K: Send + Sync + Hash + Eq + Serialize + Deserialize<'a> + Clone + 'static, | ||
{ | ||
type LruReader<R: AsyncFileReader + 'static> = R; | ||
|
||
async fn new(_options: Options) -> Result<Self, Error> { | ||
Ok(Self { | ||
_phantom: PhantomData, | ||
}) | ||
} | ||
|
||
async fn get_reader<R: AsyncFileReader>(&self, _key: K, reader: R) -> R { | ||
reader | ||
} | ||
} | ||
|
||
#[derive(Debug, Error)] | ||
pub enum Error { | ||
#[error("Foyer error: {0}")] | ||
Foyer(#[from] Box<dyn std::error::Error + Send + Sync + 'static>), | ||
} |
Oops, something went wrong.