From 8959b0750bf234a824093bd38c472ae875070c5d Mon Sep 17 00:00:00 2001 From: Gwo Tzu-Hsing Date: Mon, 25 Nov 2024 22:54:50 +0800 Subject: [PATCH 1/9] refactor: add Schema trait --- src/compaction/mod.rs | 10 +- src/inmem/immutable.rs | 112 +++++++++++++------ src/inmem/mutable.rs | 70 +++++++----- src/lib.rs | 22 ++-- src/ondisk/arrows.rs | 11 +- src/ondisk/sstable.rs | 26 +++-- src/record/internal.rs | 6 +- src/record/mod.rs | 120 +++++++++++++++------ src/record/runtime/array.rs | 44 ++++---- src/record/runtime/mod.rs | 6 +- src/record/runtime/record.rs | 109 ++++++------------- src/record/runtime/record_ref.rs | 20 ++-- src/record/runtime/{column.rs => value.rs} | 62 +++++++---- src/record/test.rs | 64 ++++++----- src/stream/level.rs | 11 +- src/transaction.rs | 18 ++-- src/version/mod.rs | 22 ++-- src/version/set.rs | 4 +- src/wal/mod.rs | 6 +- src/wal/record_entry.rs | 15 ++- tests/wasm.rs | 2 +- 21 files changed, 445 insertions(+), 315 deletions(-) rename src/record/runtime/{column.rs => value.rs} (91%) diff --git a/src/compaction/mod.rs b/src/compaction/mod.rs index a358f606..51b41713 100644 --- a/src/compaction/mod.rs +++ b/src/compaction/mod.rs @@ -527,7 +527,7 @@ pub(crate) mod tests { executor::tokio::TokioExecutor, fs::{manager::StoreManager, FileId, FileType}, inmem::{immutable::Immutable, mutable::Mutable}, - record::{Column, ColumnDesc, Datatype, DynRecord, Record, RecordInstance}, + record::{Datatype, DynRecord, Record, RecordInstance, Value, ValueDesc}, scope::Scope, tests::Test, timestamp::Timestamp, @@ -712,7 +712,7 @@ pub(crate) mod tests { .unwrap(); let empty_record = DynRecord::empty_record( - vec![ColumnDesc::new("id".to_owned(), Datatype::Int32, false)], + vec![ValueDesc::new("id".to_owned(), Datatype::Int32, false)], 0, ); let instance = RecordInstance::Runtime(empty_record); @@ -720,7 +720,7 @@ pub(crate) mod tests { let mut batch1_data = vec![]; let mut batch2_data = vec![]; for i in 0..40 { - let col = Column::new(Datatype::Int32, "id".to_owned(), Arc::new(i), false); + let col = Value::new(Datatype::Int32, "id".to_owned(), Arc::new(i), false); if i % 4 == 0 { continue; } @@ -758,11 +758,11 @@ pub(crate) mod tests { .unwrap(); assert_eq!( scope.min, - Column::new(Datatype::Int32, "id".to_owned(), Arc::new(2), false) + Value::new(Datatype::Int32, "id".to_owned(), Arc::new(2), false) ); assert_eq!( scope.max, - Column::new(Datatype::Int32, "id".to_owned(), Arc::new(39), false) + Value::new(Datatype::Int32, "id".to_owned(), Arc::new(39), false) ); } diff --git a/src/inmem/immutable.rs b/src/inmem/immutable.rs index 4077e282..c9af2021 100644 --- a/src/inmem/immutable.rs +++ b/src/inmem/immutable.rs @@ -5,12 +5,12 @@ use std::{ sync::Arc, }; -use arrow::{array::RecordBatch, datatypes::Schema}; +use arrow::{array::RecordBatch, datatypes::Schema as ArrowSchema}; use crossbeam_skiplist::SkipMap; use parquet::arrow::ProjectionMask; use crate::{ - record::{internal::InternalRecordRef, Key, Record, RecordInstance, RecordRef}, + record::{internal::InternalRecordRef, Key, Record, RecordRef, Schema}, stream::record_batch::RecordBatchEntry, timestamp::{Timestamp, Timestamped, TimestampedRef, EPOCH}, }; @@ -20,7 +20,7 @@ pub trait ArrowArrays: Sized + Sync { type Builder: Builder; - fn builder(schema: &Arc, capacity: usize) -> Self::Builder; + fn builder(schema: Arc, capacity: usize) -> Self::Builder; fn get( &self, @@ -37,7 +37,7 @@ where { fn push( &mut self, - key: Timestamped<<::Key as Key>::Ref<'_>>, + key: Timestamped<<<::Schema as Schema>::Key as Key>::Ref<'_>>, row: Option<::Ref<'_>>, ); @@ -51,26 +51,23 @@ where A: ArrowArrays, { data: A, - index: BTreeMap::Key>, u32>, + index: BTreeMap::Schema as Schema>::Key>, u32>, } -impl - From<( - SkipMap::Key>, Option>, - &RecordInstance, - )> for Immutable +impl Immutable where A: ArrowArrays, A::Record: Send, { - fn from( - (mutable, instance): ( - SkipMap::Key>, Option>, - &RecordInstance, - ), + pub(crate) fn new( + mutable: SkipMap< + Timestamped<<::Schema as Schema>::Key>, + Option, + >, + schema: Arc, ) -> Self { let mut index = BTreeMap::new(); - let mut builder = A::builder(&instance.arrow_schema::(), mutable.len()); + let mut builder = A::builder(schema, mutable.len()); for (offset, (key, value)) in mutable.into_iter().enumerate() { builder.push( @@ -93,8 +90,8 @@ where pub(crate) fn scope( &self, ) -> ( - Option<&::Key>, - Option<&::Key>, + Option<&<::Schema as Schema>::Key>, + Option<&<::Schema as Schema>::Key>, ) { ( self.index.first_key_value().map(|(key, _)| key.value()), @@ -109,8 +106,8 @@ where pub(crate) fn scan<'scan>( &'scan self, range: ( - Bound<&'scan ::Key>, - Bound<&'scan ::Key>, + Bound<&'scan <::Schema as Schema>::Key>, + Bound<&'scan <::Schema as Schema>::Key>, ), ts: Timestamp, projection_mask: ProjectionMask, @@ -128,14 +125,16 @@ where let range = self .index - .range::::Key>, _>((lower, upper)); + .range::::Schema as Schema>::Key>, _>(( + lower, upper, + )); ImmutableScan::::new(range, self.data.as_record_batch(), projection_mask) } pub(crate) fn get( &self, - key: &::Key, + key: &<::Schema as Schema>::Key, ts: Timestamp, projection_mask: ProjectionMask, ) -> Option> { @@ -147,9 +146,13 @@ where .next() } - pub(crate) fn check_conflict(&self, key: &::Key, ts: Timestamp) -> bool { + pub(crate) fn check_conflict( + &self, + key: &<::Schema as Schema>::Key, + ts: Timestamp, + ) -> bool { self.index - .range::::Key>, _>(( + .range::::Schema as Schema>::Key>, _>(( Bound::Excluded(TimestampedRef::new(key, u32::MAX.into())), Bound::Excluded(TimestampedRef::new(key, ts)), )) @@ -162,7 +165,7 @@ pub struct ImmutableScan<'iter, R> where R: Record, { - range: Range<'iter, Timestamped, u32>, + range: Range<'iter, Timestamped<::Key>, u32>, record_batch: &'iter RecordBatch, projection_mask: ProjectionMask, } @@ -172,7 +175,7 @@ where R: Record, { fn new( - range: Range<'iter, Timestamped, u32>, + range: Range<'iter, Timestamped<::Key>, u32>, record_batch: &'iter RecordBatch, projection_mask: ProjectionMask, ) -> Self { @@ -221,17 +224,61 @@ pub(crate) mod tests { Array, BooleanArray, BooleanBufferBuilder, BooleanBuilder, PrimitiveBuilder, RecordBatch, StringArray, StringBuilder, UInt32Array, UInt32Builder, }, - datatypes::{ArrowPrimitiveType, Schema, UInt32Type}, + datatypes::{ArrowPrimitiveType, DataType, Field, Schema as ArrowSchema, UInt32Type}, }; - use parquet::arrow::ProjectionMask; + use once_cell::sync::Lazy; + use parquet::{arrow::ProjectionMask, format::SortingColumn, schema::types::ColumnPath}; use super::{ArrowArrays, Builder}; use crate::{ - record::Record, + record::{Record, Schema}, tests::{Test, TestRef}, timestamp::timestamped::Timestamped, }; + pub struct TestSchema; + + impl Schema for TestSchema { + type Record = Test; + + type Columns = TestImmutableArrays; + + type Key = String; + + fn arrow_schema(&self) -> &Arc { + static SCHEMA: Lazy> = Lazy::new(|| { + Arc::new(ArrowSchema::new(vec![ + Field::new("_null", DataType::Boolean, false), + Field::new("_ts", DataType::UInt32, false), + Field::new("vstring", DataType::Utf8, false), + Field::new("vu32", DataType::UInt32, false), + Field::new("vbool", DataType::Boolean, true), + ])) + }); + + &SCHEMA + } + + fn primary_key_index(&self) -> usize { + 2 + } + + fn primary_key_path( + &self, + ) -> ( + parquet::schema::types::ColumnPath, + Vec, + ) { + ( + ColumnPath::new(vec!["_ts".to_string(), "vstring".to_string()]), + vec![ + SortingColumn::new(1, true, true), + SortingColumn::new(2, false, true), + ], + ) + } + } + #[derive(Debug)] pub struct TestImmutableArrays { _null: Arc, @@ -248,7 +295,7 @@ pub(crate) mod tests { type Builder = TestBuilder; - fn builder(_schema: &Arc, capacity: usize) -> Self::Builder { + fn builder(_schema: Arc, capacity: usize) -> Self::Builder { TestBuilder { vstring: StringBuilder::with_capacity(capacity, 0), vu32: PrimitiveBuilder::::with_capacity(capacity), @@ -336,10 +383,9 @@ pub(crate) mod tests { let vbool = Arc::new(self.vobool.finish()); let _null = Arc::new(BooleanArray::new(self._null.finish(), None)); let _ts = Arc::new(self._ts.finish()); + let schema = TestSchema; let mut record_batch = RecordBatch::try_new( - Arc::clone( - <::Record as Record>::arrow_schema(), - ), + Arc::clone(schema.arrow_schema()), vec![ Arc::clone(&_null) as Arc, Arc::clone(&_ts) as Arc, diff --git a/src/inmem/mutable.rs b/src/inmem/mutable.rs index e5ff8d33..49a78177 100644 --- a/src/inmem/mutable.rs +++ b/src/inmem/mutable.rs @@ -10,7 +10,7 @@ use fusio::{buffered::BufWriter, DynFs, DynWrite}; use crate::{ fs::{FileId, FileType}, inmem::immutable::Immutable, - record::{Key, KeyRef, Record, RecordInstance}, + record::{Key, KeyRef, Record, Schema}, timestamp::{ timestamped::{Timestamped, TimestampedRef}, Timestamp, EPOCH, @@ -22,12 +22,12 @@ use crate::{ pub(crate) type MutableScan<'scan, R> = Range< 'scan, - TimestampedRef<::Key>, + TimestampedRef<<::Schema as Schema>::Key>, ( - Bound<&'scan TimestampedRef<::Key>>, - Bound<&'scan TimestampedRef<::Key>>, + Bound<&'scan TimestampedRef<<::Schema as Schema>::Key>>, + Bound<&'scan TimestampedRef<<::Schema as Schema>::Key>>, ), - Timestamped<::Key>, + Timestamped<<::Schema as Schema>::Key>, Option, >; @@ -35,9 +35,11 @@ pub struct Mutable where R: Record, { - pub(crate) data: SkipMap, Option>, + pub(crate) data: SkipMap::Key>, Option>, wal: Option, R>>>, pub(crate) trigger: Arc + Send + Sync>>, + + pub(super) schema: Arc, } impl Mutable @@ -48,6 +50,7 @@ where option: &DbOption, trigger: Arc + Send + Sync>>, fs: &Arc, + schema: Arc, ) -> Result { let mut wal = None; if option.use_wal { @@ -66,6 +69,7 @@ where data: Default::default(), wal, trigger, + schema, }) } } @@ -87,7 +91,7 @@ where pub(crate) async fn remove( &self, log_ty: LogType, - key: R::Key, + key: ::Key, ts: Timestamp, ) -> Result> { self.append(Some(log_ty), key, ts, None).await @@ -96,7 +100,7 @@ where pub(crate) async fn append( &self, log_ty: Option, - key: R::Key, + key: ::Key, ts: Timestamp, value: Option, ) -> Result> { @@ -123,11 +127,11 @@ where pub(crate) fn get( &self, - key: &R::Key, + key: &::Key, ts: Timestamp, - ) -> Option, Option>> { + ) -> Option::Key>, Option>> { self.data - .range::, _>(( + .range::::Key>, _>(( Bound::Included(TimestampedRef::new(key, ts)), Bound::Included(TimestampedRef::new(key, EPOCH)), )) @@ -136,7 +140,10 @@ where pub(crate) fn scan<'scan>( &'scan self, - range: (Bound<&'scan R::Key>, Bound<&'scan R::Key>), + range: ( + Bound<&'scan ::Key>, + Bound<&'scan ::Key>, + ), ts: Timestamp, ) -> MutableScan<'scan, R> { let lower = match range.0 { @@ -157,9 +164,9 @@ where self.data.is_empty() } - pub(crate) fn check_conflict(&self, key: &R::Key, ts: Timestamp) -> bool { + pub(crate) fn check_conflict(&self, key: &::Key, ts: Timestamp) -> bool { self.data - .range::::Key>, _>(( + .range::::Key>, _>(( Bound::Excluded(TimestampedRef::new(key, u32::MAX.into())), Bound::Excluded(TimestampedRef::new(key, ts)), )) @@ -169,8 +176,7 @@ where pub(crate) async fn into_immutable( self, - instance: &RecordInstance, - ) -> Result<(Option, Immutable), fusio::Error> { + ) -> Result<(Option, Immutable<::Columns>), fusio::Error> { let mut file_id = None; if let Some(wal) = self.wal { @@ -179,7 +185,10 @@ where file_id = Some(wal_guard.file_id()); } - Ok((file_id, Immutable::from((self.data, instance)))) + Ok(( + file_id, + Immutable::new(self.data, self.schema.arrow_schema().clone()), + )) } pub(crate) async fn flush_wal(&self) -> Result<(), DbError> { @@ -209,7 +218,8 @@ mod tests { use super::Mutable; use crate::{ - record::{Column, Datatype, DynRecord, Record}, + inmem::immutable::tests::TestSchema, + record::{test::StringSchema, Datatype, DynRecord, DynSchema, Record, Value, ValueDesc}, tests::{Test, TestRef}, timestamp::Timestamped, trigger::TriggerFactory, @@ -228,7 +238,9 @@ mod tests { fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let mem_table = Mutable::::new(&option, trigger, &fs).await.unwrap(); + let mem_table = Mutable::::new(&option, trigger, &fs, TestSchema) + .await + .unwrap(); mem_table .insert( @@ -277,7 +289,9 @@ mod tests { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let mutable = Mutable::::new(&option, trigger, &fs).await.unwrap(); + let mutable = Mutable::::new(&option, trigger, &fs, Arc::new(StringSchema)) + .await + .unwrap(); mutable .insert(LogType::Full, "1".into(), 0_u32.into()) @@ -365,7 +379,15 @@ mod tests { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let mutable = Mutable::::new(&option, trigger, &fs) + let schema = Arc::new(DynSchema::new( + vec![ + ValueDesc::new("age".to_string(), Datatype::Int8, false), + ValueDesc::new("height".to_string(), Datatype::Int16, true), + ], + 0, + )); + + let mutable = Mutable::::new(&option, trigger, &fs, schema) .await .unwrap(); @@ -374,8 +396,8 @@ mod tests { LogType::Full, DynRecord::new( vec![ - Column::new(Datatype::Int8, "age".to_string(), Arc::new(1_i8), false), - Column::new( + Value::new(Datatype::Int8, "age".to_string(), Arc::new(1_i8), false), + Value::new( Datatype::Int16, "height".to_string(), Arc::new(1236_i16), @@ -395,7 +417,7 @@ mod tests { assert_eq!( entry.key(), &Timestamped::new( - Column::new(Datatype::Int8, "age".to_string(), Arc::new(1_i8), false), + Value::new(Datatype::Int8, "age".to_string(), Arc::new(1_i8), false), 0_u32.into() ) ); diff --git a/src/lib.rs b/src/lib.rs index 6636de7d..1ac0f054 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -149,7 +149,7 @@ use parquet::{ errors::ParquetError, }; use parquet_lru::{DynLruCache, NoCache}; -use record::{ColumnDesc, DynRecord, Record, RecordInstance}; +use record::{DynRecord, Record, ValueDesc}; use thiserror::Error; use timestamp::{Timestamp, TimestampedRef}; use tokio::sync::oneshot; @@ -181,7 +181,7 @@ where { schema: Arc>>, version_set: VersionSet, - lock_map: LockMap, + lock_map: LockMap<::Key>, manager: Arc, parquet_lru: ParquetLru, _p: PhantomData, @@ -195,7 +195,7 @@ where pub async fn with_schema( option: DbOption, executor: E, - column_descs: Vec, + column_descs: Vec, primary_index: usize, ) -> Result> { let option = Arc::new(option); @@ -912,8 +912,8 @@ pub(crate) mod tests { record::{ internal::InternalRecordRef, runtime::test::{test_dyn_item_schema, test_dyn_items}, - Column, Datatype, DynRecord, RecordDecodeError, RecordEncodeError, RecordInstance, - RecordRef, + Datatype, DynRecord, RecordDecodeError, RecordEncodeError, RecordInstance, RecordRef, + Value, }, serdes::{Decode, Encode}, trigger::{TriggerFactory, TriggerType}, @@ -1841,7 +1841,7 @@ pub(crate) mod tests { let tx = db.transaction().await; for i in 0..50 { - let key = Column::new(Datatype::Int64, "id".to_string(), Arc::new(i as i64), false); + let key = Value::new(Datatype::Int64, "id".to_string(), Arc::new(i as i64), false); let option1 = tx.get(&key, Projection::All).await.unwrap(); if i == 28 { assert!(option1.is_none()); @@ -1935,8 +1935,8 @@ pub(crate) mod tests { // test scan { let tx = db.transaction().await; - let lower = Column::new(Datatype::Int64, "id".to_owned(), Arc::new(0_i64), false); - let upper = Column::new(Datatype::Int64, "id".to_owned(), Arc::new(49_i64), false); + let lower = Value::new(Datatype::Int64, "id".to_owned(), Arc::new(0_i64), false); + let upper = Value::new(Datatype::Int64, "id".to_owned(), Arc::new(49_i64), false); let mut scan = tx .scan((Bound::Included(&lower), Bound::Included(&upper))) .projection(vec![0, 2, 7]) @@ -2090,7 +2090,7 @@ pub(crate) mod tests { let tx3 = db3.transaction().await; for i in 0..50 { - let key = Column::new(Datatype::Int64, "id".to_string(), Arc::new(i as i64), false); + let key = Value::new(Datatype::Int64, "id".to_string(), Arc::new(i as i64), false); let option1 = tx1.get(&key, Projection::All).await.unwrap(); let option2 = tx2.get(&key, Projection::All).await.unwrap(); let option3 = tx3.get(&key, Projection::All).await.unwrap(); @@ -2148,8 +2148,8 @@ pub(crate) mod tests { // test scan { let tx1 = db1.transaction().await; - let lower = Column::new(Datatype::Int64, "id".to_owned(), Arc::new(8_i64), false); - let upper = Column::new(Datatype::Int64, "id".to_owned(), Arc::new(43_i64), false); + let lower = Value::new(Datatype::Int64, "id".to_owned(), Arc::new(8_i64), false); + let upper = Value::new(Datatype::Int64, "id".to_owned(), Arc::new(43_i64), false); let mut scan = tx1 .scan((Bound::Included(&lower), Bound::Included(&upper))) .projection(vec![0, 1]) diff --git a/src/ondisk/arrows.rs b/src/ondisk/arrows.rs index fdca1b10..12875a8b 100644 --- a/src/ondisk/arrows.rs +++ b/src/ondisk/arrows.rs @@ -15,14 +15,14 @@ use parquet::{ }; use crate::{ - record::{Key, Record}, + record::{Key, Record, Schema}, timestamp::Timestamp, }; unsafe fn get_range_bound_fn( - range: Bound<&R::Key>, + range: Bound<&::Key>, ) -> ( - Option<&'static R::Key>, + Option<&'static ::Key>, &'static (dyn Fn(&dyn Datum, &dyn Datum) -> Result + Sync), ) where @@ -54,7 +54,10 @@ where pub(crate) unsafe fn get_range_filter( schema_descriptor: &SchemaDescriptor, - range: (Bound<&R::Key>, Bound<&R::Key>), + range: ( + Bound<&::Key>, + Bound<&::Key>, + ), ts: Timestamp, ) -> RowFilter where diff --git a/src/ondisk/sstable.rs b/src/ondisk/sstable.rs index 60e05af3..32b32f71 100644 --- a/src/ondisk/sstable.rs +++ b/src/ondisk/sstable.rs @@ -16,7 +16,7 @@ use ulid::Ulid; use super::{arrows::get_range_filter, scan::SsTableScan}; use crate::{ - record::Record, + record::{Record, Schema}, stream::record_batch::RecordBatchEntry, timestamp::{Timestamp, TimestampedRef}, }; @@ -70,7 +70,7 @@ where pub(crate) async fn get( self, - key: &TimestampedRef, + key: &TimestampedRef<::Key>, projection_mask: ProjectionMask, ) -> ParquetResult>> { self.scan( @@ -87,7 +87,10 @@ where pub(crate) async fn scan<'scan>( self, - range: (Bound<&'scan R::Key>, Bound<&'scan R::Key>), + range: ( + Bound<&'scan ::Key>, + Bound<&'scan ::Key>, + ), ts: Timestamp, limit: Option, projection_mask: ProjectionMask, @@ -134,7 +137,8 @@ pub(crate) mod tests { use crate::{ executor::tokio::TokioExecutor, fs::{manager::StoreManager, FileType}, - record::Record, + inmem::immutable::tests::TestSchema, + record::{Record, Schema}, tests::{get_test_record_batch, Test}, timestamp::Timestamped, DbOption, @@ -153,7 +157,7 @@ pub(crate) mod tests { ); let mut writer = AsyncArrowWriter::try_new_with_options( AsyncWriter::new(file), - Test::arrow_schema().clone(), + TestSchema {}.arrow_schema().clone(), options, ) .expect("Failed to create writer"); @@ -211,7 +215,7 @@ pub(crate) mod tests { .get( key.borrow(), ProjectionMask::roots( - &arrow_to_parquet_schema(Test::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(TestSchema {}.arrow_schema()).unwrap(), [0, 1, 2, 3], ), ) @@ -228,7 +232,7 @@ pub(crate) mod tests { .get( key.borrow(), ProjectionMask::roots( - &arrow_to_parquet_schema(Test::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(TestSchema {}.arrow_schema()).unwrap(), [0, 1, 2, 4], ), ) @@ -245,7 +249,7 @@ pub(crate) mod tests { .get( key.borrow(), ProjectionMask::roots( - &arrow_to_parquet_schema(Test::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(TestSchema {}.arrow_schema()).unwrap(), [0, 1, 2], ), ) @@ -286,7 +290,7 @@ pub(crate) mod tests { 1_u32.into(), None, ProjectionMask::roots( - &arrow_to_parquet_schema(Test::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(TestSchema {}.arrow_schema()).unwrap(), [0, 1, 2, 3], ), ) @@ -311,7 +315,7 @@ pub(crate) mod tests { 1_u32.into(), None, ProjectionMask::roots( - &arrow_to_parquet_schema(Test::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(TestSchema {}.arrow_schema()).unwrap(), [0, 1, 2, 4], ), ) @@ -336,7 +340,7 @@ pub(crate) mod tests { 1_u32.into(), None, ProjectionMask::roots( - &arrow_to_parquet_schema(Test::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(TestSchema {}.arrow_schema()).unwrap(), [0, 1, 2], ), ) diff --git a/src/record/internal.rs b/src/record/internal.rs index c2192e5a..b2212e32 100644 --- a/src/record/internal.rs +++ b/src/record/internal.rs @@ -1,6 +1,6 @@ use std::{marker::PhantomData, mem::transmute}; -use super::{Key, Record, RecordRef}; +use super::{Key, Record, RecordRef, Schema}; use crate::timestamp::{Timestamp, Timestamped}; #[derive(Debug)] @@ -32,7 +32,9 @@ impl<'r, R> InternalRecordRef<'r, R> where R: RecordRef<'r>, { - pub fn value(&self) -> Timestamped<<::Key as Key>::Ref<'_>> { + pub fn value( + &self, + ) -> Timestamped<<<::Schema as Schema>::Key as Key>::Ref<'_>> { // Safety: shorter lifetime of the value must be safe unsafe { transmute(Timestamped::new(self.record.clone().key(), self.ts)) } } diff --git a/src/record/mod.rs b/src/record/mod.rs index 4712fa5d..127180ee 100644 --- a/src/record/mod.rs +++ b/src/record/mod.rs @@ -2,11 +2,12 @@ pub mod internal; mod key; pub mod runtime; #[cfg(test)] -mod test; +pub(crate) mod test; use std::{error::Error, fmt::Debug, io, sync::Arc}; -use arrow::{array::RecordBatch, datatypes::Schema}; +use array::DynRecordImmutableArrays; +use arrow::{array::RecordBatch, datatypes::Schema as ArrowSchema}; use internal::InternalRecordRef; pub use key::{Key, KeyRef}; use parquet::{arrow::ProjectionMask, format::SortingColumn, schema::types::ColumnPath}; @@ -18,63 +19,112 @@ use crate::{ serdes::{Decode, Encode}, }; -#[allow(unused)] -pub(crate) enum RecordInstance { - Normal, - Runtime(DynRecord), +// #[allow(unused)] +// pub(crate) enum RecordInstance { +// Normal, +// Runtime(DynRecord), +// } + +// #[allow(unused)] +// impl RecordInstance { +// pub(crate) fn primary_key_index(&self) -> usize +// where +// R: Record, +// { +// match self { +// RecordInstance::Normal => R::primary_key_index(), +// RecordInstance::Runtime(record) => record.primary_key_index(), +// } +// } + +// pub(crate) fn arrow_schema(&self) -> Arc +// where +// R: Record, +// { +// match self { +// RecordInstance::Normal => R::arrow_schema().clone(), +// RecordInstance::Runtime(record) => record.arrow_schema(), +// } +// } +// } + +pub trait Schema { + type Record: Record; + + type Columns: ArrowArrays; + + type Key: Key; + + fn arrow_schema(&self) -> &Arc; + + fn primary_key_index(&self) -> usize; + + fn primary_key_path(&self) -> (ColumnPath, Vec); } -#[allow(unused)] -impl RecordInstance { - pub(crate) fn primary_key_index(&self) -> usize - where - R: Record, - { - match self { - RecordInstance::Normal => R::primary_key_index(), - RecordInstance::Runtime(record) => record.primary_key_index(), +#[derive(Debug)] +pub struct DynSchema { + schema: Vec, + primary_index: usize, + arrow_schema: Arc, +} + +impl DynSchema { + pub fn new(schema: Vec, primary_index: usize) -> Self { + let arrow_schema = Arc::new(ArrowSchema::new( + schema + .iter() + .map(|desc| desc.arrow_field()) + .collect::>(), + )); + Self { + schema, + primary_index, + arrow_schema, } } +} - pub(crate) fn arrow_schema(&self) -> Arc - where - R: Record, - { - match self { - RecordInstance::Normal => R::arrow_schema().clone(), - RecordInstance::Runtime(record) => record.arrow_schema(), - } +impl Schema for DynSchema { + type Record = DynRecord; + + type Columns = DynRecordImmutableArrays; + + type Key = Value; + + fn arrow_schema(&self) -> &Arc { + &self.arrow_schema + } + + fn primary_key_index(&self) -> usize { + self.primary_index + } + + fn primary_key_path(&self) -> (ColumnPath, Vec) { + unimplemented!() } } pub trait Record: 'static + Sized + Decode + Debug + Send + Sync { - type Columns: ArrowArrays; - - type Key: Key; + type Schema: Schema; type Ref<'r>: RecordRef<'r, Record = Self> where Self: 'r; - fn key(&self) -> <::Key as Key>::Ref<'_> { + fn key(&self) -> <<::Schema as Schema>::Key as Key>::Ref<'_> { self.as_record_ref().key() } - fn primary_key_index() -> usize; - - fn primary_key_path() -> (ColumnPath, Vec); - fn as_record_ref(&self) -> Self::Ref<'_>; - fn arrow_schema() -> &'static Arc; - fn size(&self) -> usize; } pub trait RecordRef<'r>: Clone + Sized + Encode + Send + Sync { type Record: Record; - fn key(self) -> <::Key as Key>::Ref<'r>; + fn key(self) -> <<::Schema as Schema>::Key as Key>::Ref<'r>; fn projection(&mut self, projection_mask: &ProjectionMask); @@ -82,7 +132,7 @@ pub trait RecordRef<'r>: Clone + Sized + Encode + Send + Sync { record_batch: &'r RecordBatch, offset: usize, projection_mask: &'r ProjectionMask, - full_schema: &'r Arc, + full_schema: &'r Arc, ) -> InternalRecordRef<'r, Self>; } diff --git a/src/record/runtime/array.rs b/src/record/runtime/array.rs index 9cddeeb5..22a31cd5 100644 --- a/src/record/runtime/array.rs +++ b/src/record/runtime/array.rs @@ -7,15 +7,15 @@ use arrow::{ StringArray, StringBuilder, UInt32Builder, }, datatypes::{ - Int16Type, Int32Type, Int64Type, Int8Type, Schema, UInt16Type, UInt32Type, UInt64Type, - UInt8Type, + Int16Type, Int32Type, Int64Type, Int8Type, Schema as ArrowSchema, UInt16Type, UInt32Type, + UInt64Type, UInt8Type, }, }; -use super::{column::Column, record::DynRecord, record_ref::DynRecordRef, Datatype}; +use super::{record::DynRecord, record_ref::DynRecordRef, value::Value, Datatype}; use crate::{ inmem::immutable::{ArrowArrays, Builder}, - record::{Key, Record}, + record::{Key, Record, Schema}, timestamp::Timestamped, }; @@ -23,7 +23,7 @@ use crate::{ pub struct DynRecordImmutableArrays { _null: Arc, _ts: Arc, - columns: Vec, + columns: Vec, record_batch: arrow::record_batch::RecordBatch, } @@ -32,7 +32,7 @@ impl ArrowArrays for DynRecordImmutableArrays { type Builder = DynRecordBuilder; - fn builder(schema: &Arc, capacity: usize) -> Self::Builder { + fn builder(schema: Arc, capacity: usize) -> Self::Builder { let mut builders: Vec> = vec![]; let mut datatypes = vec![]; for field in schema.fields().iter().skip(2) { @@ -153,7 +153,7 @@ impl ArrowArrays for DynRecordImmutableArrays { .to_owned(), ), }; - columns.push(Column { + columns.push(Value { datatype, name, value, @@ -171,7 +171,7 @@ impl ArrowArrays for DynRecordImmutableArrays { } } impl DynRecordImmutableArrays { - fn primitive_value(col: &Column, offset: usize) -> T::Native + fn primitive_value(col: &Value, offset: usize) -> T::Native where T: ArrowPrimitiveType, { @@ -188,13 +188,13 @@ pub struct DynRecordBuilder { datatypes: Vec, _null: BooleanBufferBuilder, _ts: UInt32Builder, - schema: Arc, + schema: Arc, } impl Builder for DynRecordBuilder { fn push( &mut self, - key: Timestamped<<::Key as Key>::Ref<'_>>, + key: Timestamped<<<::Schema as Schema>::Key as Key>::Ref<'_>>, row: Option, ) { self._null.append(row.is_none()); @@ -466,7 +466,7 @@ impl Builder for DynRecordBuilder { Self::as_builder_mut::>(builder.as_mut()) .finish(), ); - columns.push(Column { + columns.push(Value { datatype: Datatype::UInt8, name: field.name().to_owned(), value: value.clone(), @@ -479,7 +479,7 @@ impl Builder for DynRecordBuilder { Self::as_builder_mut::>(builder.as_mut()) .finish(), ); - columns.push(Column { + columns.push(Value { datatype: Datatype::UInt16, name: field.name().to_owned(), value: value.clone(), @@ -492,7 +492,7 @@ impl Builder for DynRecordBuilder { Self::as_builder_mut::>(builder.as_mut()) .finish(), ); - columns.push(Column { + columns.push(Value { datatype: Datatype::UInt32, name: field.name().to_owned(), value: value.clone(), @@ -505,7 +505,7 @@ impl Builder for DynRecordBuilder { Self::as_builder_mut::>(builder.as_mut()) .finish(), ); - columns.push(Column { + columns.push(Value { datatype: Datatype::UInt64, name: field.name().to_owned(), value: value.clone(), @@ -518,7 +518,7 @@ impl Builder for DynRecordBuilder { Self::as_builder_mut::>(builder.as_mut()) .finish(), ); - columns.push(Column { + columns.push(Value { datatype: Datatype::Int8, name: field.name().to_owned(), value: value.clone(), @@ -531,7 +531,7 @@ impl Builder for DynRecordBuilder { Self::as_builder_mut::>(builder.as_mut()) .finish(), ); - columns.push(Column { + columns.push(Value { datatype: Datatype::Int16, name: field.name().to_owned(), value: value.clone(), @@ -544,7 +544,7 @@ impl Builder for DynRecordBuilder { Self::as_builder_mut::>(builder.as_mut()) .finish(), ); - columns.push(Column { + columns.push(Value { datatype: Datatype::Int32, name: field.name().to_owned(), value: value.clone(), @@ -557,7 +557,7 @@ impl Builder for DynRecordBuilder { Self::as_builder_mut::>(builder.as_mut()) .finish(), ); - columns.push(Column { + columns.push(Value { datatype: Datatype::Int64, name: field.name().to_owned(), value: value.clone(), @@ -568,7 +568,7 @@ impl Builder for DynRecordBuilder { Datatype::String => { let value = Arc::new(Self::as_builder_mut::(builder.as_mut()).finish()); - columns.push(Column { + columns.push(Value { datatype: Datatype::String, name: field.name().to_owned(), value: value.clone(), @@ -579,7 +579,7 @@ impl Builder for DynRecordBuilder { Datatype::Boolean => { let value = Arc::new(Self::as_builder_mut::(builder.as_mut()).finish()); - columns.push(Column { + columns.push(Value { datatype: Datatype::Boolean, name: field.name().to_owned(), value: value.clone(), @@ -592,7 +592,7 @@ impl Builder for DynRecordBuilder { Self::as_builder_mut::>(builder.as_mut()) .finish(), ); - columns.push(Column { + columns.push(Value { datatype: Datatype::Bytes, name: field.name().to_owned(), value: value.clone(), @@ -624,7 +624,7 @@ impl Builder for DynRecordBuilder { impl DynRecordBuilder { fn push_primary_key( &mut self, - key: Timestamped<<::Key as Key>::Ref<'_>>, + key: Timestamped<<<::Schema as Schema>::Key as Key>::Ref<'_>>, primary_key_index: usize, ) { let builder = self.builders.get_mut(primary_key_index).unwrap(); diff --git a/src/record/runtime/mod.rs b/src/record/runtime/mod.rs index 90e8b304..8bba1c85 100644 --- a/src/record/runtime/mod.rs +++ b/src/record/runtime/mod.rs @@ -1,12 +1,12 @@ -mod array; -mod column; +pub(crate) mod array; mod record; mod record_ref; +mod value; use arrow::datatypes::DataType; -pub use column::*; pub use record::*; pub use record_ref::*; +pub use value::*; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] pub enum Datatype { diff --git a/src/record/runtime/record.rs b/src/record/runtime/record.rs index 3c11eb6c..4902e4a7 100644 --- a/src/record/runtime/record.rs +++ b/src/record/runtime/record.rs @@ -1,58 +1,31 @@ -use std::{any::Any, collections::HashMap, sync::Arc}; +use std::{any::Any, sync::Arc}; -use arrow::datatypes::{DataType, Field, Schema}; use fusio::SeqRead; -use parquet::{format::SortingColumn, schema::types::ColumnPath}; -use super::{array::DynRecordImmutableArrays, Column, ColumnDesc, Datatype, DynRecordRef}; +use super::{Datatype, DynRecordRef, Value, ValueDesc}; use crate::{ - record::{Record, RecordDecodeError}, + record::{DynSchema, Record, RecordDecodeError}, serdes::{Decode, Encode}, }; #[derive(Debug)] pub struct DynRecord { - columns: Vec, + values: Vec, primary_index: usize, } #[allow(unused)] impl DynRecord { - pub fn new(columns: Vec, primary_index: usize) -> Self { + pub fn new(values: Vec, primary_index: usize) -> Self { Self { - columns, + values, primary_index, } } - - pub(crate) fn primary_key_index(&self) -> usize { - self.primary_index + 2 - } - - pub(crate) fn arrow_schema(&self) -> Arc { - let mut fields = vec![ - Field::new("_null", DataType::Boolean, false), - Field::new("_ts", DataType::UInt32, false), - ]; - - for (idx, col) in self.columns.iter().enumerate() { - if idx == self.primary_index && col.is_nullable { - panic!("Primary key must not be nullable") - } - let mut field = Field::from(col); - fields.push(field); - } - let mut metadata = HashMap::new(); - metadata.insert( - "primary_key_index".to_string(), - self.primary_index.to_string(), - ); - Arc::new(Schema::new_with_metadata(fields, metadata)) - } } impl DynRecord { - pub(crate) fn empty_record(column_descs: Vec, primary_index: usize) -> DynRecord { + pub(crate) fn empty_record(column_descs: Vec, primary_index: usize) -> DynRecord { let mut columns = vec![]; for desc in column_descs.iter() { let value: Arc = match desc.datatype { @@ -101,7 +74,7 @@ impl DynRecord { false => Arc::new(Vec::::default()), }, }; - columns.push(Column::new( + columns.push(Value::new( desc.datatype, desc.name.to_owned(), value, @@ -122,10 +95,10 @@ impl Decode for DynRecord { { let len = u32::decode(reader).await? as usize; let primary_index = u32::decode(reader).await? as usize; - let mut columns = vec![]; + let mut values = vec![]; // keep invariant for record: nullable --> Some(v); non-nullable --> v for i in 0..len { - let mut col = Column::decode(reader).await?; + let mut col = Value::decode(reader).await?; if i != primary_index && !col.is_nullable { match col.datatype { Datatype::UInt8 => { @@ -178,34 +151,24 @@ impl Decode for DynRecord { } } } - columns.push(col); + values.push(col); } Ok(DynRecord { - columns, + values, primary_index, }) } } impl Record for DynRecord { - type Columns = DynRecordImmutableArrays; - - type Key = Column; + type Schema = DynSchema; type Ref<'r> = DynRecordRef<'r>; - fn primary_key_index() -> usize { - unreachable!("This method is not used.") - } - - fn primary_key_path() -> (ColumnPath, Vec) { - unreachable!("This method is not used.") - } - fn as_record_ref(&self) -> Self::Ref<'_> { let mut columns = vec![]; - for (idx, col) in self.columns.iter().enumerate() { + for (idx, col) in self.values.iter().enumerate() { let datatype = col.datatype; let is_nullable = col.is_nullable; let mut value = col.value.clone(); @@ -255,7 +218,7 @@ impl Record for DynRecord { }; } - columns.push(Column::new( + columns.push(Value::new( datatype, col.name.to_owned(), value, @@ -265,12 +228,8 @@ impl Record for DynRecord { DynRecordRef::new(columns, self.primary_index) } - fn arrow_schema() -> &'static std::sync::Arc { - unreachable!("This method is not used.") - } - fn size(&self) -> usize { - self.columns.iter().fold(0, |acc, col| acc + col.size()) + self.values.iter().fold(0, |acc, col| acc + col.size()) } } @@ -279,19 +238,19 @@ pub(crate) mod test { use std::sync::Arc; use super::DynRecord; - use crate::record::{Column, ColumnDesc, Datatype}; + use crate::record::{Datatype, Value, ValueDesc}; #[allow(unused)] - pub(crate) fn test_dyn_item_schema() -> (Vec, usize) { + pub(crate) fn test_dyn_item_schema() -> (Vec, usize) { let descs = vec![ - ColumnDesc::new("id".to_string(), Datatype::Int64, false), - ColumnDesc::new("age".to_string(), Datatype::Int8, true), - ColumnDesc::new("height".to_string(), Datatype::Int16, true), - ColumnDesc::new("weight".to_string(), Datatype::Int32, false), - ColumnDesc::new("name".to_string(), Datatype::String, false), - ColumnDesc::new("email".to_string(), Datatype::String, true), - ColumnDesc::new("enabled".to_string(), Datatype::Boolean, false), - ColumnDesc::new("bytes".to_string(), Datatype::Bytes, true), + ValueDesc::new("id".to_string(), Datatype::Int64, false), + ValueDesc::new("age".to_string(), Datatype::Int8, true), + ValueDesc::new("height".to_string(), Datatype::Int16, true), + ValueDesc::new("weight".to_string(), Datatype::Int32, false), + ValueDesc::new("name".to_string(), Datatype::String, false), + ValueDesc::new("email".to_string(), Datatype::String, true), + ValueDesc::new("enabled".to_string(), Datatype::Boolean, false), + ValueDesc::new("bytes".to_string(), Datatype::Bytes, true), ]; (descs, 0) } @@ -301,44 +260,44 @@ pub(crate) mod test { let mut items = vec![]; for i in 0..50 { let mut columns = vec![ - Column::new(Datatype::Int64, "id".to_string(), Arc::new(i as i64), false), - Column::new( + Value::new(Datatype::Int64, "id".to_string(), Arc::new(i as i64), false), + Value::new( Datatype::Int8, "age".to_string(), Arc::new(Some(i as i8)), true, ), - Column::new( + Value::new( Datatype::Int16, "height".to_string(), Arc::new(Some(i as i16 * 20)), true, ), - Column::new( + Value::new( Datatype::Int32, "weight".to_string(), Arc::new(i * 200_i32), false, ), - Column::new( + Value::new( Datatype::String, "name".to_string(), Arc::new(i.to_string()), false, ), - Column::new( + Value::new( Datatype::String, "email".to_string(), Arc::new(Some(format!("{}@tonbo.io", i))), true, ), - Column::new( + Value::new( Datatype::Boolean, "enabled".to_string(), Arc::new(i % 2 == 0), false, ), - Column::new( + Value::new( Datatype::Bytes, "bytes".to_string(), Arc::new(Some(i.to_le_bytes().to_vec())), diff --git a/src/record/runtime/record_ref.rs b/src/record/runtime/record_ref.rs index a9525e8d..aa5f8bbf 100644 --- a/src/record/runtime/record_ref.rs +++ b/src/record/runtime/record_ref.rs @@ -3,28 +3,28 @@ use std::{any::Any, marker::PhantomData, mem, sync::Arc}; use arrow::{ array::{Array, ArrayRef, ArrowPrimitiveType, AsArray}, datatypes::{ - Int16Type, Int32Type, Int64Type, Int8Type, Schema, UInt16Type, UInt32Type, UInt64Type, - UInt8Type, + Int16Type, Int32Type, Int64Type, Int8Type, Schema as ArrowSchema, UInt16Type, UInt32Type, + UInt64Type, UInt8Type, }, }; use fusio::Write; -use super::{Column, Datatype, DynRecord}; +use super::{Datatype, DynRecord, Value}; use crate::{ - record::{internal::InternalRecordRef, Key, Record, RecordEncodeError, RecordRef}, + record::{internal::InternalRecordRef, Key, Record, RecordEncodeError, RecordRef, Schema}, serdes::Encode, }; #[derive(Clone)] pub struct DynRecordRef<'r> { - pub columns: Vec, + pub columns: Vec, // XXX: log encode should keep the same behavior pub primary_index: usize, _marker: PhantomData<&'r ()>, } impl<'r> DynRecordRef<'r> { - pub(crate) fn new(columns: Vec, primary_index: usize) -> Self { + pub(crate) fn new(columns: Vec, primary_index: usize) -> Self { Self { columns, primary_index, @@ -60,7 +60,7 @@ impl<'r> Encode for DynRecordRef<'r> { impl<'r> RecordRef<'r> for DynRecordRef<'r> { type Record = DynRecord; - fn key(self) -> <::Key as Key>::Ref<'r> { + fn key(self) -> <<::Schema as Schema>::Key as Key>::Ref<'r> { self.columns .get(self.primary_index) .cloned() @@ -71,7 +71,7 @@ impl<'r> RecordRef<'r> for DynRecordRef<'r> { record_batch: &'r arrow::array::RecordBatch, offset: usize, projection_mask: &'r parquet::arrow::ProjectionMask, - full_schema: &'r Arc, + full_schema: &'r Arc, ) -> InternalRecordRef<'r, Self> { let null = record_batch.column(0).as_boolean().value(offset); let metadata = full_schema.metadata(); @@ -98,7 +98,7 @@ impl<'r> RecordRef<'r> for DynRecordRef<'r> { .enumerate() .find(|(_idx, f)| field.contains(f)); if batch_field.is_none() { - columns.push(Column::with_none_value( + columns.push(Value::with_none_value( datatype, field.name().to_owned(), field.is_nullable(), @@ -197,7 +197,7 @@ impl<'r> RecordRef<'r> for DynRecordRef<'r> { } } }; - columns.push(Column::new( + columns.push(Value::new( datatype, field.name().to_owned(), value, diff --git a/src/record/runtime/column.rs b/src/record/runtime/value.rs similarity index 91% rename from src/record/runtime/column.rs rename to src/record/runtime/value.rs index 0b356a62..f1216b55 100644 --- a/src/record/runtime/column.rs +++ b/src/record/runtime/value.rs @@ -16,13 +16,13 @@ use crate::{ }; #[derive(Debug, Clone)] -pub struct ColumnDesc { +pub struct ValueDesc { pub datatype: Datatype, pub is_nullable: bool, pub name: String, } -impl ColumnDesc { +impl ValueDesc { pub fn new(name: String, datatype: Datatype, is_nullable: bool) -> Self { Self { name, @@ -30,17 +30,35 @@ impl ColumnDesc { is_nullable, } } + + pub(crate) fn arrow_field(&self) -> Field { + let arrow_type = match self.datatype { + Datatype::UInt8 => DataType::UInt8, + Datatype::UInt16 => DataType::UInt16, + Datatype::UInt32 => DataType::UInt32, + Datatype::UInt64 => DataType::UInt64, + Datatype::Int8 => DataType::Int8, + Datatype::Int16 => DataType::Int16, + Datatype::Int32 => DataType::Int32, + Datatype::Int64 => DataType::Int64, + Datatype::String => DataType::Utf8, + Datatype::Boolean => DataType::Boolean, + Datatype::Bytes => DataType::Binary, + }; + Field::new(&self.name, arrow_type, self.is_nullable) + } } #[derive(Clone)] -pub struct Column { +pub struct Value { pub datatype: Datatype, - pub value: Arc, pub is_nullable: bool, pub name: String, + + pub value: Arc, } -impl Column { +impl Value { pub fn new( datatype: Datatype, name: String, @@ -96,9 +114,9 @@ impl Column { } } -impl Eq for Column {} +impl Eq for Value {} -impl PartialOrd for Column { +impl PartialOrd for Value { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } @@ -106,7 +124,7 @@ impl PartialOrd for Column { macro_rules! implement_col { ([], $({$Type:ty, $Datatype:ident}), *) => { - impl Ord for Column { + impl Ord for Value { fn cmp(&self, other: &Self) -> std::cmp::Ordering { match self.datatype { $( @@ -119,7 +137,7 @@ macro_rules! implement_col { } } - impl PartialEq for Column { + impl PartialEq for Value { fn eq(&self, other: &Self) -> bool { self.datatype == other.datatype && self.is_nullable == other.is_nullable @@ -134,7 +152,7 @@ macro_rules! implement_col { } } - impl Hash for Column { + impl Hash for Value { fn hash(&self, state: &mut H) { match self.datatype { $( @@ -144,9 +162,9 @@ macro_rules! implement_col { } } - impl Debug for Column { + impl Debug for Value { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let mut debug_struct = f.debug_struct("Column"); + let mut debug_struct = f.debug_struct("Value"); match self.datatype { $( Datatype::$Datatype => { @@ -171,8 +189,8 @@ macro_rules! implement_col { macro_rules! implement_key_col { ($({$Type:ident, $Datatype:ident, $Array:ident}), *) => { - impl Key for Column { - type Ref<'a> = Column; + impl Key for Value { + type Ref<'a> = Value; fn as_key_ref(&self) -> Self::Ref<'_> { self.clone() @@ -216,8 +234,8 @@ macro_rules! implement_key_col { } } -impl<'r> KeyRef<'r> for Column { - type Key = Column; +impl<'r> KeyRef<'r> for Value { + type Key = Value; fn to_key(self) -> Self::Key { self @@ -226,7 +244,7 @@ impl<'r> KeyRef<'r> for Column { macro_rules! implement_decode_col { ([], $({$Type:ty, $Datatype:ident}), *) => { - impl Decode for Column { + impl Decode for Value { type Error = fusio::Error; async fn decode(reader: &mut R) -> Result @@ -253,7 +271,7 @@ macro_rules! implement_decode_col { )* }; let name = String::decode(reader).await?; - Ok(Column { + Ok(Value { datatype, is_nullable, name, @@ -266,7 +284,7 @@ macro_rules! implement_decode_col { macro_rules! implement_encode_col { ([], $({$Type:ty, $Datatype:ident}), *) => { - impl Encode for Column { + impl Encode for Value { type Error = fusio::Error; async fn encode(&self, writer: &mut W) -> Result<(), Self::Error> @@ -319,7 +337,7 @@ macro_rules! implement_encode_col { } } -impl Column { +impl Value { fn tag(datatype: Datatype) -> u8 { match datatype { Datatype::UInt8 => 0, @@ -354,8 +372,8 @@ impl Column { } } -impl From<&Column> for Field { - fn from(col: &Column) -> Self { +impl From<&ValueDesc> for Field { + fn from(col: &ValueDesc) -> Self { match col.datatype { Datatype::UInt8 => Field::new(&col.name, DataType::UInt8, col.is_nullable), Datatype::UInt16 => Field::new(&col.name, DataType::UInt16, col.is_nullable), diff --git a/src/record/test.rs b/src/record/test.rs index c9816956..effc185e 100644 --- a/src/record/test.rs +++ b/src/record/test.rs @@ -5,12 +5,12 @@ use arrow::{ Array, AsArray, BooleanArray, BooleanBufferBuilder, RecordBatch, StringArray, StringBuilder, UInt32Array, UInt32Builder, }, - datatypes::{DataType, Field, Schema, UInt32Type}, + datatypes::{DataType, Field, Schema as ArrowSchema, UInt32Type}, }; use once_cell::sync::Lazy; use parquet::{arrow::ProjectionMask, format::SortingColumn, schema::types::ColumnPath}; -use super::{internal::InternalRecordRef, Key, Record, RecordRef}; +use super::{internal::InternalRecordRef, Key, Record, RecordRef, Schema}; use crate::{ inmem::immutable::{ArrowArrays, Builder}, timestamp::Timestamped, @@ -18,25 +18,33 @@ use crate::{ const PRIMARY_FIELD_NAME: &str = "vstring"; -impl Record for String { +#[derive(Debug)] +pub struct StringSchema; + +impl Schema for StringSchema { + type Record = String; + type Columns = StringColumns; - type Key = Self; + type Key = String; - type Ref<'r> - = &'r str - where - Self: 'r; + fn arrow_schema(&self) -> &Arc { + static SCHEMA: Lazy> = Lazy::new(|| { + Arc::new(ArrowSchema::new(vec![ + Field::new("_null", DataType::Boolean, false), + Field::new("_ts", DataType::UInt32, false), + Field::new(PRIMARY_FIELD_NAME, DataType::Utf8, false), + ])) + }); - fn key(&self) -> &str { - self + &SCHEMA } - fn primary_key_index() -> usize { + fn primary_key_index(&self) -> usize { 2 } - fn primary_key_path() -> (ColumnPath, Vec) { + fn primary_key_path(&self) -> (ColumnPath, Vec) { ( ColumnPath::new(vec!["_ts".to_string(), PRIMARY_FIELD_NAME.to_string()]), vec![ @@ -45,21 +53,22 @@ impl Record for String { ], ) } +} - fn as_record_ref(&self) -> Self::Ref<'_> { +impl Record for String { + type Schema = StringSchema; + + type Ref<'r> + = &'r str + where + Self: 'r; + + fn key(&self) -> &str { self } - fn arrow_schema() -> &'static Arc { - static SCHEMA: Lazy> = Lazy::new(|| { - Arc::new(Schema::new(vec![ - Field::new("_null", DataType::Boolean, false), - Field::new("_ts", DataType::UInt32, false), - Field::new(PRIMARY_FIELD_NAME, DataType::Utf8, false), - ])) - }); - - &SCHEMA + fn as_record_ref(&self) -> Self::Ref<'_> { + self } fn size(&self) -> usize { @@ -70,7 +79,7 @@ impl Record for String { impl<'r> RecordRef<'r> for &'r str { type Record = String; - fn key(self) -> <::Key as Key>::Ref<'r> { + fn key(self) -> <<::Schema as Schema>::Key as Key>::Ref<'r> { self } @@ -80,7 +89,7 @@ impl<'r> RecordRef<'r> for &'r str { record_batch: &'r RecordBatch, offset: usize, _: &'r ProjectionMask, - _: &'r Arc, + _: &'r Arc, ) -> InternalRecordRef<'r, Self> { let ts = record_batch .column(1) @@ -108,7 +117,7 @@ impl ArrowArrays for StringColumns { type Builder = StringColumnsBuilder; - fn builder(_schema: &Arc, capacity: usize) -> Self::Builder { + fn builder(_schema: Arc, capacity: usize) -> Self::Builder { StringColumnsBuilder { _null: BooleanBufferBuilder::new(capacity), _ts: UInt32Builder::with_capacity(capacity), @@ -166,8 +175,9 @@ impl Builder for StringColumnsBuilder { let _ts = Arc::new(self._ts.finish()); let string = Arc::new(self.string.finish()); + let schema = StringSchema; let record_batch = RecordBatch::try_new( - ::Record::arrow_schema().clone(), + schema.arrow_schema().clone(), vec![ Arc::clone(&_null) as Arc, Arc::clone(&_ts) as Arc, diff --git a/src/stream/level.rs b/src/stream/level.rs index 81c42b2f..61b2e71e 100644 --- a/src/stream/level.rs +++ b/src/stream/level.rs @@ -19,7 +19,7 @@ use ulid::Ulid; use crate::{ fs::{FileId, FileType}, ondisk::{scan::SsTableScan, sstable::SsTable}, - record::Record, + record::{Record, Schema}, scope::Scope, stream::record_batch::RecordBatchEntry, timestamp::Timestamp, @@ -47,8 +47,8 @@ pub(crate) struct LevelStream<'level, R> where R: Record, { - lower: Bound<&'level R::Key>, - upper: Bound<&'level R::Key>, + lower: Bound<&'level ::Key>, + upper: Bound<&'level ::Key>, ts: Timestamp, level: usize, option: Arc>, @@ -72,7 +72,10 @@ where level: usize, start: usize, end: usize, - range: (Bound<&'level R::Key>, Bound<&'level R::Key>), + range: ( + Bound<&'level ::Key>, + Bound<&'level ::Key>, + ), ts: Timestamp, limit: Option, projection_mask: ProjectionMask, diff --git a/src/transaction.rs b/src/transaction.rs index 0293d76f..aa2df175 100644 --- a/src/transaction.rs +++ b/src/transaction.rs @@ -246,8 +246,8 @@ mod tests { executor::tokio::TokioExecutor, fs::manager::StoreManager, record::{ - runtime::{Column, Datatype, DynRecord}, - ColumnDesc, + runtime::{Datatype, DynRecord, Value}, + ValueDesc, }, tests::{build_db, build_schema, Test}, transaction::CommitError, @@ -789,9 +789,9 @@ mod tests { #[tokio::test] async fn test_dyn_record() { let descs = vec![ - ColumnDesc::new("age".to_string(), Datatype::Int8, false), - ColumnDesc::new("height".to_string(), Datatype::Int16, true), - ColumnDesc::new("weight".to_string(), Datatype::Int32, false), + ValueDesc::new("age".to_string(), Datatype::Int8, false), + ValueDesc::new("height".to_string(), Datatype::Int16, true), + ValueDesc::new("weight".to_string(), Datatype::Int32, false), ]; let temp_dir = TempDir::new().unwrap(); @@ -806,14 +806,14 @@ mod tests { db.insert(DynRecord::new( vec![ - Column::new(Datatype::Int8, "age".to_string(), Arc::new(1_i8), false), - Column::new( + Value::new(Datatype::Int8, "age".to_string(), Arc::new(1_i8), false), + Value::new( Datatype::Int16, "height".to_string(), Arc::new(Some(180_i16)), true, ), - Column::new( + Value::new( Datatype::Int32, "weight".to_string(), Arc::new(56_i32), @@ -827,7 +827,7 @@ mod tests { let txn = db.transaction().await; { - let key = Column::new(Datatype::Int8, "age".to_string(), Arc::new(1_i8), false); + let key = Value::new(Datatype::Int8, "age".to_string(), Arc::new(1_i8), false); let record_ref = txn.get(&key, Projection::All).await.unwrap(); assert!(record_ref.is_some()); diff --git a/src/version/mod.rs b/src/version/mod.rs index d1dda197..28d77cf5 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -19,7 +19,7 @@ use tracing::error; use crate::{ fs::{manager::StoreManager, FileId, FileType}, ondisk::sstable::SsTable, - record::Record, + record::{Record, Schema}, scope::Scope, serdes::Encode, stream::{level::LevelStream, record_batch::RecordBatchEntry, ScanStream}, @@ -44,7 +44,7 @@ where R: Record, { ts: Timestamp, - pub(crate) level_slice: [Vec>; MAX_LEVEL], + pub(crate) level_slice: [Vec::Key>>; MAX_LEVEL], clean_sender: Sender, option: Arc>, timestamp: Arc, @@ -119,7 +119,7 @@ where pub(crate) async fn query( &self, manager: &StoreManager, - key: &TimestampedRef, + key: &TimestampedRef<::Key>, projection_mask: ProjectionMask, parquet_lru: ParquetLru, ) -> Result>, VersionError> { @@ -181,7 +181,7 @@ where async fn table_query( &self, store: &Arc, - key: &TimestampedRef<::Key>, + key: &TimestampedRef<::Key>, level: usize, gen: FileId, projection_mask: ProjectionMask, @@ -201,7 +201,10 @@ where .map_err(VersionError::Parquet) } - pub(crate) fn scope_search(key: &R::Key, level: &[Scope]) -> usize { + pub(crate) fn scope_search( + key: &::Key, + level: &[Scope<::Key>], + ) -> usize { level .binary_search_by(|scope| scope.min.cmp(key)) .unwrap_or_else(|index| index.saturating_sub(1)) @@ -216,7 +219,10 @@ where &self, manager: &StoreManager, streams: &mut Vec>, - range: (Bound<&'streams R::Key>, Bound<&'streams R::Key>), + range: ( + Bound<&'streams ::Key>, + Bound<&'streams ::Key>, + ), ts: Timestamp, limit: Option, projection_mask: ProjectionMask, @@ -291,7 +297,7 @@ where Ok(()) } - pub(crate) fn to_edits(&self) -> Vec> { + pub(crate) fn to_edits(&self) -> Vec::Key>> { let mut edits = Vec::new(); for (level, scopes) in self.level_slice.iter().enumerate() { @@ -325,7 +331,7 @@ where R: Record, { #[error("version encode error: {0}")] - Encode(#[source] ::Error), + Encode(#[source] <::Key as Encode>::Error), #[error("version io error: {0}")] Io(#[from] std::io::Error), #[error("version parquet error: {0}")] diff --git a/src/version/set.rs b/src/version/set.rs index 14fbe048..e4050d47 100644 --- a/src/version/set.rs +++ b/src/version/set.rs @@ -16,7 +16,7 @@ use futures_util::StreamExt; use super::{TransactionTs, MAX_LEVEL}; use crate::{ fs::{manager::StoreManager, parse_file_id, FileId, FileType}, - record::Record, + record::{Record, Schema}, serdes::Encode, timestamp::Timestamp, version::{cleaner::CleanTag, edit::VersionEdit, Version, VersionError, VersionRef}, @@ -176,7 +176,7 @@ where pub(crate) async fn apply_edits( &self, - mut version_edits: Vec>, + mut version_edits: Vec::Key>>, delete_gens: Option>, is_recover: bool, ) -> Result<(), VersionError> { diff --git a/src/wal/mod.rs b/src/wal/mod.rs index 3fdb9d60..83db336c 100644 --- a/src/wal/mod.rs +++ b/src/wal/mod.rs @@ -13,7 +13,7 @@ use thiserror::Error; use crate::{ fs::FileId, - record::{Key, Record}, + record::{Key, Record, Schema}, serdes::{Decode, Encode}, timestamp::Timestamped, wal::{log::LogType, record_entry::RecordEntry}, @@ -48,7 +48,7 @@ where pub(crate) async fn write<'r>( &mut self, log_ty: LogType, - key: Timestamped<::Ref<'r>>, + key: Timestamped<<::Key as Key>::Ref<'r>>, value: Option>, ) -> Result<(), as Encode>::Error> { let mut writer = HashWriter::new(&mut self.file); @@ -73,7 +73,7 @@ where &mut self, ) -> impl Stream< Item = Result< - (LogType, Timestamped, Option), + (LogType, Timestamped<::Key>, Option), RecoverError<::Error>, >, > + '_ { diff --git a/src/wal/record_entry.rs b/src/wal/record_entry.rs index 015ed317..07929553 100644 --- a/src/wal/record_entry.rs +++ b/src/wal/record_entry.rs @@ -1,7 +1,7 @@ use fusio::{SeqRead, Write}; use crate::{ - record::{Key, Record}, + record::{Key, Record, Schema}, serdes::{Decode, Encode}, timestamp::Timestamped, }; @@ -10,8 +10,13 @@ pub(crate) enum RecordEntry<'r, R> where R: Record, { - Encode((Timestamped<::Ref<'r>>, Option>)), - Decode((Timestamped, Option)), + Encode( + ( + Timestamped<<::Key as Key>::Ref<'r>>, + Option>, + ), + ), + Decode((Timestamped<::Key>, Option)), } impl Encode for RecordEntry<'_, R> @@ -51,7 +56,9 @@ where where R: SeqRead, { - let key = Timestamped::::decode(reader).await.unwrap(); + let key = Timestamped::<::Key>::decode(reader) + .await + .unwrap(); let record = Option::::decode(reader).await.unwrap(); Ok(RecordEntry::Decode((key, record))) diff --git a/tests/wasm.rs b/tests/wasm.rs index 2d0007f6..b700334f 100644 --- a/tests/wasm.rs +++ b/tests/wasm.rs @@ -7,7 +7,7 @@ mod tests { use futures::StreamExt; use tonbo::{ executor::opfs::OpfsExecutor, - record::{Column, ColumnDesc, Datatype, DynRecord, Record}, + record::{Datatype, DynRecord, Record, Value, ValueDesc}, DbOption, Projection, DB, }; use wasm_bindgen_test::wasm_bindgen_test; From bd68babbeaa8560a5c728f2cf665e290a028c4fe Mon Sep 17 00:00:00 2001 From: Kould Date: Wed, 27 Nov 2024 15:31:05 +0800 Subject: [PATCH 2/9] chore: fix Schema --- examples/datafusion.rs | 25 +- examples/declare.rs | 9 +- src/compaction/mod.rs | 209 ++++++++++------ src/inmem/immutable.rs | 1 + src/inmem/mutable.rs | 12 +- src/lib.rs | 289 ++++++++++++---------- src/ondisk/sstable.rs | 10 +- src/option.rs | 8 +- src/record/mod.rs | 25 +- src/record/runtime/record.rs | 6 +- src/snapshot.rs | 23 +- src/stream/level.rs | 22 +- src/stream/mem_projection.rs | 20 +- src/stream/merge.rs | 41 ++- src/stream/mod.rs | 21 +- src/stream/package.rs | 37 ++- src/stream/record_batch.rs | 8 +- src/transaction.rs | 91 ++++--- src/version/cleaner.rs | 6 +- src/version/set.rs | 17 +- tests/data_integrity.rs | 9 +- tests/macros_correctness.rs | 59 +++-- tonbo_macros/src/record.rs | 134 ++++++---- tonbo_macros/src/utils/ident_generator.rs | 6 + 24 files changed, 691 insertions(+), 397 deletions(-) diff --git a/examples/datafusion.rs b/examples/datafusion.rs index 2b7dd167..d7a9a940 100644 --- a/examples/datafusion.rs +++ b/examples/datafusion.rs @@ -28,7 +28,10 @@ use futures_core::Stream; use futures_util::StreamExt; use tokio::fs; use tonbo::{ - executor::tokio::TokioExecutor, inmem::immutable::ArrowArrays, record::Record, DbOption, DB, + executor::tokio::TokioExecutor, + inmem::immutable::ArrowArrays, + record::{Record, Schema}, + DbOption, DB, }; use tonbo_macros::Record; @@ -49,7 +52,10 @@ struct MusicExec { db: Arc>, projection: Option>, limit: Option, - range: (Bound<::Key>, Bound<::Key>), + range: ( + Bound<::Key>, + Bound<::Key>, + ), } struct MusicStream { @@ -63,7 +69,7 @@ impl TableProvider for MusicProvider { } fn schema(&self) -> SchemaRef { - Music::arrow_schema().clone() + MusicSchema {}.arrow_schema().clone() } fn table_type(&self) -> TableType { @@ -96,7 +102,7 @@ impl TableProvider for MusicProvider { impl MusicExec { fn new(db: Arc>, projection: Option<&Vec>) -> Self { - let schema = Music::arrow_schema(); + let schema = MusicSchema {}.arrow_schema(); let schema = if let Some(projection) = &projection { Arc::new(schema.project(projection).unwrap()) } else { @@ -127,7 +133,7 @@ impl Stream for MusicStream { impl RecordBatchStream for MusicStream { fn schema(&self) -> SchemaRef { - Music::arrow_schema().clone() + MusicSchema {}.arrow_schema().clone() } } @@ -215,9 +221,14 @@ async fn main() -> Result<()> { // make sure the path exists let _ = fs::create_dir_all("./db_path/music").await; - let options = DbOption::from(Path::from_filesystem_path("./db_path/music").unwrap()); + let options = DbOption::from(( + Path::from_filesystem_path("./db_path/music").unwrap(), + &MusicSchema, + )); - let db = DB::new(options, TokioExecutor::default()).await.unwrap(); + let db = DB::new(options, TokioExecutor::default(), MusicSchema) + .await + .unwrap(); for (id, name, like) in [ (0, "welcome".to_string(), 0), (1, "tonbo".to_string(), 999), diff --git a/examples/declare.rs b/examples/declare.rs index 6e6edcd3..310adcfc 100644 --- a/examples/declare.rs +++ b/examples/declare.rs @@ -22,9 +22,14 @@ async fn main() { // make sure the path exists let _ = fs::create_dir_all("./db_path/users").await; - let options = DbOption::from(Path::from_filesystem_path("./db_path/users").unwrap()); + let options = DbOption::from(( + Path::from_filesystem_path("./db_path/users").unwrap(), + &UserSchema, + )); // pluggable async runtime and I/O - let db = DB::new(options, TokioExecutor::default()).await.unwrap(); + let db = DB::new(options, TokioExecutor::default(), UserSchema) + .await + .unwrap(); // insert with owned value db.insert(User { diff --git a/src/compaction/mod.rs b/src/compaction/mod.rs index 51b41713..cd720469 100644 --- a/src/compaction/mod.rs +++ b/src/compaction/mod.rs @@ -15,7 +15,7 @@ use crate::{ mutable::Mutable, }, ondisk::sstable::SsTable, - record::{KeyRef, Record, RecordInstance}, + record::{KeyRef, Record, Schema as RecordSchema}, scope::Scope, stream::{level::LevelStream, merge::MergeStream, ScanStream}, transaction::CommitError, @@ -39,6 +39,7 @@ where pub(crate) schema: Arc>>, pub(crate) version_set: VersionSet, pub(crate) manager: Arc, + pub(crate) record_schema: Arc, } impl Compactor @@ -47,6 +48,7 @@ where { pub(crate) fn new( schema: Arc>>, + record_schema: Arc, option: Arc>, version_set: VersionSet, manager: Arc, @@ -56,6 +58,7 @@ where schema, version_set, manager, + record_schema, } } @@ -74,9 +77,15 @@ where let trigger_clone = guard.trigger.clone(); let mutable = mem::replace( &mut guard.mutable, - Mutable::new(&self.option, trigger_clone, self.manager.base_fs()).await?, + Mutable::new( + &self.option, + trigger_clone, + self.manager.base_fs(), + self.record_schema.clone(), + ) + .await?, ); - let (file_id, immutable) = mutable.into_immutable(&guard.record_instance).await?; + let (file_id, immutable) = mutable.into_immutable().await?; guard.immutables.push((file_id, immutable)); if guard.immutables.len() > self.option.immutable_chunk_max_num { @@ -91,7 +100,7 @@ where &self.option, recover_wal_ids, excess, - &guard.record_instance, + &guard.record_schema, &self.manager, ) .await? @@ -108,7 +117,7 @@ where &scope.max, &mut version_edits, &mut delete_gens, - &guard.record_instance, + &guard.record_schema, &self.manager, parquet_lru, ) @@ -133,10 +142,13 @@ where pub(crate) async fn minor_compaction( option: &DbOption, recover_wal_ids: Option>, - batches: &[(Option, Immutable)], - instance: &RecordInstance, + batches: &[( + Option, + Immutable<::Columns>, + )], + schema: &R::Schema, manager: &StoreManager, - ) -> Result>, CompactionError> { + ) -> Result::Key>>, CompactionError> { if !batches.is_empty() { let level_0_path = option.level_fs_path(0).unwrap_or(&option.base_path); let level_0_fs = manager.get_fs(level_0_path); @@ -156,7 +168,7 @@ where ) .await?, ), - instance.arrow_schema::().clone(), + schema.arrow_schema().clone(), Some(option.write_parquet_properties.clone()), )?; @@ -192,11 +204,11 @@ where pub(crate) async fn major_compaction( version: &Version, option: &DbOption, - mut min: &R::Key, - mut max: &R::Key, - version_edits: &mut Vec>, + mut min: &::Key, + mut max: &::Key, + version_edits: &mut Vec::Key>>, delete_gens: &mut Vec<(FileId, usize)>, - instance: &RecordInstance, + instance: &R::Schema, manager: &StoreManager, parquet_lru: ParquetLru, ) -> Result<(), CompactionError> { @@ -308,11 +320,18 @@ where fn next_level_scopes<'a>( version: &'a Version, - min: &mut &'a ::Key, - max: &mut &'a ::Key, + min: &mut &'a ::Key, + max: &mut &'a ::Key, level: usize, - meet_scopes_l: &[&'a Scope<::Key>], - ) -> Result<(Vec<&'a Scope<::Key>>, usize, usize), CompactionError> { + meet_scopes_l: &[&'a Scope<::Key>], + ) -> Result< + ( + Vec<&'a Scope<::Key>>, + usize, + usize, + ), + CompactionError, + > { let mut meet_scopes_ll = Vec::new(); let mut start_ll = 0; let mut end_ll = 0; @@ -348,10 +367,14 @@ where fn this_level_scopes<'a>( version: &'a Version, - min: &::Key, - max: &::Key, + min: &::Key, + max: &::Key, level: usize, - ) -> (Vec<&'a Scope<::Key>>, usize, usize) { + ) -> ( + Vec<&'a Scope<::Key>>, + usize, + usize, + ) { let mut meet_scopes_l = Vec::new(); let mut start_l = Version::::scope_search(min, &version.level_slice[level]); let mut end_l = start_l; @@ -386,16 +409,17 @@ where async fn build_tables<'scan>( option: &DbOption, - version_edits: &mut Vec::Key>>, + version_edits: &mut Vec::Key>>, level: usize, streams: Vec>, - instance: &RecordInstance, + schema: &R::Schema, fs: &Arc, ) -> Result<(), CompactionError> { let mut stream = MergeStream::::from_vec(streams, u32::MAX.into()).await?; // Kould: is the capacity parameter necessary? - let mut builder = R::Columns::builder(&instance.arrow_schema::(), 8192); + let mut builder = + ::Columns::builder(schema.arrow_schema().clone(), 8192); let mut min = None; let mut max = None; @@ -417,7 +441,7 @@ where &mut builder, &mut min, &mut max, - instance, + schema, fs, ) .await?; @@ -431,7 +455,7 @@ where &mut builder, &mut min, &mut max, - instance, + schema, fs, ) .await?; @@ -440,8 +464,14 @@ where } fn full_scope<'a>( - meet_scopes: &[&'a Scope<::Key>], - ) -> Result<(&'a ::Key, &'a ::Key), CompactionError> { + meet_scopes: &[&'a Scope<::Key>], + ) -> Result< + ( + &'a ::Key, + &'a ::Key, + ), + CompactionError, + > { let lower = &meet_scopes.first().ok_or(CompactionError::EmptyLevel)?.min; let upper = &meet_scopes.last().ok_or(CompactionError::EmptyLevel)?.max; Ok((lower, upper)) @@ -450,12 +480,12 @@ where #[allow(clippy::too_many_arguments)] async fn build_table( option: &DbOption, - version_edits: &mut Vec>, + version_edits: &mut Vec::Key>>, level: usize, - builder: &mut ::Builder, - min: &mut Option, - max: &mut Option, - instance: &RecordInstance, + builder: &mut <::Columns as ArrowArrays>::Builder, + min: &mut Option<::Key>, + max: &mut Option<::Key>, + schema: &R::Schema, fs: &Arc, ) -> Result<(), CompactionError> { debug_assert!(min.is_some()); @@ -471,7 +501,7 @@ where ) .await?, ), - instance.arrow_schema::().clone(), + schema.arrow_schema().clone(), Some(option.write_parquet_properties.clone()), )?; writer.write(columns.as_record_batch()).await?; @@ -526,8 +556,11 @@ pub(crate) mod tests { compaction::Compactor, executor::tokio::TokioExecutor, fs::{manager::StoreManager, FileId, FileType}, - inmem::{immutable::Immutable, mutable::Mutable}, - record::{Datatype, DynRecord, Record, RecordInstance, Value, ValueDesc}, + inmem::{ + immutable::{tests::TestSchema, Immutable}, + mutable::Mutable, + }, + record::{Datatype, DynRecord, DynSchema, Record, Schema, Value, ValueDesc}, scope::Scope, tests::Test, timestamp::Timestamp, @@ -540,34 +573,34 @@ pub(crate) mod tests { async fn build_immutable( option: &DbOption, records: Vec<(LogType, R, Timestamp)>, - instance: &RecordInstance, + schema: &Arc, fs: &Arc, - ) -> Result, DbError> + ) -> Result::Columns>, DbError> where R: Record + Send, { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let mutable: Mutable = Mutable::new(option, trigger, fs).await?; + let mutable: Mutable = Mutable::new(option, trigger, fs, schema.clone()).await?; for (log_ty, record, ts) in records { let _ = mutable.insert(log_ty, record, ts).await?; } - Ok(Immutable::from((mutable.data, instance))) + Ok(Immutable::new(mutable.data, schema.arrow_schema().clone())) } pub(crate) async fn build_parquet_table( option: &DbOption, gen: FileId, records: Vec<(LogType, R, Timestamp)>, - instance: &RecordInstance, + schema: &Arc, level: usize, fs: &Arc, ) -> Result<(), DbError> where R: Record + Send, { - let immutable = build_immutable::(option, records, instance, fs).await?; + let immutable = build_immutable::(option, records, schema, fs).await?; let mut writer = AsyncArrowWriter::try_new( AsyncWriter::new( fs.open_options( @@ -576,7 +609,7 @@ pub(crate) mod tests { ) .await?, ), - R::arrow_schema().clone(), + schema.arrow_schema().clone(), None, )?; writer.write(immutable.as_record_batch()).await?; @@ -590,13 +623,16 @@ pub(crate) mod tests { let temp_dir = tempfile::tempdir().unwrap(); let temp_dir_l0 = tempfile::tempdir().unwrap(); - let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()) - .level_path( - 0, - Path::from_filesystem_path(temp_dir_l0.path()).unwrap(), - FsOptions::Local, - ) - .unwrap(); + let option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &TestSchema, + )) + .level_path( + 0, + Path::from_filesystem_path(temp_dir_l0.path()).unwrap(), + FsOptions::Local, + ) + .unwrap(); let manager = StoreManager::new(option.base_fs.clone(), option.level_paths.clone()).unwrap(); manager @@ -636,7 +672,7 @@ pub(crate) mod tests { 0.into(), ), ], - &RecordInstance::Normal, + &Arc::new(TestSchema), manager.base_fs(), ) .await @@ -673,7 +709,7 @@ pub(crate) mod tests { 0.into(), ), ], - &RecordInstance::Normal, + &Arc::new(TestSchema), manager.base_fs(), ) .await @@ -686,7 +722,7 @@ pub(crate) mod tests { (Some(FileId::new()), batch_1), (Some(FileId::new()), batch_2), ], - &RecordInstance::Normal, + &TestSchema, &manager, ) .await @@ -711,11 +747,10 @@ pub(crate) mod tests { .await .unwrap(); - let empty_record = DynRecord::empty_record( + let instance = Arc::new(DynSchema::new( vec![ValueDesc::new("id".to_owned(), Datatype::Int32, false)], 0, - ); - let instance = RecordInstance::Runtime(empty_record); + )); let mut batch1_data = vec![]; let mut batch2_data = vec![]; @@ -772,19 +807,22 @@ pub(crate) mod tests { let temp_dir_l0 = TempDir::new().unwrap(); let temp_dir_l1 = TempDir::new().unwrap(); - let mut option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()) - .level_path( - 0, - Path::from_filesystem_path(temp_dir_l0.path()).unwrap(), - FsOptions::Local, - ) - .unwrap() - .level_path( - 1, - Path::from_filesystem_path(temp_dir_l1.path()).unwrap(), - FsOptions::Local, - ) - .unwrap(); + let mut option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &TestSchema, + )) + .level_path( + 0, + Path::from_filesystem_path(temp_dir_l0.path()).unwrap(), + FsOptions::Local, + ) + .unwrap() + .level_path( + 1, + Path::from_filesystem_path(temp_dir_l1.path()).unwrap(), + FsOptions::Local, + ) + .unwrap(); option.major_threshold_with_sst_size = 2; let option = Arc::new(option); let manager = @@ -802,7 +840,7 @@ pub(crate) mod tests { .unwrap(); let ((table_gen_1, table_gen_2, table_gen_3, table_gen_4, _), version) = - build_version(&option, &manager).await; + build_version(&option, &manager, &Arc::new(TestSchema)).await; let min = 2.to_string(); let max = 5.to_string(); @@ -815,7 +853,7 @@ pub(crate) mod tests { &max, &mut version_edits, &mut vec![], - &RecordInstance::Normal, + &TestSchema, &manager, Arc::new(NoCache::default()), ) @@ -853,6 +891,7 @@ pub(crate) mod tests { pub(crate) async fn build_version( option: &Arc>, manager: &StoreManager, + schema: &Arc, ) -> ((FileId, FileId, FileId, FileId, FileId), Version) { let level_0_fs = option .level_fs_path(0) @@ -898,7 +937,7 @@ pub(crate) mod tests { 0.into(), ), ], - &RecordInstance::Normal, + schema, 0, level_0_fs, ) @@ -936,7 +975,7 @@ pub(crate) mod tests { 0.into(), ), ], - &RecordInstance::Normal, + schema, 0, level_0_fs, ) @@ -979,7 +1018,7 @@ pub(crate) mod tests { 0.into(), ), ], - &RecordInstance::Normal, + schema, 1, level_1_fs, ) @@ -1017,7 +1056,7 @@ pub(crate) mod tests { 0.into(), ), ], - &RecordInstance::Normal, + schema, 1, level_1_fs, ) @@ -1055,7 +1094,7 @@ pub(crate) mod tests { 0.into(), ), ], - &RecordInstance::Normal, + schema, 1, level_1_fs, ) @@ -1112,7 +1151,10 @@ pub(crate) mod tests { pub(crate) async fn major_panic() { let temp_dir = TempDir::new().unwrap(); - let mut option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + let mut option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &TestSchema, + )); option.major_threshold_with_sst_size = 1; option.level_sst_magnification = 1; let manager = @@ -1162,7 +1204,7 @@ pub(crate) mod tests { &option, table_gen0, records0, - &RecordInstance::Normal, + &Arc::new(TestSchema), 0, level_0_fs, ) @@ -1172,7 +1214,7 @@ pub(crate) mod tests { &option, table_gen1, records1, - &RecordInstance::Normal, + &Arc::new(TestSchema), 1, level_1_fs, ) @@ -1207,7 +1249,7 @@ pub(crate) mod tests { &max, &mut version_edits, &mut vec![], - &RecordInstance::Normal, + &TestSchema, &manager, Arc::new(NoCache::default()), ) @@ -1220,7 +1262,10 @@ pub(crate) mod tests { async fn test_flush_major_level_sort() { let temp_dir = TempDir::new().unwrap(); - let mut option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + let mut option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &TestSchema, + )); option.immutable_chunk_num = 1; option.immutable_chunk_max_num = 0; option.major_threshold_with_sst_size = 2; @@ -1230,7 +1275,9 @@ pub(crate) mod tests { option.major_default_oldest_table_num = 1; option.trigger_type = TriggerType::Length(5); - let db: DB = DB::new(option, TokioExecutor::new()).await.unwrap(); + let db: DB = DB::new(option, TokioExecutor::new(), TestSchema) + .await + .unwrap(); for i in 5..9 { let item = Test { diff --git a/src/inmem/immutable.rs b/src/inmem/immutable.rs index c9af2021..077f9a05 100644 --- a/src/inmem/immutable.rs +++ b/src/inmem/immutable.rs @@ -236,6 +236,7 @@ pub(crate) mod tests { timestamp::timestamped::Timestamped, }; + #[derive(Debug)] pub struct TestSchema; impl Schema for TestSchema { diff --git a/src/inmem/mutable.rs b/src/inmem/mutable.rs index 49a78177..0ddfac83 100644 --- a/src/inmem/mutable.rs +++ b/src/inmem/mutable.rs @@ -234,11 +234,14 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + let option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &TestSchema, + )); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let mem_table = Mutable::::new(&option, trigger, &fs, TestSchema) + let mem_table = Mutable::::new(&option, trigger, &fs, Arc::new(TestSchema {})) .await .unwrap(); @@ -284,7 +287,10 @@ mod tests { async fn range() { let temp_dir = tempfile::tempdir().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + let option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &StringSchema, + )); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); diff --git a/src/lib.rs b/src/lib.rs index 1ac0f054..bff78dcf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -54,9 +54,14 @@ //! // make sure the path exists //! let _ = fs::create_dir_all("./db_path/users").await; //! -//! let options = DbOption::from(Path::from_filesystem_path("./db_path/users").unwrap()); +//! let options = DbOption::from(( +//! Path::from_filesystem_path("./db_path/users").unwrap(), +//! &UserSchema, +//! )); //! // pluggable async runtime and I/O -//! let db = DB::new(options, TokioExecutor::default()).await.unwrap(); +//! let db = DB::new(options, TokioExecutor::default(), UserSchema) +//! .await +//! .unwrap(); //! // insert with owned value //! db.insert(User { //! name: "Alice".into(), @@ -149,7 +154,7 @@ use parquet::{ errors::ParquetError, }; use parquet_lru::{DynLruCache, NoCache}; -use record::{DynRecord, Record, ValueDesc}; +use record::{DynRecord, Record}; use thiserror::Error; use timestamp::{Timestamp, TimestampedRef}; use tokio::sync::oneshot; @@ -162,6 +167,7 @@ use crate::{ compaction::{CompactTask, CompactionError, Compactor}, executor::Executor, fs::{manager::StoreManager, parse_file_id, FileType}, + record::{DynSchema, Schema as RecordSchema}, serdes::Decode, snapshot::Snapshot, stream::{ @@ -195,22 +201,18 @@ where pub async fn with_schema( option: DbOption, executor: E, - column_descs: Vec, - primary_index: usize, + schema: DynSchema, ) -> Result> { let option = Arc::new(option); - let instance = - RecordInstance::Runtime(DynRecord::empty_record(column_descs, primary_index)); - - Self::build(option, executor, instance, Arc::new(NoCache::default())).await + Self::build(option, executor, schema, Arc::new(NoCache::default())).await } } impl DB where R: Record + Send + Sync, - R::Columns: Send + Sync, + ::Columns: Send + Sync, E: Executor + Send + Sync + 'static, { /// Open [`DB`] with a [`DbOption`]. This will create a new directory at the @@ -218,11 +220,15 @@ where /// according to the configuration of [`DbOption`]. /// /// For more configurable options, please refer to [`DbOption`]. - pub async fn new(option: DbOption, executor: E) -> Result> { + pub async fn new( + option: DbOption, + executor: E, + schema: R::Schema, + ) -> Result> { Self::build( Arc::new(option), executor, - RecordInstance::Normal, + schema, Arc::new(NoCache::default()), ) .await @@ -232,15 +238,16 @@ where impl DB where R: Record + Send + Sync, - R::Columns: Send + Sync, + ::Columns: Send + Sync, E: Executor + Send + Sync + 'static, { async fn build( option: Arc>, executor: E, - instance: RecordInstance, + schema: R::Schema, lru_cache: ParquetLru, ) -> Result> { + let record_schema = Arc::new(schema); let manager = Arc::new(StoreManager::new( option.base_fs.clone(), option.level_paths.clone(), @@ -263,10 +270,18 @@ where let version_set = VersionSet::new(clean_sender, option.clone(), manager.clone()).await?; let schema = Arc::new(RwLock::new( - Schema::new(option.clone(), task_tx, &version_set, instance, &manager).await?, + Schema::new( + option.clone(), + task_tx, + &version_set, + record_schema.clone(), + &manager, + ) + .await?, )); let mut compactor = Compactor::::new( schema.clone(), + record_schema, option.clone(), version_set.clone(), manager.clone(), @@ -344,7 +359,10 @@ where } /// delete the record with the primary key as the `key` - pub async fn remove(&self, key: R::Key) -> Result> { + pub async fn remove( + &self, + key: ::Key, + ) -> Result> { Ok(self .schema .read() @@ -368,7 +386,7 @@ where /// get the record with `key` as the primary key and process it using closure `f` pub async fn get( &self, - key: &R::Key, + key: &::Key, mut f: impl FnMut(TransactionEntry<'_, R>) -> Option, ) -> Result, CommitError> { Ok(self @@ -396,7 +414,10 @@ where /// scan records with primary keys in the `range` and process them using closure `f` pub async fn scan<'scan, T: 'scan>( &'scan self, - range: (Bound<&'scan R::Key>, Bound<&'scan R::Key>), + range: ( + Bound<&'scan ::Key>, + Bound<&'scan ::Key>, + ), mut f: impl FnMut(TransactionEntry<'_, R>) -> T + 'scan, ) -> impl Stream>> + 'scan { stream! { @@ -469,11 +490,14 @@ where R: Record, { pub mutable: Mutable, - pub immutables: Vec<(Option, Immutable)>, + pub immutables: Vec<( + Option, + Immutable<::Columns>, + )>, compaction_tx: Sender, recover_wal_ids: Option>, trigger: Arc + Send + Sync>>, - record_instance: RecordInstance, + record_schema: Arc, } impl Schema @@ -484,17 +508,23 @@ where option: Arc>, compaction_tx: Sender, version_set: &VersionSet, - record_instance: RecordInstance, + record_schema: Arc, manager: &StoreManager, ) -> Result> { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); let mut schema = Schema { - mutable: Mutable::new(&option, trigger.clone(), manager.base_fs()).await?, + mutable: Mutable::new( + &option, + trigger.clone(), + manager.base_fs(), + record_schema.clone(), + ) + .await?, immutables: Default::default(), compaction_tx, recover_wal_ids: None, trigger, - record_instance, + record_schema, }; let base_fs = manager.base_fs(); @@ -574,7 +604,7 @@ where async fn remove( &self, log_ty: LogType, - key: R::Key, + key: ::Key, ts: Timestamp, ) -> Result> { self.mutable.remove(log_ty, key, ts).await @@ -582,7 +612,7 @@ where async fn recover_append( &self, - key: R::Key, + key: ::Key, ts: Timestamp, value: Option, ) -> Result> { @@ -593,12 +623,12 @@ where &'get self, version: &'get Version, manager: &StoreManager, - key: &'get R::Key, + key: &'get ::Key, ts: Timestamp, projection: Projection, parquet_lru: ParquetLru, ) -> Result>, DbError> { - let primary_key_index = self.record_instance.primary_key_index::(); + let primary_key_index = self.record_schema.primary_key_index(); let projection = match projection { Projection::All => ProjectionMask::all(), @@ -610,7 +640,7 @@ where fixed_projection.dedup(); ProjectionMask::roots( - &arrow_to_parquet_schema(&self.record_instance.arrow_schema::()).unwrap(), + &arrow_to_parquet_schema(self.record_schema.arrow_schema()).unwrap(), fixed_projection, ) } @@ -640,7 +670,7 @@ where .map(|entry| Entry::RecordBatch(entry))) } - fn check_conflict(&self, key: &R::Key, ts: Timestamp) -> bool { + fn check_conflict(&self, key: &::Key, ts: Timestamp) -> bool { self.mutable.check_conflict(key, ts) || self .immutables @@ -653,6 +683,10 @@ where self.mutable.flush_wal().await?; Ok(()) } + + pub(crate) fn record_schema(&self) -> &Arc { + &self.record_schema + } } /// scan configuration intermediate structure @@ -663,8 +697,8 @@ where { schema: &'scan Schema, manager: &'scan StoreManager, - lower: Bound<&'range R::Key>, - upper: Bound<&'range R::Key>, + lower: Bound<&'range ::Key>, + upper: Bound<&'range ::Key>, ts: Timestamp, version: &'scan Version, @@ -685,7 +719,10 @@ where fn new( schema: &'scan Schema, manager: &'scan StoreManager, - (lower, upper): (Bound<&'range R::Key>, Bound<&'range R::Key>), + (lower, upper): ( + Bound<&'range ::Key>, + Bound<&'range ::Key>, + ), ts: Timestamp, version: &'scan Version, fn_pre_stream: Box< @@ -722,13 +759,13 @@ where for p in &mut projection { *p += 2; } - let primary_key_index = self.schema.record_instance.primary_key_index::(); + let primary_key_index = self.schema.record_schema.primary_key_index(); let mut fixed_projection = vec![0, 1, primary_key_index]; fixed_projection.append(&mut projection); fixed_projection.dedup(); let mask = ProjectionMask::roots( - &arrow_to_parquet_schema(&self.schema.record_instance.arrow_schema::()).unwrap(), + &arrow_to_parquet_schema(self.schema.record_schema.arrow_schema()).unwrap(), fixed_projection.clone(), ); @@ -795,7 +832,10 @@ where pub async fn package( self, batch_size: usize, - ) -> Result> + 'scan, DbError> { + ) -> Result< + impl Stream::Columns, ParquetError>> + 'scan, + DbError, + > { let mut streams = Vec::new(); let is_projection = self.projection_indices.is_some(); @@ -842,7 +882,7 @@ where batch_size, merge_stream, self.projection_indices, - &self.schema.record_instance, + self.schema.record_schema.arrow_schema().clone(), )) } } @@ -908,12 +948,15 @@ pub(crate) mod tests { compaction::{CompactTask, CompactionError, Compactor}, executor::{tokio::TokioExecutor, Executor}, fs::{manager::StoreManager, FileId}, - inmem::{immutable::tests::TestImmutableArrays, mutable::Mutable}, + inmem::{ + immutable::tests::{TestImmutableArrays, TestSchema}, + mutable::Mutable, + }, record::{ internal::InternalRecordRef, runtime::test::{test_dyn_item_schema, test_dyn_items}, - Datatype, DynRecord, RecordDecodeError, RecordEncodeError, RecordInstance, RecordRef, - Value, + Datatype, DynRecord, DynSchema, Key, RecordDecodeError, RecordEncodeError, RecordRef, + Schema as RecordSchema, Value, }, serdes::{Decode, Encode}, trigger::{TriggerFactory, TriggerType}, @@ -967,9 +1010,7 @@ pub(crate) mod tests { } impl Record for Test { - type Columns = TestImmutableArrays; - - type Key = String; + type Schema = TestSchema; type Ref<'r> = TestRef<'r> @@ -980,20 +1021,6 @@ pub(crate) mod tests { &self.vstring } - fn primary_key_index() -> usize { - 2 - } - - fn primary_key_path() -> (ColumnPath, Vec) { - ( - ColumnPath::new(vec!["_ts".to_string(), "vstring".to_string()]), - vec![ - SortingColumn::new(1, true, true), - SortingColumn::new(2, false, true), - ], - ) - } - fn as_record_ref(&self) -> Self::Ref<'_> { TestRef { vstring: &self.vstring, @@ -1002,20 +1029,6 @@ pub(crate) mod tests { } } - fn arrow_schema() -> &'static Arc { - static SCHEMA: Lazy> = Lazy::new(|| { - Arc::new(Schema::new(vec![ - Field::new("_null", DataType::Boolean, false), - Field::new("_ts", DataType::UInt32, false), - Field::new("vstring", DataType::Utf8, false), - Field::new("vu32", DataType::UInt32, false), - Field::new("vbool", DataType::Boolean, true), - ])) - }); - - &SCHEMA - } - fn size(&self) -> usize { let string_size = self.vstring.len(); let u32_size = mem::size_of::(); @@ -1071,7 +1084,7 @@ pub(crate) mod tests { impl<'r> RecordRef<'r> for TestRef<'r> { type Record = Test; - fn key(self) -> <::Key as crate::record::Key>::Ref<'r> { + fn key(self) -> <<::Schema as RecordSchema>::Key as Key>::Ref<'r> { self.vstring } @@ -1140,7 +1153,9 @@ pub(crate) mod tests { option: DbOption, executor: E, ) -> RecordBatch { - let db: DB = DB::new(option.clone(), executor).await.unwrap(); + let db: DB = DB::new(option.clone(), executor, TestSchema {}) + .await + .unwrap(); let base_fs = db.manager.base_fs(); db.write( @@ -1169,12 +1184,17 @@ pub(crate) mod tests { let trigger = schema.trigger.clone(); let mutable = mem::replace( &mut schema.mutable, - Mutable::new(&option, trigger, base_fs).await.unwrap(), + Mutable::new(&option, trigger, base_fs, Arc::new(TestSchema {})) + .await + .unwrap(), ); - Immutable::<::Columns>::from((mutable.data, &RecordInstance::Normal)) - .as_record_batch() - .clone() + Immutable::<::Columns>::new( + mutable.data, + TestSchema {}.arrow_schema().clone(), + ) + .as_record_batch() + .clone() } pub(crate) async fn build_schema( @@ -1183,7 +1203,7 @@ pub(crate) mod tests { ) -> Result<(crate::Schema, Receiver), fusio::Error> { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let mutable = Mutable::new(&option, trigger.clone(), fs).await?; + let mutable = Mutable::new(&option, trigger.clone(), fs, Arc::new(TestSchema {})).await?; mutable .insert( @@ -1225,7 +1245,8 @@ pub(crate) mod tests { let immutables = { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let mutable: Mutable = Mutable::new(&option, trigger.clone(), fs).await?; + let mutable: Mutable = + Mutable::new(&option, trigger.clone(), fs, Arc::new(TestSchema)).await?; mutable .insert( @@ -1266,7 +1287,7 @@ pub(crate) mod tests { vec![( Some(FileId::new()), - Immutable::from((mutable.data, &RecordInstance::Normal)), + Immutable::new(mutable.data, TestSchema {}.arrow_schema().clone()), )] }; @@ -1279,7 +1300,7 @@ pub(crate) mod tests { compaction_tx, recover_wal_ids: None, trigger, - record_instance: RecordInstance::Normal, + record_schema: Arc::new(TestSchema {}), }, compaction_rx, )) @@ -1290,12 +1311,13 @@ pub(crate) mod tests { compaction_rx: Receiver, executor: E, schema: crate::Schema, + record_schema: Arc, version: Version, manager: Arc, ) -> Result, DbError> where R: Record + Send + Sync, - R::Columns: Send + Sync, + ::Columns: Send + Sync, E: Executor + Send + Sync + 'static, { { @@ -1312,6 +1334,7 @@ pub(crate) mod tests { build_version_set(version, clean_sender, option.clone(), manager.clone()).await?; let mut compactor = Compactor::::new( schema.clone(), + record_schema, option.clone(), version_set.clone(), manager.clone(), @@ -1572,7 +1595,7 @@ pub(crate) mod tests { let path = Path::from_filesystem_path(temp_dir.path()).unwrap(); let path_l0 = Path::from_filesystem_path(temp_dir_l0.path()).unwrap(); - let mut option = DbOption::from(path) + let mut option = DbOption::from((path, &TestSchema)) .level_path(0, path_l0, FsOptions::Local) .unwrap(); option.immutable_chunk_num = 1; @@ -1583,7 +1606,9 @@ pub(crate) mod tests { option.major_default_oldest_table_num = 1; option.trigger_type = TriggerType::Length(/* max_mutable_len */ 5); - let db: DB = DB::new(option, TokioExecutor::new()).await.unwrap(); + let db: DB = DB::new(option, TokioExecutor::new(), TestSchema) + .await + .unwrap(); for (i, item) in test_items().into_iter().enumerate() { db.write(item, 0.into()).await.unwrap(); @@ -1610,7 +1635,10 @@ pub(crate) mod tests { async fn test_flush() { let temp_dir = TempDir::new().unwrap(); - let mut option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + let mut option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &TestSchema, + )); option.immutable_chunk_num = 1; option.immutable_chunk_max_num = 1; option.major_threshold_with_sst_size = 3; @@ -1619,7 +1647,9 @@ pub(crate) mod tests { option.major_default_oldest_table_num = 1; option.trigger_type = TriggerType::Length(/* max_mutable_len */ 50); - let db: DB = DB::new(option, TokioExecutor::new()).await.unwrap(); + let db: DB = DB::new(option, TokioExecutor::new(), TestSchema) + .await + .unwrap(); for item in &test_items()[0..10] { db.write(item.clone(), 0.into()).await.unwrap(); @@ -1642,21 +1672,24 @@ pub(crate) mod tests { let temp_dir = TempDir::new().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = Arc::new(DbOption::from( + let option = Arc::new(DbOption::from(( Path::from_filesystem_path(temp_dir.path()).unwrap(), - )); + &TestSchema, + ))); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let (task_tx, _task_rx) = bounded(1); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); let schema: crate::Schema = crate::Schema { - mutable: Mutable::new(&option, trigger.clone(), &fs).await.unwrap(), + mutable: Mutable::new(&option, trigger.clone(), &fs, Arc::new(TestSchema)) + .await + .unwrap(), immutables: Default::default(), compaction_tx: task_tx.clone(), recover_wal_ids: None, trigger, - record_instance: RecordInstance::Normal, + record_schema: Arc::new(TestSchema), }; for (i, item) in test_items().into_iter().enumerate() { @@ -1668,9 +1701,10 @@ pub(crate) mod tests { schema.flush_wal().await.unwrap(); drop(schema); - let db: DB = DB::new(option.as_ref().to_owned(), TokioExecutor::new()) - .await - .unwrap(); + let db: DB = + DB::new(option.as_ref().to_owned(), TokioExecutor::new(), TestSchema) + .await + .unwrap(); let mut sort_items = BTreeMap::new(); for item in test_items() { @@ -1700,11 +1734,11 @@ pub(crate) mod tests { let temp_dir = TempDir::new().unwrap(); let manager = StoreManager::new(FsOptions::Local, vec![]).unwrap(); - let (desc, primary_key_index) = test_dyn_item_schema(); + let dyn_schema = Arc::new(test_dyn_item_schema()); let option = Arc::new(DbOption::with_path( Path::from_filesystem_path(temp_dir.path()).unwrap(), "id".to_owned(), - primary_key_index, + dyn_schema.primary_key_index(), )); manager .base_fs() @@ -1716,14 +1750,19 @@ pub(crate) mod tests { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); let schema: crate::Schema = crate::Schema { - mutable: Mutable::new(&option, trigger.clone(), manager.base_fs()) - .await - .unwrap(), + mutable: Mutable::new( + &option, + trigger.clone(), + manager.base_fs(), + dyn_schema.clone(), + ) + .await + .unwrap(), immutables: Default::default(), compaction_tx: task_tx.clone(), recover_wal_ids: None, trigger, - record_instance: RecordInstance::Normal, + record_schema: dyn_schema.clone(), }; for item in test_dyn_items().into_iter() { @@ -1738,10 +1777,11 @@ pub(crate) mod tests { let option = DbOption::with_path( Path::from_filesystem_path(temp_dir.path()).unwrap(), "id".to_owned(), - primary_key_index, + dyn_schema.primary_key_index(), ); + let dyn_schema = test_dyn_item_schema(); let db: DB = - DB::with_schema(option, TokioExecutor::new(), desc, primary_key_index) + DB::with_schema(option, TokioExecutor::new(), dyn_schema) .await .unwrap(); @@ -1774,13 +1814,18 @@ pub(crate) mod tests { async fn test_get_removed() { let temp_dir = TempDir::new().unwrap(); - let mut option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + let mut option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &TestSchema, + )); option.immutable_chunk_num = 1; option.immutable_chunk_max_num = 1; option.major_threshold_with_sst_size = 3; option.major_default_oldest_table_num = 1; option.trigger_type = TriggerType::Length(5); - let db: DB = DB::new(option, TokioExecutor::new()).await.unwrap(); + let db: DB = DB::new(option, TokioExecutor::new(), TestSchema) + .await + .unwrap(); for (idx, item) in test_items().into_iter().enumerate() { if idx % 2 == 0 { @@ -1808,11 +1853,11 @@ pub(crate) mod tests { async fn test_read_write_dyn() { let temp_dir = TempDir::new().unwrap(); - let (cols_desc, primary_key_index) = test_dyn_item_schema(); + let dyn_schema = test_dyn_item_schema(); let mut option = DbOption::with_path( Path::from_filesystem_path(temp_dir.path()).unwrap(), "id".to_string(), - primary_key_index, + dyn_schema.primary_key_index(), ); option.immutable_chunk_num = 1; option.immutable_chunk_max_num = 1; @@ -1823,7 +1868,7 @@ pub(crate) mod tests { option.trigger_type = TriggerType::Length(5); let db: DB = - DB::with_schema(option, TokioExecutor::new(), cols_desc, primary_key_index) + DB::with_schema(option, TokioExecutor::new(), dyn_schema) .await .unwrap(); @@ -2016,11 +2061,11 @@ pub(crate) mod tests { async fn test_dyn_multiple_db() { let temp_dir1 = TempDir::with_prefix("db1").unwrap(); - let (cols_desc, primary_key_index) = test_dyn_item_schema(); + let dyn_schema = test_dyn_item_schema(); let mut option = DbOption::with_path( Path::from_filesystem_path(temp_dir1.path()).unwrap(), "id".to_string(), - primary_key_index, + dyn_schema.primary_key_index(), ); option.immutable_chunk_num = 1; option.immutable_chunk_max_num = 1; @@ -2032,7 +2077,7 @@ pub(crate) mod tests { let mut option2 = DbOption::with_path( Path::from_filesystem_path(temp_dir2.path()).unwrap(), "id".to_string(), - primary_key_index, + dyn_schema.primary_key_index(), ); option2.immutable_chunk_num = 1; option2.immutable_chunk_max_num = 1; @@ -2044,7 +2089,7 @@ pub(crate) mod tests { let mut option3 = DbOption::with_path( Path::from_filesystem_path(temp_dir3.path()).unwrap(), "id".to_string(), - primary_key_index, + dyn_schema.primary_key_index(), ); option3.immutable_chunk_num = 1; option3.immutable_chunk_max_num = 1; @@ -2052,24 +2097,16 @@ pub(crate) mod tests { option3.major_default_oldest_table_num = 1; option3.trigger_type = TriggerType::Length(5); - let db1: DB = DB::with_schema( - option, - TokioExecutor::new(), - cols_desc.clone(), - primary_key_index, - ) - .await - .unwrap(); - let db2: DB = DB::with_schema( - option2, - TokioExecutor::new(), - cols_desc.clone(), - primary_key_index, - ) - .await - .unwrap(); + let db1: DB = + DB::with_schema(option, TokioExecutor::new(), test_dyn_item_schema()) + .await + .unwrap(); + let db2: DB = + DB::with_schema(option2, TokioExecutor::new(), test_dyn_item_schema()) + .await + .unwrap(); let db3: DB = - DB::with_schema(option3, TokioExecutor::new(), cols_desc, primary_key_index) + DB::with_schema(option3, TokioExecutor::new(), test_dyn_item_schema()) .await .unwrap(); diff --git a/src/ondisk/sstable.rs b/src/ondisk/sstable.rs index 32b32f71..c3c94cf9 100644 --- a/src/ondisk/sstable.rs +++ b/src/ondisk/sstable.rs @@ -193,7 +193,10 @@ pub(crate) mod tests { let manager = StoreManager::new(FsOptions::Local, vec![]).unwrap(); let base_fs = manager.base_fs(); let record_batch = get_test_record_batch::( - DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()), + DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &TestSchema, + )), TokioExecutor::new(), ) .await; @@ -268,7 +271,10 @@ pub(crate) mod tests { let manager = StoreManager::new(FsOptions::Local, vec![]).unwrap(); let base_fs = manager.base_fs(); let record_batch = get_test_record_batch::( - DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()), + DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &TestSchema, + )), TokioExecutor::new(), ) .await; diff --git a/src/option.rs b/src/option.rs index 78cb85e9..ef7fde43 100644 --- a/src/option.rs +++ b/src/option.rs @@ -14,7 +14,7 @@ use parquet::{ use crate::{ fs::{FileId, FileType}, - record::Record, + record::{Record, Schema}, trigger::TriggerType, version::{Version, MAX_LEVEL}, DbError, @@ -96,13 +96,13 @@ where } } -impl From for DbOption +impl From<(Path, &R::Schema)> for DbOption where R: Record, { /// build the default configured [`DbOption`] based on the passed path - fn from(base_path: Path) -> Self { - let (column_paths, sorting_columns) = R::primary_key_path(); + fn from((base_path, schema): (Path, &R::Schema)) -> Self { + let (column_paths, sorting_columns) = schema.primary_key_path(); DbOption { immutable_chunk_num: 3, immutable_chunk_max_num: 5, diff --git a/src/record/mod.rs b/src/record/mod.rs index 127180ee..497d1dd4 100644 --- a/src/record/mod.rs +++ b/src/record/mod.rs @@ -4,10 +4,13 @@ pub mod runtime; #[cfg(test)] pub(crate) mod test; -use std::{error::Error, fmt::Debug, io, sync::Arc}; +use std::{collections::HashMap, error::Error, fmt::Debug, io, sync::Arc}; use array::DynRecordImmutableArrays; -use arrow::{array::RecordBatch, datatypes::Schema as ArrowSchema}; +use arrow::{ + array::RecordBatch, + datatypes::{DataType, Field, Schema as ArrowSchema}, +}; use internal::InternalRecordRef; pub use key::{Key, KeyRef}; use parquet::{arrow::ProjectionMask, format::SortingColumn, schema::types::ColumnPath}; @@ -48,7 +51,7 @@ use crate::{ // } // } -pub trait Schema { +pub trait Schema: Debug + Send + Sync { type Record: Record; type Columns: ArrowArrays; @@ -71,11 +74,17 @@ pub struct DynSchema { impl DynSchema { pub fn new(schema: Vec, primary_index: usize) -> Self { - let arrow_schema = Arc::new(ArrowSchema::new( - schema - .iter() - .map(|desc| desc.arrow_field()) - .collect::>(), + let mut metadata = HashMap::new(); + metadata.insert("primary_key_index".to_string(), primary_index.to_string()); + let arrow_schema = Arc::new(ArrowSchema::new_with_metadata( + [ + Field::new("_null", DataType::Boolean, false), + Field::new("_ts", DataType::UInt32, false), + ] + .into_iter() + .chain(schema.iter().map(|desc| desc.arrow_field())) + .collect::>(), + metadata, )); Self { schema, diff --git a/src/record/runtime/record.rs b/src/record/runtime/record.rs index 4902e4a7..6b921619 100644 --- a/src/record/runtime/record.rs +++ b/src/record/runtime/record.rs @@ -238,10 +238,10 @@ pub(crate) mod test { use std::sync::Arc; use super::DynRecord; - use crate::record::{Datatype, Value, ValueDesc}; + use crate::record::{Datatype, DynSchema, Value, ValueDesc}; #[allow(unused)] - pub(crate) fn test_dyn_item_schema() -> (Vec, usize) { + pub(crate) fn test_dyn_item_schema() -> DynSchema { let descs = vec![ ValueDesc::new("id".to_string(), Datatype::Int64, false), ValueDesc::new("age".to_string(), Datatype::Int8, true), @@ -252,7 +252,7 @@ pub(crate) mod test { ValueDesc::new("enabled".to_string(), Datatype::Boolean, false), ValueDesc::new("bytes".to_string(), Datatype::Bytes, true), ]; - (descs, 0) + DynSchema::new(descs, 0) } #[allow(unused)] diff --git a/src/snapshot.rs b/src/snapshot.rs index c548d010..1782dd8c 100644 --- a/src/snapshot.rs +++ b/src/snapshot.rs @@ -5,7 +5,7 @@ use parquet::arrow::ProjectionMask; use crate::{ fs::manager::StoreManager, - record::Record, + record::{Record, Schema as RecordSchema}, stream, stream::ScanStream, timestamp::Timestamp, @@ -30,7 +30,7 @@ where { pub async fn get<'get>( &'get self, - key: &'get R::Key, + key: &'get ::Key, projection: Projection, ) -> Result>, DbError> { Ok(self @@ -55,7 +55,10 @@ where pub fn scan<'scan, 'range>( &'scan self, - range: (Bound<&'range R::Key>, Bound<&'range R::Key>), + range: ( + Bound<&'range ::Key>, + Bound<&'range ::Key>, + ), ) -> Scan<'scan, 'range, R> { Scan::new( &self.share, @@ -97,7 +100,10 @@ where pub(crate) fn _scan<'scan, 'range>( &'scan self, - range: (Bound<&'range R::Key>, Bound<&'range R::Key>), + range: ( + Bound<&'range ::Key>, + Bound<&'range ::Key>, + ), fn_pre_stream: Box< dyn FnOnce(Option) -> Option> + Send + 'scan, >, @@ -127,6 +133,7 @@ mod tests { compaction::tests::build_version, executor::tokio::TokioExecutor, fs::manager::StoreManager, + inmem::immutable::tests::TestSchema, tests::{build_db, build_schema}, version::TransactionTs, DbOption, @@ -136,9 +143,10 @@ mod tests { async fn snapshot_scan() { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); - let option = Arc::new(DbOption::from( + let option = Arc::new(DbOption::from(( Path::from_filesystem_path(temp_dir.path()).unwrap(), - )); + &TestSchema, + ))); manager .base_fs() @@ -151,7 +159,7 @@ mod tests { .await .unwrap(); - let (_, version) = build_version(&option, &manager).await; + let (_, version) = build_version(&option, &manager, &Arc::new(TestSchema)).await; let (schema, compaction_rx) = build_schema(option.clone(), manager.base_fs()) .await .unwrap(); @@ -160,6 +168,7 @@ mod tests { compaction_rx, TokioExecutor::new(), schema, + Arc::new(TestSchema), version, manager, ) diff --git a/src/stream/level.rs b/src/stream/level.rs index 61b2e71e..1fb580de 100644 --- a/src/stream/level.rs +++ b/src/stream/level.rs @@ -229,17 +229,23 @@ mod tests { use tempfile::TempDir; use crate::{ - compaction::tests::build_version, fs::manager::StoreManager, record::Record, - stream::level::LevelStream, tests::Test, DbOption, + compaction::tests::build_version, + fs::manager::StoreManager, + inmem::immutable::tests::TestSchema, + record::{Record, Schema}, + stream::level::LevelStream, + tests::Test, + DbOption, }; #[tokio::test] async fn projection_scan() { let temp_dir = TempDir::new().unwrap(); let manager = StoreManager::new(FsOptions::Local, vec![]).unwrap(); - let option = Arc::new(DbOption::from( + let option = Arc::new(DbOption::from(( Path::from_filesystem_path(temp_dir.path()).unwrap(), - )); + &TestSchema {}, + ))); manager .base_fs() @@ -252,7 +258,7 @@ mod tests { .await .unwrap(); - let (_, version) = build_version(&option, &manager).await; + let (_, version) = build_version(&option, &manager, &Arc::new(TestSchema)).await; { let mut level_stream_1 = LevelStream::new( @@ -264,7 +270,7 @@ mod tests { 1_u32.into(), None, ProjectionMask::roots( - &arrow_to_parquet_schema(Test::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(TestSchema {}.arrow_schema()).unwrap(), [0, 1, 2, 3], ), manager.base_fs().clone(), @@ -301,7 +307,7 @@ mod tests { 1_u32.into(), None, ProjectionMask::roots( - &arrow_to_parquet_schema(Test::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(TestSchema {}.arrow_schema()).unwrap(), [0, 1, 2, 4], ), manager.base_fs().clone(), @@ -338,7 +344,7 @@ mod tests { 1_u32.into(), None, ProjectionMask::roots( - &arrow_to_parquet_schema(Test::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(TestSchema {}.arrow_schema()).unwrap(), [0, 1, 2], ), manager.base_fs().clone(), diff --git a/src/stream/mem_projection.rs b/src/stream/mem_projection.rs index 0397e022..334e3569 100644 --- a/src/stream/mem_projection.rs +++ b/src/stream/mem_projection.rs @@ -63,21 +63,31 @@ mod tests { use parquet::arrow::{arrow_to_parquet_schema, ProjectionMask}; use crate::{ - inmem::mutable::Mutable, record::Record, stream::mem_projection::MemProjectionStream, - tests::Test, trigger::TriggerFactory, wal::log::LogType, DbOption, + inmem::{immutable::tests::TestSchema, mutable::Mutable}, + record::{Record, Schema}, + stream::mem_projection::MemProjectionStream, + tests::Test, + trigger::TriggerFactory, + wal::log::LogType, + DbOption, }; #[tokio::test] async fn merge_mutable() { let temp_dir = tempfile::tempdir().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + let option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &TestSchema, + )); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let mutable = Mutable::::new(&option, trigger, &fs).await.unwrap(); + let mutable = Mutable::::new(&option, trigger, &fs, Arc::new(TestSchema {})) + .await + .unwrap(); mutable .insert( @@ -117,7 +127,7 @@ mod tests { .unwrap(); let mask = ProjectionMask::roots( - &arrow_to_parquet_schema(Test::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(TestSchema.arrow_schema()).unwrap(), vec![0, 1, 2, 4], ); diff --git a/src/stream/merge.rs b/src/stream/merge.rs index 5750fb3a..42ea60dd 100644 --- a/src/stream/merge.rs +++ b/src/stream/merge.rs @@ -163,7 +163,11 @@ mod tests { use super::MergeStream; use crate::{ - inmem::mutable::Mutable, stream::Entry, trigger::TriggerFactory, wal::log::LogType, + inmem::{immutable::tests::TestSchema, mutable::Mutable}, + record::test::StringSchema, + stream::Entry, + trigger::TriggerFactory, + wal::log::LogType, DbOption, }; @@ -171,13 +175,18 @@ mod tests { async fn merge_mutable() { let temp_dir = tempfile::tempdir().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + let option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &StringSchema, + )); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let m1 = Mutable::::new(&option, trigger, &fs).await.unwrap(); + let m1 = Mutable::::new(&option, trigger, &fs, Arc::new(StringSchema)) + .await + .unwrap(); m1.remove(LogType::Full, "b".into(), 3.into()) .await @@ -191,7 +200,9 @@ mod tests { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let m2 = Mutable::::new(&option, trigger, &fs).await.unwrap(); + let m2 = Mutable::::new(&option, trigger, &fs, Arc::new(StringSchema)) + .await + .unwrap(); m2.insert(LogType::Full, "a".into(), 1.into()) .await .unwrap(); @@ -204,7 +215,9 @@ mod tests { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let m3 = Mutable::::new(&option, trigger, &fs).await.unwrap(); + let m3 = Mutable::::new(&option, trigger, &fs, Arc::new(StringSchema)) + .await + .unwrap(); m3.insert(LogType::Full, "e".into(), 4.into()) .await .unwrap(); @@ -265,13 +278,18 @@ mod tests { async fn merge_mutable_remove_duplicates() { let temp_dir = tempfile::tempdir().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + let option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &StringSchema, + )); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let m1 = Mutable::::new(&option, trigger, &fs).await.unwrap(); + let m1 = Mutable::::new(&option, trigger, &fs, Arc::new(StringSchema)) + .await + .unwrap(); m1.insert(LogType::Full, "1".into(), 0_u32.into()) .await .unwrap(); @@ -351,13 +369,18 @@ mod tests { async fn merge_mutable_limit() { let temp_dir = tempfile::tempdir().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + let option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &StringSchema, + )); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let m1 = Mutable::::new(&option, trigger, &fs).await.unwrap(); + let m1 = Mutable::::new(&option, trigger, &fs, Arc::new(StringSchema)) + .await + .unwrap(); m1.insert(LogType::Full, "1".into(), 0_u32.into()) .await .unwrap(); diff --git a/src/stream/mod.rs b/src/stream/mod.rs index fa0b5afe..375f7919 100644 --- a/src/stream/mod.rs +++ b/src/stream/mod.rs @@ -21,7 +21,7 @@ use record_batch::RecordBatchEntry; use crate::{ inmem::{immutable::ImmutableScan, mutable::MutableScan}, ondisk::scan::SsTableScan, - record::{Key, Record, RecordRef}, + record::{Key, Record, RecordRef, Schema}, stream::{level::LevelStream, mem_projection::MemProjectionStream}, timestamp::Timestamped, transaction::TransactionScan, @@ -31,8 +31,15 @@ pub enum Entry<'entry, R> where R: Record, { - Transaction((Timestamped<::Ref<'entry>>, &'entry Option)), - Mutable(crossbeam_skiplist::map::Entry<'entry, Timestamped, Option>), + Transaction( + ( + Timestamped<<::Key as Key>::Ref<'entry>>, + &'entry Option, + ), + ), + Mutable( + crossbeam_skiplist::map::Entry<'entry, Timestamped<::Key>, Option>, + ), Projection((Box>, Arc)), RecordBatch(RecordBatchEntry), } @@ -41,14 +48,14 @@ impl Entry<'_, R> where R: Record, { - pub(crate) fn key(&self) -> Timestamped<::Ref<'_>> { + pub(crate) fn key(&self) -> Timestamped<<::Key as Key>::Ref<'_>> { match self { Entry::Transaction((key, _)) => { // Safety: shorter lifetime must be safe unsafe { transmute::< - Timestamped<<::Key as Key>::Ref<'_>>, - Timestamped<<::Key as Key>::Ref<'_>>, + Timestamped<<::Key as Key>::Ref<'_>>, + Timestamped<<::Key as Key>::Ref<'_>>, >(key.clone()) } } @@ -77,7 +84,7 @@ where impl fmt::Debug for Entry<'_, R> where R: Record + Debug, - R::Key: Debug, + ::Key: Debug, { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match self { diff --git a/src/stream/package.rs b/src/stream/package.rs index 7e23b1b0..2dec0e0e 100644 --- a/src/stream/package.rs +++ b/src/stream/package.rs @@ -1,14 +1,16 @@ use std::{ pin::Pin, + sync::Arc, task::{Context, Poll}, }; +use arrow::datatypes::Schema as ArrowSchema; use futures_core::Stream; use pin_project_lite::pin_project; use crate::{ inmem::immutable::{ArrowArrays, Builder}, - record::{Record, RecordInstance}, + record::{Record, Schema}, stream::merge::MergeStream, }; @@ -20,7 +22,7 @@ pin_project! { row_count: usize, batch_size: usize, inner: MergeStream<'package, R>, - builder: ::Builder, + builder: <::Columns as ArrowArrays>::Builder, projection_indices: Option>, } } @@ -33,13 +35,13 @@ where batch_size: usize, merge: MergeStream<'package, R>, projection_indices: Option>, - instance: &RecordInstance, + schema: Arc, ) -> Self { Self { row_count: 0, batch_size, inner: merge, - builder: R::Columns::builder(&instance.arrow_schema::(), batch_size), + builder: ::Columns::builder(schema, batch_size), projection_indices, } } @@ -49,7 +51,7 @@ impl<'package, R> Stream for PackageStream<'package, R> where R: Record, { - type Item = Result; + type Item = Result<::Columns, parquet::errors::ParquetError>; fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let mut project = self.project(); @@ -88,10 +90,13 @@ mod tests { use crate::{ inmem::{ - immutable::{tests::TestImmutableArrays, ArrowArrays}, + immutable::{ + tests::{TestImmutableArrays, TestSchema}, + ArrowArrays, + }, mutable::Mutable, }, - record::Record, + record::{Record, Schema}, stream::{merge::MergeStream, package::PackageStream}, tests::Test, trigger::TriggerFactory, @@ -103,13 +108,18 @@ mod tests { async fn iter() { let temp_dir = TempDir::new().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + let option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &TestSchema, + )); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let m1 = Mutable::::new(&option, trigger, &fs).await.unwrap(); + let m1 = Mutable::::new(&option, trigger, &fs, Arc::new(TestSchema {})) + .await + .unwrap(); m1.insert( LogType::Full, Test { @@ -191,7 +201,7 @@ mod tests { row_count: 0, batch_size: 8192, inner: merge, - builder: TestImmutableArrays::builder(Test::arrow_schema(), 8192), + builder: TestImmutableArrays::builder(TestSchema {}.arrow_schema().clone(), 8192), projection_indices: Some(projection_indices.clone()), }; @@ -199,7 +209,12 @@ mod tests { assert_eq!( arrays.as_record_batch(), &RecordBatch::try_new( - Arc::new(Test::arrow_schema().project(&projection_indices).unwrap(),), + Arc::new( + TestSchema {} + .arrow_schema() + .project(&projection_indices) + .unwrap(), + ), vec![ Arc::new(BooleanArray::from(vec![ false, false, false, false, false, false diff --git a/src/stream/record_batch.rs b/src/stream/record_batch.rs index c974f8ee..edbeec84 100644 --- a/src/stream/record_batch.rs +++ b/src/stream/record_batch.rs @@ -9,7 +9,7 @@ use arrow::{array::RecordBatch, datatypes::Schema}; use parquet::arrow::ProjectionMask; use crate::{ - record::{internal::InternalRecordRef, Key, Record, RecordRef}, + record::{internal::InternalRecordRef, Key, Record, RecordRef, Schema as RecordSchema}, timestamp::Timestamped, }; @@ -35,11 +35,13 @@ where } } - pub(crate) fn internal_key(&self) -> Timestamped<::Ref<'_>> { + pub(crate) fn internal_key( + &self, + ) -> Timestamped<<::Key as Key>::Ref<'_>> { self.record_ref.value() } - pub fn key(&self) -> ::Ref<'_> { + pub fn key(&self) -> <::Key as Key>::Ref<'_> { self.record_ref.value().value().clone() } diff --git a/src/transaction.rs b/src/transaction.rs index aa2df175..997c6443 100644 --- a/src/transaction.rs +++ b/src/transaction.rs @@ -14,7 +14,7 @@ use thiserror::Error; use crate::{ compaction::CompactTask, - record::{Key, KeyRef}, + record::{Key, KeyRef, Schema as RecordSchema}, snapshot::Snapshot, stream, stream::mem_projection::MemProjectionStream, @@ -24,7 +24,7 @@ use crate::{ }; pub(crate) struct TransactionScan<'scan, R: Record> { - inner: Range<'scan, R::Key, Option>, + inner: Range<'scan, ::Key, Option>, ts: Timestamp, } @@ -32,7 +32,10 @@ impl<'scan, R> Iterator for TransactionScan<'scan, R> where R: Record, { - type Item = (Timestamped<::Ref<'scan>>, &'scan Option); + type Item = ( + Timestamped<<::Key as Key>::Ref<'scan>>, + &'scan Option, + ); fn next(&mut self) -> Option { self.inner @@ -46,16 +49,19 @@ pub struct Transaction<'txn, R> where R: Record, { - local: BTreeMap>, + local: BTreeMap<::Key, Option>, snapshot: Snapshot<'txn, R>, - lock_map: LockMap, + lock_map: LockMap<::Key>, } impl<'txn, R> Transaction<'txn, R> where R: Record + Send, { - pub(crate) fn new(snapshot: Snapshot<'txn, R>, lock_map: LockMap) -> Self { + pub(crate) fn new( + snapshot: Snapshot<'txn, R>, + lock_map: LockMap<::Key>, + ) -> Self { Self { local: BTreeMap::new(), snapshot, @@ -67,7 +73,7 @@ where /// [`Projection`] pub async fn get<'get>( &'get self, - key: &'get R::Key, + key: &'get ::Key, projection: Projection, ) -> Result>, DbError> { Ok(match self.local.get(key).and_then(|v| v.as_ref()) { @@ -83,7 +89,10 @@ where /// scan records with primary keys in the `range` pub fn scan<'scan, 'range>( &'scan self, - range: (Bound<&'range R::Key>, Bound<&'range R::Key>), + range: ( + Bound<&'range ::Key>, + Bound<&'range ::Key>, + ), ) -> Scan<'scan, 'range, R> { let ts = self.snapshot.ts(); let inner = self.local.range(range); @@ -105,11 +114,11 @@ where } /// delete the record with the primary key as the `key` on this transaction - pub fn remove(&mut self, key: R::Key) { + pub fn remove(&mut self, key: ::Key) { self.entry(key, None) } - fn entry(&mut self, key: R::Key, value: Option) { + fn entry(&mut self, key: ::Key, value: Option) { match self.local.entry(key) { Entry::Vacant(v) => { v.insert(value); @@ -179,7 +188,7 @@ where async fn append( schema: &Schema, log_ty: LogType, - key: ::Key, + key: ::Key, record: Option, new_ts: Timestamp, ) -> Result> { @@ -225,7 +234,7 @@ where #[error("transaction database error {:?}", .0)] Database(#[from] DbError), #[error("transaction write conflict: {:?}", .0)] - WriteConflict(R::Key), + WriteConflict(::Key), #[error("Failed to send compact task")] SendCompactTaskError(#[from] SendError), #[error("Channel is closed")] @@ -245,8 +254,10 @@ mod tests { compaction::tests::build_version, executor::tokio::TokioExecutor, fs::manager::StoreManager, + inmem::immutable::tests::TestSchema, record::{ - runtime::{Datatype, DynRecord, Value}, + runtime::{test::test_dyn_item_schema, Datatype, DynRecord, Value}, + test::StringSchema, ValueDesc, }, tests::{build_db, build_schema, Test}, @@ -260,8 +271,12 @@ mod tests { let temp_dir = TempDir::new().unwrap(); let db = DB::::new( - DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()), + DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &StringSchema, + )), TokioExecutor::new(), + StringSchema, ) .await .unwrap(); @@ -295,9 +310,10 @@ mod tests { async fn transaction_get() { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); - let option = Arc::new(DbOption::from( + let option = Arc::new(DbOption::from(( Path::from_filesystem_path(temp_dir.path()).unwrap(), - )); + &TestSchema, + ))); manager .base_fs() @@ -310,7 +326,7 @@ mod tests { .await .unwrap(); - let (_, version) = build_version(&option, &manager).await; + let (_, version) = build_version(&option, &manager, &Arc::new(TestSchema)).await; let (schema, compaction_rx) = build_schema(option.clone(), manager.base_fs()) .await .unwrap(); @@ -319,6 +335,7 @@ mod tests { compaction_rx, TokioExecutor::new(), schema, + Arc::new(TestSchema), version, manager, ) @@ -385,9 +402,12 @@ mod tests { #[tokio::test] async fn write_conflicts() { let temp_dir = TempDir::new().unwrap(); - let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + let option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &StringSchema, + )); - let db = DB::::new(option, TokioExecutor::new()) + let db = DB::::new(option, TokioExecutor::new(), StringSchema) .await .unwrap(); @@ -418,9 +438,12 @@ mod tests { #[tokio::test] async fn transaction_projection() { let temp_dir = TempDir::new().unwrap(); - let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + let option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &TestSchema, + )); - let db = DB::::new(option, TokioExecutor::new()) + let db = DB::::new(option, TokioExecutor::new(), TestSchema) .await .unwrap(); @@ -456,9 +479,10 @@ mod tests { async fn transaction_scan() { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); - let option = Arc::new(DbOption::from( + let option = Arc::new(DbOption::from(( Path::from_filesystem_path(temp_dir.path()).unwrap(), - )); + &TestSchema, + ))); manager .base_fs() @@ -471,7 +495,7 @@ mod tests { .await .unwrap(); - let (_, version) = build_version(&option, &manager).await; + let (_, version) = build_version(&option, &manager, &Arc::new(TestSchema)).await; let (schema, compaction_rx) = build_schema(option.clone(), manager.base_fs()) .await .unwrap(); @@ -480,6 +504,7 @@ mod tests { compaction_rx, TokioExecutor::new(), schema, + Arc::new(TestSchema), version, manager, ) @@ -551,9 +576,10 @@ mod tests { async fn test_transaction_scan_bound() { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); - let option = Arc::new(DbOption::from( + let option = Arc::new(DbOption::from(( Path::from_filesystem_path(temp_dir.path()).unwrap(), - )); + &TestSchema, + ))); manager .base_fs() @@ -566,7 +592,7 @@ mod tests { .await .unwrap(); - let (_, version) = build_version(&option, &manager).await; + let (_, version) = build_version(&option, &manager, &Arc::new(TestSchema)).await; let (schema, compaction_rx) = build_schema(option.clone(), manager.base_fs()) .await .unwrap(); @@ -575,6 +601,7 @@ mod tests { compaction_rx, TokioExecutor::new(), schema, + Arc::new(TestSchema), version, manager, ) @@ -727,9 +754,10 @@ mod tests { async fn test_transaction_scan_limit() { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); - let option = Arc::new(DbOption::from( + let option = Arc::new(DbOption::from(( Path::from_filesystem_path(temp_dir.path()).unwrap(), - )); + &TestSchema, + ))); manager .base_fs() @@ -742,7 +770,7 @@ mod tests { .await .unwrap(); - let (_, version) = build_version(&option, &manager).await; + let (_, version) = build_version(&option, &manager, &Arc::new(TestSchema)).await; let (schema, compaction_rx) = build_schema(option.clone(), manager.base_fs()) .await .unwrap(); @@ -751,6 +779,7 @@ mod tests { compaction_rx, TokioExecutor::new(), schema, + Arc::new(TestSchema), version, manager, ) @@ -800,7 +829,7 @@ mod tests { "age".to_string(), 0, ); - let db = DB::with_schema(option, TokioExecutor::default(), descs, 0) + let db = DB::with_schema(option, TokioExecutor::default(), test_dyn_item_schema()) .await .unwrap(); diff --git a/src/version/cleaner.rs b/src/version/cleaner.rs index 19a00a7f..a0579d8a 100644 --- a/src/version/cleaner.rs +++ b/src/version/cleaner.rs @@ -107,6 +107,7 @@ pub(crate) mod tests { use crate::{ executor::{tokio::TokioExecutor, Executor}, fs::{manager::StoreManager, FileId, FileType}, + inmem::immutable::tests::TestSchema, tests::Test, version::cleaner::{CleanTag, Cleaner}, DbOption, @@ -116,9 +117,10 @@ pub(crate) mod tests { async fn test_cleaner() { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); - let option = Arc::new(DbOption::from( + let option = Arc::new(DbOption::from(( Path::from_filesystem_path(temp_dir.path()).unwrap(), - )); + &TestSchema, + ))); let gen_0 = FileId::new(); let gen_1 = FileId::new(); diff --git a/src/version/set.rs b/src/version/set.rs index e4050d47..050ebfac 100644 --- a/src/version/set.rs +++ b/src/version/set.rs @@ -297,7 +297,7 @@ pub(crate) mod tests { use crate::{ fs::{manager::StoreManager, FileId, FileType}, - record::Record, + record::{test::StringSchema, Record}, scope::Scope, version::{ cleaner::CleanTag, @@ -344,9 +344,10 @@ pub(crate) mod tests { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); let (sender, _) = bounded(1); - let option = Arc::new(DbOption::from( + let option = Arc::new(DbOption::from(( Path::from_filesystem_path(temp_dir.path()).unwrap(), - )); + &StringSchema, + ))); manager .base_fs() .create_dir_all(&option.version_log_dir_path()) @@ -381,7 +382,10 @@ pub(crate) mod tests { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); let (sender, _) = bounded(1); - let mut option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + let mut option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &StringSchema, + )); option.version_log_snapshot_threshold = 4; let option = Arc::new(option); @@ -508,9 +512,10 @@ pub(crate) mod tests { async fn version_level_sort() { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); - let option = Arc::new(DbOption::from( + let option = Arc::new(DbOption::from(( Path::from_filesystem_path(temp_dir.path()).unwrap(), - )); + &StringSchema, + ))); let (sender, _) = bounded(1); manager diff --git a/tests/data_integrity.rs b/tests/data_integrity.rs index b4a1b13a..87d8b291 100644 --- a/tests/data_integrity.rs +++ b/tests/data_integrity.rs @@ -70,9 +70,14 @@ mod tests { let mut write_hasher = crc32fast::Hasher::new(); let temp_dir = TempDir::new().unwrap(); - let option = DbOption::from(Path::from_filesystem_path(temp_dir.path()).unwrap()); + let option = DbOption::from(( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &CustomerSchema, + )); - let db: DB = DB::new(option, TokioExecutor::new()).await.unwrap(); + let db: DB = DB::new(option, TokioExecutor::new(), CustomerSchema) + .await + .unwrap(); for _ in 0..WRITE_TIMES { let customer = gen_record(&mut rng, &mut primary_key_count); diff --git a/tests/macros_correctness.rs b/tests/macros_correctness.rs index 73360ed3..42b408f7 100644 --- a/tests/macros_correctness.rs +++ b/tests/macros_correctness.rs @@ -1,3 +1,4 @@ +use tonbo::record::Schema; use tonbo_macros::Record; #[derive(Record, Debug, PartialEq)] @@ -21,12 +22,12 @@ mod tests { use tokio::io::AsyncSeekExt; use tonbo::{ inmem::immutable::{ArrowArrays, Builder}, - record::{Record, RecordRef}, + record::{Record, RecordRef, Schema}, serdes::{Decode, Encode}, timestamp::timestamped::Timestamped, }; - use crate::{User, UserImmutableArrays, UserRef}; + use crate::{User, UserImmutableArrays, UserRef, UserSchema}; #[tokio::test] async fn test_record_info() { @@ -38,9 +39,9 @@ mod tests { assert_eq!(user.key(), "cat"); assert_eq!(user.size(), 20); - assert_eq!(User::primary_key_index(), 4); + assert_eq!(UserSchema {}.primary_key_index(), 4); assert_eq!( - User::primary_key_path(), + UserSchema {}.primary_key_path(), ( ColumnPath::new(vec!["_ts".to_string(), "name".to_string()]), vec![ @@ -62,7 +63,7 @@ mod tests { let mut user_ref = user.as_record_ref(); user_ref.projection(&ProjectionMask::roots( - &arrow_to_parquet_schema(User::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(UserSchema {}.arrow_schema()).unwrap(), vec![2, 3], )); @@ -74,7 +75,7 @@ mod tests { let mut user_ref = user.as_record_ref(); user_ref.projection(&ProjectionMask::roots( - &arrow_to_parquet_schema(User::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(UserSchema {}.arrow_schema()).unwrap(), vec![], )); @@ -86,7 +87,7 @@ mod tests { let mut user_ref = user.as_record_ref(); user_ref.projection(&ProjectionMask::roots( - &arrow_to_parquet_schema(User::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(UserSchema {}.arrow_schema()).unwrap(), vec![2], )); @@ -98,7 +99,7 @@ mod tests { let mut user_ref = user.as_record_ref(); user_ref.projection(&ProjectionMask::roots( - &arrow_to_parquet_schema(User::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(UserSchema {}.arrow_schema()).unwrap(), vec![3], )); @@ -112,7 +113,12 @@ mod tests { async fn test_record_from_record_batch() { { let record_batch = RecordBatch::try_new( - Arc::new(User::arrow_schema().project(&[0, 1, 2, 3, 4]).unwrap()), + Arc::new( + UserSchema {} + .arrow_schema() + .project(&[0, 1, 2, 3, 4]) + .unwrap(), + ), vec![ Arc::new(BooleanArray::from(vec![false])), Arc::new(UInt32Array::from(vec![9])), @@ -124,11 +130,15 @@ mod tests { .unwrap(); let project_mask = ProjectionMask::roots( - &arrow_to_parquet_schema(User::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(UserSchema {}.arrow_schema()).unwrap(), vec![0, 1, 2, 3, 4], ); - let record_ref = - UserRef::from_record_batch(&record_batch, 0, &project_mask, User::arrow_schema()); + let record_ref = UserRef::from_record_batch( + &record_batch, + 0, + &project_mask, + UserSchema {}.arrow_schema(), + ); assert_eq!( record_ref.value(), Timestamped { @@ -146,7 +156,7 @@ mod tests { } { let record_batch = RecordBatch::try_new( - Arc::new(User::arrow_schema().project(&[0, 1, 3, 4]).unwrap()), + Arc::new(UserSchema {}.arrow_schema().project(&[0, 1, 3, 4]).unwrap()), vec![ Arc::new(BooleanArray::from(vec![false])), Arc::new(UInt32Array::from(vec![9])), @@ -157,11 +167,15 @@ mod tests { .unwrap(); let project_mask = ProjectionMask::roots( - &arrow_to_parquet_schema(User::arrow_schema()).unwrap(), + &arrow_to_parquet_schema(UserSchema {}.arrow_schema()).unwrap(), vec![0, 1, 3, 4], ); - let record_ref = - UserRef::from_record_batch(&record_batch, 0, &project_mask, User::arrow_schema()); + let record_ref = UserRef::from_record_batch( + &record_batch, + 0, + &project_mask, + UserSchema {}.arrow_schema(), + ); assert_eq!( record_ref.value(), Timestamped { @@ -200,7 +214,7 @@ mod tests { #[tokio::test] async fn test_record_arrays() { - let mut builder = UserImmutableArrays::builder(User::arrow_schema(), 10); + let mut builder = UserImmutableArrays::builder(UserSchema {}.arrow_schema().clone(), 10); let cat = User { email: Some("cat@example.com".to_string()), @@ -242,7 +256,12 @@ mod tests { assert_eq!( arrays.as_record_batch(), &RecordBatch::try_new( - Arc::new(User::arrow_schema().project(&[0, 1, 2, 3, 4]).unwrap(),), + Arc::new( + UserSchema {} + .arrow_schema() + .project(&[0, 1, 2, 3, 4]) + .unwrap(), + ), vec![ Arc::new(BooleanArray::from(vec![false, false, true])), Arc::new(UInt32Array::from(vec![0, 1, 2])), @@ -261,7 +280,7 @@ mod tests { #[tokio::test] async fn test_record_arrays_projection() { - let mut builder = UserImmutableArrays::builder(User::arrow_schema(), 10); + let mut builder = UserImmutableArrays::builder(UserSchema {}.arrow_schema().clone(), 10); let cat = User { email: Some("cat@example.com".to_string()), @@ -303,7 +322,7 @@ mod tests { assert_eq!( arrays.as_record_batch(), &RecordBatch::try_new( - Arc::new(User::arrow_schema().project(&[0, 1, 3, 4]).unwrap(),), + Arc::new(UserSchema {}.arrow_schema().project(&[0, 1, 3, 4]).unwrap(),), vec![ Arc::new(BooleanArray::from(vec![false, false, true])), Arc::new(UInt32Array::from(vec![0, 1, 2])), diff --git a/tonbo_macros/src/record.rs b/tonbo_macros/src/record.rs index 233d6adc..66937650 100644 --- a/tonbo_macros/src/record.rs +++ b/tonbo_macros/src/record.rs @@ -107,16 +107,16 @@ pub(crate) fn handle(ast: DeriveInput) -> Result { let builder_append_primary_key = &primary_key_definitions.builder_append_value; - let record_codegen = trait_record_codegen( - &data_struct.fields, - struct_name, - primary_key_definitions.clone(), - ); + let record_codegen = + trait_record_codegen(&data_struct.fields, struct_name, &primary_key_definitions); let decode_codegen = trait_decode_codegen(struct_name, &data_struct.fields); let struct_ref_codegen = struct_ref_codegen(struct_name, &data_struct.fields); + let struct_schema_codegen = + struct_schema_codegen(struct_name, &data_struct.fields, &primary_key_definitions); + let decode_ref_codegen = trait_decode_ref_codegen(&struct_name, primary_key_ident, &data_struct.fields); @@ -138,6 +138,8 @@ pub(crate) fn handle(ast: DeriveInput) -> Result { #struct_ref_codegen + #struct_schema_codegen + #decode_ref_codegen #encode_codegen @@ -156,12 +158,11 @@ pub(crate) fn handle(ast: DeriveInput) -> Result { fn trait_record_codegen( fields: &[RecordStructFieldOpt], struct_name: &Ident, - primary_key: PrimaryKey, + primary_key: &PrimaryKey, ) -> TokenStream { let mut size_fields: Vec = Vec::new(); let mut to_ref_init_fields: Vec = Vec::new(); - let mut schema_fields: Vec = Vec::new(); for field in fields.iter() { let field_name = field.ident.as_ref().unwrap(); @@ -170,13 +171,8 @@ fn trait_record_codegen( let is_string = matches!(data_type, DataType::String); let is_bytes = matches!(data_type, DataType::Bytes); - let mapped_type = data_type.to_mapped_type(); let size_field = data_type.to_size_field(field_name, is_nullable); - schema_fields.push(quote! { - ::tonbo::arrow::datatypes::Field::new(stringify!(#field_name), #mapped_type, #is_nullable), - }); - size_fields.push(quote! { + #size_field }); @@ -205,60 +201,32 @@ fn trait_record_codegen( } } - let struct_arrays_name = struct_name.to_immutable_array_ident(); let struct_ref_name = struct_name.to_ref_ident(); + let struct_schema_name = struct_name.to_schema_ident(); let PrimaryKey { - name: primary_key_name, - base_ty: primary_key_ty, fn_key: fn_primary_key, - builder_append_value: _builder_append_primary_key, - index: primary_key_index, + .. } = primary_key; quote! { impl ::tonbo::record::Record for #struct_name { - type Columns = #struct_arrays_name; - - type Key = #primary_key_ty; + type Schema = #struct_schema_name; type Ref<'r> = #struct_ref_name<'r> where Self: 'r; - fn key(&self) -> <::Key as ::tonbo::record::Key>::Ref<'_> { + fn key(&self) -> <::Key as ::tonbo::record::Key>::Ref<'_> { #fn_primary_key } - fn primary_key_index() -> usize { - #primary_key_index - } - - fn primary_key_path() -> (::tonbo::parquet::schema::types::ColumnPath, Vec<::tonbo::parquet::format::SortingColumn>) { - ( - ::tonbo::parquet::schema::types::ColumnPath::new(vec!["_ts".to_string(), stringify!(#primary_key_name).to_string()]), - vec![::tonbo::parquet::format::SortingColumn::new(1_i32, true, true), ::tonbo::parquet::format::SortingColumn::new(#primary_key_index as i32, false, true)] - ) - } - fn as_record_ref(&self) -> Self::Ref<'_> { #struct_ref_name { #(#to_ref_init_fields)* } } - fn arrow_schema() -> &'static ::std::sync::Arc<::tonbo::arrow::datatypes::Schema> { - static SCHEMA: ::tonbo::once_cell::sync::Lazy<::std::sync::Arc<::tonbo::arrow::datatypes::Schema>> = ::tonbo::once_cell::sync::Lazy::new(|| { - ::std::sync::Arc::new(::tonbo::arrow::datatypes::Schema::new(vec![ - ::tonbo::arrow::datatypes::Field::new("_null", ::tonbo::arrow::datatypes::DataType::Boolean, false), - ::tonbo::arrow::datatypes::Field::new("_ts", ::tonbo::arrow::datatypes::DataType::UInt32, false), - #(#schema_fields)* - ])) - }); - - &SCHEMA - } - fn size(&self) -> usize { 0 #(#size_fields)* } @@ -361,6 +329,71 @@ fn struct_ref_codegen(struct_name: &Ident, fields: &[RecordStructFieldOpt]) -> T } } +fn struct_schema_codegen( + struct_name: &Ident, + fields: &[RecordStructFieldOpt], + primary_key: &PrimaryKey, +) -> TokenStream { + let struct_schema_name = struct_name.to_schema_ident(); + let struct_arrays_name = struct_name.to_immutable_array_ident(); + let mut schema_fields: Vec = Vec::new(); + + let PrimaryKey { + name: primary_key_name, + base_ty: primary_key_ty, + builder_append_value: _builder_append_primary_key, + index: primary_key_index, + .. + } = primary_key; + + for field in fields.iter() { + let field_name = field.ident.as_ref().unwrap(); + + let (data_type, is_nullable) = field.to_data_type().expect("unreachable code"); + let mapped_type = data_type.to_mapped_type(); + + schema_fields.push(quote! { + ::tonbo::arrow::datatypes::Field::new(stringify!(#field_name), #mapped_type, #is_nullable), + }); + } + + quote! { + #[derive(Debug, PartialEq, Eq, Clone, Copy)] + pub struct #struct_schema_name; + + impl ::tonbo::record::Schema for #struct_schema_name { + type Record = #struct_name; + + type Columns = #struct_arrays_name; + + type Key = #primary_key_ty; + + fn primary_key_index(&self) -> usize { + #primary_key_index + } + + fn primary_key_path(&self) -> (::tonbo::parquet::schema::types::ColumnPath, Vec<::tonbo::parquet::format::SortingColumn>) { + ( + ::tonbo::parquet::schema::types::ColumnPath::new(vec!["_ts".to_string(), stringify!(#primary_key_name).to_string()]), + vec![::tonbo::parquet::format::SortingColumn::new(1_i32, true, true), ::tonbo::parquet::format::SortingColumn::new(#primary_key_index as i32, false, true)] + ) + } + + fn arrow_schema(&self) -> &'static ::std::sync::Arc<::tonbo::arrow::datatypes::Schema> { + static SCHEMA: ::tonbo::once_cell::sync::Lazy<::std::sync::Arc<::tonbo::arrow::datatypes::Schema>> = ::tonbo::once_cell::sync::Lazy::new(|| { + ::std::sync::Arc::new(::tonbo::arrow::datatypes::Schema::new(vec![ + ::tonbo::arrow::datatypes::Field::new("_null", ::tonbo::arrow::datatypes::DataType::Boolean, false), + ::tonbo::arrow::datatypes::Field::new("_ts", ::tonbo::arrow::datatypes::DataType::UInt32, false), + #(#schema_fields)* + ])) + }); + + &SCHEMA + } + } + } +} + fn trait_decode_ref_codegen( struct_name: &&Ident, primary_key_name: &Ident, @@ -437,7 +470,7 @@ fn trait_decode_ref_codegen( impl<'r> ::tonbo::record::RecordRef<'r> for #struct_ref_name<'r> { type Record = #struct_name; - fn key(self) -> <::Key as ::tonbo::record::Key>::Ref<'r> { + fn key(self) -> <<<<#struct_ref_name<'r> as ::tonbo::record::RecordRef<'r>>::Record as ::tonbo::record::Record>::Schema as ::tonbo::record::Schema>::Key as ::tonbo::record::Key>::Ref<'r> { self.#primary_key_name } @@ -599,7 +632,7 @@ fn trait_arrow_array_codegen( type Builder = #struct_builder_name; - fn builder(schema: &::std::sync::Arc<::tonbo::arrow::datatypes::Schema>, capacity: usize) -> Self::Builder { + fn builder(schema: ::std::sync::Arc<::tonbo::arrow::datatypes::Schema>, capacity: usize) -> Self::Builder { #struct_builder_name { #(#builder_init_fields)* @@ -642,6 +675,7 @@ fn struct_builder_codegen( fields: &[RecordStructFieldOpt], ) -> TokenStream { + let struct_schema_name = struct_name.to_schema_ident(); let struct_builder_name = struct_name.to_builder_ident(); let mut field_names: Vec = Vec::new(); @@ -723,7 +757,7 @@ fn struct_builder_codegen( } impl ::tonbo::inmem::immutable::Builder<#struct_arrays_name> for #struct_builder_name { - fn push(&mut self, key: ::tonbo::timestamp::timestamped::Timestamped<<<#struct_name as ::tonbo::record::Record>::Key as ::tonbo::record::Key>::Ref<'_>>, row: Option<#struct_ref_name>) { + fn push(&mut self, key: ::tonbo::timestamp::timestamped::Timestamped<<<<#struct_name as ::tonbo::record::Record>::Schema as ::tonbo::record::Schema>::Key as ::tonbo::record::Key>::Ref<'_>>, row: Option<#struct_ref_name>) { #builder_append_primary_key match row { Some(row) => { @@ -750,10 +784,10 @@ fn struct_builder_codegen( let _null = ::std::sync::Arc::new(::tonbo::arrow::array::BooleanArray::new(self._null.finish(), None)); let _ts = ::std::sync::Arc::new(self._ts.finish()); + let schema = #struct_schema_name {}; + let mut record_batch = ::tonbo::arrow::record_batch::RecordBatch::try_new( - ::std::sync::Arc::clone( - <<#struct_arrays_name as ::tonbo::inmem::immutable::ArrowArrays>::Record as ::tonbo::record::Record>::arrow_schema(), - ), + ::std::sync::Arc::clone(::tonbo::record::Schema::arrow_schema(&schema)), vec![ ::std::sync::Arc::clone(&_null) as ::std::sync::Arc, ::std::sync::Arc::clone(&_ts) as ::std::sync::Arc, diff --git a/tonbo_macros/src/utils/ident_generator.rs b/tonbo_macros/src/utils/ident_generator.rs index 8c9743fc..6b24b186 100644 --- a/tonbo_macros/src/utils/ident_generator.rs +++ b/tonbo_macros/src/utils/ident_generator.rs @@ -3,6 +3,8 @@ use syn::Ident; pub(crate) trait IdentGenerator { fn to_ref_ident(&self) -> Ident; + fn to_schema_ident(&self) -> Ident; + fn to_builder_ident(&self) -> Ident; fn to_array_ident(&self) -> Ident; @@ -15,6 +17,10 @@ impl IdentGenerator for proc_macro2::Ident { Ident::new(&format!("{}Ref", self), self.span()) } + fn to_schema_ident(&self) -> Ident { + Ident::new(&format!("{}Schema", self), self.span()) + } + fn to_builder_ident(&self) -> Ident { Ident::new(&format!("{}Builder", self), self.span()) } From da9865157be1bac1b601c3f335a8d326030dac6c Mon Sep 17 00:00:00 2001 From: Kould Date: Wed, 27 Nov 2024 18:19:48 +0800 Subject: [PATCH 3/9] chore: fix wasm & python ci --- bindings/python/src/column.rs | 10 ++-- bindings/python/src/db.rs | 12 ++--- bindings/python/src/range.rs | 4 +- bindings/python/src/record_batch.rs | 8 ++-- bindings/python/src/transaction.rs | 12 ++--- bindings/python/src/utils.rs | 12 ++--- tests/wasm.rs | 74 ++++++++++++++--------------- 7 files changed, 64 insertions(+), 68 deletions(-) diff --git a/bindings/python/src/column.rs b/bindings/python/src/column.rs index 2f853bef..7ff2f245 100644 --- a/bindings/python/src/column.rs +++ b/bindings/python/src/column.rs @@ -5,7 +5,7 @@ use std::{ }; use pyo3::{pyclass, pymethods}; -use tonbo::record::{ColumnDesc, Datatype}; +use tonbo::record::{Datatype, Value, ValueDesc}; use crate::datatype::DataType; @@ -58,15 +58,15 @@ impl Display for Column { } } -impl From for ColumnDesc { +impl From for ValueDesc { fn from(col: Column) -> Self { let datatype = Datatype::from(col.datatype); - ColumnDesc::new(col.name, datatype, col.nullable) + ValueDesc::new(col.name, datatype, col.nullable) } } -impl From for tonbo::record::Column { +impl From for Value { fn from(col: Column) -> Self { let datatype = Datatype::from(col.datatype); - tonbo::record::Column::new(datatype, col.name, col.value, col.nullable) + Value::new(datatype, col.name, col.value, col.nullable) } } diff --git a/bindings/python/src/db.rs b/bindings/python/src/db.rs index 840ff728..da4dabe0 100644 --- a/bindings/python/src/db.rs +++ b/bindings/python/src/db.rs @@ -9,10 +9,10 @@ use pyo3::{ use pyo3_asyncio::tokio::{future_into_py, get_runtime}; use tonbo::{ executor::tokio::TokioExecutor, - record::{ColumnDesc, DynRecord}, + record::DynRecord, DB, }; - +use tonbo::record::{DynSchema, Value, ValueDesc}; use crate::{ column::Column, error::{CommitError, DbError}, @@ -54,17 +54,17 @@ impl TonboDB { primary_key_name = Some(col.name.clone()); } cols.push(col.clone()); - desc.push(ColumnDesc::from(col)); + desc.push(ValueDesc::from(col)); } } + let schema = DynSchema::new(desc, primary_key_index.unwrap()); let option = option.into_option(primary_key_index.unwrap(), primary_key_name.unwrap()); let db = get_runtime() .block_on(async { DB::with_schema( option, TokioExecutor::new(), - desc, - primary_key_index.unwrap(), + schema, ) .await }) @@ -87,7 +87,7 @@ impl TonboDB { for i in 0..values.len()? { let value = values.get_item(i)?; if let Ok(bound_col) = value.downcast::() { - let col = tonbo::record::Column::from(bound_col.extract::()?); + let col = Value::from(bound_col.extract::()?); cols.push(col); } } diff --git a/bindings/python/src/range.rs b/bindings/python/src/range.rs index 6ac0904b..84dcd589 100644 --- a/bindings/python/src/range.rs +++ b/bindings/python/src/range.rs @@ -1,7 +1,7 @@ use std::ops; use pyo3::{pyclass, FromPyObject, Py, PyAny, Python}; - +use tonbo::record::Value; use crate::{utils::to_col, Column}; #[pyclass] @@ -12,7 +12,7 @@ pub enum Bound { } impl Bound { - pub(crate) fn to_bound(&self, py: Python, col: &Column) -> ops::Bound { + pub(crate) fn to_bound(&self, py: Python, col: &Column) -> ops::Bound { match self { Bound::Included { key } => ops::Bound::Included(to_col(py, col, key.clone_ref(py))), Bound::Excluded { key } => ops::Bound::Excluded(to_col(py, col, key.clone_ref(py))), diff --git a/bindings/python/src/record_batch.rs b/bindings/python/src/record_batch.rs index 0064e445..7777ab07 100644 --- a/bindings/python/src/record_batch.rs +++ b/bindings/python/src/record_batch.rs @@ -3,18 +3,18 @@ use pyo3::{ types::{PyAnyMethods, PyMapping, PyMappingMethods}, Py, PyAny, PyResult, Python, }; -use tonbo::record::DynRecord; +use tonbo::record::{DynRecord, Value}; use crate::Column; #[derive(Clone)] struct Record { - columns: Vec, + columns: Vec, primary_key_index: usize, } impl Record { - fn new(columns: Vec, primary_key_index: usize) -> Self { + fn new(columns: Vec, primary_key_index: usize) -> Self { Self { columns, primary_key_index, @@ -58,7 +58,7 @@ impl RecordBatch { if col.primary_key { primary_key_index = col_idx; } - let col = tonbo::record::Column::from(col); + let col = Value::from(col); cols.push(col); col_idx += 1; } diff --git a/bindings/python/src/transaction.rs b/bindings/python/src/transaction.rs index 70d535d7..cfc816f4 100644 --- a/bindings/python/src/transaction.rs +++ b/bindings/python/src/transaction.rs @@ -7,7 +7,7 @@ use pyo3::{ }; use pyo3_asyncio::tokio::future_into_py; use tonbo::{record::DynRecord, transaction, Projection}; - +use tonbo::record::Value; use crate::{ column::Column, error::{repeated_commit_err, CommitError, DbError}, @@ -123,7 +123,7 @@ impl Transaction { let tuple = x.downcast::()?; let col = tuple.get_item(1)?; if let Ok(bound_col) = col.downcast::() { - let col = tonbo::record::Column::from(bound_col.extract::()?); + let col = Value::from(bound_col.extract::()?); cols.push(col); } } @@ -182,14 +182,14 @@ impl Transaction { let mut scan = txn.scan(( unsafe { transmute::< - std::ops::Bound<&tonbo::record::Column>, - std::ops::Bound<&'static tonbo::record::Column>, + std::ops::Bound<&Value>, + std::ops::Bound<&'static Value>, >(lower.as_ref()) }, unsafe { transmute::< - std::ops::Bound<&tonbo::record::Column>, - std::ops::Bound<&'static tonbo::record::Column>, + std::ops::Bound<&Value>, + std::ops::Bound<&'static Value>, >(high.as_ref()) }, )); diff --git a/bindings/python/src/utils.rs b/bindings/python/src/utils.rs index c6a190d8..1d231c83 100644 --- a/bindings/python/src/utils.rs +++ b/bindings/python/src/utils.rs @@ -4,14 +4,14 @@ use pyo3::{ types::{PyBytes, PyDict, PyDictMethods}, Bound, Py, PyAny, Python, }; -use tonbo::record::Datatype; +use tonbo::record::{Datatype, Value}; use crate::{column::Column, datatype::DataType, range}; pub(crate) fn to_dict( py: Python, primary_key_index: usize, - record: Vec, + record: Vec, ) -> Bound { let dict = PyDict::new_bound(py); for (idx, col) in record.iter().enumerate() { @@ -185,8 +185,8 @@ pub(crate) fn to_key( } } -pub(crate) fn to_col(py: Python, col: &Column, key: Py) -> tonbo::record::Column { - tonbo::record::Column::new( +pub(crate) fn to_col(py: Python, col: &Column, key: Py) -> Value { + Value::new( Datatype::from(&col.datatype), col.name.to_owned(), to_key(py, &col.datatype, key), @@ -200,8 +200,8 @@ pub(crate) fn to_bound( lower: Option>, high: Option>, ) -> ( - std::ops::Bound, - std::ops::Bound, + std::ops::Bound, + std::ops::Bound, ) { let lower = match lower { Some(bound) => bound.get().to_bound(py, col), diff --git a/tests/wasm.rs b/tests/wasm.rs index b700334f..b206371a 100644 --- a/tests/wasm.rs +++ b/tests/wasm.rs @@ -7,46 +7,46 @@ mod tests { use futures::StreamExt; use tonbo::{ executor::opfs::OpfsExecutor, - record::{Datatype, DynRecord, Record, Value, ValueDesc}, + record::{Datatype, DynRecord, DynSchema, Record, Value, ValueDesc}, DbOption, Projection, DB, }; use wasm_bindgen_test::wasm_bindgen_test; - fn test_dyn_item_schema() -> (Vec, usize) { + fn test_dyn_item_schema() -> DynSchema { let descs = vec![ - ColumnDesc::new("id".to_string(), Datatype::Int64, false), - ColumnDesc::new("age".to_string(), Datatype::Int8, true), - ColumnDesc::new("name".to_string(), Datatype::String, false), - ColumnDesc::new("email".to_string(), Datatype::String, true), - ColumnDesc::new("bytes".to_string(), Datatype::Bytes, true), + ValueDesc::new("id".to_string(), Datatype::Int64, false), + ValueDesc::new("age".to_string(), Datatype::Int8, true), + ValueDesc::new("name".to_string(), Datatype::String, false), + ValueDesc::new("email".to_string(), Datatype::String, true), + ValueDesc::new("bytes".to_string(), Datatype::Bytes, true), ]; - (descs, 0) + DynSchema::new(descs, 0) } fn test_dyn_items() -> Vec { let mut items = vec![]; for i in 0..50 { let columns = vec![ - Column::new(Datatype::Int64, "id".to_string(), Arc::new(i as i64), false), - Column::new( + Value::new(Datatype::Int64, "id".to_string(), Arc::new(i as i64), false), + Value::new( Datatype::Int8, "age".to_string(), Arc::new(Some(i as i8)), true, ), - Column::new( + Value::new( Datatype::String, "name".to_string(), Arc::new(i.to_string()), false, ), - Column::new( + Value::new( Datatype::String, "email".to_string(), Arc::new(Some(format!("{}@tonbo.io", i))), true, ), - Column::new( + Value::new( Datatype::Bytes, "bytes".to_string(), Arc::new(Some((i as i32).to_le_bytes().to_vec())), @@ -54,8 +54,7 @@ mod tests { ), ]; - let record = DynRecord::new(columns, 0); - items.push(record); + items.push(DynRecord::new(columns, 0)); } items } @@ -69,7 +68,7 @@ mod tests { #[wasm_bindgen_test] async fn test_wasm_read_write() { - let (cols_desc, primary_key_index) = test_dyn_item_schema(); + let schema = test_dyn_item_schema(); let path = Path::from_opfs_path("opfs_dir_rw").unwrap(); let fs = fusio::disk::LocalFs {}; fs.create_dir_all(&path).await.unwrap(); @@ -77,13 +76,12 @@ mod tests { let option = DbOption::with_path( Path::from_opfs_path("opfs_dir_rw").unwrap(), "id".to_string(), - primary_key_index, + 0, ); - let db: DB = - DB::with_schema(option, OpfsExecutor::new(), cols_desc, primary_key_index) - .await - .unwrap(); + let db: DB = DB::with_schema(option, OpfsExecutor::new(), schema) + .await + .unwrap(); for item in test_dyn_items().into_iter() { db.insert(item).await.unwrap(); @@ -94,7 +92,7 @@ mod tests { let tx = db.transaction().await; for i in 0..50 { - let key = Column::new(Datatype::Int64, "id".to_string(), Arc::new(i as i64), false); + let key = Value::new(Datatype::Int64, "id".to_string(), Arc::new(i as i64), false); let option1 = tx.get(&key, Projection::All).await.unwrap(); let entry = option1.unwrap(); let record_ref = entry.get(); @@ -154,7 +152,7 @@ mod tests { #[wasm_bindgen_test] async fn test_wasm_transaction() { - let (cols_desc, primary_key_index) = test_dyn_item_schema(); + let schema = test_dyn_item_schema(); let fs = fusio::disk::LocalFs {}; let path = Path::from_opfs_path("opfs_dir_txn").unwrap(); @@ -163,13 +161,12 @@ mod tests { let option = DbOption::with_path( Path::from_opfs_path("opfs_dir_txn").unwrap(), "id".to_string(), - primary_key_index, + 0, ); - let db: DB = - DB::with_schema(option, OpfsExecutor::new(), cols_desc, primary_key_index) - .await - .unwrap(); + let db: DB = DB::with_schema(option, OpfsExecutor::new(), schema) + .await + .unwrap(); { let mut txn = db.transaction().await; @@ -182,8 +179,8 @@ mod tests { // test scan { let txn = db.transaction().await; - let lower = Column::new(Datatype::Int64, "id".to_owned(), Arc::new(5_i64), false); - let upper = Column::new(Datatype::Int64, "id".to_owned(), Arc::new(47_i64), false); + let lower = Value::new(Datatype::Int64, "id".to_owned(), Arc::new(5_i64), false); + let upper = Value::new(Datatype::Int64, "id".to_owned(), Arc::new(47_i64), false); let mut scan = txn .scan((Bound::Included(&lower), Bound::Included(&upper))) .projection(vec![0, 2, 4]) @@ -239,7 +236,7 @@ mod tests { #[wasm_bindgen_test] async fn test_wasm_schema_recover() { - let (cols_desc, primary_key_index) = test_dyn_item_schema(); + let schema = test_dyn_item_schema(); let path = Path::from_opfs_path("opfs_dir").unwrap(); let fs = fusio::disk::LocalFs {}; fs.create_dir_all(&path).await.unwrap(); @@ -247,12 +244,12 @@ mod tests { let option = DbOption::with_path( Path::from_opfs_path("opfs_dir").unwrap(), "id".to_string(), - primary_key_index, + 0, ); { let db: DB = - DB::with_schema(option, OpfsExecutor::new(), cols_desc, primary_key_index) + DB::with_schema(option, OpfsExecutor::new(), schema) .await .unwrap(); @@ -263,16 +260,15 @@ mod tests { db.flush_wal().await.unwrap(); } - let (cols_desc, primary_key_index) = test_dyn_item_schema(); + let schema = test_dyn_item_schema(); let option = DbOption::with_path( Path::from_opfs_path("opfs_dir").unwrap(), "id".to_string(), - primary_key_index, + 0, ); - let db: DB = - DB::with_schema(option, OpfsExecutor::new(), cols_desc, primary_key_index) - .await - .unwrap(); + let db: DB = DB::with_schema(option, OpfsExecutor::new(), schema) + .await + .unwrap(); let mut sort_items = BTreeMap::new(); for item in test_dyn_items() { From 9d05f1c8fbf677568932ac179eefdcc30f63f9dd Mon Sep 17 00:00:00 2001 From: Gwo Tzu-Hsing Date: Mon, 16 Dec 2024 18:18:36 +0800 Subject: [PATCH 4/9] Fix refactoring bug --- src/compaction/mod.rs | 10 ++++++++-- src/option.rs | 1 - src/record/mod.rs | 11 ++++++++++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/compaction/mod.rs b/src/compaction/mod.rs index 65b648e5..3e263c0a 100644 --- a/src/compaction/mod.rs +++ b/src/compaction/mod.rs @@ -76,9 +76,15 @@ where let mutable = mem::replace( &mut guard.mutable, - Mutable::new(&self.option, trigger_clone, self.manager.base_fs()).await?, + Mutable::new( + &self.option, + trigger_clone, + self.manager.base_fs(), + self.record_schema.clone(), + ) + .await?, ); - let (file_id, immutable) = mutable.into_immutable(&guard.record_instance).await?; + let (file_id, immutable) = mutable.into_immutable().await?; guard.immutables.push((file_id, immutable)); } else if !is_manual { return Ok(()); diff --git a/src/option.rs b/src/option.rs index ef7fde43..50d23940 100644 --- a/src/option.rs +++ b/src/option.rs @@ -28,7 +28,6 @@ pub struct DbOption { pub(crate) clean_channel_buffer: usize, pub(crate) base_path: Path, pub(crate) base_fs: FsOptions, - // TODO: DEBUG pub(crate) level_paths: Vec>, pub(crate) immutable_chunk_num: usize, pub(crate) immutable_chunk_max_num: usize, diff --git a/src/record/mod.rs b/src/record/mod.rs index 497d1dd4..664f134e 100644 --- a/src/record/mod.rs +++ b/src/record/mod.rs @@ -110,7 +110,16 @@ impl Schema for DynSchema { } fn primary_key_path(&self) -> (ColumnPath, Vec) { - unimplemented!() + ( + ColumnPath::new(vec![ + "_ts".to_string(), + self.schema[self.primary_index].name.clone(), + ]), + vec![ + SortingColumn::new(1_i32, true, true), + SortingColumn::new(self.primary_key_index() as i32, false, true), + ], + ) } } From ca90c0ee59f502f64f80691b263dda4ec91967b5 Mon Sep 17 00:00:00 2001 From: Gwo Tzu-Hsing Date: Mon, 16 Dec 2024 20:02:09 +0800 Subject: [PATCH 5/9] cache --- examples/dynamic.rs | 62 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 examples/dynamic.rs diff --git a/examples/dynamic.rs b/examples/dynamic.rs new file mode 100644 index 00000000..fba6071d --- /dev/null +++ b/examples/dynamic.rs @@ -0,0 +1,62 @@ +use std::fs; +use std::sync::Arc; + +use fusio::path::Path; +use tonbo::executor::tokio::TokioExecutor; +use tonbo::record::{Datatype, DynRecord, DynSchema, Value, ValueDesc}; +use tonbo::{DbOption, DB}; + +#[tokio::main] +async fn main() { + fs::create_dir_all("./db_path/users").unwrap(); + + let schema = DynSchema::new( + vec![ + ValueDesc::new("foo".into(), Datatype::String, false), + ValueDesc::new("bar".into(), Datatype::Int32, true), + ], + 0, + ); + + let options = DbOption::from(( + Path::from_filesystem_path("./db_path/users").unwrap(), + &schema, + )); + let db = DB::with_schema(options, TokioExecutor::new(), schema) + .await + .unwrap(); + + { + let mut txn = db.transaction().await; + txn.insert(DynRecord::new( + vec![ + Value::new( + Datatype::String, + "foo".into(), + Arc::new("hello".to_owned()), + false, + ), + Value::new(Datatype::Int32, "bar".into(), Arc::new(1), true), + ], + 0, + )); + + txn.commit().await.unwrap(); + } + + db.get( + &Value::new( + Datatype::String, + "foo".into(), + Arc::new("hello".to_owned()), + false, + ), + |v| { + let v = v.get(); + println!("{:?}", v.columns[0].value.downcast_ref::()); + Some(()) + }, + ) + .await + .unwrap(); +} From 9e80dba241c3f52fee3a860fbb4e1725dd051dd6 Mon Sep 17 00:00:00 2001 From: Gwo Tzu-Hsing Date: Mon, 16 Dec 2024 22:26:25 +0800 Subject: [PATCH 6/9] refactor: remove R type arg on `DbOption` --- benches/common.rs | 4 +- bindings/python/src/db.rs | 17 +---- bindings/python/src/options.rs | 27 +++---- bindings/python/src/range.rs | 1 + bindings/python/src/transaction.rs | 21 +++--- bindings/python/src/utils.rs | 11 +-- examples/datafusion.rs | 4 +- examples/declare.rs | 4 +- examples/dynamic.rs | 17 +++-- src/compaction/mod.rs | 48 ++++++------ src/inmem/mutable.rs | 30 ++++---- src/lib.rs | 116 ++++++++++------------------- src/ondisk/sstable.rs | 8 +- src/option.rs | 98 +++++------------------- src/record/runtime/record.rs | 66 +--------------- src/snapshot.rs | 4 +- src/stream/level.rs | 15 ++-- src/stream/mem_projection.rs | 6 +- src/stream/merge.rs | 20 ++--- src/stream/package.rs | 6 +- src/transaction.rs | 43 +++++------ src/version/cleaner.rs | 28 +++---- src/version/mod.rs | 6 +- src/version/set.rs | 18 ++--- tests/data_integrity.rs | 11 +-- tests/macros_correctness.rs | 1 - tests/wasm.rs | 89 ++++++++-------------- 27 files changed, 248 insertions(+), 471 deletions(-) diff --git a/benches/common.rs b/benches/common.rs index c0249dc7..52480eaf 100644 --- a/benches/common.rs +++ b/benches/common.rs @@ -273,7 +273,7 @@ impl BenchDatabase for TonboS3BenchDataBase { .disable_wal(); TonboS3BenchDataBase::new( - tonbo::DB::new(option, TokioExecutor::current()) + tonbo::DB::new(option, TokioExecutor::current(), &CustomerSchema) .await .unwrap(), ) @@ -324,7 +324,7 @@ impl BenchDatabase for TonboBenchDataBase { DbOption::from(fusio::path::Path::from_filesystem_path(path.as_ref()).unwrap()) .disable_wal(); - let db = tonbo::DB::new(option, TokioExecutor::current()) + let db = tonbo::DB::new(option, TokioExecutor::current(), &CustomerSchema) .await .unwrap(); TonboBenchDataBase::new(db) diff --git a/bindings/python/src/db.rs b/bindings/python/src/db.rs index 3b69d9f1..db587e70 100644 --- a/bindings/python/src/db.rs +++ b/bindings/python/src/db.rs @@ -9,10 +9,10 @@ use pyo3::{ use pyo3_asyncio::tokio::{future_into_py, get_runtime}; use tonbo::{ executor::tokio::TokioExecutor, - record::DynRecord, + record::{DynRecord, DynSchema, Value, ValueDesc}, DB, }; -use tonbo::record::{DynSchema, Value, ValueDesc}; + use crate::{ column::Column, error::{CommitError, DbError}, @@ -40,7 +40,6 @@ impl TonboDB { let mut desc = vec![]; let mut cols = vec![]; let mut primary_key_index = None; - let mut primary_key_name = None; for i in 0..values.len()? { let value = values.get_item(i)?; @@ -51,23 +50,15 @@ impl TonboDB { panic!("Multiple primary keys is not allowed!") } primary_key_index = Some(desc.len()); - primary_key_name = Some(col.name.clone()); } cols.push(col.clone()); desc.push(ValueDesc::from(col)); } } let schema = DynSchema::new(desc, primary_key_index.unwrap()); - let option = option.into_option(primary_key_index.unwrap(), primary_key_name.unwrap()); + let option = option.into_option(&schema); let db = get_runtime() - .block_on(async { - DB::with_schema( - option, - TokioExecutor::current(), - schema, - ) - .await - }) + .block_on(async { DB::new(option, TokioExecutor::current(), schema).await }) .unwrap(); Ok(Self { db: Arc::new(db), diff --git a/bindings/python/src/options.rs b/bindings/python/src/options.rs index 3e994c62..5fa98ca2 100644 --- a/bindings/python/src/options.rs +++ b/bindings/python/src/options.rs @@ -1,6 +1,6 @@ use fusio::path::Path; use pyo3::{pyclass, pymethods, PyResult}; -use tonbo::record::DynRecord; +use tonbo::record::Schema; use crate::{ExceedsMaxLevelError, FsOptions}; @@ -72,21 +72,16 @@ impl DbOption { } impl DbOption { - pub(crate) fn into_option( - self, - primary_key_index: usize, - primary_key_name: String, - ) -> tonbo::DbOption { - let mut opt = - tonbo::DbOption::with_path(Path::from(self.path), primary_key_name, primary_key_index) - .clean_channel_buffer(self.clean_channel_buffer) - .immutable_chunk_num(self.immutable_chunk_num) - .level_sst_magnification(self.level_sst_magnification) - .major_default_oldest_table_num(self.major_default_oldest_table_num) - .major_threshold_with_sst_size(self.major_threshold_with_sst_size) - .max_sst_file_size(self.max_sst_file_size) - .version_log_snapshot_threshold(self.version_log_snapshot_threshold) - .base_fs(fusio_dispatch::FsOptions::from(self.base_fs)); + pub(crate) fn into_option(self, schema: &S) -> tonbo::DbOption { + let mut opt = tonbo::DbOption::new(Path::from(self.path), schema) + .clean_channel_buffer(self.clean_channel_buffer) + .immutable_chunk_num(self.immutable_chunk_num) + .level_sst_magnification(self.level_sst_magnification) + .major_default_oldest_table_num(self.major_default_oldest_table_num) + .major_threshold_with_sst_size(self.major_threshold_with_sst_size) + .max_sst_file_size(self.max_sst_file_size) + .version_log_snapshot_threshold(self.version_log_snapshot_threshold) + .base_fs(fusio_dispatch::FsOptions::from(self.base_fs)); for (level, path) in self.level_paths.into_iter().enumerate() { if let Some((path, fs_options)) = path { opt = opt diff --git a/bindings/python/src/range.rs b/bindings/python/src/range.rs index 84dcd589..297c1aca 100644 --- a/bindings/python/src/range.rs +++ b/bindings/python/src/range.rs @@ -2,6 +2,7 @@ use std::ops; use pyo3::{pyclass, FromPyObject, Py, PyAny, Python}; use tonbo::record::Value; + use crate::{utils::to_col, Column}; #[pyclass] diff --git a/bindings/python/src/transaction.rs b/bindings/python/src/transaction.rs index cfc816f4..f9f5212c 100644 --- a/bindings/python/src/transaction.rs +++ b/bindings/python/src/transaction.rs @@ -6,8 +6,11 @@ use pyo3::{ Bound, IntoPy, Py, PyAny, PyResult, Python, }; use pyo3_asyncio::tokio::future_into_py; -use tonbo::{record::DynRecord, transaction, Projection}; -use tonbo::record::Value; +use tonbo::{ + record::{DynRecord, Value}, + transaction, Projection, +}; + use crate::{ column::Column, error::{repeated_commit_err, CommitError, DbError}, @@ -181,16 +184,14 @@ impl Transaction { future_into_py(py, async move { let mut scan = txn.scan(( unsafe { - transmute::< - std::ops::Bound<&Value>, - std::ops::Bound<&'static Value>, - >(lower.as_ref()) + transmute::, std::ops::Bound<&'static Value>>( + lower.as_ref(), + ) }, unsafe { - transmute::< - std::ops::Bound<&Value>, - std::ops::Bound<&'static Value>, - >(high.as_ref()) + transmute::, std::ops::Bound<&'static Value>>( + high.as_ref(), + ) }, )); diff --git a/bindings/python/src/utils.rs b/bindings/python/src/utils.rs index 1d231c83..0dfb64fb 100644 --- a/bindings/python/src/utils.rs +++ b/bindings/python/src/utils.rs @@ -8,11 +8,7 @@ use tonbo::record::{Datatype, Value}; use crate::{column::Column, datatype::DataType, range}; -pub(crate) fn to_dict( - py: Python, - primary_key_index: usize, - record: Vec, -) -> Bound { +pub(crate) fn to_dict(py: Python, primary_key_index: usize, record: Vec) -> Bound { let dict = PyDict::new_bound(py); for (idx, col) in record.iter().enumerate() { match &col.datatype { @@ -199,10 +195,7 @@ pub(crate) fn to_bound( col: &Column, lower: Option>, high: Option>, -) -> ( - std::ops::Bound, - std::ops::Bound, -) { +) -> (std::ops::Bound, std::ops::Bound) { let lower = match lower { Some(bound) => bound.get().to_bound(py, col), None => std::ops::Bound::Unbounded, diff --git a/examples/datafusion.rs b/examples/datafusion.rs index b9564c93..34a9c452 100644 --- a/examples/datafusion.rs +++ b/examples/datafusion.rs @@ -221,10 +221,10 @@ async fn main() -> Result<()> { // make sure the path exists let _ = fs::create_dir_all("./db_path/music").await; - let options = DbOption::from(( + let options = DbOption::new( Path::from_filesystem_path("./db_path/music").unwrap(), &MusicSchema, - )); + ); let db = DB::new(options, TokioExecutor::current(), MusicSchema) .await diff --git a/examples/declare.rs b/examples/declare.rs index 685255a6..a4c7d781 100644 --- a/examples/declare.rs +++ b/examples/declare.rs @@ -22,10 +22,10 @@ async fn main() { // make sure the path exists let _ = fs::create_dir_all("./db_path/users").await; - let options = DbOption::from(( + let options = DbOption::new( Path::from_filesystem_path("./db_path/users").unwrap(), &UserSchema, - )); + ); // pluggable async runtime and I/O let db = DB::new(options, TokioExecutor::current(), UserSchema) .await diff --git a/examples/dynamic.rs b/examples/dynamic.rs index cc5c8dad..e982bf2b 100644 --- a/examples/dynamic.rs +++ b/examples/dynamic.rs @@ -1,10 +1,11 @@ -use std::fs; -use std::sync::Arc; +use std::{fs, sync::Arc}; use fusio::path::Path; -use tonbo::executor::tokio::TokioExecutor; -use tonbo::record::{Datatype, DynRecord, DynSchema, Value, ValueDesc}; -use tonbo::{DbOption, DB}; +use tonbo::{ + executor::tokio::TokioExecutor, + record::{Datatype, DynRecord, DynSchema, Value, ValueDesc}, + DbOption, DB, +}; #[tokio::main] async fn main() { @@ -18,11 +19,11 @@ async fn main() { 0, ); - let options = DbOption::from(( + let options = DbOption::new( Path::from_filesystem_path("./db_path/users").unwrap(), &schema, - )); - let db = DB::with_schema(options, TokioExecutor::current(), schema) + ); + let db = DB::new(options, TokioExecutor::current(), schema) .await .unwrap(); diff --git a/src/compaction/mod.rs b/src/compaction/mod.rs index f4d7da2b..cc9262c8 100644 --- a/src/compaction/mod.rs +++ b/src/compaction/mod.rs @@ -35,7 +35,7 @@ pub(crate) struct Compactor where R: Record, { - pub(crate) option: Arc>, + pub(crate) option: Arc, pub(crate) schema: Arc>>, pub(crate) version_set: VersionSet, pub(crate) manager: Arc, @@ -49,7 +49,7 @@ where pub(crate) fn new( schema: Arc>>, record_schema: Arc, - option: Arc>, + option: Arc, version_set: VersionSet, manager: Arc, ) -> Self { @@ -148,7 +148,7 @@ where } pub(crate) async fn minor_compaction( - option: &DbOption, + option: &DbOption, recover_wal_ids: Option>, batches: &[( Option, @@ -211,7 +211,7 @@ where #[allow(clippy::too_many_arguments)] pub(crate) async fn major_compaction( version: &Version, - option: &DbOption, + option: &DbOption, mut min: &::Key, mut max: &::Key, version_edits: &mut Vec::Key>>, @@ -419,7 +419,7 @@ where } async fn build_tables<'scan>( - option: &DbOption, + option: &DbOption, version_edits: &mut Vec::Key>>, level: usize, streams: Vec>, @@ -490,7 +490,7 @@ where #[allow(clippy::too_many_arguments)] async fn build_table( - option: &DbOption, + option: &DbOption, version_edits: &mut Vec::Key>>, level: usize, builder: &mut <::Columns as ArrowArrays>::Builder, @@ -582,7 +582,7 @@ pub(crate) mod tests { }; async fn build_immutable( - option: &DbOption, + option: &DbOption, records: Vec<(LogType, R, Timestamp)>, schema: &Arc, fs: &Arc, @@ -601,7 +601,7 @@ pub(crate) mod tests { } pub(crate) async fn build_parquet_table( - option: &DbOption, + option: &DbOption, gen: FileId, records: Vec<(LogType, R, Timestamp)>, schema: &Arc, @@ -634,10 +634,10 @@ pub(crate) mod tests { let temp_dir = tempfile::tempdir().unwrap(); let temp_dir_l0 = tempfile::tempdir().unwrap(); - let option = DbOption::from(( + let option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - )) + ) .level_path( 0, Path::from_filesystem_path(temp_dir_l0.path()).unwrap(), @@ -747,21 +747,21 @@ pub(crate) mod tests { async fn dyn_minor_compaction() { let temp_dir = tempfile::tempdir().unwrap(); let manager = StoreManager::new(FsOptions::Local, vec![]).unwrap(); - let option = DbOption::with_path( - Path::from_filesystem_path(temp_dir.path()).unwrap(), - "id".to_string(), + let schema = DynSchema::new( + vec![ValueDesc::new("id".to_owned(), Datatype::Int32, false)], 0, ); + let option = DbOption::new( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &schema, + ); manager .base_fs() .create_dir_all(&option.wal_dir_path()) .await .unwrap(); - let instance = Arc::new(DynSchema::new( - vec![ValueDesc::new("id".to_owned(), Datatype::Int32, false)], - 0, - )); + let instance = Arc::new(schema); let mut batch1_data = vec![]; let mut batch2_data = vec![]; @@ -818,10 +818,10 @@ pub(crate) mod tests { let temp_dir_l0 = TempDir::new().unwrap(); let temp_dir_l1 = TempDir::new().unwrap(); - let mut option = DbOption::from(( + let mut option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - )) + ) .level_path( 0, Path::from_filesystem_path(temp_dir_l0.path()).unwrap(), @@ -900,7 +900,7 @@ pub(crate) mod tests { } pub(crate) async fn build_version( - option: &Arc>, + option: &Arc, manager: &StoreManager, schema: &Arc, ) -> ((FileId, FileId, FileId, FileId, FileId), Version) { @@ -1162,10 +1162,10 @@ pub(crate) mod tests { pub(crate) async fn major_panic() { let temp_dir = TempDir::new().unwrap(); - let mut option = DbOption::from(( + let mut option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - )); + ); option.major_threshold_with_sst_size = 1; option.level_sst_magnification = 1; let manager = @@ -1273,10 +1273,10 @@ pub(crate) mod tests { async fn test_flush_major_level_sort() { let temp_dir = TempDir::new().unwrap(); - let mut option = DbOption::from(( + let mut option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - )); + ); option.immutable_chunk_num = 1; option.immutable_chunk_max_num = 0; option.major_threshold_with_sst_size = 2; diff --git a/src/inmem/mutable.rs b/src/inmem/mutable.rs index 79b29b00..0212bf35 100644 --- a/src/inmem/mutable.rs +++ b/src/inmem/mutable.rs @@ -47,7 +47,7 @@ where R: Record, { pub async fn new( - option: &DbOption, + option: &DbOption, trigger: Arc + Send + Sync>>, fs: &Arc, schema: Arc, @@ -234,10 +234,10 @@ mod tests { let temp_dir = tempfile::tempdir().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = DbOption::from(( + let option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - )); + ); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); @@ -287,10 +287,10 @@ mod tests { async fn range() { let temp_dir = tempfile::tempdir().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = DbOption::from(( + let option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &StringSchema, - )); + ); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); @@ -375,23 +375,23 @@ mod tests { #[tokio::test] async fn test_dyn_read() { let temp_dir = tempfile::tempdir().unwrap(); - let option = DbOption::with_path( - Path::from_filesystem_path(temp_dir.path()).unwrap(), - "age".to_string(), + let schema = DynSchema::new( + vec![ + ValueDesc::new("age".to_string(), Datatype::Int8, false), + ValueDesc::new("height".to_string(), Datatype::Int16, true), + ], 0, ); + let option = DbOption::new( + Path::from_filesystem_path(temp_dir.path()).unwrap(), + &schema, + ); let fs = Arc::new(TokioFs) as Arc; fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let schema = Arc::new(DynSchema::new( - vec![ - ValueDesc::new("age".to_string(), Datatype::Int8, false), - ValueDesc::new("height".to_string(), Datatype::Int16, true), - ], - 0, - )); + let schema = Arc::new(schema); let mutable = Mutable::::new(&option, trigger, &fs, schema) .await diff --git a/src/lib.rs b/src/lib.rs index 43c3c6a1..e58db942 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -54,10 +54,10 @@ //! // make sure the path exists //! let _ = fs::create_dir_all("./db_path/users").await; //! -//! let options = DbOption::from(( +//! let options = DbOption::new( //! Path::from_filesystem_path("./db_path/users").unwrap(), //! &UserSchema, -//! )); +//! ); //! // pluggable async runtime and I/O //! let db = DB::new(options, TokioExecutor::current(), UserSchema) //! .await @@ -154,7 +154,7 @@ use parquet::{ errors::ParquetError, }; use parquet_lru::{DynLruCache, NoCache}; -use record::{DynRecord, Record}; +use record::Record; use thiserror::Error; use timestamp::{Timestamp, TimestampedRef}; use tokio::sync::oneshot; @@ -167,7 +167,7 @@ use crate::{ compaction::{CompactTask, CompactionError, Compactor}, executor::Executor, fs::{manager::StoreManager, parse_file_id, FileType}, - record::{DynSchema, Schema as RecordSchema}, + record::Schema as RecordSchema, serdes::Decode, snapshot::Snapshot, stream::{ @@ -193,22 +193,6 @@ where _p: PhantomData, } -impl DB -where - E: Executor + Send + Sync + 'static, -{ - /// Open [`DB`] with schema which determined by [`ColumnDesc`]. - pub async fn with_schema( - option: DbOption, - executor: E, - schema: DynSchema, - ) -> Result> { - let option = Arc::new(option); - - Self::build(option, executor, schema, Arc::new(NoCache::default())).await - } -} - impl DB where R: Record + Send + Sync, @@ -220,11 +204,7 @@ where /// according to the configuration of [`DbOption`]. /// /// For more configurable options, please refer to [`DbOption`]. - pub async fn new( - option: DbOption, - executor: E, - schema: R::Schema, - ) -> Result> { + pub async fn new(option: DbOption, executor: E, schema: R::Schema) -> Result> { Self::build( Arc::new(option), executor, @@ -242,7 +222,7 @@ where E: Executor + Send + Sync + 'static, { async fn build( - option: Arc>, + option: Arc, executor: E, schema: R::Schema, lru_cache: ParquetLru, @@ -266,7 +246,7 @@ where } let (task_tx, task_rx) = bounded(1); - let (mut cleaner, clean_sender) = Cleaner::::new(option.clone(), manager.clone()); + let (mut cleaner, clean_sender) = Cleaner::new(option.clone(), manager.clone()); let version_set = VersionSet::new(clean_sender, option.clone(), manager.clone()).await?; let schema = Arc::new(RwLock::new( @@ -505,7 +485,7 @@ where R: Record + Send, { async fn new( - option: Arc>, + option: Arc, compaction_tx: Sender, version_set: &VersionSet, record_schema: Arc, @@ -683,10 +663,6 @@ where self.mutable.flush_wal().await?; Ok(()) } - - pub(crate) fn record_schema(&self) -> &Arc { - &self.record_schema - } } /// scan configuration intermediate structure @@ -931,15 +907,14 @@ pub(crate) mod tests { use arrow::{ array::{Array, AsArray, RecordBatch}, - datatypes::{DataType, Field, Schema, UInt32Type}, + datatypes::{Schema, UInt32Type}, }; use async_lock::RwLock; use flume::{bounded, Receiver}; use fusio::{disk::TokioFs, path::Path, DynFs, SeqRead, Write}; use fusio_dispatch::FsOptions; use futures::StreamExt; - use once_cell::sync::Lazy; - use parquet::{arrow::ProjectionMask, format::SortingColumn, schema::types::ColumnPath}; + use parquet::arrow::ProjectionMask; use parquet_lru::NoCache; use tempfile::TempDir; use tracing::error; @@ -947,15 +922,12 @@ pub(crate) mod tests { use crate::{ compaction::{CompactTask, CompactionError, Compactor}, executor::{tokio::TokioExecutor, Executor}, - fs::{generate_file_id, manager::StoreManager, FileId}, - inmem::{ - immutable::tests::{TestImmutableArrays, TestSchema}, - mutable::Mutable, - }, + fs::{generate_file_id, manager::StoreManager}, + inmem::{immutable::tests::TestSchema, mutable::Mutable}, record::{ internal::InternalRecordRef, runtime::test::{test_dyn_item_schema, test_dyn_items}, - Datatype, DynRecord, DynSchema, Key, RecordDecodeError, RecordEncodeError, RecordRef, + Datatype, DynRecord, Key, RecordDecodeError, RecordEncodeError, RecordRef, Schema as RecordSchema, Value, }, serdes::{Decode, Encode}, @@ -1150,7 +1122,7 @@ pub(crate) mod tests { } pub(crate) async fn get_test_record_batch( - option: DbOption, + option: DbOption, executor: E, ) -> RecordBatch { let db: DB = DB::new(option.clone(), executor, TestSchema {}) @@ -1198,7 +1170,7 @@ pub(crate) mod tests { } pub(crate) async fn build_schema( - option: Arc>, + option: Arc, fs: &Arc, ) -> Result<(crate::Schema, Receiver), fusio::Error> { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); @@ -1307,7 +1279,7 @@ pub(crate) mod tests { } pub(crate) async fn build_db( - option: Arc>, + option: Arc, compaction_rx: Receiver, executor: E, schema: crate::Schema, @@ -1329,7 +1301,7 @@ pub(crate) mod tests { let schema = Arc::new(RwLock::new(schema)); - let (mut cleaner, clean_sender) = Cleaner::::new(option.clone(), manager.clone()); + let (mut cleaner, clean_sender) = Cleaner::new(option.clone(), manager.clone()); let version_set = build_version_set(version, clean_sender, option.clone(), manager.clone()).await?; let mut compactor = Compactor::::new( @@ -1595,7 +1567,7 @@ pub(crate) mod tests { let path = Path::from_filesystem_path(temp_dir.path()).unwrap(); let path_l0 = Path::from_filesystem_path(temp_dir_l0.path()).unwrap(); - let mut option = DbOption::from((path, &TestSchema)) + let mut option = DbOption::new(path, &TestSchema) .level_path(0, path_l0, FsOptions::Local) .unwrap(); option.immutable_chunk_num = 1; @@ -1635,10 +1607,10 @@ pub(crate) mod tests { async fn test_flush() { let temp_dir = TempDir::new().unwrap(); - let mut option = DbOption::from(( + let mut option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - )); + ); option.immutable_chunk_num = 1; option.immutable_chunk_max_num = 1; option.major_threshold_with_sst_size = 3; @@ -1672,10 +1644,10 @@ pub(crate) mod tests { let temp_dir = TempDir::new().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = Arc::new(DbOption::from(( + let option = Arc::new(DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - ))); + )); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); let (task_tx, _task_rx) = bounded(1); @@ -1738,10 +1710,9 @@ pub(crate) mod tests { let manager = StoreManager::new(FsOptions::Local, vec![]).unwrap(); let dyn_schema = Arc::new(test_dyn_item_schema()); - let option = Arc::new(DbOption::with_path( + let option = Arc::new(DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), - "id".to_owned(), - dyn_schema.primary_key_index(), + dyn_schema.as_ref(), )); manager .base_fs() @@ -1777,14 +1748,13 @@ pub(crate) mod tests { schema.flush_wal().await.unwrap(); drop(schema); - let option = DbOption::with_path( + let option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), - "id".to_owned(), - dyn_schema.primary_key_index(), + dyn_schema.as_ref(), ); let dyn_schema = test_dyn_item_schema(); let db: DB = - DB::with_schema(option, TokioExecutor::current(), dyn_schema) + DB::new(option, TokioExecutor::current(), dyn_schema) .await .unwrap(); @@ -1817,10 +1787,10 @@ pub(crate) mod tests { async fn test_get_removed() { let temp_dir = TempDir::new().unwrap(); - let mut option = DbOption::from(( + let mut option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - )); + ); option.immutable_chunk_num = 1; option.immutable_chunk_max_num = 1; option.major_threshold_with_sst_size = 3; @@ -1857,10 +1827,9 @@ pub(crate) mod tests { let temp_dir = TempDir::new().unwrap(); let dyn_schema = test_dyn_item_schema(); - let mut option = DbOption::with_path( + let mut option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), - "id".to_string(), - dyn_schema.primary_key_index(), + &dyn_schema, ); option.immutable_chunk_num = 1; option.immutable_chunk_max_num = 1; @@ -1871,7 +1840,7 @@ pub(crate) mod tests { option.trigger_type = TriggerType::Length(5); let db: DB = - DB::with_schema(option, TokioExecutor::current(), dyn_schema) + DB::new(option, TokioExecutor::current(), dyn_schema) .await .unwrap(); @@ -2065,10 +2034,9 @@ pub(crate) mod tests { let temp_dir1 = TempDir::with_prefix("db1").unwrap(); let dyn_schema = test_dyn_item_schema(); - let mut option = DbOption::with_path( + let mut option = DbOption::new( Path::from_filesystem_path(temp_dir1.path()).unwrap(), - "id".to_string(), - dyn_schema.primary_key_index(), + &dyn_schema, ); option.immutable_chunk_num = 1; option.immutable_chunk_max_num = 1; @@ -2077,10 +2045,9 @@ pub(crate) mod tests { option.trigger_type = TriggerType::Length(5); let temp_dir2 = TempDir::with_prefix("db2").unwrap(); - let mut option2 = DbOption::with_path( + let mut option2 = DbOption::new( Path::from_filesystem_path(temp_dir2.path()).unwrap(), - "id".to_string(), - dyn_schema.primary_key_index(), + &dyn_schema, ); option2.immutable_chunk_num = 1; option2.immutable_chunk_max_num = 1; @@ -2089,10 +2056,9 @@ pub(crate) mod tests { option2.trigger_type = TriggerType::Length(5); let temp_dir3 = TempDir::with_prefix("db3").unwrap(); - let mut option3 = DbOption::with_path( + let mut option3 = DbOption::new( Path::from_filesystem_path(temp_dir3.path()).unwrap(), - "id".to_string(), - dyn_schema.primary_key_index(), + &dyn_schema, ); option3.immutable_chunk_num = 1; option3.immutable_chunk_max_num = 1; @@ -2101,15 +2067,15 @@ pub(crate) mod tests { option3.trigger_type = TriggerType::Length(5); let db1: DB = - DB::with_schema(option, TokioExecutor::current(), test_dyn_item_schema()) + DB::new(option, TokioExecutor::current(), test_dyn_item_schema()) .await .unwrap(); let db2: DB = - DB::with_schema(option2, TokioExecutor::current(), test_dyn_item_schema()) + DB::new(option2, TokioExecutor::current(), test_dyn_item_schema()) .await .unwrap(); let db3: DB = - DB::with_schema(option3, TokioExecutor::current(), test_dyn_item_schema()) + DB::new(option3, TokioExecutor::current(), test_dyn_item_schema()) .await .unwrap(); diff --git a/src/ondisk/sstable.rs b/src/ondisk/sstable.rs index 8d5501c8..b92acd63 100644 --- a/src/ondisk/sstable.rs +++ b/src/ondisk/sstable.rs @@ -193,10 +193,10 @@ pub(crate) mod tests { let manager = StoreManager::new(FsOptions::Local, vec![]).unwrap(); let base_fs = manager.base_fs(); let record_batch = get_test_record_batch::( - DbOption::from(( + DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - )), + ), TokioExecutor::current(), ) .await; @@ -271,10 +271,10 @@ pub(crate) mod tests { let manager = StoreManager::new(FsOptions::Local, vec![]).unwrap(); let base_fs = manager.base_fs(); let record_batch = get_test_record_batch::( - DbOption::from(( + DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - )), + ), TokioExecutor::current(), ) .await; diff --git a/src/option.rs b/src/option.rs index 50d23940..3d69e76b 100644 --- a/src/option.rs +++ b/src/option.rs @@ -1,30 +1,25 @@ -use std::{ - fmt::{Debug, Formatter}, - marker::PhantomData, -}; +use std::fmt::{Debug, Formatter}; use fusio::path::Path; use fusio_dispatch::FsOptions; use parquet::{ basic::Compression, file::properties::{EnabledStatistics, WriterProperties}, - format::SortingColumn, - schema::types::ColumnPath, }; +use thiserror::Error; use crate::{ fs::{FileId, FileType}, record::{Record, Schema}, trigger::TriggerType, version::{Version, MAX_LEVEL}, - DbError, }; const DEFAULT_WAL_BUFFER_SIZE: usize = 4 * 1024; /// configure the operating parameters of each component in the [`DB`](crate::DB) #[derive(Clone)] -pub struct DbOption { +pub struct DbOption { pub(crate) clean_channel_buffer: usize, pub(crate) base_path: Path, pub(crate) base_fs: FsOptions, @@ -41,17 +36,12 @@ pub struct DbOption { pub(crate) use_wal: bool, pub(crate) wal_buffer_size: usize, pub(crate) write_parquet_properties: WriterProperties, - _p: PhantomData, } -impl DbOption -where - R: Record, -{ +impl DbOption { /// build the default configured [`DbOption`] with base path and primary key - pub fn with_path(base_path: Path, primary_key_name: String, primary_key_index: usize) -> Self { - let (column_paths, sorting_columns) = - Self::primary_key_path(primary_key_name, primary_key_index); + pub fn new(base_path: Path, schema: &S) -> Self { + let (column_paths, sorting_columns) = schema.primary_key_path(); DbOption { immutable_chunk_num: 3, @@ -74,67 +64,14 @@ where major_default_oldest_table_num: 3, major_l_selection_table_max_num: 4, trigger_type: TriggerType::SizeOfMem(64 * 1024 * 1024), - _p: Default::default(), version_log_snapshot_threshold: 200, level_paths: vec![None; MAX_LEVEL], base_fs: FsOptions::Local, } } - - fn primary_key_path( - primary_key_name: String, - primary_key_index: usize, - ) -> (ColumnPath, Vec) { - ( - ColumnPath::new(vec!["_ts".to_string(), primary_key_name]), - vec![ - SortingColumn::new(1_i32, true, true), - SortingColumn::new(primary_key_index as i32, false, true), - ], - ) - } } -impl From<(Path, &R::Schema)> for DbOption -where - R: Record, -{ - /// build the default configured [`DbOption`] based on the passed path - fn from((base_path, schema): (Path, &R::Schema)) -> Self { - let (column_paths, sorting_columns) = schema.primary_key_path(); - DbOption { - immutable_chunk_num: 3, - immutable_chunk_max_num: 5, - major_threshold_with_sst_size: 4, - level_sst_magnification: 10, - max_sst_file_size: 256 * 1024 * 1024, - clean_channel_buffer: 10, - base_path, - base_fs: FsOptions::Local, - write_parquet_properties: WriterProperties::builder() - .set_compression(Compression::LZ4) - .set_column_statistics_enabled(column_paths.clone(), EnabledStatistics::Page) - .set_column_bloom_filter_enabled(column_paths.clone(), true) - .set_sorting_columns(Some(sorting_columns)) - .set_created_by(concat!("tonbo version ", env!("CARGO_PKG_VERSION")).to_owned()) - .build(), - - use_wal: true, - wal_buffer_size: DEFAULT_WAL_BUFFER_SIZE, - major_default_oldest_table_num: 3, - major_l_selection_table_max_num: 4, - trigger_type: TriggerType::SizeOfMem(64 * 1024 * 1024), - _p: Default::default(), - version_log_snapshot_threshold: 200, - level_paths: vec![None; MAX_LEVEL], - } - } -} - -impl DbOption -where - R: Record, -{ +impl DbOption { /// build the [`DB`](crate::DB) storage directory based on the passed path pub fn path(self, path: impl Into) -> Self { DbOption { @@ -232,9 +169,9 @@ where level: usize, path: Path, fs_options: FsOptions, - ) -> Result> { + ) -> Result { if level >= MAX_LEVEL { - Err(DbError::ExceedsMaxLevel)?; + return Err(ExceedsMaxLevel); } self.level_paths[level] = Some((path, fs_options)); Ok(self) @@ -246,10 +183,11 @@ where } } -impl DbOption -where - R: Record, -{ +#[derive(Debug, Error)] +#[error("exceeds max level, max level is {}", MAX_LEVEL)] +pub struct ExceedsMaxLevel; + +impl DbOption { pub(crate) fn table_path(&self, gen: FileId, level: usize) -> Path { self.level_paths[level] .as_ref() @@ -280,13 +218,17 @@ where self.level_paths[level].as_ref().map(|(path, _)| path) } - pub(crate) fn is_threshold_exceeded_major(&self, version: &Version, level: usize) -> bool { + pub(crate) fn is_threshold_exceeded_major( + &self, + version: &Version, + level: usize, + ) -> bool { Version::::tables_len(version, level) >= (self.major_threshold_with_sst_size * self.level_sst_magnification.pow(level as u32)) } } -impl Debug for DbOption { +impl Debug for DbOption { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.debug_struct("DbOption") .field("clean_channel_buffer", &self.clean_channel_buffer) diff --git a/src/record/runtime/record.rs b/src/record/runtime/record.rs index 6b921619..5d6df689 100644 --- a/src/record/runtime/record.rs +++ b/src/record/runtime/record.rs @@ -1,8 +1,8 @@ -use std::{any::Any, sync::Arc}; +use std::sync::Arc; use fusio::SeqRead; -use super::{Datatype, DynRecordRef, Value, ValueDesc}; +use super::{Datatype, DynRecordRef, Value}; use crate::{ record::{DynSchema, Record, RecordDecodeError}, serdes::{Decode, Encode}, @@ -24,68 +24,6 @@ impl DynRecord { } } -impl DynRecord { - pub(crate) fn empty_record(column_descs: Vec, primary_index: usize) -> DynRecord { - let mut columns = vec![]; - for desc in column_descs.iter() { - let value: Arc = match desc.datatype { - Datatype::UInt8 => match desc.is_nullable { - true => Arc::>::new(None), - false => Arc::new(u8::default()), - }, - Datatype::UInt16 => match desc.is_nullable { - true => Arc::>::new(None), - false => Arc::new(u16::default()), - }, - Datatype::UInt32 => match desc.is_nullable { - true => Arc::>::new(None), - false => Arc::new(u32::default()), - }, - Datatype::UInt64 => match desc.is_nullable { - true => Arc::>::new(None), - false => Arc::new(u64::default()), - }, - Datatype::Int8 => match desc.is_nullable { - true => Arc::>::new(None), - false => Arc::new(i8::default()), - }, - Datatype::Int16 => match desc.is_nullable { - true => Arc::>::new(None), - false => Arc::new(i16::default()), - }, - Datatype::Int32 => match desc.is_nullable { - true => Arc::>::new(None), - false => Arc::new(i32::default()), - }, - Datatype::Int64 => match desc.is_nullable { - true => Arc::>::new(None), - false => Arc::new(i64::default()), - }, - Datatype::String => match desc.is_nullable { - true => Arc::>::new(None), - false => Arc::new(String::default()), - }, - Datatype::Boolean => match desc.is_nullable { - true => Arc::>::new(None), - false => Arc::new(bool::default()), - }, - Datatype::Bytes => match desc.is_nullable { - true => Arc::>>::new(None), - false => Arc::new(Vec::::default()), - }, - }; - columns.push(Value::new( - desc.datatype, - desc.name.to_owned(), - value, - desc.is_nullable, - )); - } - - DynRecord::new(columns, primary_index) - } -} - impl Decode for DynRecord { type Error = RecordDecodeError; diff --git a/src/snapshot.rs b/src/snapshot.rs index 24ff2a0e..c90f06e2 100644 --- a/src/snapshot.rs +++ b/src/snapshot.rs @@ -143,10 +143,10 @@ mod tests { async fn snapshot_scan() { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); - let option = Arc::new(DbOption::from(( + let option = Arc::new(DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - ))); + )); manager .base_fs() diff --git a/src/stream/level.rs b/src/stream/level.rs index 1fb580de..24f1c908 100644 --- a/src/stream/level.rs +++ b/src/stream/level.rs @@ -51,7 +51,7 @@ where upper: Bound<&'level ::Key>, ts: Timestamp, level: usize, - option: Arc>, + option: Arc, gens: VecDeque, limit: Option, projection_mask: ProjectionMask, @@ -229,23 +229,18 @@ mod tests { use tempfile::TempDir; use crate::{ - compaction::tests::build_version, - fs::manager::StoreManager, - inmem::immutable::tests::TestSchema, - record::{Record, Schema}, - stream::level::LevelStream, - tests::Test, - DbOption, + compaction::tests::build_version, fs::manager::StoreManager, + inmem::immutable::tests::TestSchema, record::Schema, stream::level::LevelStream, DbOption, }; #[tokio::test] async fn projection_scan() { let temp_dir = TempDir::new().unwrap(); let manager = StoreManager::new(FsOptions::Local, vec![]).unwrap(); - let option = Arc::new(DbOption::from(( + let option = Arc::new(DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema {}, - ))); + )); manager .base_fs() diff --git a/src/stream/mem_projection.rs b/src/stream/mem_projection.rs index 334e3569..c22de630 100644 --- a/src/stream/mem_projection.rs +++ b/src/stream/mem_projection.rs @@ -64,7 +64,7 @@ mod tests { use crate::{ inmem::{immutable::tests::TestSchema, mutable::Mutable}, - record::{Record, Schema}, + record::Schema, stream::mem_projection::MemProjectionStream, tests::Test, trigger::TriggerFactory, @@ -76,10 +76,10 @@ mod tests { async fn merge_mutable() { let temp_dir = tempfile::tempdir().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = DbOption::from(( + let option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - )); + ); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); diff --git a/src/stream/merge.rs b/src/stream/merge.rs index 42ea60dd..d4b3f5a4 100644 --- a/src/stream/merge.rs +++ b/src/stream/merge.rs @@ -163,22 +163,18 @@ mod tests { use super::MergeStream; use crate::{ - inmem::{immutable::tests::TestSchema, mutable::Mutable}, - record::test::StringSchema, - stream::Entry, - trigger::TriggerFactory, - wal::log::LogType, - DbOption, + inmem::mutable::Mutable, record::test::StringSchema, stream::Entry, + trigger::TriggerFactory, wal::log::LogType, DbOption, }; #[tokio::test] async fn merge_mutable() { let temp_dir = tempfile::tempdir().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = DbOption::from(( + let option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &StringSchema, - )); + ); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); @@ -278,10 +274,10 @@ mod tests { async fn merge_mutable_remove_duplicates() { let temp_dir = tempfile::tempdir().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = DbOption::from(( + let option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &StringSchema, - )); + ); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); @@ -369,10 +365,10 @@ mod tests { async fn merge_mutable_limit() { let temp_dir = tempfile::tempdir().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = DbOption::from(( + let option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &StringSchema, - )); + ); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); diff --git a/src/stream/package.rs b/src/stream/package.rs index 2dec0e0e..39ba16f3 100644 --- a/src/stream/package.rs +++ b/src/stream/package.rs @@ -96,7 +96,7 @@ mod tests { }, mutable::Mutable, }, - record::{Record, Schema}, + record::Schema, stream::{merge::MergeStream, package::PackageStream}, tests::Test, trigger::TriggerFactory, @@ -108,10 +108,10 @@ mod tests { async fn iter() { let temp_dir = TempDir::new().unwrap(); let fs = Arc::new(TokioFs) as Arc; - let option = DbOption::from(( + let option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - )); + ); fs.create_dir_all(&option.wal_dir_path()).await.unwrap(); diff --git a/src/transaction.rs b/src/transaction.rs index d6d219a1..59257ed8 100644 --- a/src/transaction.rs +++ b/src/transaction.rs @@ -258,7 +258,6 @@ mod tests { record::{ runtime::{test::test_dyn_item_schema, Datatype, DynRecord, Value}, test::StringSchema, - ValueDesc, }, tests::{build_db, build_schema, Test}, transaction::CommitError, @@ -271,10 +270,10 @@ mod tests { let temp_dir = TempDir::new().unwrap(); let db = DB::::new( - DbOption::from(( + DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &StringSchema, - )), + ), TokioExecutor::current(), StringSchema, ) @@ -310,10 +309,10 @@ mod tests { async fn transaction_get() { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); - let option = Arc::new(DbOption::from(( + let option = Arc::new(DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - ))); + )); manager .base_fs() @@ -402,10 +401,10 @@ mod tests { #[tokio::test] async fn write_conflicts() { let temp_dir = TempDir::new().unwrap(); - let option = DbOption::from(( + let option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &StringSchema, - )); + ); let db = DB::::new(option, TokioExecutor::current(), StringSchema) .await @@ -438,10 +437,10 @@ mod tests { #[tokio::test] async fn transaction_projection() { let temp_dir = TempDir::new().unwrap(); - let option = DbOption::from(( + let option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - )); + ); let db = DB::::new(option, TokioExecutor::current(), TestSchema) .await @@ -479,10 +478,10 @@ mod tests { async fn transaction_scan() { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); - let option = Arc::new(DbOption::from(( + let option = Arc::new(DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - ))); + )); manager .base_fs() @@ -576,10 +575,10 @@ mod tests { async fn test_transaction_scan_bound() { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); - let option = Arc::new(DbOption::from(( + let option = Arc::new(DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - ))); + )); manager .base_fs() @@ -754,10 +753,10 @@ mod tests { async fn test_transaction_scan_limit() { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); - let option = Arc::new(DbOption::from(( + let option = Arc::new(DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - ))); + )); manager .base_fs() @@ -817,19 +816,13 @@ mod tests { #[tokio::test] async fn test_dyn_record() { - let descs = vec![ - ValueDesc::new("age".to_string(), Datatype::Int8, false), - ValueDesc::new("height".to_string(), Datatype::Int16, true), - ValueDesc::new("weight".to_string(), Datatype::Int32, false), - ]; - let temp_dir = TempDir::new().unwrap(); - let option = DbOption::with_path( + let schema = test_dyn_item_schema(); + let option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), - "age".to_string(), - 0, + &schema, ); - let db = DB::with_schema(option, TokioExecutor::current(), test_dyn_item_schema()) + let db = DB::new(option, TokioExecutor::current(), schema) .await .unwrap(); diff --git a/src/version/cleaner.rs b/src/version/cleaner.rs index 118f0e5a..9b1e9c63 100644 --- a/src/version/cleaner.rs +++ b/src/version/cleaner.rs @@ -4,9 +4,8 @@ use flume::{Receiver, Sender}; use crate::{ fs::{manager::StoreManager, FileId}, - record::Record, timestamp::Timestamp, - DbError, DbOption, + DbOption, }; pub enum CleanTag { @@ -23,22 +22,16 @@ pub enum CleanTag { }, } -pub(crate) struct Cleaner -where - R: Record, -{ +pub(crate) struct Cleaner { tag_recv: Receiver, gens_map: BTreeMap, bool)>, - option: Arc>, + option: Arc, manager: Arc, } -impl Cleaner -where - R: Record, -{ +impl Cleaner { pub(crate) fn new( - option: Arc>, + option: Arc, manager: Arc, ) -> (Self, Sender) { let (tag_send, tag_recv) = flume::bounded(option.clean_channel_buffer); @@ -54,7 +47,7 @@ where ) } - pub(crate) async fn listen(&mut self) -> Result<(), DbError> { + pub(crate) async fn listen(&mut self) -> Result<(), fusio::Error> { while let Ok(tag) = self.tag_recv.recv_async().await { match tag { CleanTag::Add { ts, gens } => { @@ -106,9 +99,8 @@ pub(crate) mod tests { use crate::{ executor::{tokio::TokioExecutor, Executor}, - fs::{generate_file_id, manager::StoreManager, FileId, FileType}, + fs::{generate_file_id, manager::StoreManager, FileType}, inmem::immutable::tests::TestSchema, - tests::Test, version::cleaner::{CleanTag, Cleaner}, DbOption, }; @@ -117,10 +109,10 @@ pub(crate) mod tests { async fn test_cleaner() { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); - let option = Arc::new(DbOption::from(( + let option = Arc::new(DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &TestSchema, - ))); + )); let gen_0 = generate_file_id(); let gen_1 = generate_file_id(); @@ -157,7 +149,7 @@ pub(crate) mod tests { .unwrap(); } - let (mut cleaner, tx) = Cleaner::::new(option.clone(), manager.clone()); + let (mut cleaner, tx) = Cleaner::new(option.clone(), manager.clone()); let executor = TokioExecutor::current(); diff --git a/src/version/mod.rs b/src/version/mod.rs index 28d77cf5..49478035 100644 --- a/src/version/mod.rs +++ b/src/version/mod.rs @@ -46,7 +46,7 @@ where ts: Timestamp, pub(crate) level_slice: [Vec::Key>>; MAX_LEVEL], clean_sender: Sender, - option: Arc>, + option: Arc, timestamp: Arc, log_length: u32, } @@ -58,7 +58,7 @@ where #[cfg(test)] #[allow(unused)] pub(crate) fn new( - option: Arc>, + option: Arc, clean_sender: Sender, timestamp: Arc, ) -> Self { @@ -72,7 +72,7 @@ where } } - pub(crate) fn option(&self) -> &Arc> { + pub(crate) fn option(&self) -> &Arc { &self.option } } diff --git a/src/version/set.rs b/src/version/set.rs index 72943aa6..b7e9b25d 100644 --- a/src/version/set.rs +++ b/src/version/set.rs @@ -60,7 +60,7 @@ where inner: Arc>>, clean_sender: Sender, timestamp: Arc, - option: Arc>, + option: Arc, manager: Arc, } @@ -98,7 +98,7 @@ where { pub(crate) async fn new( clean_sender: Sender, - option: Arc>, + option: Arc, manager: Arc, ) -> Result> { let fs = manager.base_fs(); @@ -311,7 +311,7 @@ pub(crate) mod tests { pub(crate) async fn build_version_set( version: Version, clean_sender: Sender, - option: Arc>, + option: Arc, manager: Arc, ) -> Result, VersionError> where @@ -344,10 +344,10 @@ pub(crate) mod tests { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); let (sender, _) = bounded(1); - let option = Arc::new(DbOption::from(( + let option = Arc::new(DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &StringSchema, - ))); + )); manager .base_fs() .create_dir_all(&option.version_log_dir_path()) @@ -382,10 +382,10 @@ pub(crate) mod tests { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); let (sender, _) = bounded(1); - let mut option = DbOption::from(( + let mut option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &StringSchema, - )); + ); option.version_log_snapshot_threshold = 4; let option = Arc::new(option); @@ -512,10 +512,10 @@ pub(crate) mod tests { async fn version_level_sort() { let temp_dir = TempDir::new().unwrap(); let manager = Arc::new(StoreManager::new(FsOptions::Local, vec![]).unwrap()); - let option = Arc::new(DbOption::from(( + let option = Arc::new(DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &StringSchema, - ))); + )); let (sender, _) = bounded(1); manager diff --git a/tests/data_integrity.rs b/tests/data_integrity.rs index f109dd4b..1b3e6263 100644 --- a/tests/data_integrity.rs +++ b/tests/data_integrity.rs @@ -70,14 +70,15 @@ mod tests { let mut write_hasher = crc32fast::Hasher::new(); let temp_dir = TempDir::new().unwrap(); - let option = DbOption::from(( + let option = DbOption::new( Path::from_filesystem_path(temp_dir.path()).unwrap(), &CustomerSchema, - )); + ); - let db: DB = DB::new(option, TokioExecutor::current(), CustomerSchema) - .await - .unwrap(); + let db: DB = + DB::new(option, TokioExecutor::current(), CustomerSchema) + .await + .unwrap(); for _ in 0..WRITE_TIMES { let customer = gen_record(&mut rng, &mut primary_key_count); diff --git a/tests/macros_correctness.rs b/tests/macros_correctness.rs index 42b408f7..a536343f 100644 --- a/tests/macros_correctness.rs +++ b/tests/macros_correctness.rs @@ -1,4 +1,3 @@ -use tonbo::record::Schema; use tonbo_macros::Record; #[derive(Record, Debug, PartialEq)] diff --git a/tests/wasm.rs b/tests/wasm.rs index b2cbb8d5..b931765a 100644 --- a/tests/wasm.rs +++ b/tests/wasm.rs @@ -73,15 +73,10 @@ mod tests { let fs = fusio::disk::LocalFs {}; fs.create_dir_all(&path).await.unwrap(); - let option = DbOption::with_path( - Path::from_opfs_path("opfs_dir_rw").unwrap(), - "id".to_string(), - 0, - ); + let option = DbOption::new(Path::from_opfs_path("opfs_dir_rw").unwrap(), &schema); - let db: DB = DB::with_schema(option, OpfsExecutor::new(), schema) - .await - .unwrap(); + let db: DB = + DB::new(option, OpfsExecutor::new(), schema).await.unwrap(); for item in test_dyn_items().into_iter() { db.insert(item).await.unwrap(); @@ -158,15 +153,10 @@ mod tests { let path = Path::from_opfs_path("opfs_dir_txn").unwrap(); fs.create_dir_all(&path).await.unwrap(); - let option = DbOption::with_path( - Path::from_opfs_path("opfs_dir_txn").unwrap(), - "id".to_string(), - 0, - ); + let option = DbOption::new(Path::from_opfs_path("opfs_dir_txn").unwrap(), &schema); - let db: DB = DB::with_schema(option, OpfsExecutor::new(), schema) - .await - .unwrap(); + let db: DB = + DB::new(option, OpfsExecutor::new(), schema).await.unwrap(); { let mut txn = db.transaction().await; @@ -241,17 +231,11 @@ mod tests { let fs = fusio::disk::LocalFs {}; fs.create_dir_all(&path).await.unwrap(); - let option = DbOption::with_path( - Path::from_opfs_path("opfs_dir").unwrap(), - "id".to_string(), - 0, - ); + let option = DbOption::new(Path::from_opfs_path("opfs_dir").unwrap(), &schema); { let db: DB = - DB::with_schema(option, OpfsExecutor::new(), schema) - .await - .unwrap(); + DB::new(option, OpfsExecutor::new(), schema).await.unwrap(); for item in test_dyn_items().into_iter() { db.insert(item).await.unwrap(); @@ -261,14 +245,9 @@ mod tests { } let schema = test_dyn_item_schema(); - let option = DbOption::with_path( - Path::from_opfs_path("opfs_dir").unwrap(), - "id".to_string(), - 0, - ); - let db: DB = DB::with_schema(option, OpfsExecutor::new(), schema) - .await - .unwrap(); + let option = DbOption::new(Path::from_opfs_path("opfs_dir").unwrap(), &schema); + let db: DB = + DB::new(option, OpfsExecutor::new(), schema).await.unwrap(); let mut sort_items = BTreeMap::new(); for item in test_dyn_items() { @@ -311,7 +290,7 @@ mod tests { let key_id = option_env!("AWS_ACCESS_KEY_ID").unwrap().to_string(); let secret_key = option_env!("AWS_SECRET_ACCESS_KEY").unwrap().to_string(); - let (cols_desc, primary_key_index) = test_dyn_item_schema(); + let schema = test_dyn_item_schema(); let fs_option = FsOptions::S3 { bucket: "wasm-data".to_string(), @@ -326,33 +305,27 @@ mod tests { region: Some("ap-southeast-2".to_string()), }; - let option = DbOption::with_path( - Path::from_opfs_path("s3_rw").unwrap(), - "id".to_string(), - primary_key_index, - ) - .level_path( - 0, - Path::from_url_path("tonbo/l0").unwrap(), - fs_option.clone(), - ) - .unwrap() - .level_path( - 1, - Path::from_url_path("tonbo/l1").unwrap(), - fs_option.clone(), - ) - .unwrap() - .level_path(2, Path::from_url_path("tonbo/l2").unwrap(), fs_option) - .unwrap() - .major_threshold_with_sst_size(3) - .level_sst_magnification(1) - .max_sst_file_size(1 * 1024); + let option = DbOption::new(Path::from_opfs_path("s3_rw").unwrap(), &schema) + .level_path( + 0, + Path::from_url_path("tonbo/l0").unwrap(), + fs_option.clone(), + ) + .unwrap() + .level_path( + 1, + Path::from_url_path("tonbo/l1").unwrap(), + fs_option.clone(), + ) + .unwrap() + .level_path(2, Path::from_url_path("tonbo/l2").unwrap(), fs_option) + .unwrap() + .major_threshold_with_sst_size(3) + .level_sst_magnification(1) + .max_sst_file_size(1 * 1024); let db: DB = - DB::with_schema(option, OpfsExecutor::new(), cols_desc, primary_key_index) - .await - .unwrap(); + DB::new(option, OpfsExecutor::new(), schema).await.unwrap(); for (i, item) in test_dyn_items().into_iter().enumerate() { db.insert(item).await.unwrap(); From 69f639bbe67b9a25cb1aea7849f92045b6c76852 Mon Sep 17 00:00:00 2001 From: Gwo Tzu-Hsing Date: Tue, 17 Dec 2024 01:05:51 +0800 Subject: [PATCH 7/9] refactor: move dyn schema to runtime mod --- src/inmem/immutable.rs | 5 +- src/lib.rs | 6 +- src/magic.rs | 2 + src/record/mod.rs | 95 +------------------------------- src/record/runtime/array.rs | 5 +- src/record/runtime/mod.rs | 2 + src/record/runtime/record.rs | 8 +-- src/record/runtime/record_ref.rs | 4 +- src/record/runtime/schema.rs | 65 ++++++++++++++++++++++ src/record/test.rs | 5 +- tests/macros_correctness.rs | 3 +- tonbo_macros/src/record.rs | 4 +- 12 files changed, 95 insertions(+), 109 deletions(-) create mode 100644 src/magic.rs create mode 100644 src/record/runtime/schema.rs diff --git a/src/inmem/immutable.rs b/src/inmem/immutable.rs index 077f9a05..fe7e7463 100644 --- a/src/inmem/immutable.rs +++ b/src/inmem/immutable.rs @@ -231,6 +231,7 @@ pub(crate) mod tests { use super::{ArrowArrays, Builder}; use crate::{ + magic, record::{Record, Schema}, tests::{Test, TestRef}, timestamp::timestamped::Timestamped, @@ -250,7 +251,7 @@ pub(crate) mod tests { static SCHEMA: Lazy> = Lazy::new(|| { Arc::new(ArrowSchema::new(vec![ Field::new("_null", DataType::Boolean, false), - Field::new("_ts", DataType::UInt32, false), + Field::new(magic::TS, DataType::UInt32, false), Field::new("vstring", DataType::Utf8, false), Field::new("vu32", DataType::UInt32, false), Field::new("vbool", DataType::Boolean, true), @@ -271,7 +272,7 @@ pub(crate) mod tests { Vec, ) { ( - ColumnPath::new(vec!["_ts".to_string(), "vstring".to_string()]), + ColumnPath::new(vec![magic::TS.to_string(), "vstring".to_string()]), vec![ SortingColumn::new(1, true, true), SortingColumn::new(2, false, true), diff --git a/src/lib.rs b/src/lib.rs index e58db942..c21d5b6c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -121,6 +121,7 @@ mod compaction; pub mod executor; pub mod fs; pub mod inmem; +pub mod magic; mod ondisk; pub mod option; pub mod record; @@ -147,6 +148,7 @@ use futures_core::Stream; use futures_util::StreamExt; use inmem::{immutable::Immutable, mutable::Mutable}; use lockable::LockableHashMap; +use magic::USER_COLUMN_OFFSET; pub use once_cell; pub use parquet; use parquet::{ @@ -615,7 +617,7 @@ where Projection::Parts(projection) => { let mut fixed_projection: Vec = [0, 1, primary_key_index] .into_iter() - .chain(projection.into_iter().map(|p| p + 2)) + .chain(projection.into_iter().map(|p| p + USER_COLUMN_OFFSET)) .collect(); fixed_projection.dedup(); @@ -733,7 +735,7 @@ where pub fn projection(self, mut projection: Vec) -> Self { // skip two columns: _null and _ts for p in &mut projection { - *p += 2; + *p += USER_COLUMN_OFFSET; } let primary_key_index = self.schema.record_schema.primary_key_index(); let mut fixed_projection = vec![0, 1, primary_key_index]; diff --git a/src/magic.rs b/src/magic.rs new file mode 100644 index 00000000..705025ef --- /dev/null +++ b/src/magic.rs @@ -0,0 +1,2 @@ +pub const TS: &str = "_ts"; +pub const USER_COLUMN_OFFSET: usize = 2; diff --git a/src/record/mod.rs b/src/record/mod.rs index 664f134e..416ea976 100644 --- a/src/record/mod.rs +++ b/src/record/mod.rs @@ -4,13 +4,9 @@ pub mod runtime; #[cfg(test)] pub(crate) mod test; -use std::{collections::HashMap, error::Error, fmt::Debug, io, sync::Arc}; +use std::{error::Error, fmt::Debug, io, sync::Arc}; -use array::DynRecordImmutableArrays; -use arrow::{ - array::RecordBatch, - datatypes::{DataType, Field, Schema as ArrowSchema}, -}; +use arrow::{array::RecordBatch, datatypes::Schema as ArrowSchema}; use internal::InternalRecordRef; pub use key::{Key, KeyRef}; use parquet::{arrow::ProjectionMask, format::SortingColumn, schema::types::ColumnPath}; @@ -22,35 +18,6 @@ use crate::{ serdes::{Decode, Encode}, }; -// #[allow(unused)] -// pub(crate) enum RecordInstance { -// Normal, -// Runtime(DynRecord), -// } - -// #[allow(unused)] -// impl RecordInstance { -// pub(crate) fn primary_key_index(&self) -> usize -// where -// R: Record, -// { -// match self { -// RecordInstance::Normal => R::primary_key_index(), -// RecordInstance::Runtime(record) => record.primary_key_index(), -// } -// } - -// pub(crate) fn arrow_schema(&self) -> Arc -// where -// R: Record, -// { -// match self { -// RecordInstance::Normal => R::arrow_schema().clone(), -// RecordInstance::Runtime(record) => record.arrow_schema(), -// } -// } -// } - pub trait Schema: Debug + Send + Sync { type Record: Record; @@ -65,64 +32,6 @@ pub trait Schema: Debug + Send + Sync { fn primary_key_path(&self) -> (ColumnPath, Vec); } -#[derive(Debug)] -pub struct DynSchema { - schema: Vec, - primary_index: usize, - arrow_schema: Arc, -} - -impl DynSchema { - pub fn new(schema: Vec, primary_index: usize) -> Self { - let mut metadata = HashMap::new(); - metadata.insert("primary_key_index".to_string(), primary_index.to_string()); - let arrow_schema = Arc::new(ArrowSchema::new_with_metadata( - [ - Field::new("_null", DataType::Boolean, false), - Field::new("_ts", DataType::UInt32, false), - ] - .into_iter() - .chain(schema.iter().map(|desc| desc.arrow_field())) - .collect::>(), - metadata, - )); - Self { - schema, - primary_index, - arrow_schema, - } - } -} - -impl Schema for DynSchema { - type Record = DynRecord; - - type Columns = DynRecordImmutableArrays; - - type Key = Value; - - fn arrow_schema(&self) -> &Arc { - &self.arrow_schema - } - - fn primary_key_index(&self) -> usize { - self.primary_index - } - - fn primary_key_path(&self) -> (ColumnPath, Vec) { - ( - ColumnPath::new(vec![ - "_ts".to_string(), - self.schema[self.primary_index].name.clone(), - ]), - vec![ - SortingColumn::new(1_i32, true, true), - SortingColumn::new(self.primary_key_index() as i32, false, true), - ], - ) - } -} - pub trait Record: 'static + Sized + Decode + Debug + Send + Sync { type Schema: Schema; diff --git a/src/record/runtime/array.rs b/src/record/runtime/array.rs index 22a31cd5..b8180c02 100644 --- a/src/record/runtime/array.rs +++ b/src/record/runtime/array.rs @@ -15,6 +15,7 @@ use arrow::{ use super::{record::DynRecord, record_ref::DynRecordRef, value::Value, Datatype}; use crate::{ inmem::immutable::{ArrowArrays, Builder}, + magic::USER_COLUMN_OFFSET, record::{Key, Record, Schema}, timestamp::Timestamped, }; @@ -117,7 +118,7 @@ impl ArrowArrays for DynRecordImmutableArrays { let mut columns = vec![]; for (idx, col) in self.columns.iter().enumerate() { - if projection_mask.leaf_included(idx + 2) && !col.is_nullable { + if projection_mask.leaf_included(idx + USER_COLUMN_OFFSET) && !col.is_nullable { let datatype = col.datatype; let name = col.name.to_string(); let value: Arc = match datatype { @@ -458,7 +459,7 @@ impl Builder for DynRecordBuilder { .zip(self.datatypes.iter()) .enumerate() { - let field = self.schema.field(idx + 2); + let field = self.schema.field(idx + USER_COLUMN_OFFSET); let is_nullable = field.is_nullable(); match datatype { Datatype::UInt8 => { diff --git a/src/record/runtime/mod.rs b/src/record/runtime/mod.rs index 8bba1c85..f7489a69 100644 --- a/src/record/runtime/mod.rs +++ b/src/record/runtime/mod.rs @@ -1,11 +1,13 @@ pub(crate) mod array; mod record; mod record_ref; +mod schema; mod value; use arrow::datatypes::DataType; pub use record::*; pub use record_ref::*; +pub use schema::*; pub use value::*; #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] diff --git a/src/record/runtime/record.rs b/src/record/runtime/record.rs index 5d6df689..81b5e900 100644 --- a/src/record/runtime/record.rs +++ b/src/record/runtime/record.rs @@ -2,9 +2,9 @@ use std::sync::Arc; use fusio::SeqRead; -use super::{Datatype, DynRecordRef, Value}; +use super::{schema::DynSchema, Datatype, DynRecordRef, Value}; use crate::{ - record::{DynSchema, Record, RecordDecodeError}, + record::{Record, RecordDecodeError}, serdes::{Decode, Encode}, }; @@ -175,8 +175,8 @@ impl Record for DynRecord { pub(crate) mod test { use std::sync::Arc; - use super::DynRecord; - use crate::record::{Datatype, DynSchema, Value, ValueDesc}; + use super::{DynRecord, DynSchema}; + use crate::record::{Datatype, Value, ValueDesc}; #[allow(unused)] pub(crate) fn test_dyn_item_schema() -> DynSchema { diff --git a/src/record/runtime/record_ref.rs b/src/record/runtime/record_ref.rs index aa5f8bbf..b129212f 100644 --- a/src/record/runtime/record_ref.rs +++ b/src/record/runtime/record_ref.rs @@ -11,6 +11,7 @@ use fusio::Write; use super::{Datatype, DynRecord, Value}; use crate::{ + magic::USER_COLUMN_OFFSET, record::{internal::InternalRecordRef, Key, Record, RecordEncodeError, RecordRef, Schema}, serdes::Encode, }; @@ -215,7 +216,8 @@ impl<'r> RecordRef<'r> for DynRecordRef<'r> { fn projection(&mut self, projection_mask: &parquet::arrow::ProjectionMask) { for (idx, col) in self.columns.iter_mut().enumerate() { - if idx != self.primary_index && !projection_mask.leaf_included(idx + 2) { + if idx != self.primary_index && !projection_mask.leaf_included(idx + USER_COLUMN_OFFSET) + { match col.datatype { Datatype::UInt8 => col.value = Arc::>::new(None), Datatype::UInt16 => col.value = Arc::>::new(None), diff --git a/src/record/runtime/schema.rs b/src/record/runtime/schema.rs new file mode 100644 index 00000000..37fef4de --- /dev/null +++ b/src/record/runtime/schema.rs @@ -0,0 +1,65 @@ +use std::{collections::HashMap, sync::Arc}; + +use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; +use parquet::{format::SortingColumn, schema::types::ColumnPath}; + +use super::{array::DynRecordImmutableArrays, DynRecord, Value, ValueDesc}; +use crate::{magic, record::Schema}; + +#[derive(Debug)] +pub struct DynSchema { + schema: Vec, + primary_index: usize, + arrow_schema: Arc, +} + +impl DynSchema { + pub fn new(schema: Vec, primary_index: usize) -> Self { + let mut metadata = HashMap::new(); + metadata.insert("primary_key_index".to_string(), primary_index.to_string()); + let arrow_schema = Arc::new(ArrowSchema::new_with_metadata( + [ + Field::new("_null", DataType::Boolean, false), + Field::new(magic::TS, DataType::UInt32, false), + ] + .into_iter() + .chain(schema.iter().map(|desc| desc.arrow_field())) + .collect::>(), + metadata, + )); + Self { + schema, + primary_index, + arrow_schema, + } + } +} + +impl Schema for DynSchema { + type Record = DynRecord; + + type Columns = DynRecordImmutableArrays; + + type Key = Value; + + fn arrow_schema(&self) -> &Arc { + &self.arrow_schema + } + + fn primary_key_index(&self) -> usize { + self.primary_index + } + + fn primary_key_path(&self) -> (ColumnPath, Vec) { + ( + ColumnPath::new(vec![ + magic::TS.to_string(), + self.schema[self.primary_index].name.clone(), + ]), + vec![ + SortingColumn::new(1_i32, true, true), + SortingColumn::new(self.primary_key_index() as i32, false, true), + ], + ) + } +} diff --git a/src/record/test.rs b/src/record/test.rs index effc185e..b94cd5c9 100644 --- a/src/record/test.rs +++ b/src/record/test.rs @@ -13,6 +13,7 @@ use parquet::{arrow::ProjectionMask, format::SortingColumn, schema::types::Colum use super::{internal::InternalRecordRef, Key, Record, RecordRef, Schema}; use crate::{ inmem::immutable::{ArrowArrays, Builder}, + magic, timestamp::Timestamped, }; @@ -32,7 +33,7 @@ impl Schema for StringSchema { static SCHEMA: Lazy> = Lazy::new(|| { Arc::new(ArrowSchema::new(vec![ Field::new("_null", DataType::Boolean, false), - Field::new("_ts", DataType::UInt32, false), + Field::new(magic::TS, DataType::UInt32, false), Field::new(PRIMARY_FIELD_NAME, DataType::Utf8, false), ])) }); @@ -46,7 +47,7 @@ impl Schema for StringSchema { fn primary_key_path(&self) -> (ColumnPath, Vec) { ( - ColumnPath::new(vec!["_ts".to_string(), PRIMARY_FIELD_NAME.to_string()]), + ColumnPath::new(vec![magic::TS.to_string(), PRIMARY_FIELD_NAME.to_string()]), vec![ SortingColumn::new(1, true, true), SortingColumn::new(2, false, true), diff --git a/tests/macros_correctness.rs b/tests/macros_correctness.rs index a536343f..cd7b0fea 100644 --- a/tests/macros_correctness.rs +++ b/tests/macros_correctness.rs @@ -21,6 +21,7 @@ mod tests { use tokio::io::AsyncSeekExt; use tonbo::{ inmem::immutable::{ArrowArrays, Builder}, + magic, record::{Record, RecordRef, Schema}, serdes::{Decode, Encode}, timestamp::timestamped::Timestamped, @@ -42,7 +43,7 @@ mod tests { assert_eq!( UserSchema {}.primary_key_path(), ( - ColumnPath::new(vec!["_ts".to_string(), "name".to_string()]), + ColumnPath::new(vec![magic::TS.to_string(), "name".to_string()]), vec![ SortingColumn::new(1, true, true), SortingColumn::new(4, false, true), diff --git a/tonbo_macros/src/record.rs b/tonbo_macros/src/record.rs index 66937650..8b2703e1 100644 --- a/tonbo_macros/src/record.rs +++ b/tonbo_macros/src/record.rs @@ -374,7 +374,7 @@ fn struct_schema_codegen( fn primary_key_path(&self) -> (::tonbo::parquet::schema::types::ColumnPath, Vec<::tonbo::parquet::format::SortingColumn>) { ( - ::tonbo::parquet::schema::types::ColumnPath::new(vec!["_ts".to_string(), stringify!(#primary_key_name).to_string()]), + ::tonbo::parquet::schema::types::ColumnPath::new(vec![::tonbo::magic::TS.to_string(), stringify!(#primary_key_name).to_string()]), vec![::tonbo::parquet::format::SortingColumn::new(1_i32, true, true), ::tonbo::parquet::format::SortingColumn::new(#primary_key_index as i32, false, true)] ) } @@ -383,7 +383,7 @@ fn struct_schema_codegen( static SCHEMA: ::tonbo::once_cell::sync::Lazy<::std::sync::Arc<::tonbo::arrow::datatypes::Schema>> = ::tonbo::once_cell::sync::Lazy::new(|| { ::std::sync::Arc::new(::tonbo::arrow::datatypes::Schema::new(vec![ ::tonbo::arrow::datatypes::Field::new("_null", ::tonbo::arrow::datatypes::DataType::Boolean, false), - ::tonbo::arrow::datatypes::Field::new("_ts", ::tonbo::arrow::datatypes::DataType::UInt32, false), + ::tonbo::arrow::datatypes::Field::new(::tonbo::magic::TS, ::tonbo::arrow::datatypes::DataType::UInt32, false), #(#schema_fields)* ])) }); From b450e8e67533c217e94ac3676aa69662ceaf20a6 Mon Sep 17 00:00:00 2001 From: Gwo Tzu-Hsing Date: Tue, 17 Dec 2024 01:23:56 +0800 Subject: [PATCH 8/9] refactor: rename Schema to DbStorage --- src/compaction/mod.rs | 6 ++-- src/lib.rs | 66 ++++++++++++++++++++----------------------- src/magic.rs | 2 +- src/snapshot.rs | 8 +++--- src/transaction.rs | 4 +-- 5 files changed, 40 insertions(+), 46 deletions(-) diff --git a/src/compaction/mod.rs b/src/compaction/mod.rs index cc9262c8..c6e1c0c4 100644 --- a/src/compaction/mod.rs +++ b/src/compaction/mod.rs @@ -22,7 +22,7 @@ use crate::{ version::{ edit::VersionEdit, set::VersionSet, TransactionTs, Version, VersionError, MAX_LEVEL, }, - DbOption, ParquetLru, Schema, + DbOption, DbStorage, ParquetLru, }; #[derive(Debug)] @@ -36,7 +36,7 @@ where R: Record, { pub(crate) option: Arc, - pub(crate) schema: Arc>>, + pub(crate) schema: Arc>>, pub(crate) version_set: VersionSet, pub(crate) manager: Arc, pub(crate) record_schema: Arc, @@ -47,7 +47,7 @@ where R: Record, { pub(crate) fn new( - schema: Arc>>, + schema: Arc>>, record_schema: Arc, option: Arc, version_set: VersionSet, diff --git a/src/lib.rs b/src/lib.rs index c21d5b6c..a89799a3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -169,7 +169,7 @@ use crate::{ compaction::{CompactTask, CompactionError, Compactor}, executor::Executor, fs::{manager::StoreManager, parse_file_id, FileType}, - record::Schema as RecordSchema, + record::Schema, serdes::Decode, snapshot::Snapshot, stream::{ @@ -187,9 +187,9 @@ where R: Record, E: Executor, { - schema: Arc>>, + schema: Arc>>, version_set: VersionSet, - lock_map: LockMap<::Key>, + lock_map: LockMap<::Key>, manager: Arc, parquet_lru: ParquetLru, _p: PhantomData, @@ -198,7 +198,7 @@ where impl DB where R: Record + Send + Sync, - ::Columns: Send + Sync, + ::Columns: Send + Sync, E: Executor + Send + Sync + 'static, { /// Open [`DB`] with a [`DbOption`]. This will create a new directory at the @@ -220,7 +220,7 @@ where impl DB where R: Record + Send + Sync, - ::Columns: Send + Sync, + ::Columns: Send + Sync, E: Executor + Send + Sync + 'static, { async fn build( @@ -252,7 +252,7 @@ where let version_set = VersionSet::new(clean_sender, option.clone(), manager.clone()).await?; let schema = Arc::new(RwLock::new( - Schema::new( + DbStorage::new( option.clone(), task_tx, &version_set, @@ -341,10 +341,7 @@ where } /// delete the record with the primary key as the `key` - pub async fn remove( - &self, - key: ::Key, - ) -> Result> { + pub async fn remove(&self, key: ::Key) -> Result> { Ok(self .schema .read() @@ -368,7 +365,7 @@ where /// get the record with `key` as the primary key and process it using closure `f` pub async fn get( &self, - key: &::Key, + key: &::Key, mut f: impl FnMut(TransactionEntry<'_, R>) -> Option, ) -> Result, CommitError> { Ok(self @@ -397,8 +394,8 @@ where pub async fn scan<'scan, T: 'scan>( &'scan self, range: ( - Bound<&'scan ::Key>, - Bound<&'scan ::Key>, + Bound<&'scan ::Key>, + Bound<&'scan ::Key>, ), mut f: impl FnMut(TransactionEntry<'_, R>) -> T + 'scan, ) -> impl Stream>> + 'scan { @@ -467,22 +464,19 @@ where } } -pub(crate) struct Schema +pub(crate) struct DbStorage where R: Record, { pub mutable: Mutable, - pub immutables: Vec<( - Option, - Immutable<::Columns>, - )>, + pub immutables: Vec<(Option, Immutable<::Columns>)>, compaction_tx: Sender, recover_wal_ids: Option>, trigger: Arc + Send + Sync>>, record_schema: Arc, } -impl Schema +impl DbStorage where R: Record + Send, { @@ -494,7 +488,7 @@ where manager: &StoreManager, ) -> Result> { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let mut schema = Schema { + let mut schema = DbStorage { mutable: Mutable::new( &option, trigger.clone(), @@ -586,7 +580,7 @@ where async fn remove( &self, log_ty: LogType, - key: ::Key, + key: ::Key, ts: Timestamp, ) -> Result> { self.mutable.remove(log_ty, key, ts).await @@ -594,7 +588,7 @@ where async fn recover_append( &self, - key: ::Key, + key: ::Key, ts: Timestamp, value: Option, ) -> Result> { @@ -605,7 +599,7 @@ where &'get self, version: &'get Version, manager: &StoreManager, - key: &'get ::Key, + key: &'get ::Key, ts: Timestamp, projection: Projection, parquet_lru: ParquetLru, @@ -652,7 +646,7 @@ where .map(|entry| Entry::RecordBatch(entry))) } - fn check_conflict(&self, key: &::Key, ts: Timestamp) -> bool { + fn check_conflict(&self, key: &::Key, ts: Timestamp) -> bool { self.mutable.check_conflict(key, ts) || self .immutables @@ -673,10 +667,10 @@ where R: Record, 'range: 'scan, { - schema: &'scan Schema, + schema: &'scan DbStorage, manager: &'scan StoreManager, - lower: Bound<&'range ::Key>, - upper: Bound<&'range ::Key>, + lower: Bound<&'range ::Key>, + upper: Bound<&'range ::Key>, ts: Timestamp, version: &'scan Version, @@ -695,11 +689,11 @@ where R: Record + Send, { fn new( - schema: &'scan Schema, + schema: &'scan DbStorage, manager: &'scan StoreManager, (lower, upper): ( - Bound<&'range ::Key>, - Bound<&'range ::Key>, + Bound<&'range ::Key>, + Bound<&'range ::Key>, ), ts: Timestamp, version: &'scan Version, @@ -811,7 +805,7 @@ where self, batch_size: usize, ) -> Result< - impl Stream::Columns, ParquetError>> + 'scan, + impl Stream::Columns, ParquetError>> + 'scan, DbError, > { let mut streams = Vec::new(); @@ -1174,7 +1168,7 @@ pub(crate) mod tests { pub(crate) async fn build_schema( option: Arc, fs: &Arc, - ) -> Result<(crate::Schema, Receiver), fusio::Error> { + ) -> Result<(crate::DbStorage, Receiver), fusio::Error> { let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); let mutable = Mutable::new(&option, trigger.clone(), fs, Arc::new(TestSchema {})).await?; @@ -1268,7 +1262,7 @@ pub(crate) mod tests { let (compaction_tx, compaction_rx) = bounded(1); Ok(( - crate::Schema { + crate::DbStorage { mutable, immutables, compaction_tx, @@ -1284,7 +1278,7 @@ pub(crate) mod tests { option: Arc, compaction_rx: Receiver, executor: E, - schema: crate::Schema, + schema: crate::DbStorage, record_schema: Arc, version: Version, manager: Arc, @@ -1655,7 +1649,7 @@ pub(crate) mod tests { let (task_tx, _task_rx) = bounded(1); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let schema: crate::Schema = crate::Schema { + let schema: crate::DbStorage = crate::DbStorage { mutable: Mutable::new(&option, trigger.clone(), &fs, Arc::new(TestSchema)) .await .unwrap(), @@ -1725,7 +1719,7 @@ pub(crate) mod tests { let (task_tx, _task_rx) = bounded(1); let trigger = Arc::new(TriggerFactory::create(option.trigger_type)); - let schema: crate::Schema = crate::Schema { + let schema: crate::DbStorage = crate::DbStorage { mutable: Mutable::new( &option, trigger.clone(), diff --git a/src/magic.rs b/src/magic.rs index 705025ef..dedbd42a 100644 --- a/src/magic.rs +++ b/src/magic.rs @@ -1,2 +1,2 @@ pub const TS: &str = "_ts"; -pub const USER_COLUMN_OFFSET: usize = 2; +pub(crate) const USER_COLUMN_OFFSET: usize = 2; diff --git a/src/snapshot.rs b/src/snapshot.rs index c90f06e2..708c0d78 100644 --- a/src/snapshot.rs +++ b/src/snapshot.rs @@ -10,7 +10,7 @@ use crate::{ stream::ScanStream, timestamp::Timestamp, version::{TransactionTs, VersionRef}, - DbError, ParquetLru, Projection, Scan, Schema, + DbError, DbStorage, ParquetLru, Projection, Scan, }; pub struct Snapshot<'s, R> @@ -18,7 +18,7 @@ where R: Record, { ts: Timestamp, - share: RwLockReadGuard<'s, Schema>, + share: RwLockReadGuard<'s, DbStorage>, version: VersionRef, manager: Arc, parquet_lru: ParquetLru, @@ -72,7 +72,7 @@ where } pub(crate) fn new( - share: RwLockReadGuard<'s, Schema>, + share: RwLockReadGuard<'s, DbStorage>, version: VersionRef, manager: Arc, parquet_lru: ParquetLru, @@ -94,7 +94,7 @@ where self.version.increase_ts() } - pub(crate) fn schema(&self) -> &Schema { + pub(crate) fn schema(&self) -> &DbStorage { &self.share } diff --git a/src/transaction.rs b/src/transaction.rs index 59257ed8..533f2198 100644 --- a/src/transaction.rs +++ b/src/transaction.rs @@ -20,7 +20,7 @@ use crate::{ stream::mem_projection::MemProjectionStream, timestamp::{Timestamp, Timestamped}, wal::log::LogType, - DbError, LockMap, Projection, Record, Scan, Schema, + DbError, LockMap, Projection, Record, Scan, DbStorage, }; pub(crate) struct TransactionScan<'scan, R: Record> { @@ -186,7 +186,7 @@ where } async fn append( - schema: &Schema, + schema: &DbStorage, log_ty: LogType, key: ::Key, record: Option, From 73c8bf0404e89c5dc1e68f3b8317cbd303033f46 Mon Sep 17 00:00:00 2001 From: Gwo Tzu-Hsing Date: Tue, 17 Dec 2024 14:49:08 +0800 Subject: [PATCH 9/9] fmt --- src/transaction.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transaction.rs b/src/transaction.rs index 533f2198..48bc2524 100644 --- a/src/transaction.rs +++ b/src/transaction.rs @@ -20,7 +20,7 @@ use crate::{ stream::mem_projection::MemProjectionStream, timestamp::{Timestamp, Timestamped}, wal::log::LogType, - DbError, LockMap, Projection, Record, Scan, DbStorage, + DbError, DbStorage, LockMap, Projection, Record, Scan, }; pub(crate) struct TransactionScan<'scan, R: Record> {