Skip to content

Commit

Permalink
refactor: Decouple dedup and merge (GreptimeTeam#4139)
Browse files Browse the repository at this point in the history
* feat: remove dedup/filter deleted from merge reader

* feat: impl dedup reader

* feat: support filter deleted flag

* test: test dedup reader

* feat: remove put_only field

* chore: fix clippy

* feat: metrics

* test: test empty batch

* perf: optimize dedup strategy

Avoid iterating all timestamps.

* test: fix test

* feat: generic
  • Loading branch information
evenyag authored Jun 17, 2024
1 parent f4a5a44 commit 558272d
Show file tree
Hide file tree
Showing 4 changed files with 430 additions and 227 deletions.
34 changes: 1 addition & 33 deletions src/mito2/src/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
//! Common structs and utilities for reading data.
pub mod compat;
pub mod dedup;
pub mod merge;
pub mod projection;
pub(crate) mod scan_region;
Expand Down Expand Up @@ -74,8 +75,6 @@ pub struct Batch {
///
/// UInt8 type, not null.
op_types: Arc<UInt8Vector>,
/// True if op types only contains put operations.
put_only: bool,
/// Fields organized in columnar format.
fields: Vec<BatchColumn>,
}
Expand Down Expand Up @@ -225,7 +224,6 @@ impl Batch {
sequences: Arc::new(self.sequences.get_slice(offset, length)),
op_types: Arc::new(self.op_types.get_slice(offset, length)),
fields,
put_only: self.put_only,
}
}

Expand Down Expand Up @@ -292,11 +290,6 @@ impl Batch {

/// Removes rows whose op type is delete.
pub fn filter_deleted(&mut self) -> Result<()> {
if self.put_only {
// If there is only put operation, we can skip comparison and filtering.
return Ok(());
}

// Safety: op type column is not null.
let array = self.op_types.as_arrow();
// Find rows with non-delete op type.
Expand Down Expand Up @@ -327,10 +320,6 @@ impl Batch {
)
.unwrap(),
);
// Also updates put_only field if it contains other ops.
if !self.put_only {
self.put_only = is_put_only(&self.op_types);
}
for batch_column in &mut self.fields {
batch_column.data = batch_column
.data
Expand Down Expand Up @@ -454,10 +443,6 @@ impl Batch {
let array = arrow::compute::take(self.op_types.as_arrow(), indices.as_arrow(), None)
.context(ComputeArrowSnafu)?;
self.op_types = Arc::new(UInt8Vector::try_from_arrow_array(array).unwrap());
// Also updates put_only field if it contains other ops.
if !self.put_only {
self.put_only = is_put_only(&self.op_types);
}
for batch_column in &mut self.fields {
batch_column.data = batch_column
.data
Expand Down Expand Up @@ -491,16 +476,6 @@ impl Batch {
}
}

/// Returns whether the op types vector only contains put operation.
fn is_put_only(op_types: &UInt8Vector) -> bool {
// Safety: Op types is not null.
op_types
.as_arrow()
.values()
.iter()
.all(|v| *v == OpType::Put as u8)
}

/// Len of timestamp in arrow row format.
const TIMESTAMP_KEY_LEN: usize = 9;

Expand Down Expand Up @@ -676,18 +651,13 @@ impl BatchBuilder {
);
}

// Checks whether op types are put only. In the future, we may get this from statistics
// in memtables and SSTs.
let put_only = is_put_only(&op_types);

Ok(Batch {
primary_key: self.primary_key,
pk_values: None,
timestamps,
sequences,
op_types,
fields: self.fields,
put_only,
})
}
}
Expand Down Expand Up @@ -994,7 +964,6 @@ mod tests {
&[OpType::Delete, OpType::Put, OpType::Delete, OpType::Put],
&[21, 22, 23, 24],
);
assert!(!batch.put_only);
batch.filter_deleted().unwrap();
let expect = new_batch(&[2, 4], &[12, 14], &[OpType::Put, OpType::Put], &[22, 24]);
assert_eq!(expect, batch);
Expand All @@ -1005,7 +974,6 @@ mod tests {
&[OpType::Put, OpType::Put, OpType::Put, OpType::Put],
&[21, 22, 23, 24],
);
assert!(batch.put_only);
let expect = batch.clone();
batch.filter_deleted().unwrap();
assert_eq!(expect, batch);
Expand Down
Loading

0 comments on commit 558272d

Please sign in to comment.