Skip to content

Commit

Permalink
feat(mito): Reads SST's row groups one by one (#2668)
Browse files Browse the repository at this point in the history
* feat: read parquet metadata

* feat: add create method for row group

* feat: read parquet by row group

* refactor: use VecDeque to collect batches

* style: fix row group clippy warnings

* chore: update comments

* style: fix clippy

* refactor: simplify row group reader builder

* docs: fix grammar issue

Co-authored-by: Lei, HUANG <[email protected]>

* chore: format code

---------

Co-authored-by: Lei, HUANG <[email protected]>
  • Loading branch information
evenyag and v0y4g3r authored Nov 1, 2023
1 parent 7bd137f commit 5f3bbdc
Show file tree
Hide file tree
Showing 5 changed files with 239 additions and 182 deletions.
9 changes: 9 additions & 0 deletions src/mito2/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,14 @@ pub enum Error {
region_dir: String,
location: Location,
},

#[snafu(display("Failed to read arrow record batch from parquet file {}", path))]
ArrowReader {
path: String,
#[snafu(source)]
error: ArrowError,
location: Location,
},
}

pub type Result<T, E = Error> = std::result::Result<T, E>;
Expand Down Expand Up @@ -458,6 +466,7 @@ impl ErrorExt for Error {
RegionReadonly { .. } => StatusCode::RegionReadonly,
JsonOptions { .. } => StatusCode::InvalidArguments,
EmptyRegionDir { .. } => StatusCode::RegionNotFound,
ArrowReader { .. } => StatusCode::StorageUnavailable,
}
}

Expand Down
1 change: 1 addition & 0 deletions src/mito2/src/read/scan_region.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ impl ScanRegion {
);

let predicate = Predicate::new(self.request.filters.clone());
// The mapper always computes projected column ids as the schema of SSTs may change.
let mapper = match &self.request.projection {
Some(p) => ProjectionMapper::new(&self.version.metadata, p.iter().copied())?,
None => ProjectionMapper::all(&self.version.metadata)?,
Expand Down
17 changes: 10 additions & 7 deletions src/mito2/src/sst/parquet/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
//!
//! We stores fields in the same order as [RegionMetadata::field_columns()](store_api::metadata::RegionMetadata::field_columns()).
use std::collections::HashMap;
use std::collections::{HashMap, VecDeque};
use std::sync::Arc;

use api::v1::SemanticType;
Expand Down Expand Up @@ -138,7 +138,10 @@ impl ReadFormat {
}
}

/// Gets the converted arrow schema.
/// Gets the arrow schema of the SST file.
///
/// This schema is computed from the region metadata but should be the same
/// as the arrow schema decoded from the file metadata.
pub(crate) fn arrow_schema(&self) -> &SchemaRef {
&self.arrow_schema
}
Expand Down Expand Up @@ -178,7 +181,7 @@ impl ReadFormat {
pub(crate) fn convert_record_batch(
&self,
record_batch: &RecordBatch,
batches: &mut Vec<Batch>,
batches: &mut VecDeque<Batch>,
) -> Result<()> {
debug_assert!(batches.is_empty());

Expand Down Expand Up @@ -249,7 +252,7 @@ impl ReadFormat {
}

let batch = builder.build()?;
batches.push(batch);
batches.push_back(batch);
}

Ok(())
Expand Down Expand Up @@ -768,7 +771,7 @@ mod tests {
assert_eq!(arrow_schema, *read_format.arrow_schema());

let record_batch = RecordBatch::new_empty(arrow_schema);
let mut batches = vec![];
let mut batches = VecDeque::new();
read_format
.convert_record_batch(&record_batch, &mut batches)
.unwrap();
Expand All @@ -790,14 +793,14 @@ mod tests {
];
let arrow_schema = build_test_arrow_schema();
let record_batch = RecordBatch::try_new(arrow_schema, columns).unwrap();
let mut batches = vec![];
let mut batches = VecDeque::new();
read_format
.convert_record_batch(&record_batch, &mut batches)
.unwrap();

assert_eq!(
vec![new_batch(b"one", 1, 1, 2), new_batch(b"two", 11, 10, 2)],
batches
batches.into_iter().collect::<Vec<_>>(),
);
}
}
Loading

0 comments on commit 5f3bbdc

Please sign in to comment.