Skip to content

Commit

Permalink
Add ArrowToParquetSchemaConverter, deprecate `arrow_to_parquet_schema…
Browse files Browse the repository at this point in the history
…` et al
  • Loading branch information
alamb committed Dec 5, 2024
1 parent 93ce75c commit 4f8fa7a
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 9 deletions.
61 changes: 61 additions & 0 deletions parquet/src/arrow/schema/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,67 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut
}
}

/// Converter for arrow schema to parquet schema
///
/// Notes:
#[derive(Debug)]
pub struct ArrowToParquetSchemaConverter<'a> {
/// The schema to convert
schema: &'a Schema
/// Name of the root schema in Parquet
root_schema_name: &str,
/// Should we Coerce arrow types to compatible Parquet types?
///
/// See docs on [Self::with_coerce_types]`
coerce_types: bool
}

impl <'a> ArrowToParquetSchemaConverter<'a> {
/// Create a new converter
pub fn new(schema: &'a Schema) -> Self {
Self {
schema,
root_schema_name: "arrow_schema",
coerce_types: false,
}
}

/// Should arrow types be coerced into parquet native types (default false).
///
/// Setting this option to `true` will result in parquet files that can be
/// read by more readers, but may lose precision for arrow types such as
/// [`DataType::Date64`] which have no direct corresponding Parquet type.
///
/// # Discussion
///
/// Some Arrow types such as `Date64`, `Timestamp` and `Interval` have no
/// corresponding Parquet logical type. Thus, they can not be losslessly
/// round-tripped when stored using the appropriate Parquet logical type.
///
/// For example, some Date64 values may be truncated when stored with
/// parquet's native 32 bit date type.
///
/// By default, the arrow writer does not coerce to native parquet types. It
/// writes data in such a way that it can be lossless round tripped.
/// However, this means downstream readers must be aware of and correctly
/// interpret the embedded Arrow schema.
pub fn with_coerce_types(mut self, coerce_types: bool) -> Self {
self.coerce_types = coerce_types;
self
}

/// Set the root schema element name (defaults to `"arrow_schema"`).
pub fn with_root_schema_name<'b>(mut self, root_schema_name: &'b str) -> Self<'b> {
self.root_schema_name = root_schema_name;
self
}

/// Build the parquet schema according
pub fn convert(self) -> Result<SchemaDescriptor> {

}
}

/// Convert arrow schema to parquet schema
///
/// The name of the root schema element defaults to `"arrow_schema"`, this can be
Expand Down
16 changes: 7 additions & 9 deletions parquet/src/file/properties.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
use std::str::FromStr;
use std::{collections::HashMap, sync::Arc};

use arrow_schema::DataType;
use crate::basic::{Compression, Encoding};
use crate::compression::{CodecOptions, CodecOptionsBuilder};
use crate::file::metadata::KeyValue;
Expand Down Expand Up @@ -287,15 +287,13 @@ impl WriterProperties {
self.statistics_truncate_length
}

/// Returns `coerce_types` boolean
/// Should the writer coerce types to parquet native types.
///
/// Setting this option to `true` will result in parquet files that can be
/// read by more readers, but may lose precision for arrow types such as
/// [`DataType::Date64`] which have no direct corresponding Parquet type.
///
/// Some Arrow types do not have a corresponding Parquet logical type.
/// Affected Arrow data types include `Date64`, `Timestamp` and `Interval`.
/// Writers have the option to coerce these into native Parquet types. Type
/// coercion allows for meaningful representations that do not require
/// downstream readers to consider the embedded Arrow schema. However, type
/// coercion also prevents the data from being losslessly round-tripped. This method
/// returns `true` if type coercion enabled.
/// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
pub fn coerce_types(&self) -> bool {
self.coerce_types
}
Expand Down

0 comments on commit 4f8fa7a

Please sign in to comment.