From 4f8fa7a595a22ea14eb94fa6b5abaab181e7ac5d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 5 Dec 2024 15:22:06 -0500 Subject: [PATCH] Add ArrowToParquetSchemaConverter, deprecate `arrow_to_parquet_schema` et al --- parquet/src/arrow/schema/mod.rs | 61 +++++++++++++++++++++++++++++++++ parquet/src/file/properties.rs | 16 ++++----- 2 files changed, 68 insertions(+), 9 deletions(-) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index ec34840d858f..a1ea23728bc6 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -225,6 +225,67 @@ pub(crate) fn add_encoded_arrow_schema_to_metadata(schema: &Schema, props: &mut } } +/// Converter for arrow schema to parquet schema +/// +/// Notes: +#[derive(Debug)] +pub struct ArrowToParquetSchemaConverter<'a> { + /// The schema to convert + schema: &'a Schema + /// Name of the root schema in Parquet + root_schema_name: &str, + /// Should we Coerce arrow types to compatible Parquet types? + /// + /// See docs on [Self::with_coerce_types]` + coerce_types: bool +} + +impl <'a> ArrowToParquetSchemaConverter<'a> { + /// Create a new converter + pub fn new(schema: &'a Schema) -> Self { + Self { + schema, + root_schema_name: "arrow_schema", + coerce_types: false, + } + } + + /// Should arrow types be coerced into parquet native types (default false). + /// + /// Setting this option to `true` will result in parquet files that can be + /// read by more readers, but may lose precision for arrow types such as + /// [`DataType::Date64`] which have no direct corresponding Parquet type. + /// + /// # Discussion + /// + /// Some Arrow types such as `Date64`, `Timestamp` and `Interval` have no + /// corresponding Parquet logical type. Thus, they can not be losslessly + /// round-tripped when stored using the appropriate Parquet logical type. + /// + /// For example, some Date64 values may be truncated when stored with + /// parquet's native 32 bit date type. + /// + /// By default, the arrow writer does not coerce to native parquet types. It + /// writes data in such a way that it can be lossless round tripped. + /// However, this means downstream readers must be aware of and correctly + /// interpret the embedded Arrow schema. + pub fn with_coerce_types(mut self, coerce_types: bool) -> Self { + self.coerce_types = coerce_types; + self + } + + /// Set the root schema element name (defaults to `"arrow_schema"`). + pub fn with_root_schema_name<'b>(mut self, root_schema_name: &'b str) -> Self<'b> { + self.root_schema_name = root_schema_name; + self + } + + /// Build the parquet schema according + pub fn convert(self) -> Result { + + } +} + /// Convert arrow schema to parquet schema /// /// The name of the root schema element defaults to `"arrow_schema"`, this can be diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs index 1e8a4868dfc3..b6dbb2ff552b 100644 --- a/parquet/src/file/properties.rs +++ b/parquet/src/file/properties.rs @@ -18,7 +18,7 @@ //! Configuration via [`WriterProperties`] and [`ReaderProperties`] use std::str::FromStr; use std::{collections::HashMap, sync::Arc}; - +use arrow_schema::DataType; use crate::basic::{Compression, Encoding}; use crate::compression::{CodecOptions, CodecOptionsBuilder}; use crate::file::metadata::KeyValue; @@ -287,15 +287,13 @@ impl WriterProperties { self.statistics_truncate_length } - /// Returns `coerce_types` boolean + /// Should the writer coerce types to parquet native types. + /// + /// Setting this option to `true` will result in parquet files that can be + /// read by more readers, but may lose precision for arrow types such as + /// [`DataType::Date64`] which have no direct corresponding Parquet type. /// - /// Some Arrow types do not have a corresponding Parquet logical type. - /// Affected Arrow data types include `Date64`, `Timestamp` and `Interval`. - /// Writers have the option to coerce these into native Parquet types. Type - /// coercion allows for meaningful representations that do not require - /// downstream readers to consider the embedded Arrow schema. However, type - /// coercion also prevents the data from being losslessly round-tripped. This method - /// returns `true` if type coercion enabled. + /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details pub fn coerce_types(&self) -> bool { self.coerce_types }