diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 222d86131e0..1533a64bdfc 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -30,12 +30,10 @@ use arrow_array::types::*; use arrow_array::{ArrayRef, RecordBatch, RecordBatchWriter}; use arrow_schema::{ArrowError, DataType as ArrowDataType, Field, IntervalUnit, SchemaRef}; -use super::schema::{ - add_encoded_arrow_schema_to_metadata, arrow_to_parquet_schema, - arrow_to_parquet_schema_with_root, decimal_length_from_precision, -}; +use super::schema::{add_encoded_arrow_schema_to_metadata, decimal_length_from_precision}; use crate::arrow::arrow_writer::byte_array::ByteArrayEncoder; +use crate::arrow::ArrowToParquetSchemaConverter; use crate::column::page::{CompressedPage, PageWriteSpec, PageWriter}; use crate::column::writer::encoder::ColumnValueEncoder; use crate::column::writer::{ @@ -181,10 +179,12 @@ impl ArrowWriter { options: ArrowWriterOptions, ) -> Result { let mut props = options.properties; - let schema = match options.schema_root { - Some(s) => arrow_to_parquet_schema_with_root(&arrow_schema, &s, props.coerce_types())?, - None => arrow_to_parquet_schema(&arrow_schema, props.coerce_types())?, - }; + let mut converter = ArrowToParquetSchemaConverter::new(&arrow_schema) + .with_coerce_types(props.coerce_types()); + if let Some(s) = &options.schema_root { + converter = converter.schema_root(s); + } + let schema = converter.build()?; if !options.skip_arrow_metadata { // add serialized arrow schema add_encoded_arrow_schema_to_metadata(&arrow_schema, &mut props); diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs index 2d09cd19203..dbb6a4cf044 100644 --- a/parquet/src/arrow/mod.rs +++ b/parquet/src/arrow/mod.rs @@ -117,8 +117,8 @@ use crate::schema::types::SchemaDescriptor; use arrow_schema::{FieldRef, Schema}; pub use self::schema::{ - arrow_to_parquet_schema, parquet_to_arrow_field_levels, parquet_to_arrow_schema, - parquet_to_arrow_schema_by_columns, FieldLevels, + parquet_to_arrow_field_levels, parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns, + ArrowToParquetSchemaConverter, FieldLevels, }; /// Schema metadata key used to store serialized Arrow IPC schema diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 4b5e0dadc11..c91e9a55f8b 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -236,7 +236,7 @@ pub struct ArrowToParquetSchemaConverter<'a> { /// The schema to convert schema: &'a Schema, /// Name of the root schema in Parquet - root_schema_name: &'a str, + schema_root: &'a str, /// Should we Coerce arrow types to compatible Parquet types? /// /// See docs on [Self::with_coerce_types]` @@ -248,7 +248,7 @@ impl <'a> ArrowToParquetSchemaConverter<'a> { pub fn new(schema: &'a Schema) -> Self { Self { schema, - root_schema_name: "arrow_schema", + schema_root: "arrow_schema", coerce_types: false, } } @@ -278,14 +278,14 @@ impl <'a> ArrowToParquetSchemaConverter<'a> { } /// Set the root schema element name (defaults to `"arrow_schema"`). - pub fn with_root_schema_name(mut self, root_schema_name: &'a str) -> Self { - self.root_schema_name = root_schema_name; + pub fn schema_root(mut self, schema_root: &'a str) -> Self { + self.schema_root = schema_root; self } /// Build the desired parquet [`SchemaDescriptor`] pub fn build(self) -> Result { - let Self { schema, root_schema_name, coerce_types } = self; + let Self { schema, schema_root: root_schema_name, coerce_types } = self; let fields = schema .fields() .iter() @@ -300,19 +300,20 @@ impl <'a> ArrowToParquetSchemaConverter<'a> { /// /// The name of the root schema element defaults to `"arrow_schema"`, this can be /// overridden with [`arrow_to_parquet_schema_with_root`] -pub fn arrow_to_parquet_schema(schema: &Schema, coerce_types: bool) -> Result { - arrow_to_parquet_schema_with_root(schema, "arrow_schema", coerce_types) +#[deprecated(since = "54.0.0", note = "Use `ArrowToParquetSchemaConverter` instead")] +pub fn arrow_to_parquet_schema(schema: &Schema) -> Result { + + ArrowToParquetSchemaConverter::new(schema).build() } /// Convert arrow schema to parquet schema specifying the name of the root schema element +#[deprecated(since = "54.0.0", note = "Use `ArrowToParquetSchemaConverter` instead")] pub fn arrow_to_parquet_schema_with_root( schema: &Schema, root: &str, - coerce_types: bool, ) -> Result { ArrowToParquetSchemaConverter::new(schema) - .with_root_schema_name(root) - .with_coerce_types(coerce_types) + .schema_root(root) .build() } @@ -1637,7 +1638,7 @@ mod tests { Field::new("decimal256", DataType::Decimal256(39, 2), false), ]; let arrow_schema = Schema::new(arrow_fields); - let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema, false).unwrap(); + let converted_arrow_schema = ArrowToParquetSchemaConverter::new(&arrow_schema).build().unwrap(); assert_eq!( parquet_schema.columns().len(), @@ -1674,9 +1675,10 @@ mod tests { false, )]; let arrow_schema = Schema::new(arrow_fields); - let converted_arrow_schema = arrow_to_parquet_schema(&arrow_schema, true); + let converted_arrow_schema = ArrowToParquetSchemaConverter::new(&arrow_schema) + .with_coerce_types(true) + .build(); - assert!(converted_arrow_schema.is_err()); converted_arrow_schema.unwrap(); } @@ -1946,7 +1948,9 @@ mod tests { // don't pass metadata so field ids are read from Parquet and not from serialized Arrow schema let arrow_schema = crate::arrow::parquet_to_arrow_schema(&schema_descriptor, None)?; - let parq_schema_descr = crate::arrow::arrow_to_parquet_schema(&arrow_schema, true)?; + let parq_schema_descr = crate::arrow::ArrowToParquetSchemaConverter::new(&arrow_schema) + .with_coerce_types(true) + .build()?; let parq_fields = parq_schema_descr.root_schema().get_fields(); assert_eq!(parq_fields.len(), 2); assert_eq!(parq_fields[0].get_basic_info().id(), 1);