From 0966a0fdfcf496819bceeb2a27941282d95a0cb3 Mon Sep 17 00:00:00 2001 From: Matthijs Brobbel Date: Sat, 18 Jan 2025 00:17:47 +0100 Subject: [PATCH] Add `Json` support to parquet, schema roundtrip not working yet --- parquet/src/arrow/schema/mod.rs | 74 +++++++++++++++++++++++++++++++-- 1 file changed, 70 insertions(+), 4 deletions(-) diff --git a/parquet/src/arrow/schema/mod.rs b/parquet/src/arrow/schema/mod.rs index 82bcc8db6a8..b796271887c 100644 --- a/parquet/src/arrow/schema/mod.rs +++ b/parquet/src/arrow/schema/mod.rs @@ -30,7 +30,7 @@ use std::sync::Arc; use arrow_ipc::writer; #[cfg(feature = "arrow-canonical-extension-types")] -use arrow_schema::extension::Uuid; +use arrow_schema::extension::{Json, Uuid}; use arrow_schema::{DataType, Field, Fields, Schema, TimeUnit}; use crate::basic::{ @@ -276,12 +276,26 @@ pub fn parquet_to_arrow_field(parquet_column: &ColumnDescriptor) -> Result ret.try_with_extension_type(Uuid)?, + LogicalType::Json => ret.try_with_extension_type(Json::default())?, + _ => {} + } + } + if !meta.is_empty() { ret.set_metadata(meta); } @@ -516,13 +530,35 @@ fn arrow_to_parquet_type(field: &Field) -> Result { } DataType::Utf8 | DataType::LargeUtf8 => { Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) - .with_logical_type(Some(LogicalType::String)) + .with_logical_type({ + #[cfg(feature = "arrow-canonical-extension-types")] + { + // Use the Json logical type if the canonical Json + // extension type is set on this field. + field + .try_extension_type::() + .map_or(Some(LogicalType::String), |_| Some(LogicalType::Json)) + } + #[cfg(not(feature = "arrow-canonical-extension-types"))] + Some(LogicalType::String) + }) .with_repetition(repetition) .with_id(id) .build() } DataType::Utf8View => Type::primitive_type_builder(name, PhysicalType::BYTE_ARRAY) - .with_logical_type(Some(LogicalType::String)) + .with_logical_type({ + #[cfg(feature = "arrow-canonical-extension-types")] + { + // Use the Json logical type if the canonical Json + // extension type is set on this field. + field + .try_extension_type::() + .map_or(Some(LogicalType::String), |_| Some(LogicalType::Json)) + } + #[cfg(not(feature = "arrow-canonical-extension-types"))] + Some(LogicalType::String) + }) .with_repetition(repetition) .with_id(id) .build(), @@ -1962,6 +1998,36 @@ mod tests { Some(LogicalType::Uuid) ); + let arrow_schema = parquet_to_arrow_schema(&parquet_schema, None)?; + dbg!(&arrow_schema); + + assert_eq!(arrow_schema.field(0).try_extension_type::()?, Uuid); + + Ok(()) + } + + #[test] + #[cfg(feature = "arrow-canonical-extension-types")] + fn arrow_json_to_parquet_json() -> Result<()> { + let arrow_schema = Schema::new(vec![ + Field::new("json", DataType::Utf8, false).with_extension_type(Json::default()) + ]); + + let parquet_schema = arrow_to_parquet_schema(&arrow_schema)?; + + assert_eq!( + parquet_schema.column(0).logical_type(), + Some(LogicalType::Json) + ); + + let arrow_schema = parquet_to_arrow_schema(&parquet_schema, None)?; + dbg!(&arrow_schema); + + assert_eq!( + arrow_schema.field(0).try_extension_type::()?, + Json::default() + ); + Ok(()) } }