Skip to content

Commit

Permalink
allow to apply hint for binary as Utf8 type
Browse files Browse the repository at this point in the history
  • Loading branch information
goldmedal authored and alamb committed Oct 10, 2024
1 parent 065c7b8 commit c052147
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 0 deletions.
113 changes: 113 additions & 0 deletions parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3077,6 +3077,119 @@ mod tests {
);
}

#[test]
fn test_read_binary_as_utf8() {
let file = write_parquet_from_iter(vec![
(
"binary_to_utf8",
Arc::new(BinaryArray::from(vec![
b"one".as_ref(),
b"two".as_ref(),
b"three".as_ref(),
])) as ArrayRef,
),
(
"large_binary_to_large_utf8",
Arc::new(LargeBinaryArray::from(vec![
b"one".as_ref(),
b"two".as_ref(),
b"three".as_ref(),
])) as ArrayRef,
),
(
"binary_view_to_utf8_view",
Arc::new(BinaryViewArray::from(vec![
b"one".as_ref(),
b"two".as_ref(),
b"three".as_ref(),
])) as ArrayRef,
),
]);
let supplied_fields = Fields::from(vec![
Field::new("binary_to_utf8", ArrowDataType::Utf8, false),
Field::new(
"large_binary_to_large_utf8",
ArrowDataType::LargeUtf8,
false,
),
Field::new("binary_view_to_utf8_view", ArrowDataType::Utf8View, false),
]);

let options = ArrowReaderOptions::new().with_schema(Arc::new(Schema::new(supplied_fields)));
let mut arrow_reader = ParquetRecordBatchReaderBuilder::try_new_with_options(
file.try_clone().unwrap(),
options,
)
.expect("reader builder with schema")
.build()
.expect("reader with schema");

let batch = arrow_reader.next().unwrap().unwrap();
assert_eq!(batch.num_columns(), 3);
assert_eq!(batch.num_rows(), 3);
assert_eq!(
batch
.column(0)
.as_any()
.downcast_ref::<StringArray>()
.expect("downcast to string")
.iter()
.collect::<Vec<_>>(),
vec![Some("one"), Some("two"), Some("three")]
);

assert_eq!(
batch
.column(1)
.as_any()
.downcast_ref::<LargeStringArray>()
.expect("downcast to large string")
.iter()
.collect::<Vec<_>>(),
vec![Some("one"), Some("two"), Some("three")]
);

assert_eq!(
batch
.column(2)
.as_any()
.downcast_ref::<StringViewArray>()
.expect("downcast to string view")
.iter()
.collect::<Vec<_>>(),
vec![Some("one"), Some("two"), Some("three")]
);
}

#[test]
#[should_panic(expected = "Invalid UTF8 sequence at")]
fn test_read_non_utf8_binary_as_utf8() {
let file = write_parquet_from_iter(vec![(
"non_utf8_binary",
Arc::new(BinaryArray::from(vec![
b"\xDE\x00\xFF".as_ref(),
b"\xDE\x01\xAA".as_ref(),
b"\xDE\x02\xFF".as_ref(),
])) as ArrayRef,
)]);
let supplied_fields = Fields::from(vec![Field::new(
"non_utf8_binary",
ArrowDataType::Utf8,
false,
)]);

let options = ArrowReaderOptions::new().with_schema(Arc::new(Schema::new(supplied_fields)));
let mut arrow_reader = ParquetRecordBatchReaderBuilder::try_new_with_options(
file.try_clone().unwrap(),
options,
)
.expect("reader builder with schema")
.build()
.expect("reader with schema");

arrow_reader.next();
}

#[test]
fn test_with_schema() {
let nested_fields = Fields::from(vec![
Expand Down
5 changes: 5 additions & 0 deletions parquet/src/arrow/schema/primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ fn apply_hint(parquet: DataType, hint: DataType) -> DataType {
(DataType::Utf8, DataType::LargeUtf8) => hint,
(DataType::Binary, DataType::LargeBinary) => hint,

// Read as Utf8
(DataType::Binary, DataType::Utf8) => hint,
(DataType::Binary, DataType::LargeUtf8) => hint,
(DataType::Binary, DataType::Utf8View) => hint,

// Determine view type
(DataType::Utf8, DataType::Utf8View) => hint,
(DataType::Binary, DataType::BinaryView) => hint,
Expand Down

0 comments on commit c052147

Please sign in to comment.