diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index 2d23ad8510f9..2348dea526d3 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -326,6 +326,13 @@ impl Storage for ByteArrayStorage { } } +impl crate::util::interner::Intern for [u8] { + #[inline] + fn eq(&self, other: &Self) -> bool { + self == other + } +} + /// A dictionary encoder for byte array data #[derive(Debug, Default)] struct DictEncoder { diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index c4c03727f44a..5dd60074813d 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -611,6 +611,7 @@ pub(crate) mod private { use super::{ParquetError, Result, SliceAsBytes}; use crate::basic::Type; use crate::file::metadata::HeapSize; + use crate::util::interner::Intern; /// Sealed trait to start to remove specialisation from implementations /// @@ -631,6 +632,7 @@ pub(crate) mod private { + HeapSize + crate::encodings::decoding::private::GetDecoder + crate::file::statistics::private::MakeStatistics + + Intern { const PHYSICAL_TYPE: Type; @@ -1125,6 +1127,64 @@ pub(crate) mod private { self.0.heap_size() } } + + impl Intern for bool { + #[inline] + fn eq(&self, other: &Self) -> bool { + self == other + } + } + + impl Intern for i32 { + #[inline] + fn eq(&self, other: &Self) -> bool { + self == other + } + } + + impl Intern for i64 { + #[inline] + fn eq(&self, other: &Self) -> bool { + self == other + } + } + + impl Intern for super::Int96 { + #[inline] + fn eq(&self, other: &Self) -> bool { + self == other + } + } + + impl Intern for f32 { + #[inline] + fn eq(&self, other: &Self) -> bool { + // Treat NaN == NaN when interning values + self.total_cmp(other) == std::cmp::Ordering::Equal + } + } + + impl Intern for f64 { + #[inline] + fn eq(&self, other: &Self) -> bool { + // Treat NaN == NaN when interning values + self.total_cmp(other) == std::cmp::Ordering::Equal + } + } + + impl Intern for super::ByteArray { + #[inline] + fn eq(&self, other: &Self) -> bool { + self == other + } + } + + impl Intern for super::FixedLenByteArray { + #[inline] + fn eq(&self, other: &Self) -> bool { + self == other + } + } } /// Contains the Parquet physical type information as well as the Rust primitive type diff --git a/parquet/src/util/interner.rs b/parquet/src/util/interner.rs index 489d4d58122c..e040467d794c 100644 --- a/parquet/src/util/interner.rs +++ b/parquet/src/util/interner.rs @@ -24,7 +24,7 @@ const DEFAULT_DEDUP_CAPACITY: usize = 4096; pub trait Storage { type Key: Copy; - type Value: AsBytes + PartialEq + ?Sized; + type Value: Intern + ?Sized; /// Gets an element by its key fn get(&self, idx: Self::Key) -> &Self::Value; @@ -37,6 +37,10 @@ pub trait Storage { fn estimated_memory_size(&self) -> usize; } +pub trait Intern: AsBytes { + fn eq(&self, other: &Self) -> bool; +} + /// A generic value interner supporting various different [`Storage`] #[derive(Debug, Default)] pub struct Interner { @@ -66,7 +70,7 @@ impl Interner { .dedup .entry( hash, - |index| value == self.storage.get(*index), + |index| value.eq(self.storage.get(*index)), |key| self.state.hash_one(self.storage.get(*key).as_bytes()), ) .or_insert_with(|| self.storage.push(value))