From ef980a7bd2c0dbefeee19cc03c283ad0bd4527bb Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Wed, 8 Jan 2025 17:09:37 +1300 Subject: [PATCH] Compare all values by bytes rather than adding Intern trait --- parquet/src/arrow/arrow_writer/byte_array.rs | 7 --- parquet/src/data_type.rs | 60 -------------------- parquet/src/util/interner.rs | 9 +-- 3 files changed, 3 insertions(+), 73 deletions(-) diff --git a/parquet/src/arrow/arrow_writer/byte_array.rs b/parquet/src/arrow/arrow_writer/byte_array.rs index 2348dea526d3..2d23ad8510f9 100644 --- a/parquet/src/arrow/arrow_writer/byte_array.rs +++ b/parquet/src/arrow/arrow_writer/byte_array.rs @@ -326,13 +326,6 @@ impl Storage for ByteArrayStorage { } } -impl crate::util::interner::Intern for [u8] { - #[inline] - fn eq(&self, other: &Self) -> bool { - self == other - } -} - /// A dictionary encoder for byte array data #[derive(Debug, Default)] struct DictEncoder { diff --git a/parquet/src/data_type.rs b/parquet/src/data_type.rs index 5dd60074813d..c4c03727f44a 100644 --- a/parquet/src/data_type.rs +++ b/parquet/src/data_type.rs @@ -611,7 +611,6 @@ pub(crate) mod private { use super::{ParquetError, Result, SliceAsBytes}; use crate::basic::Type; use crate::file::metadata::HeapSize; - use crate::util::interner::Intern; /// Sealed trait to start to remove specialisation from implementations /// @@ -632,7 +631,6 @@ pub(crate) mod private { + HeapSize + crate::encodings::decoding::private::GetDecoder + crate::file::statistics::private::MakeStatistics - + Intern { const PHYSICAL_TYPE: Type; @@ -1127,64 +1125,6 @@ pub(crate) mod private { self.0.heap_size() } } - - impl Intern for bool { - #[inline] - fn eq(&self, other: &Self) -> bool { - self == other - } - } - - impl Intern for i32 { - #[inline] - fn eq(&self, other: &Self) -> bool { - self == other - } - } - - impl Intern for i64 { - #[inline] - fn eq(&self, other: &Self) -> bool { - self == other - } - } - - impl Intern for super::Int96 { - #[inline] - fn eq(&self, other: &Self) -> bool { - self == other - } - } - - impl Intern for f32 { - #[inline] - fn eq(&self, other: &Self) -> bool { - // Treat NaN == NaN when interning values - self.total_cmp(other) == std::cmp::Ordering::Equal - } - } - - impl Intern for f64 { - #[inline] - fn eq(&self, other: &Self) -> bool { - // Treat NaN == NaN when interning values - self.total_cmp(other) == std::cmp::Ordering::Equal - } - } - - impl Intern for super::ByteArray { - #[inline] - fn eq(&self, other: &Self) -> bool { - self == other - } - } - - impl Intern for super::FixedLenByteArray { - #[inline] - fn eq(&self, other: &Self) -> bool { - self == other - } - } } /// Contains the Parquet physical type information as well as the Rust primitive type diff --git a/parquet/src/util/interner.rs b/parquet/src/util/interner.rs index e040467d794c..34c7d1390f7a 100644 --- a/parquet/src/util/interner.rs +++ b/parquet/src/util/interner.rs @@ -24,7 +24,7 @@ const DEFAULT_DEDUP_CAPACITY: usize = 4096; pub trait Storage { type Key: Copy; - type Value: Intern + ?Sized; + type Value: AsBytes + ?Sized; /// Gets an element by its key fn get(&self, idx: Self::Key) -> &Self::Value; @@ -37,10 +37,6 @@ pub trait Storage { fn estimated_memory_size(&self) -> usize; } -pub trait Intern: AsBytes { - fn eq(&self, other: &Self) -> bool; -} - /// A generic value interner supporting various different [`Storage`] #[derive(Debug, Default)] pub struct Interner { @@ -70,7 +66,8 @@ impl Interner { .dedup .entry( hash, - |index| value.eq(self.storage.get(*index)), + // Compare bytes rather than directly comparing values so NaNs can be interned + |index| value.as_bytes() == self.storage.get(*index).as_bytes(), |key| self.state.hash_one(self.storage.get(*key).as_bytes()), ) .or_insert_with(|| self.storage.push(value))