Skip to content

Commit

Permalink
Treat NaNs equal to NaN when interning for dictionary encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
adamreeve committed Jan 8, 2025
1 parent a0eaaec commit 8cca284
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 2 deletions.
7 changes: 7 additions & 0 deletions parquet/src/arrow/arrow_writer/byte_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,13 @@ impl Storage for ByteArrayStorage {
}
}

impl crate::util::interner::Intern for [u8] {
#[inline]
fn eq(&self, other: &Self) -> bool {
self == other
}
}

/// A dictionary encoder for byte array data
#[derive(Debug, Default)]
struct DictEncoder {
Expand Down
60 changes: 60 additions & 0 deletions parquet/src/data_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,7 @@ pub(crate) mod private {
use super::{ParquetError, Result, SliceAsBytes};
use crate::basic::Type;
use crate::file::metadata::HeapSize;
use crate::util::interner::Intern;

/// Sealed trait to start to remove specialisation from implementations
///
Expand All @@ -631,6 +632,7 @@ pub(crate) mod private {
+ HeapSize
+ crate::encodings::decoding::private::GetDecoder
+ crate::file::statistics::private::MakeStatistics
+ Intern
{
const PHYSICAL_TYPE: Type;

Expand Down Expand Up @@ -1125,6 +1127,64 @@ pub(crate) mod private {
self.0.heap_size()
}
}

impl Intern for bool {
#[inline]
fn eq(&self, other: &Self) -> bool {
self == other
}
}

impl Intern for i32 {
#[inline]
fn eq(&self, other: &Self) -> bool {
self == other
}
}

impl Intern for i64 {
#[inline]
fn eq(&self, other: &Self) -> bool {
self == other
}
}

impl Intern for super::Int96 {
#[inline]
fn eq(&self, other: &Self) -> bool {
self == other
}
}

impl Intern for f32 {
#[inline]
fn eq(&self, other: &Self) -> bool {
// Treat NaN == NaN when interning values
self.total_cmp(other) == std::cmp::Ordering::Equal
}
}

impl Intern for f64 {
#[inline]
fn eq(&self, other: &Self) -> bool {
// Treat NaN == NaN when interning values
self.total_cmp(other) == std::cmp::Ordering::Equal
}
}

impl Intern for super::ByteArray {
#[inline]
fn eq(&self, other: &Self) -> bool {
self == other
}
}

impl Intern for super::FixedLenByteArray {
#[inline]
fn eq(&self, other: &Self) -> bool {
self == other
}
}
}

/// Contains the Parquet physical type information as well as the Rust primitive type
Expand Down
8 changes: 6 additions & 2 deletions parquet/src/util/interner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ const DEFAULT_DEDUP_CAPACITY: usize = 4096;
pub trait Storage {
type Key: Copy;

type Value: AsBytes + PartialEq + ?Sized;
type Value: Intern + ?Sized;

/// Gets an element by its key
fn get(&self, idx: Self::Key) -> &Self::Value;
Expand All @@ -37,6 +37,10 @@ pub trait Storage {
fn estimated_memory_size(&self) -> usize;
}

pub trait Intern: AsBytes {
fn eq(&self, other: &Self) -> bool;
}

/// A generic value interner supporting various different [`Storage`]
#[derive(Debug, Default)]
pub struct Interner<S: Storage> {
Expand Down Expand Up @@ -66,7 +70,7 @@ impl<S: Storage> Interner<S> {
.dedup
.entry(
hash,
|index| value == self.storage.get(*index),
|index| value.eq(self.storage.get(*index)),
|key| self.state.hash_one(self.storage.get(*key).as_bytes()),
)
.or_insert_with(|| self.storage.push(value))
Expand Down

0 comments on commit 8cca284

Please sign in to comment.