diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index a1c9c0ab2113..76119ec4abb4 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -55,6 +55,7 @@ arrow-string = { workspace = true } rand = { version = "0.8", default-features = false, features = ["std", "std_rng"], optional = true } pyo3 = { version = "0.23", default-features = false, optional = true } +half = { version = "2.1", default-features = false, optional = true } [package.metadata.docs.rs] features = ["prettyprint", "ipc_compression", "ffi", "pyarrow"] @@ -70,7 +71,7 @@ prettyprint = ["arrow-cast/prettyprint"] # not the core arrow code itself. Be aware that `rand` must be kept as # an optional dependency for supporting compile to wasm32-unknown-unknown # target without assuming an environment containing JavaScript. -test_utils = ["dep:rand"] +test_utils = ["dep:rand", "dep:half"] pyarrow = ["pyo3", "ffi"] # force_validate runs full data validation for all arrays that are created # this is not enabled by default as it is too computationally expensive diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 8eaae36dbe56..53e01034122b 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -21,6 +21,7 @@ use crate::array::*; use crate::datatypes::*; use crate::util::test_util::seedable_rng; use arrow_buffer::{Buffer, IntervalMonthDayNano}; +use half::f16; use rand::distributions::uniform::SampleUniform; use rand::thread_rng; use rand::Rng; @@ -416,3 +417,48 @@ where DictionaryArray::from(data) } + +/// Creates a random (but fixed-seeded) f16 array of a given size and nan-value density +pub fn create_f16_array(size: usize, nan_density: f32) -> Float16Array { + let mut rng = seedable_rng(); + + (0..size) + .map(|_| { + if rng.gen::() < nan_density { + Some(f16::NAN) + } else { + Some(f16::from_f32(rng.gen())) + } + }) + .collect() +} + +/// Creates a random (but fixed-seeded) f32 array of a given size and nan-value density +pub fn create_f32_array(size: usize, nan_density: f32) -> Float32Array { + let mut rng = seedable_rng(); + + (0..size) + .map(|_| { + if rng.gen::() < nan_density { + Some(f32::NAN) + } else { + Some(rng.gen()) + } + }) + .collect() +} + +/// Creates a random (but fixed-seeded) f64 array of a given size and nan-value density +pub fn create_f64_array(size: usize, nan_density: f32) -> Float64Array { + let mut rng = seedable_rng(); + + (0..size) + .map(|_| { + if rng.gen::() < nan_density { + Some(f64::NAN) + } else { + Some(rng.gen()) + } + }) + .collect() +} diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs index bfa333db722c..fdcff0ef5efb 100644 --- a/parquet/benches/arrow_writer.rs +++ b/parquet/benches/arrow_writer.rs @@ -28,7 +28,9 @@ extern crate parquet; use std::sync::Arc; use arrow::datatypes::*; +use arrow::util::bench_util::{create_f16_array, create_f32_array, create_f64_array}; use arrow::{record_batch::RecordBatch, util::data_gen::*}; +use arrow_array::RecordBatchOptions; use parquet::file::properties::WriterProperties; use parquet::{arrow::ArrowWriter, errors::Result}; @@ -181,6 +183,25 @@ fn create_bool_bench_batch_non_null( )?) } +fn create_float_bench_batch_with_nans(size: usize, nan_density: f32) -> Result { + let fields = vec![ + Field::new("_1", DataType::Float16, false), + Field::new("_2", DataType::Float32, false), + Field::new("_3", DataType::Float64, false), + ]; + let schema = Schema::new(fields); + let columns: Vec = vec![ + Arc::new(create_f16_array(size, nan_density)), + Arc::new(create_f32_array(size, nan_density)), + Arc::new(create_f64_array(size, nan_density)), + ]; + Ok(RecordBatch::try_new_with_options( + Arc::new(schema), + columns, + &RecordBatchOptions::new().with_match_field_names(false), + )?) +} + fn create_list_primitive_bench_batch( size: usize, null_density: f32, @@ -459,6 +480,30 @@ fn bench_primitive_writer(c: &mut Criterion) { b.iter(|| write_batch_enable_bloom_filter(&batch).unwrap()) }); + let batch = create_float_bench_batch_with_nans(4096, 0.5).unwrap(); + group.throughput(Throughput::Bytes( + batch + .columns() + .iter() + .map(|f| f.get_array_memory_size() as u64) + .sum(), + )); + group.bench_function("4096 values float with NaNs", |b| { + b.iter(|| write_batch(&batch).unwrap()) + }); + + let batch = create_float_bench_batch_with_nans(4096, 0.0).unwrap(); + group.throughput(Throughput::Bytes( + batch + .columns() + .iter() + .map(|f| f.get_array_memory_size() as u64) + .sum(), + )); + group.bench_function("4096 values float with no NaNs", |b| { + b.iter(|| write_batch(&batch).unwrap()) + }); + group.finish(); } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 871b140768cb..41f15569fda0 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1095,6 +1095,7 @@ mod tests { use arrow::{array::*, buffer::Buffer}; use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer}; use arrow_schema::Fields; + use half::f16; use crate::basic::Encoding; use crate::data_type::AsBytes; @@ -1763,6 +1764,44 @@ mod tests { ); } + #[test] + fn arrow_writer_float_nans() { + let f16_field = Field::new("a", DataType::Float16, false); + let f32_field = Field::new("b", DataType::Float32, false); + let f64_field = Field::new("c", DataType::Float64, false); + let schema = Schema::new(vec![f16_field, f32_field, f64_field]); + + let f16_values = (0..MEDIUM_SIZE) + .map(|i| { + Some(if i % 2 == 0 { + f16::NAN + } else { + f16::from_f32(i as f32) + }) + }) + .collect::(); + + let f32_values = (0..MEDIUM_SIZE) + .map(|i| Some(if i % 2 == 0 { f32::NAN } else { i as f32 })) + .collect::(); + + let f64_values = (0..MEDIUM_SIZE) + .map(|i| Some(if i % 2 == 0 { f64::NAN } else { i as f64 })) + .collect::(); + + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(f16_values), + Arc::new(f32_values), + Arc::new(f64_values), + ], + ) + .unwrap(); + + roundtrip(batch, None); + } + const SMALL_SIZE: usize = 7; const MEDIUM_SIZE: usize = 63;