Skip to content

Commit

Permalink
fix: partially fix consistency issue of hash functions with decimal i…
Browse files Browse the repository at this point in the history
…nput (#1295)
  • Loading branch information
wForget authored Jan 19, 2025
1 parent c4ef5ed commit 2588e13
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 0 deletions.
25 changes: 25 additions & 0 deletions native/spark-expr/src/hash_funcs/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,26 @@ macro_rules! hash_array_primitive_float {
};
}

#[macro_export]
macro_rules! hash_array_small_decimal {
($array_type:ident, $column: ident, $hashes: ident, $hash_method: ident) => {
let array = $column.as_any().downcast_ref::<$array_type>().unwrap();

if array.null_count() == 0 {
for (i, hash) in $hashes.iter_mut().enumerate() {
*hash = $hash_method(i64::try_from(array.value(i)).unwrap().to_le_bytes(), *hash);
}
} else {
for (i, hash) in $hashes.iter_mut().enumerate() {
if !array.is_null(i) {
*hash =
$hash_method(i64::try_from(array.value(i)).unwrap().to_le_bytes(), *hash);
}
}
}
};
}

#[macro_export]
macro_rules! hash_array_decimal {
($array_type:ident, $column: ident, $hashes: ident, $hash_method: ident) => {
Expand Down Expand Up @@ -274,6 +294,11 @@ macro_rules! create_hashes_internal {
DataType::FixedSizeBinary(_) => {
$crate::hash_array!(FixedSizeBinaryArray, col, $hashes_buffer, $hash_method);
}
// Apache Spark: if it's a small decimal, i.e. precision <= 18, turn it into long and hash it.
// Else, turn it into bytes and hash it.
DataType::Decimal128(precision, _) if *precision <= 18 => {
$crate::hash_array_small_decimal!(Decimal128Array, col, $hashes_buffer, $hash_method);
}
DataType::Decimal128(_, _) => {
$crate::hash_array_decimal!(Decimal128Array, col, $hashes_buffer, $hash_method);
}
Expand Down
17 changes: 17 additions & 0 deletions spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -1928,6 +1928,23 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper {
}
}
}

test("hash functions with decimal input") {
withTable("t1", "t2") {
// Apache Spark: if it's a small decimal, i.e. precision <= 18, turn it into long and hash it.
// Else, turn it into bytes and hash it.
sql("create table t1(c1 decimal(18, 2)) using parquet")
sql("insert into t1 values(1.23), (-1.23), (0.0), (null)")
checkSparkAnswerAndOperator("select c1, hash(c1), xxhash64(c1) from t1 order by c1")

// TODO: comet hash function is not compatible with spark for decimal with precision greater than 18.
// https://github.com/apache/datafusion-comet/issues/1294
// sql("create table t2(c1 decimal(20, 2)) using parquet")
// sql("insert into t2 values(1.23), (-1.23), (0.0), (null)")
// checkSparkAnswerAndOperator("select c1, hash(c1), xxhash64(c1) from t2 order by c1")
}
}

test("unary negative integer overflow test") {
def withAnsiMode(enabled: Boolean)(f: => Unit): Unit = {
withSQLConf(
Expand Down

0 comments on commit 2588e13

Please sign in to comment.