Merge remote-tracking branch 'origin/branch-25.02' into hivehash-nest…

…ed-support
NVIDIA · Dec 11, 2024 · 7d7d57c · 7d7d57c
2 parents 67125c0 + e22a7ca
commit 7d7d57c
Show file tree

Hide file tree

Showing 50 changed files with 783 additions and 341 deletions.
diff --git a/.github/workflows/license-header-check.yml b/.github/workflows/license-header-check.yml
@@ -0,0 +1,58 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# A workflow to check copyright/license header
+name: license header check
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  license-header-check:
+    runs-on: ubuntu-latest
+    if: "!contains(github.event.pull_request.title, '[bot]')"
+    steps:
+      - name: Get checkout depth
+        run: |
+          echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 10 ))" >> $GITHUB_ENV
+
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: ${{ env.PR_FETCH_DEPTH }}
+
+      - name: license-header-check
+        uses: NVIDIA/spark-rapids-common/license-header-check@main
+        with:
+          included_file_patterns: |
+            *.yml,
+            *.yaml,
+            *.sh,
+            *.xml,
+            *.properties,
+            *.scala,
+            *.py,
+            build/*,
+            *.cpp,
+            *Dockerfile*,
+            *Jenkinsfile*,
+            *.ini,
+            *.java,
+            *.fbs
+          excluded_file_patterns: |
+            *target/*,
+            thirdparty/*,
+            sql-plugin/src/main/java/com/nvidia/spark/rapids/format/*
+ 
diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml
@@ -53,7 +53,8 @@ jobs:
         id: generateCacheKey
         run: |
           set -x
-          cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-$(date +'%Y-%m-%d')"
+          depsSHA1=$(. .github/workflows/mvn-verify-check/get-deps-sha1.sh 2.12)
+          cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-${depsSHA1}"
           echo "dailyCacheKey=$cacheKey" | tee $GITHUB_ENV $GITHUB_OUTPUT
       - name: Cache local Maven repository
         id: cache
@@ -165,7 +166,8 @@ jobs:
         id: generateCacheKey
         run: |
           set -x
-          cacheKey="${{ runner.os }}-maven-scala213-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-$(date +'%Y-%m-%d')"
+          depsSHA1=$(. .github/workflows/mvn-verify-check/get-deps-sha1.sh 2.13)
+          cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-${depsSHA1}"
           echo "scala213dailyCacheKey=$cacheKey" | tee $GITHUB_ENV $GITHUB_OUTPUT
       - name: Cache local Maven repository
         id: cache

diff --git a/.github/workflows/mvn-verify-check/get-deps-sha1.sh b/.github/workflows/mvn-verify-check/get-deps-sha1.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+scala_ver=${1:-"2.12"}
+base_URL="https://oss.sonatype.org/service/local/artifact/maven/resolve"
+project_jni="spark-rapids-jni"
+project_private="rapids-4-spark-private_${scala_ver}"
+
+jni_ver=$(mvn help:evaluate -q -pl dist -Dexpression=spark-rapids-jni.version -DforceStdout)
+private_ver=$(mvn help:evaluate -q -pl dist -Dexpression=spark-rapids-private.version -DforceStdout)
+
+jni_sha1=$(curl -s -H "Accept: application/json" \
+  "${base_URL}?r=snapshots&g=com.nvidia&a=${project_jni}&v=${jni_ver}&c=&e=jar&wt=json" \
+  | jq .data.sha1) || $(date +'%Y-%m-%d')
+private_sha1=$(curl -s -H "Accept: application/json" \
+  "${base_URL}?r=snapshots&g=com.nvidia&a=${project_private}&v=${private_ver}&c=&e=jar&wt=json" \
+  | jq .data.sha1) || $(date +'%Y-%m-%d')
+
+sha1md5=$(echo -n "${jni_sha1}_${private_sha1}" | md5sum | awk '{print $1}')
+
+echo $sha1md5
diff --git a/.github/workflows/mvn-verify-check/populate-daily-cache.sh b/.github/workflows/mvn-verify-check/populate-daily-cache.sh
@@ -14,27 +14,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set -x
-max_retry=3; delay=30; i=1
+set -e
+set -o pipefail
+
 if [[ $SCALA_VER == '2.12' ]]; then
     pom='pom.xml'
 elif [[ $SCALA_VER == '2.13' ]]; then
     pom='scala2.13/pom.xml'
 fi
+
+max_retry=3; delay=30; i=1
 while true; do
+    buildvers=($(python build/get_buildvers.py no_snapshots $pom | tr -d ',')) &&
     {
-        python build/get_buildvers.py "no_snapshots.buildvers" $pom | tr -d ',' | \
-            xargs -n 1 -I {} bash -c \
-                "mvn $COMMON_MVN_FLAGS --file $pom -Dbuildver={} de.qaware.maven:go-offline-maven-plugin:resolve-dependencies"
-
+        for buildver in "${buildvers[@]}"; do
+            mvn $COMMON_MVN_FLAGS --file $pom -Dbuildver=$buildver de.qaware.maven:go-offline-maven-plugin:resolve-dependencies
+        done
+    } && {
         # compile base versions to cache scala compiler and compiler bridge
-        mvn $COMMON_MVN_FLAGS --file $pom \
-            process-test-resources -pl sql-plugin-api -am
+        mvn $COMMON_MVN_FLAGS --file $pom process-test-resources -pl sql-plugin-api -am
     } && break || {
     if [[ $i -le $max_retry ]]; then
         echo "mvn command failed. Retry $i/$max_retry."; ((i++)); sleep $delay; ((delay=delay*2))
     else
         echo "mvn command failed. Exit 1"; exit 1
     fi
 }
-done
+done
diff --git a/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala b/datagen/src/main/spark400/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala
@@ -24,6 +24,6 @@ import org.apache.spark.sql.catalyst.expressions.Expression
 import org.apache.spark.sql.internal.ExpressionUtils.{column, expression}
 
 object DataGenExprShims {
-  def columnToExpr(c: Column): Expression = c
-  def exprToColumn(e: Expression): Column = e
+  def columnToExpr(c: Column): Expression = expression(c)
+  def exprToColumn(e: Expression): Column = column(e)
 }
diff --git a/...ain/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala b/...ain/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala
@@ -39,6 +39,7 @@ import org.apache.spark.sql.execution.{CoalescedPartitionSpec, ShufflePartitionS
 import org.apache.spark.sql.execution.exchange.Exchange
 import org.apache.spark.sql.execution.metric.{SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter}
 import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBase, ShuffledBatchRDD}
+import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.createAdditionalExchangeMetrics
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.ThreadUtils
 
@@ -71,22 +72,11 @@ case class GpuOptimizeWriteExchangeExec(
   private[sql] lazy val readMetrics =
     SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext)
 
-  override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
-    "dataSize" -> createSizeMetric(ESSENTIAL_LEVEL, "data size"),
-    "dataReadSize" -> createSizeMetric(MODERATE_LEVEL, "data read size"),
-    "rapidsShuffleSerializationTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. serialization time"),
-    "rapidsShuffleDeserializationTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. deserialization time"),
-    "rapidsShuffleWriteTime" ->
-        createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle write time"),
-    "rapidsShuffleCombineTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle combine time"),
-    "rapidsShuffleWriteIoTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle write io time"),
-    "rapidsShuffleReadTime" ->
-        createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle read time")
-  ) ++ GpuMetric.wrap(readMetrics) ++ GpuMetric.wrap(writeMetrics)
+  override lazy val additionalMetrics : Map[String, GpuMetric] = {
+    createAdditionalExchangeMetrics(this) ++
+      GpuMetric.wrap(readMetrics) ++
+      GpuMetric.wrap(writeMetrics)
+  }
 
   override lazy val allMetrics: Map[String, GpuMetric] = {
     Map(
@@ -98,7 +88,7 @@ case class GpuOptimizeWriteExchangeExec(
   }
 
   private lazy val serializer: Serializer =
-    new GpuColumnarBatchSerializer(gpuLongMetric("dataSize"),
+    new GpuColumnarBatchSerializer(allMetrics,
       child.output.map(_.dataType).toArray,
       RapidsConf.SHUFFLE_KUDO_SERIALIZER_ENABLED.get(child.conf))
 

diff --git a/docs/dev/idea-code-style-settings.xml b/docs/dev/idea-code-style-settings.xml
@@ -1,3 +1,19 @@
+<!--
+ Copyright (c) 2024, NVIDIA CORPORATION.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+--> 
+
 <code_scheme name="Default" version="173">
   <option name="SOFT_MARGINS" value="100" />
   <JavaCodeStyleSettings>

diff --git a/integration_tests/src/main/python/datasourcev2_write_test.py b/integration_tests/src/main/python/datasourcev2_write_test.py
@@ -18,7 +18,7 @@
 from data_gen import gen_df, decimal_gens, non_utc_allow, StructGen, ArrayGen, string_gen
 from marks import *
 from spark_session import is_hive_available, is_spark_330_or_later, with_cpu_session, with_gpu_session
-from hive_parquet_write_test import _hive_bucket_gens
+from hive_parquet_write_test import _hive_bucket_gens_sans_bools
 from hive_parquet_write_test import read_single_bucket
 
 _hive_write_conf = {
@@ -33,9 +33,11 @@
 @allow_non_gpu(*non_utc_allow)
 def test_write_hive_bucketed_table(spark_tmp_table_factory, file_format):
     num_rows = 2048
-
+    # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+    # https://github.com/rapidsai/cudf/issues/6763 .
+    # Once the first issue is fixed, add back boolean_gen
     def gen_table(spark):
-        gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens)]
+        gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens_sans_bools)]
         types_sql_str = ','.join('{} {}'.format(
             name, gen.data_type.simpleString()) for name, gen in gen_list)
         col_names_str = ','.join(name for name, gen in gen_list)

diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py
@@ -139,34 +139,34 @@ def test_datediff(data_gen):
 
 hms_fallback = ['ProjectExec'] if not is_supported_time_zone() else []
 
-@allow_non_gpu(*hms_fallback)
+@allow_non_gpu(*non_utc_tz_allow)
 def test_months_between():
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark : binary_op_df(spark, timestamp_gen).selectExpr('months_between(a, b, false)'))
 
-@allow_non_gpu(*hms_fallback)
+@allow_non_gpu(*non_utc_tz_allow)
 def test_months_between_first_day():
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2024-01-01", false)'))
 
-@allow_non_gpu(*hms_fallback)
+@allow_non_gpu(*non_utc_tz_allow)
 def test_months_between_last_day():
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2023-12-31", false)'))
 
-@allow_non_gpu(*hms_fallback)
+@allow_non_gpu(*non_utc_tz_allow)
 @approximate_float()
 def test_months_between_round():
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark : binary_op_df(spark, timestamp_gen).selectExpr('months_between(a, b, true)'))
 
-@allow_non_gpu(*hms_fallback)
+@allow_non_gpu(*non_utc_tz_allow)
 @approximate_float()
 def test_months_between_first_day_round():
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2024-01-01", true)'))
 
-@allow_non_gpu(*hms_fallback)
+@allow_non_gpu(*non_utc_tz_allow)
 @approximate_float()
 def test_months_between_last_day_round():
     assert_gpu_and_cpu_are_equal_collect(

diff --git a/integration_tests/src/main/python/hive_parquet_write_test.py b/integration_tests/src/main/python/hive_parquet_write_test.py
@@ -25,24 +25,25 @@
 # "GpuInsertIntoHiveTable" for Parquet write.
 _write_to_hive_conf = {"spark.sql.hive.convertMetastoreParquet": False}
 
-_hive_bucket_basic_gens = [
-    boolean_gen, byte_gen, short_gen, int_gen, long_gen, string_gen, float_gen, double_gen,
+_hive_bucket_basic_gens_sans_bools = [
+    byte_gen, short_gen, int_gen, long_gen, string_gen, float_gen, double_gen,
     DateGen(start=date(1590, 1, 1)), _restricted_timestamp()]
+_hive_bucket_basic_gens = [boolean_gen] + _hive_bucket_basic_gens_sans_bools
 
 _hive_bucket_basic_struct_gen = StructGen(
-    [['c'+str(ind), c_gen] for ind, c_gen in enumerate(_hive_bucket_basic_gens)])
+    [['c'+str(ind), c_gen] for ind, c_gen in enumerate(_hive_bucket_basic_gens_sans_bools)])
 
 _hive_bucket_struct_gens = [
     _hive_bucket_basic_struct_gen,
     StructGen([['child0', byte_gen], ['child1', _hive_bucket_basic_struct_gen]]),
     StructGen([['child0', ArrayGen(short_gen)], ['child1', double_gen]])]
 
-_hive_bucket_array_gens = [ArrayGen(sub_gen) for sub_gen in _hive_bucket_basic_gens] + [
+_hive_bucket_array_gens = [ArrayGen(sub_gen) for sub_gen in _hive_bucket_basic_gens_sans_bools] + [
     ArrayGen(ArrayGen(short_gen, max_length=10), max_length=10),
     ArrayGen(ArrayGen(string_gen, max_length=10), max_length=10),
     ArrayGen(StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]]))]
 
-_hive_bucket_gens = _hive_bucket_basic_gens + _hive_bucket_struct_gens + _hive_bucket_array_gens
+_hive_bucket_gens_sans_bools = _hive_bucket_basic_gens_sans_bools + _hive_bucket_struct_gens + _hive_bucket_array_gens
 
 _hive_basic_gens = _hive_bucket_basic_gens + [
     DecimalGen(precision=19, scale=1, nullable=True),
@@ -211,7 +212,7 @@ def test_insert_hive_bucketed_table(spark_tmp_table_factory):
     num_rows = 2048
 
     def gen_table(spark):
-        gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens)]
+        gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens_sans_bools)]
         types_sql_str = ','.join('{} {}'.format(
             name, gen.data_type.simpleString()) for name, gen in gen_list)
         col_names_str = ','.join(name for name, gen in gen_list)

diff --git a/integration_tests/src/main/python/hive_write_test.py b/integration_tests/src/main/python/hive_write_test.py
@@ -29,8 +29,11 @@ def _restricted_timestamp(nullable=True):
                         end=datetime(2262, 4, 11, tzinfo=timezone.utc),
                         nullable=nullable)
 
+# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+# https://github.com/rapidsai/cudf/issues/6763 .
+# Once the first issue is fixed, add back boolean_gen
 _basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
-                     string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
+                     string_gen, DateGen(start=date(1590, 1, 1)),
                      _restricted_timestamp()
                ] + decimal_gens
 
@@ -45,8 +48,11 @@ def _restricted_timestamp(nullable=True):
     ArrayGen(ArrayGen(string_gen, max_length=10), max_length=10),
     ArrayGen(StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]]))]
 
+# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
+# https://github.com/rapidsai/cudf/issues/6763 .
+# Once the first issue is fixed, add back boolean_gen
 _map_gens = [simple_string_to_string_map_gen] + [MapGen(f(nullable=False), f()) for f in [
-    BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen,
+    ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen,
     lambda nullable=True: _restricted_timestamp(nullable=nullable),
     lambda nullable=True: DateGen(start=date(1590, 1, 1), nullable=nullable),
     lambda nullable=True: DecimalGen(precision=15, scale=1, nullable=nullable),

diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py
@@ -22,7 +22,8 @@
 from marks import ignore_order, allow_non_gpu, incompat, validate_execs_in_gpu_plan
 from spark_session import with_cpu_session, is_before_spark_330, is_databricks_runtime
 
-pytestmark = [pytest.mark.nightly_resource_consuming_test]
+# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
+pytestmark = [pytest.mark.nightly_resource_consuming_test, pytest.mark.premerge_ci_1]
 
 all_non_sized_join_types = ['LeftSemi', 'LeftAnti', 'Cross']
 all_symmetric_sized_join_types = ['Inner', 'FullOuter']

diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py
@@ -23,6 +23,9 @@
 from marks import approximate_float, allow_non_gpu, ignore_order, datagen_overrides
 from spark_session import *
 
+# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
+pytestmark = [pytest.mark.premerge_ci_1]
+
 TEXT_INPUT_EXEC='FileSourceScanExec'
 
 # allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653'