Merge branch 'branch-25.02' into db-script-optimize

NVIDIA · Dec 6, 2024 · a002c5f · a002c5f
2 parents 2d3ca0a + 89984d0
commit a002c5f
Show file tree

Hide file tree

Showing 27 changed files with 380 additions and 120 deletions.
diff --git a/.github/workflows/license-header-check.yml b/.github/workflows/license-header-check.yml
@@ -0,0 +1,58 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# A workflow to check copyright/license header
+name: license header check
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  license-header-check:
+    runs-on: ubuntu-latest
+    if: "!contains(github.event.pull_request.title, '[bot]')"
+    steps:
+      - name: Get checkout depth
+        run: |
+          echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 10 ))" >> $GITHUB_ENV
+
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: ${{ env.PR_FETCH_DEPTH }}
+
+      - name: license-header-check
+        uses: NVIDIA/spark-rapids-common/license-header-check@main
+        with:
+          included_file_patterns: |
+            *.yml,
+            *.yaml,
+            *.sh,
+            *.xml,
+            *.properties,
+            *.scala,
+            *.py,
+            build/*,
+            *.cpp,
+            *Dockerfile*,
+            *Jenkinsfile*,
+            *.ini,
+            *.java,
+            *.fbs
+          excluded_file_patterns: |
+            *target/*,
+            thirdparty/*,
+            sql-plugin/src/main/java/com/nvidia/spark/rapids/format/*
+ 
diff --git a/...ain/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala b/...ain/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala
@@ -39,6 +39,7 @@ import org.apache.spark.sql.execution.{CoalescedPartitionSpec, ShufflePartitionS
 import org.apache.spark.sql.execution.exchange.Exchange
 import org.apache.spark.sql.execution.metric.{SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter}
 import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBase, ShuffledBatchRDD}
+import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.createAdditionalExchangeMetrics
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.util.ThreadUtils
 
@@ -71,22 +72,11 @@ case class GpuOptimizeWriteExchangeExec(
   private[sql] lazy val readMetrics =
     SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext)
 
-  override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
-    "dataSize" -> createSizeMetric(ESSENTIAL_LEVEL, "data size"),
-    "dataReadSize" -> createSizeMetric(MODERATE_LEVEL, "data read size"),
-    "rapidsShuffleSerializationTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. serialization time"),
-    "rapidsShuffleDeserializationTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. deserialization time"),
-    "rapidsShuffleWriteTime" ->
-        createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle write time"),
-    "rapidsShuffleCombineTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle combine time"),
-    "rapidsShuffleWriteIoTime" ->
-        createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle write io time"),
-    "rapidsShuffleReadTime" ->
-        createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle read time")
-  ) ++ GpuMetric.wrap(readMetrics) ++ GpuMetric.wrap(writeMetrics)
+  override lazy val additionalMetrics : Map[String, GpuMetric] = {
+    createAdditionalExchangeMetrics(this) ++
+      GpuMetric.wrap(readMetrics) ++
+      GpuMetric.wrap(writeMetrics)
+  }
 
   override lazy val allMetrics: Map[String, GpuMetric] = {
     Map(
@@ -98,7 +88,7 @@ case class GpuOptimizeWriteExchangeExec(
   }
 
   private lazy val serializer: Serializer =
-    new GpuColumnarBatchSerializer(gpuLongMetric("dataSize"),
+    new GpuColumnarBatchSerializer(allMetrics,
       child.output.map(_.dataType).toArray,
       RapidsConf.SHUFFLE_KUDO_SERIALIZER_ENABLED.get(child.conf))
 

diff --git a/docs/dev/idea-code-style-settings.xml b/docs/dev/idea-code-style-settings.xml
@@ -1,3 +1,19 @@
+<!--
+ Copyright (c) 2024, NVIDIA CORPORATION.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+--> 
+
 <code_scheme name="Default" version="173">
   <option name="SOFT_MARGINS" value="100" />
   <JavaCodeStyleSettings>

diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py
@@ -22,7 +22,8 @@
 from marks import ignore_order, allow_non_gpu, incompat, validate_execs_in_gpu_plan
 from spark_session import with_cpu_session, is_before_spark_330, is_databricks_runtime
 
-pytestmark = [pytest.mark.nightly_resource_consuming_test]
+# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
+pytestmark = [pytest.mark.nightly_resource_consuming_test, pytest.mark.premerge_ci_1]
 
 all_non_sized_join_types = ['LeftSemi', 'LeftAnti', 'Cross']
 all_symmetric_sized_join_types = ['Inner', 'FullOuter']

diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py
@@ -23,6 +23,9 @@
 from marks import approximate_float, allow_non_gpu, ignore_order, datagen_overrides
 from spark_session import *
 
+# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
+pytestmark = [pytest.mark.premerge_ci_1]
+
 TEXT_INPUT_EXEC='FileSourceScanExec'
 
 # allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653'

diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py
@@ -28,6 +28,8 @@
 from spark_session import *
 from conftest import is_databricks_runtime, is_dataproc_runtime
 
+# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
+pytestmark = [pytest.mark.premerge_ci_1]
 
 def read_parquet_df(data_path):
     return lambda spark : spark.read.parquet(data_path)

diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py
@@ -1012,14 +1012,16 @@ def test_regexp_replace_simple(regexp_enabled):
             'REGEXP_REPLACE(a, "ab", "PROD")',
             'REGEXP_REPLACE(a, "ae", "PROD")',
             'REGEXP_REPLACE(a, "bc", "PROD")',
-            'REGEXP_REPLACE(a, "fa", "PROD")'
+            'REGEXP_REPLACE(a, "fa", "PROD")',
+            'REGEXP_REPLACE(a, "a\n", "PROD")',
+            'REGEXP_REPLACE(a, "\n", "PROD")'
         ),
         conf=conf
     )
 
 @pytest.mark.parametrize("regexp_enabled", ['true', 'false'])
 def test_regexp_replace_multi_optimization(regexp_enabled):
-    gen = mk_str_gen('[abcdef]{0,2}')
+    gen = mk_str_gen('[abcdef\t\n\a]{0,3}')
 
     conf = { 'spark.rapids.sql.regexp.enabled': regexp_enabled }
 
@@ -1032,7 +1034,9 @@ def test_regexp_replace_multi_optimization(regexp_enabled):
             'REGEXP_REPLACE(a, "aa|bb|cc|dd", "PROD")',
             'REGEXP_REPLACE(a, "(aa|bb)|(cc|dd)", "PROD")',
             'REGEXP_REPLACE(a, "aa|bb|cc|dd|ee", "PROD")',
-            'REGEXP_REPLACE(a, "aa|bb|cc|dd|ee|ff", "PROD")'
+            'REGEXP_REPLACE(a, "aa|bb|cc|dd|ee|ff", "PROD")',
+            'REGEXP_REPLACE(a, "a\n|b\a|c\t", "PROD")',
+            'REGEXP_REPLACE(a, "a\ta|b\nb", "PROD")'
         ),
         conf=conf
     )

diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py
@@ -24,6 +24,9 @@
 from spark_session import is_before_spark_320, is_databricks113_or_later, is_databricks133_or_later, is_spark_350_or_later, spark_version, with_cpu_session
 import warnings
 
+# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
+pytestmark = [pytest.mark.premerge_ci_1]
+
 _grpkey_longs_with_no_nulls = [
     ('a', RepeatSeqGen(LongGen(nullable=False), length=20)),
     ('b', IntegerGen()),

diff --git a/jenkins/Jenkinsfile-blossom.premerge-databricks b/jenkins/Jenkinsfile-blossom.premerge-databricks
@@ -68,6 +68,8 @@ pipeline {
         DATABRICKS_PUBKEY = credentials("SPARK_DATABRICKS_PUBKEY")
         DATABRICKS_DRIVER = DbUtils.getDriver("$DB_TYPE")
         DATABRICKS_WORKER = DbUtils.getWorker("$DB_TYPE")
+        INIT_SCRIPTS_DIR = "/databricks/init_scripts/${BUILD_TAG}"
+        TEST_TYPE = 'pre-commit'
     }
 
     stages {

diff --git a/jenkins/databricks/params.py b/jenkins/databricks/params.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -33,6 +33,8 @@
 spark_conf = ''
 # can take comma separated environments, e.g., foo=abc,bar=123,...'
 extra_envs = ''
+# 'nightly' is for nightly CI, 'pre-commit' is for the pre-merge CI
+test_type = 'nightly'
 
 
 def usage():
@@ -51,11 +53,12 @@ def usage():
           ' -n <skipstartingcluster>'
           ' -f <sparkconf>'
           ' -i <sparkinstallver>'
-          ' -e <extraenvs>')
+          ' -e <extraenvs>'
+          ' -m <testtype>')
 
 
 try:
-    opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:e:',
+    opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:e:m:',
                                       ['workspace=',
                                        'token=',
                                        'clusterid=',
@@ -68,7 +71,8 @@ def usage():
                                        'jarpath=',
                                        'sparkconf=',
                                        'sparkinstallver=',
-                                       'extraenvs='])
+                                       'extraenvs=',
+                                       'testtype='])
 except getopt.GetoptError:
     usage()
     sys.exit(2)
@@ -103,6 +107,8 @@ def usage():
         base_spark_version_to_install_databricks_jars = arg
     elif opt in ('-e', '--extraenvs'):
         extra_envs = arg
+    elif opt in ('-m', '--testtype'):
+        test_type = arg
 
 print('-w is ' + workspace)
 print('-c is ' + clusterid)
@@ -116,3 +122,4 @@ def usage():
 print('-f is ' + spark_conf)
 print('-i is ' + base_spark_version_to_install_databricks_jars)
 print('-e is ' + extra_envs)
+print('-m is ' + test_type)
diff --git a/jenkins/databricks/run-build.py b/jenkins/databricks/run-build.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,10 +46,12 @@ def main():
   print("ssh command: %s" % ssh_command)
   subprocess.check_call(ssh_command, shell = True)
 
-  print("Copying built tarball back")
-  rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (ssh_args, master_addr)
-  print("rsync command to get built tarball: %s" % rsync_command)
-  subprocess.check_call(rsync_command, shell = True)
+  # Only the nightly build needs to copy the spark-rapids-built.tgz back
+  if params.test_type == 'nightly':
+      print("Copying built tarball back")
+      rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (ssh_args, master_addr)
+      print("rsync command to get built tarball: %s" % rsync_command)
+      subprocess.check_call(rsync_command, shell = True)
 
 if __name__ == '__main__':
   main()
diff --git a/jenkins/databricks/run-tests.py b/jenkins/databricks/run-tests.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -37,9 +37,9 @@ def main():
     subprocess.check_call(rsync_command, shell=True)
 
     ssh_command = "ssh %s ubuntu@%s " % (ssh_args, master_addr) + \
-        "'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s EXTRA_ENVS=%s bash %s %s 2>&1 | tee testout; " \
+        "'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s EXTRA_ENVS=%s TEST_TYPE=%s bash %s %s 2>&1 | tee testout; " \
         "if [ ${PIPESTATUS[0]} -ne 0 ]; then false; else true; fi'" % \
-        (params.jar_path, params.spark_conf, params.base_spark_pom_version, params.extra_envs,
+        (params.jar_path, params.spark_conf, params.base_spark_pom_version, params.extra_envs, params.test_type,
          params.script_dest, ' '.join(params.script_args))
     print("ssh command: %s" % ssh_command)
     try:

diff --git a/pom.xml b/pom.xml
@@ -936,13 +936,14 @@
           Build and run unit tests on one specific version for each sub-version (e.g. 320, 330)
           Base shim version (320 currently) should be covered in default mvn verify command of premerge script,
           so base shim version is removed from the premergeUT list.
-          Separate the versions to two parts (premergeUT1, premergeUT2) for balancing the duration
+          Separate the versions to two parts: premergeUT1(2 shims' UT + 1/3 of the integration tests)
+          and premergeUT2(1 shim's UT + 2/3 of the integration tests), for balancing the duration
         -->
         <premergeUT1.buildvers>
-            320
+            320,
+            330
         </premergeUT1.buildvers>
         <premergeUT2.buildvers>
-            330,
             340
         </premergeUT2.buildvers>
         <premergeUTF8.buildvers>

diff --git a/python/rapids/daemon.py b/python/rapids/daemon.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with

diff --git a/python/rapids/daemon_databricks.py b/python/rapids/daemon_databricks.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with

diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml
@@ -936,13 +936,14 @@
           Build and run unit tests on one specific version for each sub-version (e.g. 320, 330)
           Base shim version (320 currently) should be covered in default mvn verify command of premerge script,
           so base shim version is removed from the premergeUT list.
-          Separate the versions to two parts (premergeUT1, premergeUT2) for balancing the duration
+          Separate the versions to two parts: premergeUT1(2 shims' UT + 1/3 of the integration tests)
+          and premergeUT2(1 shim's UT + 2/3 of the integration tests), for balancing the duration
         -->
         <premergeUT1.buildvers>
-            320
+            320,
+            330
         </premergeUT1.buildvers>
         <premergeUT2.buildvers>
-            330,
             340
         </premergeUT2.buildvers>
         <premergeUTF8.buildvers>