diff --git a/README.md b/README.md index 46d846aa9c2..d669be64ca9 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ as a `provided` dependency. com.nvidia rapids-4-spark_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT provided ``` diff --git a/aggregator/pom.xml b/aggregator/pom.xml index 4fa4827ac52..f3bdf9d256b 100644 --- a/aggregator/pom.xml +++ b/aggregator/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark-aggregator_2.12 RAPIDS Accelerator for Apache Spark Aggregator Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT aggregator diff --git a/api_validation/pom.xml b/api_validation/pom.xml index 34c2404c3c0..d85b6a68146 100644 --- a/api_validation/pom.xml +++ b/api_validation/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-api-validation_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT api_validation diff --git a/datagen/README.md b/datagen/README.md index f374e4da9f2..5fc3aa06de3 100644 --- a/datagen/README.md +++ b/datagen/README.md @@ -24,12 +24,12 @@ Where `$SPARK_VERSION` is a compressed version number, like 330 for Spark 3.3.0. After this the jar should be at `target/datagen_2.12-$PLUGIN_VERSION-spark$SPARK_VERSION.jar` -for example a Spark 3.3.0 jar for the 23.12.0 release would be -`target/datagen_2.12-23.12.0-spark330.jar` +for example a Spark 3.3.0 jar for the 24.02.0 release would be +`target/datagen_2.12-24.02.0-spark330.jar` To get a spark shell with this you can run ```shell -spark-shell --jars target/datagen_2.12-23.12.0-spark330.jar +spark-shell --jars target/datagen_2.12-24.02.0-spark330.jar ``` After that you should be good to go. diff --git a/datagen/ScaleTest.md b/datagen/ScaleTest.md index dc55ffd393a..fd2af1decbb 100644 --- a/datagen/ScaleTest.md +++ b/datagen/ScaleTest.md @@ -44,7 +44,7 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \ --class com.nvidia.rapids.tests.scaletest.ScaleTestDataGen \ # the main class --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ # one dependency jar just shipped with Spark under $SPARK_HOME -./target/datagen_2.12-23.12.0-SNAPSHOT-spark332.jar \ +./target/datagen_2.12-24.02.0-SNAPSHOT-spark332.jar \ 1 \ 10 \ parquet \ diff --git a/datagen/pom.xml b/datagen/pom.xml index d22f874bb04..6a6129ac603 100644 --- a/datagen/pom.xml +++ b/datagen/pom.xml @@ -21,13 +21,13 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../shim-deps/pom.xml datagen_2.12 Data Generator Tools for generating large amounts of data - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT datagen diff --git a/delta-lake/delta-20x/pom.xml b/delta-lake/delta-20x/pom.xml index 5cb0e2e2e4e..8a06a26a69c 100644 --- a/delta-lake/delta-20x/pom.xml +++ b/delta-lake/delta-20x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-20x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../delta-lake/delta-20x diff --git a/delta-lake/delta-21x/pom.xml b/delta-lake/delta-21x/pom.xml index 5b4e1225722..547cb52a9f8 100644 --- a/delta-lake/delta-21x/pom.xml +++ b/delta-lake/delta-21x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-21x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../delta-lake/delta-21x diff --git a/delta-lake/delta-22x/pom.xml b/delta-lake/delta-22x/pom.xml index 0b6d2175f2f..f52c3ab0f7c 100644 --- a/delta-lake/delta-22x/pom.xml +++ b/delta-lake/delta-22x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-22x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../delta-lake/delta-22x diff --git a/delta-lake/delta-23x/pom.xml b/delta-lake/delta-23x/pom.xml index 9b8cb489cb6..02372462348 100644 --- a/delta-lake/delta-23x/pom.xml +++ b/delta-lake/delta-23x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-parent_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../../pom.xml rapids-4-spark-delta-23x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../delta-lake/delta-23x diff --git a/delta-lake/delta-24x/pom.xml b/delta-lake/delta-24x/pom.xml index 93f625397bf..b793ec7c393 100644 --- a/delta-lake/delta-24x/pom.xml +++ b/delta-lake/delta-24x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-24x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../delta-lake/delta-24x diff --git a/delta-lake/delta-spark321db/pom.xml b/delta-lake/delta-spark321db/pom.xml index 95f9146f51a..1514904d03f 100644 --- a/delta-lake/delta-spark321db/pom.xml +++ b/delta-lake/delta-spark321db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-spark321db_2.12 RAPIDS Accelerator for Apache Spark Databricks 10.4 Delta Lake Support Databricks 10.4 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../delta-lake/delta-spark321db diff --git a/delta-lake/delta-spark330db/pom.xml b/delta-lake/delta-spark330db/pom.xml index c8ed34bd539..eef0b56b0ae 100644 --- a/delta-lake/delta-spark330db/pom.xml +++ b/delta-lake/delta-spark330db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-spark330db_2.12 RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../delta-lake/delta-spark330db diff --git a/delta-lake/delta-spark332db/pom.xml b/delta-lake/delta-spark332db/pom.xml index 1d81d63aa94..34335400b43 100644 --- a/delta-lake/delta-spark332db/pom.xml +++ b/delta-lake/delta-spark332db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-spark332db_2.12 RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../delta-lake/delta-spark332db diff --git a/delta-lake/delta-spark341db/pom.xml b/delta-lake/delta-spark341db/pom.xml index 64e920eb8f1..11291f9ad21 100644 --- a/delta-lake/delta-spark341db/pom.xml +++ b/delta-lake/delta-spark341db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-spark341db_2.12 RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT false diff --git a/delta-lake/delta-stub/pom.xml b/delta-lake/delta-stub/pom.xml index c58eb185cfc..ac77668efda 100644 --- a/delta-lake/delta-stub/pom.xml +++ b/delta-lake/delta-stub/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../../jdk-profiles/pom.xml rapids-4-spark-delta-stub_2.12 RAPIDS Accelerator for Apache Spark Delta Lake Stub Delta Lake stub for the RAPIDS Accelerator for Apache Spark - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../delta-lake/delta-stub diff --git a/dist/pom.xml b/dist/pom.xml index 6fbc047ac47..01a60af8096 100644 --- a/dist/pom.xml +++ b/dist/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../jdk-profiles/pom.xml rapids-4-spark_2.12 RAPIDS Accelerator for Apache Spark Distribution Creates the distribution package of the RAPIDS plugin for Apache Spark - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT com.nvidia diff --git a/docs/compatibility.md b/docs/compatibility.md index 9d411f56d50..8043aa12d38 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -83,6 +83,17 @@ after Spark 3.1.0. We do not disable operations that produce different results due to `-0.0` in the data because it is considered to be a rare occurrence. +### `NaN` vs `NaN` + +Apache Spark does not have a consistent way to handle `NaN` comparison. Sometimes, all `NaN` are +considered as one unique value while other times they can be treated as different. The outcome of +`NaN` comparison can differ in various operations and also changed between Spark versions. +Our plugin tries to match its output with Apache Spark except for a few operation(s) listed below: + - `IN` SQL expression: `NaN` can be treated as different values in Spark 3.1.2 but not from +Spark 3.1.3 (see the [issue](https://issues.apache.org/jira/browse/SPARK-36792)). +On the other hand, our plugin always compares all `NaN` as equal value for this operation. + + ## Decimal Support Apache Spark supports decimal values with a precision up to 38. This equates to 128-bits. diff --git a/docs/configs.md b/docs/configs.md index 5a467ea9fa0..58d4e28d79d 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -10,7 +10,7 @@ The following is the list of options that `rapids-plugin-4-spark` supports. On startup use: `--conf [conf key]=[conf value]`. For example: ``` -${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar \ +${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ --conf spark.rapids.sql.concurrentGpuTasks=2 ``` diff --git a/docs/dev/shims.md b/docs/dev/shims.md index cca778382b8..e214d07862d 100644 --- a/docs/dev/shims.md +++ b/docs/dev/shims.md @@ -68,17 +68,17 @@ Using JarURLConnection URLs we create a Parallel World of the current version wi Spark 3.0.2's URLs: ```text -jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/ -jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark3xx-common/ -jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark302/ +jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/ +jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark3xx-common/ +jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark302/ ``` Spark 3.2.0's URLs : ```text -jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/ -jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark3xx-common/ -jar:file:/home/spark/rapids-4-spark_2.12-23.12.0.jar!/spark320/ +jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/ +jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark3xx-common/ +jar:file:/home/spark/rapids-4-spark_2.12-24.02.0.jar!/spark320/ ``` ### Late Inheritance in Public Classes diff --git a/integration_tests/README.md b/integration_tests/README.md index af203f44ad9..11687baa2d8 100644 --- a/integration_tests/README.md +++ b/integration_tests/README.md @@ -254,7 +254,7 @@ individually, so you don't risk running unit tests along with the integration te http://www.scalatest.org/user_guide/using_the_scalatest_shell ```shell -spark-shell --jars rapids-4-spark-tests_2.12-23.12.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-23.12.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar +spark-shell --jars rapids-4-spark-tests_2.12-24.02.0-SNAPSHOT-tests.jar,rapids-4-spark-integration-tests_2.12-24.02.0-SNAPSHOT-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar ``` First you import the `scalatest_shell` and tell the tests where they can find the test files you @@ -277,7 +277,7 @@ If you just want to verify the SQL replacement is working you will need to add t assumes CUDA 11.0 is being used and the Spark distribution is built with Scala 2.12. ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar" ./runtests.py +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar" ./runtests.py ``` You don't have to enable the plugin for this to work, the test framework will do that for you. @@ -376,7 +376,7 @@ To run cudf_udf tests, need following configuration changes: As an example, here is the `spark-submit` command with the cudf_udf parameter on CUDA 11.0: ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-23.12.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-23.12.0-SNAPSHOT-cuda11.jar" ./runtests.py --cudf_udf +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar,rapids-4-spark-tests_2.12-24.02.0-SNAPSHOT.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-24.02.0-SNAPSHOT-cuda11.jar" ./runtests.py --cudf_udf ``` ### Enabling fuzz tests diff --git a/integration_tests/ScaleTest.md b/integration_tests/ScaleTest.md index 774fb906cf6..3cf4b3a25d9 100644 --- a/integration_tests/ScaleTest.md +++ b/integration_tests/ScaleTest.md @@ -97,7 +97,7 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \ --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ --class com.nvidia.spark.rapids.tests.scaletest.ScaleTest \ -./target/rapids-4-spark-integration-tests_2.12-23.12.0-SNAPSHOT-spark332.jar \ +./target/rapids-4-spark-integration-tests_2.12-24.02.0-SNAPSHOT-spark332.jar \ 10 \ 100 \ parquet \ diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index e5484e0fd49..21432f5161b 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT ../shim-deps/pom.xml rapids-4-spark-integration-tests_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT integration_tests diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index 61dad6412e1..beb3111383b 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -146,10 +146,9 @@ def test_cast_string_date_non_ansi(): lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), conf={'spark.rapids.sql.hasExtendedYearValues': 'false'}) -@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9708') @pytest.mark.parametrize('data_gen', [StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}'), StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'), - StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9].[0-9]{0,6}Z?')], + StringGen('[0-9]{1,4}-[0-3][0-9]-[0-5][0-9][ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]\.[0-9]{0,6}Z?')], ids=idfn) @pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') def test_cast_string_ts_valid_format(data_gen): diff --git a/integration_tests/src/main/python/cmp_test.py b/integration_tests/src/main/python/cmp_test.py index a891b667016..0217bf0530f 100644 --- a/integration_tests/src/main/python/cmp_test.py +++ b/integration_tests/src/main/python/cmp_test.py @@ -17,7 +17,7 @@ from asserts import assert_gpu_and_cpu_are_equal_collect from conftest import is_not_utc from data_gen import * -from spark_session import with_cpu_session, is_before_spark_330 +from spark_session import with_cpu_session, is_before_spark_320, is_before_spark_330 from pyspark.sql.types import * from marks import datagen_overrides import pyspark.sql.functions as f @@ -346,11 +346,16 @@ def test_in(data_gen): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').isin(scalars))) +# We avoid testing inset with NaN in Spark < 3.2.0 since it has issue with NaN comparisons. +# See https://github.com/NVIDIA/spark-rapids/issues/9687. +test_inset_data_gen = [gen for gen in eq_gens_with_decimal_gen if gen != float_gen if gen != double_gen] + \ + [FloatGen(no_nans=True), DoubleGen(no_nans=True)] \ + if is_before_spark_320() else eq_gens_with_decimal_gen + # Spark supports two different versions of 'IN', and it depends on the spark.sql.optimizer.inSetConversionThreshold conf # This is to test entries over that value. -@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9687') -@pytest.mark.parametrize('data_gen', eq_gens_with_decimal_gen, ids=idfn) @pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') +@pytest.mark.parametrize('data_gen', test_inset_data_gen, ids=idfn) def test_in_set(data_gen): # nulls are not supported for in on the GPU yet num_entries = int(with_cpu_session(lambda spark: spark.conf.get('spark.sql.optimizer.inSetConversionThreshold'))) + 1 diff --git a/integration_tests/src/main/python/spark_session.py b/integration_tests/src/main/python/spark_session.py index aa27503c8eb..50eaa7c49a9 100644 --- a/integration_tests/src/main/python/spark_session.py +++ b/integration_tests/src/main/python/spark_session.py @@ -158,6 +158,9 @@ def is_spark_330_or_later(): def is_spark_340_or_later(): return spark_version() >= "3.4.0" +def is_spark_341(): + return spark_version() == "3.4.1" + def is_spark_350_or_later(): return spark_version() >= "3.5.0" diff --git a/integration_tests/src/main/python/udf_test.py b/integration_tests/src/main/python/udf_test.py index 88281279162..9e3f5d05bcc 100644 --- a/integration_tests/src/main/python/udf_test.py +++ b/integration_tests/src/main/python/udf_test.py @@ -15,7 +15,7 @@ import pytest from conftest import is_at_least_precommit_run, is_not_utc -from spark_session import is_databricks_runtime, is_before_spark_330, is_before_spark_350, is_spark_340_or_later +from spark_session import is_databricks_runtime, is_before_spark_330, is_before_spark_350, is_spark_341 from pyspark.sql.pandas.utils import require_minimum_pyarrow_version, require_minimum_pandas_version @@ -43,12 +43,6 @@ import pyarrow from typing import Iterator, Tuple - -if is_databricks_runtime() and is_spark_340_or_later(): - # Databricks 13.3 does not use separate reader/writer threads for Python UDFs - # which can lead to hangs. Skipping these tests until the Python UDF handling is updated. - pytestmark = pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/9493") - arrow_udf_conf = { 'spark.sql.execution.arrow.pyspark.enabled': 'true', 'spark.rapids.sql.exec.WindowInPandasExec': 'true', @@ -182,7 +176,10 @@ def group_size_udf(to_process: pd.Series) -> int: low_upper_win = Window.partitionBy('a').orderBy('b').rowsBetween(-3, 3) -udf_windows = [no_part_win, unbounded_win, cur_follow_win, pre_cur_win, low_upper_win] +running_win_param = pytest.param(pre_cur_win, marks=pytest.mark.xfail( + condition=is_databricks_runtime() and is_spark_341(), + reason='DB13.3 wrongly uses RunningWindowFunctionExec to evaluate a PythonUDAF and it will fail even on CPU')) +udf_windows = [no_part_win, unbounded_win, cur_follow_win, running_win_param, low_upper_win] window_ids = ['No_Partition', 'Unbounded', 'Unbounded_Following', 'Unbounded_Preceding', 'Lower_Upper'] @@ -338,8 +335,8 @@ def create_df(spark, data_gen, left_length, right_length): @ignore_order @pytest.mark.parametrize('data_gen', [ShortGen(nullable=False)], ids=idfn) def test_cogroup_apply_udf(data_gen): - def asof_join(l, r): - return pd.merge_asof(l, r, on='a', by='b') + def asof_join(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame: + return pd.merge_ordered(left, right) def do_it(spark): left, right = create_df(spark, data_gen, 500, 500) diff --git a/jdk-profiles/pom.xml b/jdk-profiles/pom.xml index d9488a8259d..e9d913eb6c6 100644 --- a/jdk-profiles/pom.xml +++ b/jdk-profiles/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-parent_2.12 - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT com.nvidia rapids-4-spark-jdk-profiles_2.12 pom Shim JDK Profiles - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT jdk9plus diff --git a/jenkins/Jenkinsfile-blossom.premerge-databricks b/jenkins/Jenkinsfile-blossom.premerge-databricks index 86fff7f23be..2fd2df7a8b0 100644 --- a/jenkins/Jenkinsfile-blossom.premerge-databricks +++ b/jenkins/Jenkinsfile-blossom.premerge-databricks @@ -88,7 +88,7 @@ pipeline { // 'name' and 'value' only supprt literal string in the declarative Jenkins // Refer to Jenkins issue https://issues.jenkins.io/browse/JENKINS-62127 name 'DB_RUNTIME' - values '10.4', '11.3', '12.2' + values '10.4', '11.3', '12.2', '13.3' } } stages { diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh index a68b272257b..8a0b25a0c95 100755 --- a/jenkins/databricks/build.sh +++ b/jenkins/databricks/build.sh @@ -144,12 +144,6 @@ if [[ "$WITH_BLOOP" == "1" ]]; then MVN_OPT="ch.epfl.scala:bloop-maven-plugin:bloopInstall $MVN_OPT" fi -# Disabling build for 341db until 24.02 -if [[ "$BUILDVER" == "341db" ]]; then - echo "Databricks 341 is not supported as of release 23.12\n" - exit 1 -fi - # Build the RAPIDS plugin by running package command for databricks $MVN_CMD -B -Ddatabricks -Dbuildver=$BUILDVER clean package -DskipTests $MVN_OPT diff --git a/jenkins/databricks/create.py b/jenkins/databricks/create.py index b0305f92112..289a114d230 100644 --- a/jenkins/databricks/create.py +++ b/jenkins/databricks/create.py @@ -27,7 +27,7 @@ def main(): workspace = 'https://dbc-9ff9942e-a9c4.cloud.databricks.com' token = '' sshkey = '' - cluster_name = 'CI-GPU-databricks-23.12.0-SNAPSHOT' + cluster_name = 'CI-GPU-databricks-24.02.0-SNAPSHOT' idletime = 240 runtime = '7.0.x-gpu-ml-scala2.12' num_workers = 1 diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh index 4af278e3a97..a0c7de590d7 100755 --- a/jenkins/databricks/init_cudf_udf.sh +++ b/jenkins/databricks/init_cudf_udf.sh @@ -20,7 +20,7 @@ set -ex -CUDF_VER=${CUDF_VER:-23.12} +CUDF_VER=${CUDF_VER:-24.02} CUDA_VER=${CUDA_VER:-11.8} # Need to explicitly add conda into PATH environment, to activate conda environment. diff --git a/jenkins/deploy.sh b/jenkins/deploy.sh index a85c8618a97..16428e121dc 100755 --- a/jenkins/deploy.sh +++ b/jenkins/deploy.sh @@ -30,6 +30,7 @@ # POM_FILE: Project pom file to be deployed # OUT_PATH: The path where jar files are # CUDA_CLASSIFIERS: Comma separated classifiers, e.g., "cuda11,cuda12" +# CLASSIFIERS: Comma separated classifiers, e.g., "cuda11,cuda12,cuda11-arm64,cuda12-arm64" ### set -ex @@ -48,6 +49,7 @@ ART_GROUP_ID=$(mvnEval $DIST_PL project.groupId) ART_VER=$(mvnEval $DIST_PL project.version) DEFAULT_CUDA_CLASSIFIER=$(mvnEval $DIST_PL cuda.version) CUDA_CLASSIFIERS=${CUDA_CLASSIFIERS:-"$DEFAULT_CUDA_CLASSIFIER"} +CLASSIFIERS=${CLASSIFIERS:-"$CUDA_CLASSIFIERS"} # default as CUDA_CLASSIFIERS for compatibility SQL_PL=${SQL_PL:-"sql-plugin"} POM_FILE=${POM_FILE:-"$DIST_PL/target/parallel-world/META-INF/maven/${ART_GROUP_ID}/${ART_ID}/pom.xml"} @@ -57,9 +59,8 @@ SIGN_TOOL=${SIGN_TOOL:-"gpg"} FPATH="$OUT_PATH/$ART_ID-$ART_VER" DEPLOY_TYPES='' DEPLOY_FILES='' -IFS=',' read -a CUDA_CLASSIFIERS_ARR <<< "$CUDA_CLASSIFIERS" -DEPLOY_TYPES=$(echo $CUDA_CLASSIFIERS | sed -e 's;[^,]*;jar;g') -DEPLOY_FILES=$(echo $CUDA_CLASSIFIERS | sed -e "s;\([^,]*\);${FPATH}-\1.jar;g") +DEPLOY_TYPES=$(echo $CLASSIFIERS | sed -e 's;[^,]*;jar;g') +DEPLOY_FILES=$(echo $CLASSIFIERS | sed -e "s;\([^,]*\);${FPATH}-\1.jar;g") # dist does not have javadoc and sources jars, use 'sql-plugin' instead source jenkins/version-def.sh >/dev/null 2&>1 @@ -103,4 +104,4 @@ $DEPLOY_CMD -DpomFile=$POM_FILE \ -Djavadoc=$FPATH-javadoc.jar \ -Dfiles=$DEPLOY_FILES \ -Dtypes=$DEPLOY_TYPES \ - -Dclassifiers=$CUDA_CLASSIFIERS + -Dclassifiers=$CLASSIFIERS diff --git a/jenkins/spark-nightly-build.sh b/jenkins/spark-nightly-build.sh index 969837ee397..5d331686659 100755 --- a/jenkins/spark-nightly-build.sh +++ b/jenkins/spark-nightly-build.sh @@ -42,7 +42,8 @@ ART_GROUP_ID=$(mvnEval project.groupId) ART_VER=$(mvnEval project.version) DEFAULT_CUDA_CLASSIFIER=${DEFAULT_CUDA_CLASSIFIER:-$(mvnEval cuda.version)} # default cuda version CUDA_CLASSIFIERS=${CUDA_CLASSIFIERS:-"$DEFAULT_CUDA_CLASSIFIER"} # e.g. cuda11,cuda12 -IFS=',' read -a CUDA_CLASSIFIERS_ARR <<< "$CUDA_CLASSIFIERS" +CLASSIFIERS=${CLASSIFIERS:-"$CUDA_CLASSIFIERS"} # default as CUDA_CLASSIFIERS for compatibility +IFS=',' read -a CLASSIFIERS_ARR <<< "$CLASSIFIERS" TMP_PATH="/tmp/$(date '+%Y-%m-%d')-$$" DIST_FPATH="$DIST_PL/target/$ART_ID-$ART_VER-$DEFAULT_CUDA_CLASSIFIER" @@ -72,7 +73,7 @@ function distWithReducedPom { deploy) mvnCmd="deploy:deploy-file" - if (( ${#CUDA_CLASSIFIERS_ARR[@]} > 1 )); then + if (( ${#CLASSIFIERS_ARR[@]} > 1 )); then # try move tmp artifacts back to target folder for simplifying separate release process mv ${TMP_PATH}/${ART_ID}-${ART_VER}-*.jar ${DIST_PL}/target/ fi @@ -102,6 +103,13 @@ function distWithReducedPom { # option to skip unit tests. Used in our CI to separate test runs in parallel stages SKIP_TESTS=${SKIP_TESTS:-"false"} +if [[ "${SKIP_TESTS}" == "true" ]]; then + # if skip test, we could try speed up build with multiple-threads + MVN="${MVN} -T1C" +fi + +set +H # turn off history expansion +DEPLOY_SUBMODULES=${DEPLOY_SUBMODULES:-"!${DIST_PL}"} # TODO: deploy only required submodules to save time for buildver in "${SPARK_SHIM_VERSIONS[@]:1}"; do $MVN -U -B clean install $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \ -Dcuda.version=$DEFAULT_CUDA_CLASSIFIER \ @@ -117,33 +125,44 @@ for buildver in "${SPARK_SHIM_VERSIONS[@]:1}"; do fi distWithReducedPom "install" [[ $SKIP_DEPLOY != 'true' ]] && \ - $MVN -B deploy -pl '!dist' $MVN_URM_MIRROR \ + # this deploys selected submodules + $MVN -B deploy -pl $DEPLOY_SUBMODULES $MVN_URM_MIRROR \ -Dmaven.repo.local=$M2DIR \ -Dcuda.version=$DEFAULT_CUDA_CLASSIFIER \ -DskipTests \ + -Dmaven.scaladoc.skip -Dmaven.scalastyle.skip=true \ -Dbuildver="${buildver}" done installDistArtifact() { - local cuda_classifier="$1" + local cuda_version="$1" + local opt="$2" $MVN -B clean install \ + $opt \ $DIST_PROFILE_OPT \ -Dbuildver=$SPARK_BASE_SHIM_VERSION \ $MVN_URM_MIRROR \ -Dmaven.repo.local=$M2DIR \ - -Dcuda.version=$cuda_classifier \ + -Dcuda.version=$cuda_version \ -DskipTests=$SKIP_TESTS } # build extra cuda classifiers -if (( ${#CUDA_CLASSIFIERS_ARR[@]} > 1 )); then +if (( ${#CLASSIFIERS_ARR[@]} > 1 )); then mkdir -p ${TMP_PATH} - for classifier in "${CUDA_CLASSIFIERS_ARR[@]}"; do + for classifier in "${CLASSIFIERS_ARR[@]}"; do if [ "${classifier}" == "${DEFAULT_CUDA_CLASSIFIER}" ]; then echo "skip default: ${DEFAULT_CUDA_CLASSIFIER} in build extra cuda classifiers step..." continue fi - installDistArtifact ${classifier} + + opt="" + if [[ "${classifier}" == *"-arm64" ]]; then + opt="-Parm64" + fi + # pass cuda version and extra opt + installDistArtifact ${classifier%%-*} ${opt} + # move artifacts to temp for deployment later artifactFile="${ART_ID}-${ART_VER}-${classifier}.jar" mv ${DIST_PL}/target/${artifactFile} ${TMP_PATH}/ @@ -161,10 +180,11 @@ distWithReducedPom "install" if [[ $SKIP_DEPLOY != 'true' ]]; then distWithReducedPom "deploy" - # this deploys submodules except dist that is unconditionally built with Spark 3.1.1 - $MVN -B deploy -pl '!dist' \ + # this deploys selected submodules that is unconditionally built with Spark 3.1.1 + $MVN -B deploy -pl $DEPLOY_SUBMODULES \ -Dbuildver=$SPARK_BASE_SHIM_VERSION \ - -DskipTests=$SKIP_TESTS \ + -DskipTests \ + -Dmaven.scaladoc.skip -Dmaven.scalastyle.skip=true \ $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \ -Dcuda.version=$DEFAULT_CUDA_CLASSIFIER fi diff --git a/jenkins/version-def.sh b/jenkins/version-def.sh index ebf75617d99..98894c0d548 100755 --- a/jenkins/version-def.sh +++ b/jenkins/version-def.sh @@ -26,10 +26,10 @@ for VAR in $OVERWRITE_PARAMS; do done IFS=$PRE_IFS -CUDF_VER=${CUDF_VER:-"23.12.0-SNAPSHOT"} +CUDF_VER=${CUDF_VER:-"24.02.0-SNAPSHOT"} CUDA_CLASSIFIER=${CUDA_CLASSIFIER:-"cuda11"} -PROJECT_VER=${PROJECT_VER:-"23.12.0-SNAPSHOT"} -PROJECT_TEST_VER=${PROJECT_TEST_VER:-"23.12.0-SNAPSHOT"} +PROJECT_VER=${PROJECT_VER:-"24.02.0-SNAPSHOT"} +PROJECT_TEST_VER=${PROJECT_TEST_VER:-"24.02.0-SNAPSHOT"} SPARK_VER=${SPARK_VER:-"3.1.1"} SPARK_VER_213=${SPARK_VER_213:-"3.3.0"} # Make a best attempt to set the default value for the shuffle shim. diff --git a/pom.xml b/pom.xml index e8086a35d06..744e9dd2985 100644 --- a/pom.xml +++ b/pom.xml @@ -23,7 +23,7 @@ rapids-4-spark-parent_2.12 RAPIDS Accelerator for Apache Spark Root Project The root project of the RAPIDS Accelerator for Apache Spark - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT pom https://nvidia.github.io/spark-rapids/ @@ -660,8 +660,8 @@ spark${buildver} cuda11 ${cuda.version} - 23.12.0-SNAPSHOT - 23.12.0-SNAPSHOT + 24.02.0-SNAPSHOT + 24.02.0-SNAPSHOT 2.12 2.8.0 incremental @@ -771,7 +771,8 @@ 321db, 330db, - 332db + 332db, + 341db