diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml index 93557017b08..1d7b0ab8e0b 100644 --- a/.github/workflows/blossom-ci.yml +++ b/.github/workflows/blossom-ci.yml @@ -77,7 +77,8 @@ jobs: github.actor == 'Feng-Jiang28' || github.actor == 'SurajAralihalli' || github.actor == 'jihoonson' || - github.actor == 'ustcfy' + github.actor == 'ustcfy' || + github.actor == 'knoguchi22' ) steps: - name: Check if comment is issued by authorized person diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml index 86e12a4a32b..0aca7bc3655 100644 --- a/.github/workflows/mvn-verify-check.yml +++ b/.github/workflows/mvn-verify-check.yml @@ -246,12 +246,10 @@ jobs: echo "Generated Scala 2.13 build files don't match what's in repository" exit 1 fi - # change to Scala 2.13 Directory - cd scala2.13 # test command, will retry for 3 times if failed. max_retry=3; delay=30; i=1 while true; do - mvn package \ + mvn package -f scala2.13/ \ -pl integration_tests,tests,tools -am -P 'individual,pre-merge' \ -Dbuildver=${{ matrix.spark-version }} -Dmaven.scalastyle.skip=true \ -Drat.skip=true ${{ env.COMMON_MVN_FLAGS }} && break || { @@ -303,12 +301,10 @@ jobs: echo "Generated Scala 2.13 build files don't match what's in repository" exit 1 fi - # change to Scala 2.13 Directory - cd scala2.13 # test command, will retry for 3 times if failed. max_retry=3; delay=30; i=1 while true; do - mvn verify \ + mvn verify -f scala2.13/ \ -P "individual,pre-merge,source-javadoc" -Dbuildver=${{ matrix.spark-version }} \ ${{ env.COMMON_MVN_FLAGS }} && break || { if [[ $i -le $max_retry ]]; then diff --git a/CHANGELOG.md b/CHANGELOG.md index 2510eba5dfe..c7786d8586a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,162 @@ # Change log -Generated on 2024-10-31 +Generated on 2024-12-08 + +## Release 24.12 + +### Features +||| +|:---|:---| +|[#11630](https://github.com/NVIDIA/spark-rapids/issues/11630)|[FEA] enable from_json and json scan by default| +|[#11709](https://github.com/NVIDIA/spark-rapids/issues/11709)|[FEA] Add support for `MonthsBetween`| +|[#11666](https://github.com/NVIDIA/spark-rapids/issues/11666)|[FEA] support task limit profiling for specified stages| +|[#11662](https://github.com/NVIDIA/spark-rapids/issues/11662)|[FEA] Support Apache Spark 3.4.4| +|[#11657](https://github.com/NVIDIA/spark-rapids/issues/11657)|[FEA] Support format 'yyyyMMdd HH:mm:ss' for legacy mode| +|[#11419](https://github.com/NVIDIA/spark-rapids/issues/11419)|[FEA] Support Spark 3.5.3 release| +|[#11492](https://github.com/NVIDIA/spark-rapids/issues/11492)|[FEA] Update rapids JNI and private dependency version to 24.12.0-SNAPSHOT| +|[#11505](https://github.com/NVIDIA/spark-rapids/issues/11505)|[FEA] Support yyyymmdd format for GetTimestamp for LEGACY mode.| + +### Performance +||| +|:---|:---| +|[#8391](https://github.com/NVIDIA/spark-rapids/issues/8391)|[FEA] Do a hash based re-partition instead of a sort based fallback for hash aggregate| +|[#11560](https://github.com/NVIDIA/spark-rapids/issues/11560)|[FEA] Improve `GpuJsonToStructs` performance| +|[#11458](https://github.com/NVIDIA/spark-rapids/issues/11458)|[FEA] enable prune_columns for from_json| + +### Bugs Fixed +||| +|:---|:---| +|[#11798](https://github.com/NVIDIA/spark-rapids/issues/11798)|[BUG] mismatch CPU and GPU result in test_months_between_first_day[DATAGEN_SEED=1733006411, TZ=Africa/Casablanca]| +|[#11790](https://github.com/NVIDIA/spark-rapids/issues/11790)|[BUG] test_hash_* failed "java.util.NoSuchElementException: head of empty list" or "Too many times of repartition, may hit a bug?"| +|[#11643](https://github.com/NVIDIA/spark-rapids/issues/11643)|[BUG] Support AQE with Broadcast Hash Join and DPP on Databricks 14.3| +|[#10910](https://github.com/NVIDIA/spark-rapids/issues/10910)|from_json, when input = empty object, rapids throws an exception.| +|[#10891](https://github.com/NVIDIA/spark-rapids/issues/10891)|Parsing a column containing invalid json into StructureType with schema throws an Exception.| +|[#11741](https://github.com/NVIDIA/spark-rapids/issues/11741)|[BUG] Fix spark400 build due to writeWithV1 return value change| +|[#11533](https://github.com/NVIDIA/spark-rapids/issues/11533)|Fix JSON Matrix tests on Databricks 14.3| +|[#11722](https://github.com/NVIDIA/spark-rapids/issues/11722)|[BUG] Spark 4.0.0 has moved `NullIntolerant` and builds are breaking because they are unable to find it.| +|[#11726](https://github.com/NVIDIA/spark-rapids/issues/11726)|[BUG] Databricks 14.3 nightly deploy fails due to incorrect DB_SHIM_NAME| +|[#11293](https://github.com/NVIDIA/spark-rapids/issues/11293)|[BUG] A user query with from_json failed with "JSON Parser encountered an invalid format at location"| +|[#9592](https://github.com/NVIDIA/spark-rapids/issues/9592)|[BUG][JSON] `from_json` to Map type should produce null for invalid entries| +|[#11715](https://github.com/NVIDIA/spark-rapids/issues/11715)|[BUG] parquet_testing_test.py failed on "AssertionError: GPU and CPU boolean values are different"| +|[#11716](https://github.com/NVIDIA/spark-rapids/issues/11716)|[BUG] delta_lake_write_test.py failed on "AssertionError: GPU and CPU boolean values are different"| +|[#11684](https://github.com/NVIDIA/spark-rapids/issues/11684)|[BUG] 24.12 Precommit fails with wrong number of arguments in `GpuDataSource`| +|[#11168](https://github.com/NVIDIA/spark-rapids/issues/11168)|[BUG] reserve allocation should be displayed when erroring due to lack of memory on startup| +|[#7585](https://github.com/NVIDIA/spark-rapids/issues/7585)|[BUG] [Regexp] Line anchor '$' incorrect matching of unicode line terminators| +|[#11622](https://github.com/NVIDIA/spark-rapids/issues/11622)|[BUG] GPU Parquet scan filter pushdown fails with timestamp/INT96 column| +|[#11646](https://github.com/NVIDIA/spark-rapids/issues/11646)|[BUG] NullPointerException in GpuRand| +|[#10498](https://github.com/NVIDIA/spark-rapids/issues/10498)|[BUG] Unit tests failed: [INTERVAL_ARITHMETIC_OVERFLOW] integer overflow. Use 'try_add' to tolerate overflow and return NULL instead| +|[#11659](https://github.com/NVIDIA/spark-rapids/issues/11659)|[BUG] parse_url throws exception if partToExtract is invalid while Spark returns null| +|[#10894](https://github.com/NVIDIA/spark-rapids/issues/10894)|Parsing a column containing a nested structure to json thows an exception| +|[#10895](https://github.com/NVIDIA/spark-rapids/issues/10895)|Converting a column containing a map into json throws an exception| +|[#10896](https://github.com/NVIDIA/spark-rapids/issues/10896)|Converting an column containing an array into json throws an exception| +|[#10915](https://github.com/NVIDIA/spark-rapids/issues/10915)|to_json when converts an array will throw an exception:| +|[#10916](https://github.com/NVIDIA/spark-rapids/issues/10916)|to_json function doesn't support map[string, struct] to json conversion.| +|[#10919](https://github.com/NVIDIA/spark-rapids/issues/10919)|to_json converting map[string, integer] to json, throws an exception| +|[#10920](https://github.com/NVIDIA/spark-rapids/issues/10920)|to_json converting an array with maps throws an exception.| +|[#10921](https://github.com/NVIDIA/spark-rapids/issues/10921)|to_json - array with single map| +|[#10923](https://github.com/NVIDIA/spark-rapids/issues/10923)|[BUG] Spark UT framework: to_json function to convert the array with a single empty row to a JSON string throws an exception.| +|[#10924](https://github.com/NVIDIA/spark-rapids/issues/10924)|[BUG] Spark UT framework: to_json when converts an empty array into json throws an exception. | +|[#11024](https://github.com/NVIDIA/spark-rapids/issues/11024)|Fix tests failures in parquet_write_test.py| +|[#11174](https://github.com/NVIDIA/spark-rapids/issues/11174)|Opcode Suite fails for Scala 2.13.8+ | +|[#10483](https://github.com/NVIDIA/spark-rapids/issues/10483)|[BUG] JsonToStructs fails to parse all empty dicts and invalid lines| +|[#10489](https://github.com/NVIDIA/spark-rapids/issues/10489)|[BUG] from_json does not support input with \n in it.| +|[#10347](https://github.com/NVIDIA/spark-rapids/issues/10347)|[BUG] Failures in Integration Tests on Dataproc Serverless| +|[#11021](https://github.com/NVIDIA/spark-rapids/issues/11021)|Fix tests failures in orc_cast_test.py| +|[#11609](https://github.com/NVIDIA/spark-rapids/issues/11609)|[BUG] test_hash_repartition_long_overflow_ansi_exception failed on 341DB| +|[#11600](https://github.com/NVIDIA/spark-rapids/issues/11600)|[BUG] regex_test failed mismatched cpu and gpu values in UT and IT| +|[#11611](https://github.com/NVIDIA/spark-rapids/issues/11611)|[BUG] Spark 4.0 build failure - value cannotSaveIntervalIntoExternalStorageError is not a member of object org.apache.spark.sql.errors.QueryCompilationErrors| +|[#10922](https://github.com/NVIDIA/spark-rapids/issues/10922)|from_json cannot support line separator in the input string.| +|[#11009](https://github.com/NVIDIA/spark-rapids/issues/11009)|Fix tests failures in cast_test.py| +|[#11572](https://github.com/NVIDIA/spark-rapids/issues/11572)|[BUG] MultiFileReaderThreadPool may flood the console with log messages| + +### PRs +||| +|:---|:---| +|[#11763](https://github.com/NVIDIA/spark-rapids/pull/11763)|Orc writes don't fully support Booleans with nulls | +|[#11802](https://github.com/NVIDIA/spark-rapids/pull/11802)|Fall back to CPU for non-UTC months_between| +|[#11792](https://github.com/NVIDIA/spark-rapids/pull/11792)|[BUG] Fix issue 11790| +|[#11712](https://github.com/NVIDIA/spark-rapids/pull/11712)|repartition-based fallback for hash aggregate v3| +|[#11730](https://github.com/NVIDIA/spark-rapids/pull/11730)|Add support for asynchronous writing for parquet| +|[#11750](https://github.com/NVIDIA/spark-rapids/pull/11750)|Fix aqe_test failures on 14.3.| +|[#11753](https://github.com/NVIDIA/spark-rapids/pull/11753)|Enable JSON Scan and from_json by default| +|[#11733](https://github.com/NVIDIA/spark-rapids/pull/11733)|Print out the current attempt object when OOM inside a retry block| +|[#11618](https://github.com/NVIDIA/spark-rapids/pull/11618)|Execute `from_json` with struct schema using `JSONUtils.fromJSONToStructs`| +|[#11725](https://github.com/NVIDIA/spark-rapids/pull/11725)|host watermark metric| +|[#11746](https://github.com/NVIDIA/spark-rapids/pull/11746)|Remove batch size bytes limits| +|[#11723](https://github.com/NVIDIA/spark-rapids/pull/11723)|Add NVIDIA Copyright| +|[#11721](https://github.com/NVIDIA/spark-rapids/pull/11721)|Add a few more JSON tests for MAP| +|[#11744](https://github.com/NVIDIA/spark-rapids/pull/11744)|Do not package the Databricks 14.3 shim into the dist jar [skip ci]| +|[#11724](https://github.com/NVIDIA/spark-rapids/pull/11724)|Integrate with kudo| +|[#11739](https://github.com/NVIDIA/spark-rapids/pull/11739)|Update to Spark 4.0 changing signature of SupportsV1Write.writeWithV1| +|[#11737](https://github.com/NVIDIA/spark-rapids/pull/11737)|Add in support for months_between| +|[#11700](https://github.com/NVIDIA/spark-rapids/pull/11700)|Fix leak with RapidsHostColumnBuilder in GpuUserDefinedFunction| +|[#11727](https://github.com/NVIDIA/spark-rapids/pull/11727)|Widen type promotion for decimals with larger scale in Parquet Read| +|[#11719](https://github.com/NVIDIA/spark-rapids/pull/11719)|Skip `from_json` overflow tests for 14.3| +|[#11708](https://github.com/NVIDIA/spark-rapids/pull/11708)|Support profiling for specific stages on a limited number of tasks| +|[#11731](https://github.com/NVIDIA/spark-rapids/pull/11731)|Add NullIntolerantShim to adapt to Spark 4.0 removing NullIntolerant| +|[#11413](https://github.com/NVIDIA/spark-rapids/pull/11413)|Support multi string contains| +|[#11728](https://github.com/NVIDIA/spark-rapids/pull/11728)|Change Databricks 14.3 shim name to spark350db143 [skip ci]| +|[#11702](https://github.com/NVIDIA/spark-rapids/pull/11702)|Improve JSON scan and `from_json`| +|[#11635](https://github.com/NVIDIA/spark-rapids/pull/11635)|Added Shims for adding Databricks 14.3 Support| +|[#11714](https://github.com/NVIDIA/spark-rapids/pull/11714)|Let AWS Databricks automatically choose an Availability Zone| +|[#11703](https://github.com/NVIDIA/spark-rapids/pull/11703)|Simplify $ transpiling and fix newline character bug| +|[#11707](https://github.com/NVIDIA/spark-rapids/pull/11707)|impalaFile cannot be found by UT framework. | +|[#11697](https://github.com/NVIDIA/spark-rapids/pull/11697)|Make delta-lake shim dependencies parametrizable| +|[#11710](https://github.com/NVIDIA/spark-rapids/pull/11710)|Add shim version 344 to LogicalPlanShims.scala| +|[#11706](https://github.com/NVIDIA/spark-rapids/pull/11706)|Add retry support in sub hash join| +|[#11673](https://github.com/NVIDIA/spark-rapids/pull/11673)|Fix Parquet Writer tests on 14.3| +|[#11669](https://github.com/NVIDIA/spark-rapids/pull/11669)|Fix `string_test` for 14.3| +|[#11692](https://github.com/NVIDIA/spark-rapids/pull/11692)|Add Spark 3.4.4 Shim | +|[#11695](https://github.com/NVIDIA/spark-rapids/pull/11695)|Fix spark400 build due to LogicalRelation signature changes| +|[#11689](https://github.com/NVIDIA/spark-rapids/pull/11689)|Update the Maven repository to download Spark JAR files [skip ci]| +|[#11670](https://github.com/NVIDIA/spark-rapids/pull/11670)|Fix `misc_expr_test` for 14.3| +|[#11652](https://github.com/NVIDIA/spark-rapids/pull/11652)|Fix skipping fixed_length_char ORC tests on > 13.3| +|[#11644](https://github.com/NVIDIA/spark-rapids/pull/11644)|Skip AQE-join-DPP tests for 14.3| +|[#11667](https://github.com/NVIDIA/spark-rapids/pull/11667)|Preparation for the coming Kudo support| +|[#11685](https://github.com/NVIDIA/spark-rapids/pull/11685)|Exclude shimplify-generated files from scalastyle| +|[#11282](https://github.com/NVIDIA/spark-rapids/pull/11282)|Reserve allocation should be displayed when erroring due to lack of memory on startup| +|[#11671](https://github.com/NVIDIA/spark-rapids/pull/11671)|Use the new host memory allocation API| +|[#11682](https://github.com/NVIDIA/spark-rapids/pull/11682)|Fix auto merge conflict 11679 [skip ci]| +|[#11663](https://github.com/NVIDIA/spark-rapids/pull/11663)|Simplify Transpilation of $ with Extended Line Separator Support in cuDF Regex| +|[#11672](https://github.com/NVIDIA/spark-rapids/pull/11672)|Fix race condition with Parquet filter pushdown modifying shared hadoop Configuration| +|[#11596](https://github.com/NVIDIA/spark-rapids/pull/11596)|Add a new NVTX range for task GPU ownership| +|[#11664](https://github.com/NVIDIA/spark-rapids/pull/11664)|Fix `orc_write_test.py` for 14.3| +|[#11656](https://github.com/NVIDIA/spark-rapids/pull/11656)|[DOC] update the supported OS in download page [skip ci]| +|[#11665](https://github.com/NVIDIA/spark-rapids/pull/11665)|Generate classes identical up to the shim package name| +|[#11647](https://github.com/NVIDIA/spark-rapids/pull/11647)|Fix a NPE issue in GpuRand| +|[#11658](https://github.com/NVIDIA/spark-rapids/pull/11658)|Support format 'yyyyMMdd HH:mm:ss' for legacy mode| +|[#11661](https://github.com/NVIDIA/spark-rapids/pull/11661)|Support invalid partToExtract for parse_url| +|[#11520](https://github.com/NVIDIA/spark-rapids/pull/11520)|UT adjust override checkScanSchemata & enabling ut of exclude_by_suffix fea.| +|[#11634](https://github.com/NVIDIA/spark-rapids/pull/11634)|Put DF_UDF plugin code into the main uber jar.| +|[#11522](https://github.com/NVIDIA/spark-rapids/pull/11522)|UT adjust test SPARK-26677: negated null-safe equality comparison| +|[#11521](https://github.com/NVIDIA/spark-rapids/pull/11521)|Datetime rebasing issue fixed| +|[#11642](https://github.com/NVIDIA/spark-rapids/pull/11642)|Update to_json to be more generic and fix some bugs| +|[#11615](https://github.com/NVIDIA/spark-rapids/pull/11615)|Spark 4 parquet_writer_test.py fixes| +|[#11623](https://github.com/NVIDIA/spark-rapids/pull/11623)|Fix `collection_ops_test` for 14.3| +|[#11553](https://github.com/NVIDIA/spark-rapids/pull/11553)|Fix udf-compiler scala2.13 internal return statements| +|[#11640](https://github.com/NVIDIA/spark-rapids/pull/11640)|Disable date/timestamp types by default when parsing JSON| +|[#11570](https://github.com/NVIDIA/spark-rapids/pull/11570)|Add support for Spark 3.5.3| +|[#11591](https://github.com/NVIDIA/spark-rapids/pull/11591)|Spark UT framework: Read Parquet file generated by parquet-thrift Rapids, UT case adjust.| +|[#11631](https://github.com/NVIDIA/spark-rapids/pull/11631)|Update JSON tests based on a closed/fixed issues| +|[#11617](https://github.com/NVIDIA/spark-rapids/pull/11617)|Quick fix for the build script failure of Scala 2.13 jars [skip ci]| +|[#11614](https://github.com/NVIDIA/spark-rapids/pull/11614)|Ensure repartition overflow test always overflows| +|[#11612](https://github.com/NVIDIA/spark-rapids/pull/11612)|Revert "Disable regex tests to unblock CI (#11606)"| +|[#11597](https://github.com/NVIDIA/spark-rapids/pull/11597)|`install_deps` changes for Databricks 14.3| +|[#11608](https://github.com/NVIDIA/spark-rapids/pull/11608)|Use mvn -f scala2.13/ in the build scripts to build the 2.13 jars| +|[#11610](https://github.com/NVIDIA/spark-rapids/pull/11610)|Change DataSource calendar interval error to fix spark400 build| +|[#11549](https://github.com/NVIDIA/spark-rapids/pull/11549)|Adopt `JSONUtils.concatenateJsonStrings` for concatenating JSON strings| +|[#11595](https://github.com/NVIDIA/spark-rapids/pull/11595)|Remove an unused config shuffle.spillThreads| +|[#11606](https://github.com/NVIDIA/spark-rapids/pull/11606)|Disable regex tests to unblock CI| +|[#11605](https://github.com/NVIDIA/spark-rapids/pull/11605)|Fix auto merge conflict 11604 [skip ci]| +|[#11587](https://github.com/NVIDIA/spark-rapids/pull/11587)|avoid long tail tasks due to PrioritySemaphore, remaing part| +|[#11574](https://github.com/NVIDIA/spark-rapids/pull/11574)|avoid long tail tasks due to PrioritySemaphore| +|[#11559](https://github.com/NVIDIA/spark-rapids/pull/11559)|[Spark 4.0] Address test failures in cast_test.py| +|[#11579](https://github.com/NVIDIA/spark-rapids/pull/11579)|Fix merge conflict with branch-24.10| +|[#11571](https://github.com/NVIDIA/spark-rapids/pull/11571)|Log reconfigure multi-file thread pool only once| +|[#11564](https://github.com/NVIDIA/spark-rapids/pull/11564)|Disk spill metric| +|[#11561](https://github.com/NVIDIA/spark-rapids/pull/11561)|Add in a basic plugin for dataframe UDF support in Apache Spark| +|[#11563](https://github.com/NVIDIA/spark-rapids/pull/11563)|Fix the latest merge conflict in integration tests| +|[#11542](https://github.com/NVIDIA/spark-rapids/pull/11542)|Update rapids JNI and private dependency to 24.12.0-SNAPSHOT [skip ci]| +|[#11493](https://github.com/NVIDIA/spark-rapids/pull/11493)|Support legacy mode for yyyymmdd format| ## Release 24.10 @@ -69,15 +226,21 @@ Generated on 2024-10-31 ### PRs ||| |:---|:---| +|[#11683](https://github.com/NVIDIA/spark-rapids/pull/11683)|[DOC] update download page for 2410 hot fix release [skip ci]| +|[#11680](https://github.com/NVIDIA/spark-rapids/pull/11680)|Update latest changelog [skip ci]| +|[#11678](https://github.com/NVIDIA/spark-rapids/pull/11678)|Update version to 24.10.1-SNAPSHOT [skip ci]| |[#11676](https://github.com/NVIDIA/spark-rapids/pull/11676)| Fix race condition with Parquet filter pushdown modifying shared hadoop Configuration| |[#11626](https://github.com/NVIDIA/spark-rapids/pull/11626)|Update latest changelog [skip ci]| |[#11624](https://github.com/NVIDIA/spark-rapids/pull/11624)|Update the download link [skip ci]| |[#11577](https://github.com/NVIDIA/spark-rapids/pull/11577)|Update latest changelog [skip ci]| |[#11576](https://github.com/NVIDIA/spark-rapids/pull/11576)|Update rapids JNI and private dependency to 24.10.0| |[#11582](https://github.com/NVIDIA/spark-rapids/pull/11582)|[DOC] update doc for 24.10 release [skip ci]| +|[#11414](https://github.com/NVIDIA/spark-rapids/pull/11414)|Fix `collection_ops_tests` for Spark 4.0| |[#11588](https://github.com/NVIDIA/spark-rapids/pull/11588)|backport fixes of #11573 to branch 24.10| |[#11569](https://github.com/NVIDIA/spark-rapids/pull/11569)|Have "dump always" dump input files before trying to decode them| +|[#11544](https://github.com/NVIDIA/spark-rapids/pull/11544)|Update test case related to LEACY datetime format to unblock nightly CI| |[#11567](https://github.com/NVIDIA/spark-rapids/pull/11567)|Fix test case unix_timestamp(col, 'yyyyMMdd') failed for Africa/Casablanca timezone and LEGACY mode| +|[#11519](https://github.com/NVIDIA/spark-rapids/pull/11519)|Spark 4: Fix parquet_test.py| |[#11496](https://github.com/NVIDIA/spark-rapids/pull/11496)|Update test now that code is fixed| |[#11548](https://github.com/NVIDIA/spark-rapids/pull/11548)|Fix negative rs. shuffle write time| |[#11545](https://github.com/NVIDIA/spark-rapids/pull/11545)|Update test case related to LEACY datetime format to unblock nightly CI| @@ -157,215 +320,6 @@ Generated on 2024-10-31 |[#11280](https://github.com/NVIDIA/spark-rapids/pull/11280)|Asynchronously copy table data to the host during shuffle| |[#11258](https://github.com/NVIDIA/spark-rapids/pull/11258)|Explicitly disable ANSI mode for ast_test.py| |[#11267](https://github.com/NVIDIA/spark-rapids/pull/11267)|Update the rapids JNI and private dependency version to 24.10.0-SNAPSHOT| -|[#11241](https://github.com/NVIDIA/spark-rapids/pull/11241)|Auto merge PRs to branch-24.10 from branch-24.08 [skip ci]| -|[#11231](https://github.com/NVIDIA/spark-rapids/pull/11231)|Cache dependencies for scala 2.13 [skip ci]| - -## Release 24.08 - -### Features -||| -|:---|:---| -|[#9259](https://github.com/NVIDIA/spark-rapids/issues/9259)|[FEA] Create Spark 4.0.0 shim and build env| -|[#10366](https://github.com/NVIDIA/spark-rapids/issues/10366)|[FEA] It would be nice if we could support Hive-style write bucketing table| -|[#10987](https://github.com/NVIDIA/spark-rapids/issues/10987)|[FEA] Implement lore framework to support all operators.| -|[#11087](https://github.com/NVIDIA/spark-rapids/issues/11087)|[FEA] Support regex pattern with brackets when rewrite to PrefixRange patten in rlike| -|[#22](https://github.com/NVIDIA/spark-rapids/issues/22)|[FEA] Add support for bucketed writes| -|[#9939](https://github.com/NVIDIA/spark-rapids/issues/9939)|[FEA] `GpuInsertIntoHiveTable` supports parquet format| - -### Performance -||| -|:---|:---| -|[#8750](https://github.com/NVIDIA/spark-rapids/issues/8750)|[FEA] Rework GpuSubstringIndex to use cudf::slice_strings| -|[#7404](https://github.com/NVIDIA/spark-rapids/issues/7404)|[FEA] explore a hash agg passthrough on partial aggregates| -|[#10976](https://github.com/NVIDIA/spark-rapids/issues/10976)|Rewrite `pattern1|pattern2|pattern3` to multiple contains in `rlike`| - -### Bugs Fixed -||| -|:---|:---| -|[#11287](https://github.com/NVIDIA/spark-rapids/issues/11287)|[BUG] String split APIs on empty string produce incorrect result| -|[#11270](https://github.com/NVIDIA/spark-rapids/issues/11270)|[BUG] test_regexp_replace[DATAGEN_SEED=1722297411, TZ=UTC] hanging there forever in pre-merge CI intermittently| -|[#9682](https://github.com/NVIDIA/spark-rapids/issues/9682)|[BUG] Casting FLOAT64 to DECIMAL(12,7) produces different rows from Apache Spark CPU| -|[#10809](https://github.com/NVIDIA/spark-rapids/issues/10809)|[BUG] cast(9.95 as decimal(3,1)), actual: 9.9, expected: 10.0| -|[#11266](https://github.com/NVIDIA/spark-rapids/issues/11266)|[BUG] test_broadcast_hash_join_constant_keys failed in databricks runtimes| -|[#11243](https://github.com/NVIDIA/spark-rapids/issues/11243)|[BUG] ArrayIndexOutOfBoundsException on a left outer join| -|[#11030](https://github.com/NVIDIA/spark-rapids/issues/11030)|Fix tests failures in string_test.py| -|[#11245](https://github.com/NVIDIA/spark-rapids/issues/11245)|[BUG] mvn verify for the source-javadoc fails and no pre-merge check catches it| -|[#11223](https://github.com/NVIDIA/spark-rapids/issues/11223)|[BUG] Remove unreferenced `CUDF_VER=xxx` in the CI script| -|[#11114](https://github.com/NVIDIA/spark-rapids/issues/11114)|[BUG] Update nightly tests for Scala 2.13 to use JDK 17 only| -|[#11229](https://github.com/NVIDIA/spark-rapids/issues/11229)|[BUG] test_delta_name_column_mapping_no_field_ids fails on Spark | -|[#11031](https://github.com/NVIDIA/spark-rapids/issues/11031)|Fix tests failures in multiple files | -|[#10948](https://github.com/NVIDIA/spark-rapids/issues/10948)|Figure out why `MapFromArrays ` appears in the tests for hive parquet write| -|[#11018](https://github.com/NVIDIA/spark-rapids/issues/11018)|Fix tests failures in hash_aggregate_test.py| -|[#11173](https://github.com/NVIDIA/spark-rapids/issues/11173)|[BUG] The `rs. serialization time` metric is misleading| -|[#11017](https://github.com/NVIDIA/spark-rapids/issues/11017)|Fix tests failures in url_test.py| -|[#11201](https://github.com/NVIDIA/spark-rapids/issues/11201)|[BUG] Delta Lake tables with name mapping can throw exceptions on read| -|[#11175](https://github.com/NVIDIA/spark-rapids/issues/11175)|[BUG] Clean up unused and duplicated 'org/roaringbitmap' folder in the spark3xx shims| -|[#11196](https://github.com/NVIDIA/spark-rapids/issues/11196)|[BUG] pipeline failed due to class not found exception: NoClassDefFoundError: com/nvidia/spark/rapids/GpuScalar| -|[#11189](https://github.com/NVIDIA/spark-rapids/issues/11189)|[BUG] regression in NDS after PR #11170| -|[#11167](https://github.com/NVIDIA/spark-rapids/issues/11167)|[BUG] UnsupportedOperationException during delta write with `optimize()`| -|[#11172](https://github.com/NVIDIA/spark-rapids/issues/11172)|[BUG] `get_json_object` returns wrong output with wildcard path| -|[#11148](https://github.com/NVIDIA/spark-rapids/issues/11148)|[BUG] Integration test `test_write_hive_bucketed_table` fails| -|[#11155](https://github.com/NVIDIA/spark-rapids/issues/11155)|[BUG] ArrayIndexOutOfBoundsException in BatchWithPartitionData.splitColumnarBatch| -|[#11152](https://github.com/NVIDIA/spark-rapids/issues/11152)|[BUG] LORE dumping consumes too much memory.| -|[#11029](https://github.com/NVIDIA/spark-rapids/issues/11029)|Fix tests failures in subquery_test.py| -|[#11150](https://github.com/NVIDIA/spark-rapids/issues/11150)|[BUG] hive_parquet_write_test.py::test_insert_hive_bucketed_table failure| -|[#11070](https://github.com/NVIDIA/spark-rapids/issues/11070)|[BUG] numpy2 fail fastparquet cases: numpy.dtype size changed| -|[#11136](https://github.com/NVIDIA/spark-rapids/issues/11136)|UnaryPositive expression doesn't extend UnaryExpression| -|[#11122](https://github.com/NVIDIA/spark-rapids/issues/11122)|[BUG] UT MetricRange failed 651070526 was not less than 1.5E8 in spark313| -|[#11119](https://github.com/NVIDIA/spark-rapids/issues/11119)|[BUG] window_function_test.py::test_window_group_limits_fallback_for_row_number fails in a distributed environment| -|[#11023](https://github.com/NVIDIA/spark-rapids/issues/11023)|Fix tests failures in dpp_test.py| -|[#11026](https://github.com/NVIDIA/spark-rapids/issues/11026)|Fix tests failures in map_test.py| -|[#11020](https://github.com/NVIDIA/spark-rapids/issues/11020)|Fix tests failures in grouping_sets_test.py| -|[#11113](https://github.com/NVIDIA/spark-rapids/issues/11113)|[BUG] Update premerge tests for Scala 2.13 to use JDK 17 only| -|[#11027](https://github.com/NVIDIA/spark-rapids/issues/11027)|Fix tests failures in sort_test.py| -|[#10775](https://github.com/NVIDIA/spark-rapids/issues/10775)|[BUG] Issues found by Spark UT Framework on RapidsStringExpressionsSuite| -|[#11033](https://github.com/NVIDIA/spark-rapids/issues/11033)|[BUG] CICD failed a case: cmp_test.py::test_empty_filter[>]| -|[#11103](https://github.com/NVIDIA/spark-rapids/issues/11103)|[BUG] UCX Shuffle With scala.MatchError | -|[#11007](https://github.com/NVIDIA/spark-rapids/issues/11007)|Fix tests failures in array_test.py| -|[#10801](https://github.com/NVIDIA/spark-rapids/issues/10801)|[BUG] JDK17 nightly build after Spark UT Framework is merged| -|[#11019](https://github.com/NVIDIA/spark-rapids/issues/11019)|Fix tests failures in window_function_test.py| -|[#11063](https://github.com/NVIDIA/spark-rapids/issues/11063)|[BUG] op time for GpuCoalesceBatches is more than actual| -|[#11006](https://github.com/NVIDIA/spark-rapids/issues/11006)|Fix test failures in arithmetic_ops_test.py| -|[#10995](https://github.com/NVIDIA/spark-rapids/issues/10995)|Fallback TimeZoneAwareExpression that only support UTC with zoneId instead of timeZone config| -|[#8652](https://github.com/NVIDIA/spark-rapids/issues/8652)|[BUG] array_item test failures on Spark 3.3.x| -|[#11053](https://github.com/NVIDIA/spark-rapids/issues/11053)|[BUG] Build on Databricks 330 fails| -|[#10925](https://github.com/NVIDIA/spark-rapids/issues/10925)| Concat cannot accept no parameter| -|[#10975](https://github.com/NVIDIA/spark-rapids/issues/10975)|[BUG] regex `^.*literal` cannot be rewritten as `contains(literal)` for multiline strings| -|[#10956](https://github.com/NVIDIA/spark-rapids/issues/10956)|[BUG] hive_parquet_write_test.py: test_write_compressed_parquet_into_hive_table integration test failures| -|[#10772](https://github.com/NVIDIA/spark-rapids/issues/10772)|[BUG] Issues found by Spark UT Framework on RapidsDataFrameAggregateSuite| -|[#10986](https://github.com/NVIDIA/spark-rapids/issues/10986)|[BUG]Cast from string to float using hand-picked values failed in CastOpSuite| -|[#10972](https://github.com/NVIDIA/spark-rapids/issues/10972)|Spark 4.0 compile errors | -|[#10794](https://github.com/NVIDIA/spark-rapids/issues/10794)|[BUG] Incorrect cast of string columns containing various infinity notations with trailing spaces | -|[#10964](https://github.com/NVIDIA/spark-rapids/issues/10964)|[BUG] Improve stability of pre-merge jenkinsfile| -|[#10714](https://github.com/NVIDIA/spark-rapids/issues/10714)|Signature changed for `PythonUDFRunner.writeUDFs` | -|[#10712](https://github.com/NVIDIA/spark-rapids/issues/10712)|[AUDIT] BatchScanExec/DataSourceV2Relation to group splits by join keys if they differ from partition keys| -|[#10673](https://github.com/NVIDIA/spark-rapids/issues/10673)|[AUDIT] Rename plan nodes for PythonMapInArrowExec| -|[#10710](https://github.com/NVIDIA/spark-rapids/issues/10710)|[AUDIT] `uncacheTableOrView` changed in CommandUtils | -|[#10711](https://github.com/NVIDIA/spark-rapids/issues/10711)|[AUDIT] Match DataSourceV2ScanExecBase changes to groupPartitions method | -|[#10669](https://github.com/NVIDIA/spark-rapids/issues/10669)|Supporting broadcast of multiple filtering keys in DynamicPruning | - -### PRs -||| -|:---|:---| -|[#11400](https://github.com/NVIDIA/spark-rapids/pull/11400)|[DOC] update notes in download page for the decompressing gzip issue [skip ci]| -|[#11355](https://github.com/NVIDIA/spark-rapids/pull/11355)|Update changelog for the v24.08 release [skip ci]| -|[#11353](https://github.com/NVIDIA/spark-rapids/pull/11353)|Update download doc for v24.08.1 [skip ci]| -|[#11352](https://github.com/NVIDIA/spark-rapids/pull/11352)|Update version to 24.08.1-SNAPSHOT [skip ci]| -|[#11337](https://github.com/NVIDIA/spark-rapids/pull/11337)|Update changelog for the v24.08 release [skip ci]| -|[#11335](https://github.com/NVIDIA/spark-rapids/pull/11335)|Fix Delta Lake truncation of min/max string values| -|[#11304](https://github.com/NVIDIA/spark-rapids/pull/11304)|Update changelog for v24.08.0 release [skip ci]| -|[#11303](https://github.com/NVIDIA/spark-rapids/pull/11303)|Update rapids JNI and private dependency to 24.08.0| -|[#11296](https://github.com/NVIDIA/spark-rapids/pull/11296)|[DOC] update doc for 2408 release [skip CI]| -|[#11309](https://github.com/NVIDIA/spark-rapids/pull/11309)|[Doc ]Update lore doc about the range [skip ci]| -|[#11292](https://github.com/NVIDIA/spark-rapids/pull/11292)|Add work around for string split with empty input.| -|[#11278](https://github.com/NVIDIA/spark-rapids/pull/11278)|Fix formatting of advanced configs doc| -|[#10917](https://github.com/NVIDIA/spark-rapids/pull/10917)|Adopt changes from JNI for casting from float to decimal| -|[#11269](https://github.com/NVIDIA/spark-rapids/pull/11269)|Revert "upgrade ucx to 1.17.0"| -|[#11260](https://github.com/NVIDIA/spark-rapids/pull/11260)|Mitigate intermittent test_buckets and shuffle_smoke_test OOM issue| -|[#11268](https://github.com/NVIDIA/spark-rapids/pull/11268)|Fix degenerate conditional nested loop join detection| -|[#11244](https://github.com/NVIDIA/spark-rapids/pull/11244)|Fix ArrayIndexOutOfBoundsException on join counts with constant join keys| -|[#11259](https://github.com/NVIDIA/spark-rapids/pull/11259)|CI Docker to support integration tests with Rocky OS + jdk17 [skip ci]| -|[#11247](https://github.com/NVIDIA/spark-rapids/pull/11247)|Fix `string_test.py` errors on Spark 4.0| -|[#11246](https://github.com/NVIDIA/spark-rapids/pull/11246)|Rework Maven Source Plugin Skip| -|[#11149](https://github.com/NVIDIA/spark-rapids/pull/11149)|Rework on substring index| -|[#11236](https://github.com/NVIDIA/spark-rapids/pull/11236)|Remove the unused vars from the version-def CI script| -|[#11237](https://github.com/NVIDIA/spark-rapids/pull/11237)|Fork jvm for maven-source-plugin| -|[#11200](https://github.com/NVIDIA/spark-rapids/pull/11200)|Multi-get_json_object| -|[#11230](https://github.com/NVIDIA/spark-rapids/pull/11230)|Skip test where Delta Lake may not be fully compatible with Spark| -|[#11220](https://github.com/NVIDIA/spark-rapids/pull/11220)|Avoid failing spark bug SPARK-44242 while generate run_dir| -|[#11226](https://github.com/NVIDIA/spark-rapids/pull/11226)|Fix auto merge conflict 11212| -|[#11129](https://github.com/NVIDIA/spark-rapids/pull/11129)|Spark 4: Fix miscellaneous tests including logic, repart, hive_delimited.| -|[#11163](https://github.com/NVIDIA/spark-rapids/pull/11163)|Support `MapFromArrays` on GPU| -|[#11219](https://github.com/NVIDIA/spark-rapids/pull/11219)|Fix hash_aggregate_test.py to run with ANSI enabled| -|[#11186](https://github.com/NVIDIA/spark-rapids/pull/11186)|from_json Json to Struct Exception Logging| -|[#11180](https://github.com/NVIDIA/spark-rapids/pull/11180)|More accurate estimation for the result serialization time in RapidsShuffleThreadedWriterBase| -|[#11194](https://github.com/NVIDIA/spark-rapids/pull/11194)|Fix ANSI mode test failures in url_test.py| -|[#11202](https://github.com/NVIDIA/spark-rapids/pull/11202)|Fix read from Delta Lake table with name column mapping and missing Parquet IDs| -|[#11185](https://github.com/NVIDIA/spark-rapids/pull/11185)|Fix multi-release jar problem| -|[#11144](https://github.com/NVIDIA/spark-rapids/pull/11144)|Build the Scala2.13 dist jar with JDK17| -|[#11197](https://github.com/NVIDIA/spark-rapids/pull/11197)|Fix class not found error: com/nvidia/spark/rapids/GpuScalar| -|[#11191](https://github.com/NVIDIA/spark-rapids/pull/11191)|Fix dynamic pruning regression in GpuFileSourceScanExec| -|[#10994](https://github.com/NVIDIA/spark-rapids/pull/10994)|Add Spark 4.0.0 Build Profile and Other Supporting Changes| -|[#11192](https://github.com/NVIDIA/spark-rapids/pull/11192)|Append new authorized user to blossom-ci whitelist [skip ci]| -|[#11179](https://github.com/NVIDIA/spark-rapids/pull/11179)|Allow more expressions to be tiered| -|[#11141](https://github.com/NVIDIA/spark-rapids/pull/11141)|Enable some Rapids config in RapidsSQLTestsBaseTrait for Spark UT| -|[#11170](https://github.com/NVIDIA/spark-rapids/pull/11170)|Avoid listFiles or inputFiles on relations with static partitioning| -|[#11159](https://github.com/NVIDIA/spark-rapids/pull/11159)|Drop spark31x shims| -|[#10951](https://github.com/NVIDIA/spark-rapids/pull/10951)|Case when performance improvement: reduce the `copy_if_else`| -|[#11165](https://github.com/NVIDIA/spark-rapids/pull/11165)|Fix some GpuBroadcastToRowExec by not dropping columns| -|[#11126](https://github.com/NVIDIA/spark-rapids/pull/11126)|Coalesce batches after a logical coalesce operation| -|[#11164](https://github.com/NVIDIA/spark-rapids/pull/11164)|fix the bucketed write error for non-utc cases| -|[#11132](https://github.com/NVIDIA/spark-rapids/pull/11132)|Add deletion vector metrics for low shuffle merge.| -|[#11156](https://github.com/NVIDIA/spark-rapids/pull/11156)|Fix batch splitting for partition column size on row-count-only batches| -|[#11153](https://github.com/NVIDIA/spark-rapids/pull/11153)|Fix LORE dump oom.| -|[#11102](https://github.com/NVIDIA/spark-rapids/pull/11102)|Fix ANSI mode failures in subquery_test.py| -|[#11151](https://github.com/NVIDIA/spark-rapids/pull/11151)|Fix the test error of the bucketed write for the non-utc case| -|[#11147](https://github.com/NVIDIA/spark-rapids/pull/11147)|upgrade ucx to 1.17.0| -|[#11138](https://github.com/NVIDIA/spark-rapids/pull/11138)|Update fastparquet to 2024.5.0 for numpy2 compatibility| -|[#11137](https://github.com/NVIDIA/spark-rapids/pull/11137)|Handle the change for UnaryPositive now extending RuntimeReplaceable| -|[#11094](https://github.com/NVIDIA/spark-rapids/pull/11094)|Add `HiveHash` support on GPU| -|[#11139](https://github.com/NVIDIA/spark-rapids/pull/11139)|Improve MetricsSuite to allow more gc jitter| -|[#11133](https://github.com/NVIDIA/spark-rapids/pull/11133)|Fix `test_window_group_limits_fallback`| -|[#11097](https://github.com/NVIDIA/spark-rapids/pull/11097)|Fix miscellaneous integ tests for Spark 4| -|[#11118](https://github.com/NVIDIA/spark-rapids/pull/11118)|Fix issue with DPP and AQE on reused broadcast exchanges| -|[#11043](https://github.com/NVIDIA/spark-rapids/pull/11043)|Dataproc serverless test fixes| -|[#10965](https://github.com/NVIDIA/spark-rapids/pull/10965)|Profiler: Disable collecting async allocation events by default| -|[#11117](https://github.com/NVIDIA/spark-rapids/pull/11117)|Update Scala2.13 premerge CI against JDK17| -|[#11084](https://github.com/NVIDIA/spark-rapids/pull/11084)|Introduce LORE framework.| -|[#11099](https://github.com/NVIDIA/spark-rapids/pull/11099)|Spark 4: Handle ANSI mode in sort_test.py| -|[#11115](https://github.com/NVIDIA/spark-rapids/pull/11115)|Fix match error in RapidsShuffleIterator.scala [scala2.13]| -|[#11088](https://github.com/NVIDIA/spark-rapids/pull/11088)|Support regex patterns with brackets when rewriting to PrefixRange pattern in rlike.| -|[#10950](https://github.com/NVIDIA/spark-rapids/pull/10950)|Add a heuristic to skip second or third agg pass| -|[#11048](https://github.com/NVIDIA/spark-rapids/pull/11048)|Fixed array_tests for Spark 4.0.0| -|[#11049](https://github.com/NVIDIA/spark-rapids/pull/11049)|Fix some cast_tests for Spark 4.0.0| -|[#11066](https://github.com/NVIDIA/spark-rapids/pull/11066)|Replaced spark3xx-common references to spark-shared| -|[#11083](https://github.com/NVIDIA/spark-rapids/pull/11083)|Exclude a case based on JDK version in Spark UT| -|[#10997](https://github.com/NVIDIA/spark-rapids/pull/10997)|Fix some test issues in Spark UT and keep RapidsTestSettings update-to-date| -|[#11073](https://github.com/NVIDIA/spark-rapids/pull/11073)|Disable ANSI mode for window function tests| -|[#11076](https://github.com/NVIDIA/spark-rapids/pull/11076)|Improve the diagnostics for 'conv' fallback explain| -|[#11092](https://github.com/NVIDIA/spark-rapids/pull/11092)|Add GpuBucketingUtils shim to Spark 4.0.0| -|[#11062](https://github.com/NVIDIA/spark-rapids/pull/11062)|fix duplicate counted metrics like op time for GpuCoalesceBatches| -|[#11044](https://github.com/NVIDIA/spark-rapids/pull/11044)|Fixed Failing tests in arithmetic_ops_tests for Spark 4.0.0| -|[#11086](https://github.com/NVIDIA/spark-rapids/pull/11086)|upgrade blossom-ci actions version [skip ci]| -|[#10957](https://github.com/NVIDIA/spark-rapids/pull/10957)|Support bucketing write for GPU| -|[#10979](https://github.com/NVIDIA/spark-rapids/pull/10979)|[FEA] Introduce low shuffle merge.| -|[#10996](https://github.com/NVIDIA/spark-rapids/pull/10996)|Fallback non-UTC TimeZoneAwareExpression with zoneId| -|[#11072](https://github.com/NVIDIA/spark-rapids/pull/11072)|Workaround numpy2 failed fastparquet compatibility tests| -|[#11046](https://github.com/NVIDIA/spark-rapids/pull/11046)|Calculate parallelism to speed up pre-merge CI| -|[#11054](https://github.com/NVIDIA/spark-rapids/pull/11054)|fix flaky array_item test failures| -|[#11051](https://github.com/NVIDIA/spark-rapids/pull/11051)|[FEA] Increase parallelism of deltalake test on databricks| -|[#10993](https://github.com/NVIDIA/spark-rapids/pull/10993)|`binary-dedupe` changes for Spark 4.0.0| -|[#11060](https://github.com/NVIDIA/spark-rapids/pull/11060)|Add in the ability to fingerprint JSON columns| -|[#11059](https://github.com/NVIDIA/spark-rapids/pull/11059)|Revert "Add in the ability to fingerprint JSON columns (#11002)" [skip ci]| -|[#11039](https://github.com/NVIDIA/spark-rapids/pull/11039)|Concat() Exception bug fix| -|[#11002](https://github.com/NVIDIA/spark-rapids/pull/11002)|Add in the ability to fingerprint JSON columns| -|[#10977](https://github.com/NVIDIA/spark-rapids/pull/10977)|Rewrite multiple literal choice regex to multiple contains in rlike| -|[#11035](https://github.com/NVIDIA/spark-rapids/pull/11035)|Fix auto merge conflict 11034 [skip ci]| -|[#11040](https://github.com/NVIDIA/spark-rapids/pull/11040)|Append new authorized user to blossom-ci whitelist [skip ci]| -|[#11036](https://github.com/NVIDIA/spark-rapids/pull/11036)|Update blossom-ci ACL to secure format [skip ci]| -|[#11032](https://github.com/NVIDIA/spark-rapids/pull/11032)|Fix a hive write test failure for Spark 350| -|[#10998](https://github.com/NVIDIA/spark-rapids/pull/10998)|Improve log to print more lines in build [skip ci]| -|[#10992](https://github.com/NVIDIA/spark-rapids/pull/10992)|Addressing the Named Parameter change in Spark 4.0.0| -|[#10943](https://github.com/NVIDIA/spark-rapids/pull/10943)|Fix Spark UT issues in RapidsDataFrameAggregateSuite| -|[#10963](https://github.com/NVIDIA/spark-rapids/pull/10963)|Add rapids configs to enable GPU running in Spark UT| -|[#10978](https://github.com/NVIDIA/spark-rapids/pull/10978)|More compilation fixes for Spark 4.0.0| -|[#10953](https://github.com/NVIDIA/spark-rapids/pull/10953)|Speed up the integration tests by running them in parallel on the Databricks cluster| -|[#10958](https://github.com/NVIDIA/spark-rapids/pull/10958)|Fix a hive write test failure| -|[#10970](https://github.com/NVIDIA/spark-rapids/pull/10970)|Move Support for `RaiseError` to a Shim Excluding Spark 4.0.0| -|[#10966](https://github.com/NVIDIA/spark-rapids/pull/10966)|Add default value for REF of premerge jenkinsfile to avoid bad overwritten [skip ci]| -|[#10959](https://github.com/NVIDIA/spark-rapids/pull/10959)|Add new ID to blossom-ci allow list [skip ci]| -|[#10952](https://github.com/NVIDIA/spark-rapids/pull/10952)|Add shims to take care of the signature change for writeUDFs in PythonUDFRunner| -|[#10931](https://github.com/NVIDIA/spark-rapids/pull/10931)|Add Support for Renaming of PythonMapInArrow| -|[#10949](https://github.com/NVIDIA/spark-rapids/pull/10949)|Change dependency version to 24.08.0-SNAPSHOT| -|[#10857](https://github.com/NVIDIA/spark-rapids/pull/10857)|[Spark 4.0] Account for `PartitionedFileUtil.splitFiles` signature change.| -|[#10912](https://github.com/NVIDIA/spark-rapids/pull/10912)|GpuInsertIntoHiveTable supports parquet format| -|[#10863](https://github.com/NVIDIA/spark-rapids/pull/10863)|[Spark 4.0] Account for `CommandUtils.uncacheTableOrView` signature change.| -|[#10944](https://github.com/NVIDIA/spark-rapids/pull/10944)|Added Shim for BatchScanExec to Support Spark 4.0| -|[#10946](https://github.com/NVIDIA/spark-rapids/pull/10946)|Unarchive Spark test jar for spark.read(ability)| -|[#10945](https://github.com/NVIDIA/spark-rapids/pull/10945)|Add Support for Multiple Filtering Keys for Subquery Broadcast| -|[#10871](https://github.com/NVIDIA/spark-rapids/pull/10871)|Add classloader diagnostics to initShuffleManager error message| -|[#10933](https://github.com/NVIDIA/spark-rapids/pull/10933)|Fixed Databricks build| -|[#10929](https://github.com/NVIDIA/spark-rapids/pull/10929)|Append new authorized user to blossom-ci whitelist [skip ci]| ## Older Releases Changelog of older releases can be found at [docs/archives](/docs/archives) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f18fa2ba0cb..21f31ba1498 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -127,15 +127,15 @@ mvn -pl dist -PnoSnapshots package -DskipTests Verify that shim-specific classes are hidden from a conventional classloader. ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.10.1-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl +$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl Error: class not found: com.nvidia.spark.rapids.shims.SparkShimImpl ``` However, its bytecode can be loaded if prefixed with `spark3XY` not contained in the package name ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.10.1-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 -Warning: File dist/target/rapids-4-spark_2.12-24.10.1-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl +$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-cuda11.jar spark320.com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 +Warning: File dist/target/rapids-4-spark_2.12-24.12.0-cuda11.jar(/spark320/com/nvidia/spark/rapids/shims/SparkShimImpl.class) does not contain class spark320.com.nvidia.spark.rapids.shims.SparkShimImpl Compiled from "SparkShims.scala" public final class com.nvidia.spark.rapids.shims.SparkShimImpl { ``` @@ -178,7 +178,7 @@ mvn package -pl dist -am -Dbuildver=340 -DallowConventionalDistJar=true Verify `com.nvidia.spark.rapids.shims.SparkShimImpl` is conventionally loadable: ```bash -$ javap -cp dist/target/rapids-4-spark_2.12-24.10.1-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 +$ javap -cp dist/target/rapids-4-spark_2.12-24.12.0-cuda11.jar com.nvidia.spark.rapids.shims.SparkShimImpl | head -2 Compiled from "SparkShims.scala" public final class com.nvidia.spark.rapids.shims.SparkShimImpl { ``` diff --git a/DF_UDF_README.md b/DF_UDF_README.md new file mode 100644 index 00000000000..a669c87f258 --- /dev/null +++ b/DF_UDF_README.md @@ -0,0 +1,117 @@ +# Scala / Java UDFS implemented using data frame + +User Defined Functions (UDFs) are used for a number of reasons in Apache Spark. Much of the time it is to implement +logic that is either very difficult or impossible to implement using existing SQL/Dataframe APIs directly. But they +are also used as a way to standardize processing logic across an organization or for code reused. + +But UDFs come with some downsides. The biggest one is visibility into the processing being done. SQL is a language that +can be highly optimized. But a UDF in most cases is a black box, that the SQL optimizer cannot do anything about. +This can result in less than ideal query planning. Additionally, accelerated execution environments, like the +RAPIDS Accelerator for Apache Spark have no easy way to replace UDFs with accelerated versions, which can result in +slow performance. + +This attempts to add visibility to the code reuse use case by providing a way to implement a UDF in terms of dataframe +commands. + +## Setup + +The dataframe UDF plugin is packaged in the same jar as the RAPIDS Accelerator for Apache Spark. This jar will need to +be added as a compile time dependency for code that wants to use this feature as well as adding the jar to your Spark +classpath just like you would do for GPU acceleration. + +If you plan to not use the GPU accelerated processing, but still want dataframe UDF support on CPU applications then +add `com.nvidia.spark.DFUDFPlugin` to the `spark.sql.extensions` config. If you do use GPU accelerated processing +the RAPIDS Plugin will enable this automatically. You don't need to set the `spark.sql.extensions` config, but it +won't hurt anything if you do add it. Now you can implement a UDF in terms of Dataframe operations. + +## Usage + +```scala +import com.nvidia.spark.functions._ + +import org.apache.spark.sql.Column +import org.apache.spark.sql.functions._ + +val sum_array = df_udf((longArray: Column) => + aggregate(longArray, + lit(0L), + (a, b) => coalesce(a, lit(0L)) + coalesce(b, lit(0L)), + a => a)) +spark.udf.register("sum_array", sum_array) +``` + +You can then use `sum_array` however you would have used any other UDF. This allows you to provide a drop in replacement +implementation of an existing UDF. + +```scala +Seq(Array(1L, 2L, 3L)).toDF("data").selectExpr("sum_array(data) as result").show() + ++------+ +|result| ++------+ +| 6| ++------+ +``` + +Java APIs are also supported and should work the same as Spark's UDFs + +```java +import com.nvidia.spark.functions.df_udf + +import org.apache.spark.sql.*; +import org.apache.spark.sql.api.java.UDF2; +import org.apache.spark.sql.expressions.UserDefinedFunction; + + +UserDefinedFunction myAdd = df_udf((Column lhs, Column rhs) -> lhs + rhs) +spark.udf().register("myadd", myAdd) + +spark.sql("SELECT myadd(1, 1) as r").show(); +// +--+ +// | r| +// +--+ +// | 2| +// +--+ + +``` + +## Type Checks + +DataFrame APIs do not provide type safety when writing the code and that is the same here. There are no builtin type +checks for inputs yet. Also, because of how types are resolved in Spark there is no way to adjust the query based on +the types passed in. Type checks are handled by the SQL planner/optimizer after the UDF has been replaced. This means +that the final SQL will not violate any type safety, but it also means that the errors might be confusing. For example, +if I passed in an `ARRAY` to `sum_array` instead of an `ARRAY` I would get an error like + +```scala +Seq(Array(1.0, 2.0, 3.0)).toDF("data").selectExpr("sum_array(data) as result").show() +org.apache.spark.sql.AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "aggregate(data, 0, lambdafunction((coalesce(namedlambdavariable(), 0) + coalesce(namedlambdavariable(), 0)), namedlambdavariable(), namedlambdavariable()), lambdafunction(namedlambdavariable(), namedlambdavariable()))" due to data type mismatch: Parameter 3 requires the "BIGINT" type, however "lambdafunction((coalesce(namedlambdavariable(), 0) + coalesce(namedlambdavariable(), 0)), namedlambdavariable(), namedlambdavariable())" has the type "DOUBLE".; line 1 pos 0; +Project [aggregate(data#46, 0, lambdafunction((cast(coalesce(lambda x_9#49L, 0) as double) + coalesce(lambda y_10#50, cast(0 as double))), lambda x_9#49L, lambda y_10#50, false), lambdafunction(lambda x_11#51L, lambda x_11#51L, false)) AS result#48L] ++- Project [value#43 AS data#46] + +- LocalRelation [value#43] + + at org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.dataTypeMismatch(package.scala:73) + at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$5(CheckAnalysis.scala:269) + at org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis0$5$adapted(CheckAnalysis.scala:256) +``` + +Which is not as simple to understand as a normal UDF. + +```scala +val sum_array = udf((a: Array[Long]) => a.sum) + +spark.udf.register("sum_array", sum_array) + +Seq(Array(1.0, 2.0, 3.0)).toDF("data").selectExpr("sum_array(data) as result").show() +org.apache.spark.sql.AnalysisException: [CANNOT_UP_CAST_DATATYPE] Cannot up cast array element from "DOUBLE" to "BIGINT". + The type path of the target object is: +- array element class: "long" +- root class: "[J" +You can either add an explicit cast to the input data or choose a higher precision type of the field in the target object +at org.apache.spark.sql.errors.QueryCompilationErrors$.upCastFailureError(QueryCompilationErrors.scala:285) +at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveUpCast$.org$apache$spark$sql$catalyst$analysis$Analyzer$ResolveUpCast$$fail(Analyzer.scala:3646) +at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveUpCast$$anonfun$apply$57$$anonfun$applyOrElse$234.applyOrElse(Analyzer.scala:3677) +at org.apache.spark.sql.catalyst.analysis.Analyzer$ResolveUpCast$$anonfun$apply$57$$anonfun$applyOrElse$234.applyOrElse(Analyzer.scala:3654) +``` + +We hope to add optional type checks in the future. diff --git a/README.md b/README.md index 01e2076bdf8..65e194de3c2 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ as a `provided` dependency. com.nvidia rapids-4-spark_2.12 - 24.10.1 + 24.12.0 provided ``` diff --git a/aggregator/pom.xml b/aggregator/pom.xml index a5b47a827d5..1ba28e86568 100644 --- a/aggregator/pom.xml +++ b/aggregator/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.1 + 24.12.0 ../jdk-profiles/pom.xml rapids-4-spark-aggregator_2.12 RAPIDS Accelerator for Apache Spark Aggregator Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark - 24.10.1 + 24.12.0 aggregator @@ -71,6 +71,28 @@ ${spark-rapids-private.version} ${spark.version.classifier} + + + com.nvidia + ${rapids.delta.artifactId1}_${scala.binary.version} + ${project.version} + ${spark.version.classifier} + + + com.nvidia + ${rapids.delta.artifactId2}_${scala.binary.version} + ${project.version} + ${spark.version.classifier} + + + com.nvidia + ${rapids.delta.artifactId3}_${scala.binary.version} + ${project.version} + ${spark.version.classifier} + @@ -262,507 +284,4 @@ - - - - release320 - - - true - - - buildver - 320 - - - - - com.nvidia - rapids-4-spark-delta-20x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release321 - - - buildver - 321 - - - - - com.nvidia - rapids-4-spark-delta-20x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release321cdh - - - buildver - 321cdh - - - - - com.nvidia - rapids-4-spark-delta-20x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release322 - - - buildver - 322 - - - - - com.nvidia - rapids-4-spark-delta-20x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release323 - - - buildver - 323 - - - - - com.nvidia - rapids-4-spark-delta-20x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release324 - - - buildver - 324 - - - - - com.nvidia - rapids-4-spark-delta-20x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release330 - - - - buildver - 330 - - - - - com.nvidia - rapids-4-spark-delta-21x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-22x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-23x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release330cdh - - - buildver - 330cdh - - - - - com.nvidia - rapids-4-spark-delta-21x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-22x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-23x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release332cdh - - - buildver - 332cdh - - - - - com.nvidia - rapids-4-spark-delta-21x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-22x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-23x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release330db - - - buildver - 330db - - - - - com.nvidia - rapids-4-spark-delta-spark330db_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release331 - - - buildver - 331 - - - - - com.nvidia - rapids-4-spark-delta-21x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-22x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-23x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release332 - - - buildver - 332 - - - - - com.nvidia - rapids-4-spark-delta-21x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-22x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-23x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release332db - - - buildver - 332db - - - - - com.nvidia - rapids-4-spark-delta-spark332db_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release341db - - - buildver - 341db - - - - - com.nvidia - rapids-4-spark-delta-spark341db_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release333 - - - buildver - 333 - - - - - com.nvidia - rapids-4-spark-delta-21x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-22x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-23x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release334 - - - buildver - 334 - - - - - com.nvidia - rapids-4-spark-delta-21x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-22x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-23x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release340 - - - buildver - 340 - - - - - com.nvidia - rapids-4-spark-delta-24x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release341 - - - buildver - 341 - - - - - com.nvidia - rapids-4-spark-delta-24x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release342 - - - buildver - 342 - - - - - com.nvidia - rapids-4-spark-delta-24x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release343 - - - buildver - 343 - - - - - com.nvidia - rapids-4-spark-delta-24x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release350 - - - buildver - 350 - - - - - com.nvidia - rapids-4-spark-delta-stub_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release351 - - - buildver - 351 - - - - - com.nvidia - rapids-4-spark-delta-stub_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release352 - - - buildver - 352 - - - - - com.nvidia - rapids-4-spark-delta-stub_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - diff --git a/api_validation/pom.xml b/api_validation/pom.xml index 653f2def0bc..697fb4c7759 100644 --- a/api_validation/pom.xml +++ b/api_validation/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.1 + 24.12.0 ../shim-deps/pom.xml rapids-4-spark-api-validation_2.12 - 24.10.1 + 24.12.0 api_validation diff --git a/build/buildall b/build/buildall index 0599d080054..2da61fe6451 100755 --- a/build/buildall +++ b/build/buildall @@ -86,7 +86,7 @@ function bloopInstall() { function versionsFromDistProfile() { [[ "$BUILD_ALL_DEBUG" == "1" ]] && set -x - versionRawStr=$(mvn -B help:evaluate -q -pl dist -P"$1" -Dexpression=included_buildvers -DforceStdout) + versionRawStr=$($MVN -B help:evaluate -q -pl dist -P"$1" -Dexpression=included_buildvers -DforceStdout) versionStr=${versionRawStr//[$'\n',]/} echo -n $versionStr } @@ -171,6 +171,7 @@ fi export MVN="mvn -Dmaven.wagon.http.retryHandler.count=3 ${MVN_OPT}" if [[ "$SCALA213" == "1" ]]; then + MVN="$MVN -f scala2.13/" DIST_PROFILE=${DIST_PROFILE:-"noSnapshotsScala213"} $(dirname $0)/make-scala-version-build-files.sh 2.13 else @@ -234,10 +235,6 @@ if [[ "$SKIP_CLEAN" != "1" ]]; then $MVN -q clean fi -if [[ "$SCALA213" == "1" ]]; then - cd scala2.13 -fi - echo "Building a combined dist jar with Shims for ${SPARK_SHIM_VERSIONS[@]} ..." function build_single_shim() { diff --git a/build/get_buildvers.py b/build/get_buildvers.py index bfce9656054..5fe864670b5 100644 --- a/build/get_buildvers.py +++ b/build/get_buildvers.py @@ -34,7 +34,7 @@ def _get_buildvers(buildvers, pom_file, logger=None): else: no_snapshots.append(release) excluded_shims = pom.find(".//pom:dyn.shim.excluded.releases", ns) - if excluded_shims: + if excluded_shims is not None: for removed_shim in [x.strip() for x in excluded_shims.text.split(",")]: if removed_shim in snapshots: snapshots.remove(removed_shim) @@ -48,8 +48,8 @@ def _get_buildvers(buildvers, pom_file, logger=None): if "scala2.13" in pom_file: no_snapshots = list(filter(lambda x: not x.endswith("cdh"), no_snapshots)) - db_release = list(filter(lambda x: x.endswith("db"), no_snapshots)) - no_snapshots = list(filter(lambda x: not x.endswith("db"), no_snapshots)) + db_release = list(filter(lambda x: "db" in x, no_snapshots)) + no_snapshots = list(filter(lambda x: "db" not in x, no_snapshots)) snap_and_no_snap = no_snapshots + snapshots snap_and_no_snap_with_db = snap_and_no_snap + db_release no_snap_with_db = no_snapshots + db_release diff --git a/build/make-scala-version-build-files.sh b/build/make-scala-version-build-files.sh index ad3482ee979..ae6ae016e4e 100755 --- a/build/make-scala-version-build-files.sh +++ b/build/make-scala-version-build-files.sh @@ -18,6 +18,20 @@ set -e +trap_func() { + rv=$? + if [[ $rv == 0 ]]; then + echo DONE scala2.13 poms generated: exit code = $rv + else + echo ERROR generating scala2.13 poms, re-execute with: + echo " bash -x $*" + echo to inspect the error output + exit $rv + fi +} + +trap "trap_func" EXIT + VALID_VERSIONS=( 2.13 ) declare -A DEFAULT_SPARK DEFAULT_SPARK[2.12]="spark320" diff --git a/build/shimplify.py b/build/shimplify.py index a942f9a05b9..d551d2f3bbd 100644 --- a/build/shimplify.py +++ b/build/shimplify.py @@ -84,6 +84,7 @@ import os import re import subprocess +from functools import partial def __project(): @@ -199,7 +200,9 @@ def __csv_as_arr(str_val): __shim_comment_pattern = re.compile(re.escape(__opening_shim_tag) + r'\n(.*)\n' + re.escape(__closing_shim_tag), re.DOTALL) - +__spark_version_classifier = '$_spark.version.classifier_' +__spark_version_placeholder = re.escape(__spark_version_classifier) +__package_pattern = re.compile('package .*' + '(' + __spark_version_placeholder + ')') def __upsert_shim_json(filename, bv_list): with open(filename, 'r') as file: contents = file.readlines() @@ -365,10 +368,7 @@ def __generate_symlinks(): __log.info("# generating symlinks for shim %s %s files", buildver, src_type) __traverse_source_tree_of_all_shims( src_type, - lambda src_type, path, build_ver_arr: __generate_symlink_to_file(buildver, - src_type, - path, - build_ver_arr)) + partial(__generate_symlink_to_file, buildver=buildver, src_type=src_type)) def __traverse_source_tree_of_all_shims(src_type, func): """Walks src//sparkXYZ""" @@ -392,11 +392,10 @@ def __traverse_source_tree_of_all_shims(src_type, func): build_ver_arr = map(lambda x: str(json.loads(x).get('spark')), shim_arr) __log.debug("extracted shims %s", build_ver_arr) assert build_ver_arr == sorted(build_ver_arr),\ - "%s shim list is not properly sorted" % shim_file_path - func(src_type, shim_file_path, build_ver_arr) - + "%s shim list is not properly sorted: %s" % (shim_file_path, build_ver_arr) + func(shim_file_path=shim_file_path, build_ver_arr=build_ver_arr, shim_file_txt=shim_file_txt) -def __generate_symlink_to_file(buildver, src_type, shim_file_path, build_ver_arr): +def __generate_symlink_to_file(buildver, src_type, shim_file_path, build_ver_arr, shim_file_txt): if buildver in build_ver_arr: project_base_dir = str(__project().getBaseDir()) base_dir = __src_basedir @@ -416,9 +415,32 @@ def __generate_symlink_to_file(buildver, src_type, shim_file_path, build_ver_arr target_shim_file_path = os.path.join(target_root, target_rel_path) __log.debug("creating symlink %s -> %s", target_shim_file_path, shim_file_path) __makedirs(os.path.dirname(target_shim_file_path)) - if __should_overwrite: + package_match = __package_pattern.search(shim_file_txt) + if __should_overwrite or package_match: __remove_file(target_shim_file_path) - __symlink(shim_file_path, target_shim_file_path) + if package_match: + with open(target_shim_file_path, mode='w') as f: + f.write(shim_file_txt[0:package_match.start(1)]) + f.write("spark") + f.write(buildver) + f.write('\n') + f.write(''' +/* +!!! DO NOT EDIT THIS FILE !!! + +This file has been generated from the original + +%s + +by interpolating $_spark.version.classifier_=%s + +Be sure to edit the original file if required + +*/ + ''' % (shim_file_path, 'spark' + buildver)) + f.write(shim_file_txt[package_match.end(1):]) + else: + __symlink(shim_file_path, target_shim_file_path) def __symlink(src, target): @@ -456,8 +478,7 @@ def __shimplify_layout(): for src_type in ['main', 'test']: __traverse_source_tree_of_all_shims( src_type, - lambda unused_src_type, shim_file_path, build_ver_arr: - __update_files2bv(files2bv, shim_file_path, build_ver_arr)) + partial(__update_files2bv, files2bv=files2bv)) # adding a new shim? if __add_shim_buildver is not None: @@ -486,11 +507,17 @@ def __shimplify_layout(): __git_rename_or_copy(shim_file, owner_shim) -def __update_files2bv(files2bv, path, buildver_arr): - assert path not in files2bv.keys(), "new path %s %s should be "\ - "encountered only once, current map\n%s" % (path, buildver_arr, files2bv) - __log.debug("Adding %s %s to files to shim map", path, buildver_arr) - files2bv[path] = buildver_arr +def __update_files2bv(files2bv, + # TODO an anachronism requirement: that the following two params + # have the same name along generate_symlink_file + shim_file_path, + build_ver_arr, + # + **kwargs): + assert shim_file_path not in files2bv.keys(), "new path %s %s should be "\ + "encountered only once, current map\n%s" % (shim_file_path, build_ver_arr, files2bv) + __log.debug("Adding %s %s to files to shim map", shim_file_path, build_ver_arr) + files2bv[shim_file_path] = build_ver_arr def __add_new_shim_to_file_map(files2bv): diff --git a/datagen/README.md b/datagen/README.md index 2855bbdd8b4..022cc2f1eba 100644 --- a/datagen/README.md +++ b/datagen/README.md @@ -24,12 +24,12 @@ Where `$SPARK_VERSION` is a compressed version number, like 330 for Spark 3.3.0. After this the jar should be at `target/datagen_2.12-$PLUGIN_VERSION-spark$SPARK_VERSION.jar` -for example a Spark 3.3.0 jar for the 24.10.1 release would be -`target/datagen_2.12-24.10.1-spark330.jar` +for example a Spark 3.3.0 jar for the 24.12.0 release would be +`target/datagen_2.12-24.12.0-spark330.jar` To get a spark shell with this you can run ```shell -spark-shell --jars target/datagen_2.12-24.10.1-spark330.jar +spark-shell --jars target/datagen_2.12-24.12.0-spark330.jar ``` After that you should be good to go. diff --git a/datagen/ScaleTest.md b/datagen/ScaleTest.md index 63d2bc3b82a..bb5c4a1c988 100644 --- a/datagen/ScaleTest.md +++ b/datagen/ScaleTest.md @@ -44,7 +44,7 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \ --class com.nvidia.rapids.tests.scaletest.ScaleTestDataGen \ # the main class --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ # one dependency jar just shipped with Spark under $SPARK_HOME -./target/datagen_2.12-24.10.1-spark332.jar \ +./target/datagen_2.12-24.12.0-spark332.jar \ 1 \ 10 \ parquet \ diff --git a/datagen/pom.xml b/datagen/pom.xml index b22db7444c6..64e48ff9c85 100644 --- a/datagen/pom.xml +++ b/datagen/pom.xml @@ -21,18 +21,19 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.1 + 24.12.0 ../shim-deps/pom.xml datagen_2.12 Data Generator Tools for generating large amounts of data - 24.10.1 + 24.12.0 datagen **/* package + ${project.build.outputDirectory}/datagen-version-info.properties diff --git a/datagen/src/main/spark320/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala b/datagen/src/main/spark320/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala index 9134505c2f2..5ae88606cfb 100644 --- a/datagen/src/main/spark320/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala +++ b/datagen/src/main/spark320/scala/org/apache/spark/sql/tests/datagen/DataGenExprShims.scala @@ -35,9 +35,12 @@ {"spark": "341db"} {"spark": "342"} {"spark": "343"} +{"spark": "344"} {"spark": "350"} +{"spark": "350db143"} {"spark": "351"} {"spark": "352"} +{"spark": "353"} spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.tests.datagen diff --git a/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeltaLog.scala b/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeltaLog.scala index 2927b8607ad..58bcde1a855 100644 --- a/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeltaLog.scala +++ b/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeltaLog.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -68,7 +68,7 @@ object GpuDeltaLog { dataPath: String, options: Map[String, String], rapidsConf: RapidsConf): GpuDeltaLog = { - val deltaLog = DeltaLog.forTable(spark, dataPath, options) + val deltaLog = DeltaLog.forTable(spark, new Path(dataPath), options) new GpuDeltaLog(deltaLog, rapidsConf) } diff --git a/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/DatabricksDeltaProviderBase.scala b/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/DatabricksDeltaProviderBase.scala index 55f9cc2ae49..508b641d890 100644 --- a/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/DatabricksDeltaProviderBase.scala +++ b/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/DatabricksDeltaProviderBase.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -337,7 +337,7 @@ class DeltaCreatableRelationProviderMeta( } val path = saveCmd.options.get("path") if (path.isDefined) { - val deltaLog = DeltaLog.forTable(SparkSession.active, path.get, saveCmd.options) + val deltaLog = DeltaLog.forTable(SparkSession.active, new Path(path.get), saveCmd.options) RapidsDeltaUtils.tagForDeltaWrite(this, saveCmd.query.schema, Some(deltaLog), saveCmd.options, SparkSession.active) } else { @@ -346,4 +346,4 @@ class DeltaCreatableRelationProviderMeta( } override def convertToGpu(): GpuCreatableRelationProvider = new GpuDeltaDataSource(conf) -} \ No newline at end of file +} diff --git a/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala b/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala index 1a9936ea808..0c212d6842a 100644 --- a/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala +++ b/delta-lake/common/src/main/databricks/scala/org/apache/spark/sql/rapids/delta/GpuOptimizeWriteExchangeExec.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * This file was derived from OptimizeWriteExchange.scala * in the Delta Lake project at https://github.com/delta-io/delta @@ -26,7 +26,7 @@ import scala.concurrent.Future import scala.concurrent.duration.Duration import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf -import com.nvidia.spark.rapids.{GpuColumnarBatchSerializer, GpuExec, GpuMetric, GpuPartitioning, GpuRoundRobinPartitioning} +import com.nvidia.spark.rapids.{GpuColumnarBatchSerializer, GpuExec, GpuMetric, GpuPartitioning, GpuRoundRobinPartitioning, RapidsConf} import com.nvidia.spark.rapids.delta.RapidsDeltaSQLConf import org.apache.spark.{MapOutputStatistics, ShuffleDependency} @@ -98,7 +98,9 @@ case class GpuOptimizeWriteExchangeExec( } private lazy val serializer: Serializer = - new GpuColumnarBatchSerializer(gpuLongMetric("dataSize")) + new GpuColumnarBatchSerializer(gpuLongMetric("dataSize"), + child.output.map(_.dataType).toArray, + RapidsConf.SHUFFLE_KUDO_SERIALIZER_ENABLED.get(child.conf)) @transient lazy val inputRDD: RDD[ColumnarBatch] = child.executeColumnar() diff --git a/delta-lake/delta-20x/pom.xml b/delta-lake/delta-20x/pom.xml index ce78cd3747a..a07d81e0f25 100644 --- a/delta-lake/delta-20x/pom.xml +++ b/delta-lake/delta-20x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.1 + 24.12.0 ../../jdk-profiles/pom.xml rapids-4-spark-delta-20x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-20x diff --git a/delta-lake/delta-21x/pom.xml b/delta-lake/delta-21x/pom.xml index b87f8e3107a..3ad3e3c83fc 100644 --- a/delta-lake/delta-21x/pom.xml +++ b/delta-lake/delta-21x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.1 + 24.12.0 ../../jdk-profiles/pom.xml rapids-4-spark-delta-21x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-21x diff --git a/delta-lake/delta-22x/pom.xml b/delta-lake/delta-22x/pom.xml index dbf2d0316ca..5d4d389b097 100644 --- a/delta-lake/delta-22x/pom.xml +++ b/delta-lake/delta-22x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.1 + 24.12.0 ../../jdk-profiles/pom.xml rapids-4-spark-delta-22x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-22x diff --git a/delta-lake/delta-23x/pom.xml b/delta-lake/delta-23x/pom.xml index 05d18e8d74c..ea394bd26b2 100644 --- a/delta-lake/delta-23x/pom.xml +++ b/delta-lake/delta-23x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-parent_2.12 - 24.10.1 + 24.12.0 ../../pom.xml rapids-4-spark-delta-23x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-23x diff --git a/delta-lake/delta-24x/pom.xml b/delta-lake/delta-24x/pom.xml index 6f03bd3af8c..ee1c7926245 100644 --- a/delta-lake/delta-24x/pom.xml +++ b/delta-lake/delta-24x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.1 + 24.12.0 ../../jdk-profiles/pom.xml rapids-4-spark-delta-24x_2.12 RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-24x diff --git a/delta-lake/delta-spark330db/pom.xml b/delta-lake/delta-spark330db/pom.xml index 54ea4dcda29..e5f60afb125 100644 --- a/delta-lake/delta-spark330db/pom.xml +++ b/delta-lake/delta-spark330db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.1 + 24.12.0 ../../shim-deps/pom.xml rapids-4-spark-delta-spark330db_2.12 RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-spark330db diff --git a/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala b/delta-lake/delta-spark330db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala similarity index 100% rename from delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala rename to delta-lake/delta-spark330db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala diff --git a/delta-lake/delta-spark332db/pom.xml b/delta-lake/delta-spark332db/pom.xml index 66fd0ec856b..102c91daf82 100644 --- a/delta-lake/delta-spark332db/pom.xml +++ b/delta-lake/delta-spark332db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.1 + 24.12.0 ../../shim-deps/pom.xml rapids-4-spark-delta-spark332db_2.12 RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-spark332db diff --git a/delta-lake/delta-spark332db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala b/delta-lake/delta-spark332db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala new file mode 100644 index 00000000000..b6e9e11946d --- /dev/null +++ b/delta-lake/delta-spark332db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * This file was derived from OptimisticTransaction.scala and TransactionalWrite.scala + * in the Delta Lake project at https://github.com/delta-io/delta. + * + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.databricks.sql.transaction.tahoe.rapids + +import com.databricks.sql.transaction.tahoe._ +import com.databricks.sql.transaction.tahoe.actions.FileAction +import com.databricks.sql.transaction.tahoe.constraints.{Constraint, DeltaInvariantCheckerExec} +import com.databricks.sql.transaction.tahoe.files.TahoeBatchFileIndex +import com.databricks.sql.transaction.tahoe.metering.DeltaLogging +import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf +import com.nvidia.spark.rapids._ + +import org.apache.spark.sql.{Dataset, SparkSession} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, NamedExpression} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} +import org.apache.spark.sql.rapids.GpuShuffleEnv +import org.apache.spark.sql.rapids.GpuV1WriteUtils.GpuEmpty2Null +import org.apache.spark.sql.rapids.delta.{DeltaShufflePartitionsUtil, GpuOptimizeWriteExchangeExec, OptimizeWriteExchangeExec} +import org.apache.spark.sql.types.{StringType, StructType} +import org.apache.spark.util.Clock + +/** + * Used to perform a set of reads in a transaction and then commit a set of updates to the + * state of the log. All reads from the DeltaLog, MUST go through this instance rather + * than directly to the DeltaLog otherwise they will not be check for logical conflicts + * with concurrent updates. + * + * This class is not thread-safe. + * + * @param deltaLog The Delta Log for the table this transaction is modifying. + * @param snapshot The snapshot that this transaction is reading at. + * @param rapidsConf RAPIDS Accelerator config settings. + */ +abstract class GpuOptimisticTransactionBase + (deltaLog: DeltaLog, snapshot: Snapshot, val rapidsConf: RapidsConf) + (implicit clock: Clock) + extends OptimisticTransaction(deltaLog, snapshot)(clock) + with DeltaLogging { + + /** + * Adds checking of constraints on the table + * @param plan Plan to generate the table to check against constraints + * @param constraints Constraints to check on the table + * @return GPU columnar plan to execute + */ + protected def addInvariantChecks(plan: SparkPlan, constraints: Seq[Constraint]): SparkPlan = { + val cpuInvariants = + DeltaInvariantCheckerExec.buildInvariantChecks(plan.output, constraints, plan.session) + GpuCheckDeltaInvariant.maybeConvertToGpu(cpuInvariants, rapidsConf) match { + case Some(gpuInvariants) => + val gpuPlan = convertToGpu(plan) + GpuDeltaInvariantCheckerExec(gpuPlan, gpuInvariants) + case None => + val cpuPlan = convertToCpu(plan) + DeltaInvariantCheckerExec(cpuPlan, constraints) + } + } + + /** GPU version of convertEmptyToNullIfNeeded */ + private def gpuConvertEmptyToNullIfNeeded( + plan: GpuExec, + partCols: Seq[Attribute], + constraints: Seq[Constraint]): SparkPlan = { + if (!spark.conf.get(DeltaSQLConf.CONVERT_EMPTY_TO_NULL_FOR_STRING_PARTITION_COL)) { + return plan + } + // No need to convert if there are no constraints. The empty strings will be converted later by + // FileFormatWriter and FileFormatDataWriter. Note that we might still do unnecessary convert + // here as the constraints might not be related to the string partition columns. A precise + // check will need to walk the constraints to see if such columns are really involved. It + // doesn't seem to worth the effort. + if (constraints.isEmpty) return plan + + val partSet = AttributeSet(partCols) + var needConvert = false + val projectList: Seq[NamedExpression] = plan.output.map { + case p if partSet.contains(p) && p.dataType == StringType => + needConvert = true + GpuAlias(GpuEmpty2Null(p), p.name)() + case attr => attr + } + if (needConvert) GpuProjectExec(projectList.toList, plan) else plan + } + + /** + * If there is any string partition column and there are constraints defined, add a projection to + * convert empty string to null for that column. The empty strings will be converted to null + * eventually even without this convert, but we want to do this earlier before check constraints + * so that empty strings are correctly rejected. Note that this should not cause the downstream + * logic in `FileFormatWriter` to add duplicate conversions because the logic there checks the + * partition column using the original plan's output. When the plan is modified with additional + * projections, the partition column check won't match and will not add more conversion. + * + * @param plan The original SparkPlan. + * @param partCols The partition columns. + * @param constraints The defined constraints. + * @return A SparkPlan potentially modified with an additional projection on top of `plan` + */ + override def convertEmptyToNullIfNeeded( + plan: SparkPlan, + partCols: Seq[Attribute], + constraints: Seq[Constraint]): SparkPlan = { + // Reuse the CPU implementation if the plan ends up on the CPU, otherwise do the + // equivalent on the GPU. + plan match { + case g: GpuExec => gpuConvertEmptyToNullIfNeeded(g, partCols, constraints) + case _ => super.convertEmptyToNullIfNeeded(plan, partCols, constraints) + } + } + + override def writeFiles( + inputData: Dataset[_], + additionalConstraints: Seq[Constraint]): Seq[FileAction] = { + writeFiles(inputData, None, additionalConstraints) + } + + protected def applyOptimizeWriteIfNeeded( + spark: SparkSession, + physicalPlan: SparkPlan, + partitionSchema: StructType, + isOptimize: Boolean): SparkPlan = { + val optimizeWriteEnabled = !isOptimize && + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_WRITE_ENABLED) + .orElse(DeltaConfigs.OPTIMIZE_WRITE.fromMetaData(metadata)).getOrElse(false) + if (optimizeWriteEnabled) { + val planWithoutTopRepartition = + DeltaShufflePartitionsUtil.removeTopRepartition(physicalPlan) + val partitioning = DeltaShufflePartitionsUtil.partitioningForRebalance( + physicalPlan.output, partitionSchema, spark.sessionState.conf.numShufflePartitions) + planWithoutTopRepartition match { + case p: GpuExec => + val partMeta = GpuOverrides.wrapPart(partitioning, rapidsConf, None) + partMeta.tagForGpu() + if (partMeta.canThisBeReplaced) { + val plan = GpuOptimizeWriteExchangeExec(partMeta.convertToGpu(), p) + if (GpuShuffleEnv.useGPUShuffle(rapidsConf)) { + GpuCoalesceBatches(plan, TargetSize(rapidsConf.gpuTargetBatchSizeBytes)) + } else { + GpuShuffleCoalesceExec(plan, rapidsConf.gpuTargetBatchSizeBytes) + } + } else { + GpuColumnarToRowExec(OptimizeWriteExchangeExec(partitioning, p)) + } + case p => + OptimizeWriteExchangeExec(partitioning, p) + } + } else { + physicalPlan + } + } + + protected def isOptimizeCommand(plan: LogicalPlan): Boolean = { + val leaves = plan.collectLeaves() + leaves.size == 1 && leaves.head.collect { + case LogicalRelation(HadoopFsRelation( + index: TahoeBatchFileIndex, _, _, _, _, _), _, _, _) => + index.actionType.equals("Optimize") + }.headOption.getOrElse(false) + } + + protected def convertToCpu(plan: SparkPlan): SparkPlan = plan match { + case GpuRowToColumnarExec(p, _) => p + case p: GpuExec => GpuColumnarToRowExec(p) + case p => p + } + + protected def convertToGpu(plan: SparkPlan): SparkPlan = plan match { + case GpuColumnarToRowExec(p, _) => p + case p: GpuExec => p + case p => GpuRowToColumnarExec(p, TargetSize(rapidsConf.gpuTargetBatchSizeBytes)) + } +} diff --git a/delta-lake/delta-spark341db/pom.xml b/delta-lake/delta-spark341db/pom.xml index eff6346fe5f..39e7c0b2dd4 100644 --- a/delta-lake/delta-spark341db/pom.xml +++ b/delta-lake/delta-spark341db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.1 + 24.12.0 ../../shim-deps/pom.xml rapids-4-spark-delta-spark341db_2.12 RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 false diff --git a/delta-lake/delta-spark341db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala b/delta-lake/delta-spark341db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala new file mode 100644 index 00000000000..b6e9e11946d --- /dev/null +++ b/delta-lake/delta-spark341db/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * This file was derived from OptimisticTransaction.scala and TransactionalWrite.scala + * in the Delta Lake project at https://github.com/delta-io/delta. + * + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.databricks.sql.transaction.tahoe.rapids + +import com.databricks.sql.transaction.tahoe._ +import com.databricks.sql.transaction.tahoe.actions.FileAction +import com.databricks.sql.transaction.tahoe.constraints.{Constraint, DeltaInvariantCheckerExec} +import com.databricks.sql.transaction.tahoe.files.TahoeBatchFileIndex +import com.databricks.sql.transaction.tahoe.metering.DeltaLogging +import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf +import com.nvidia.spark.rapids._ + +import org.apache.spark.sql.{Dataset, SparkSession} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, NamedExpression} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} +import org.apache.spark.sql.rapids.GpuShuffleEnv +import org.apache.spark.sql.rapids.GpuV1WriteUtils.GpuEmpty2Null +import org.apache.spark.sql.rapids.delta.{DeltaShufflePartitionsUtil, GpuOptimizeWriteExchangeExec, OptimizeWriteExchangeExec} +import org.apache.spark.sql.types.{StringType, StructType} +import org.apache.spark.util.Clock + +/** + * Used to perform a set of reads in a transaction and then commit a set of updates to the + * state of the log. All reads from the DeltaLog, MUST go through this instance rather + * than directly to the DeltaLog otherwise they will not be check for logical conflicts + * with concurrent updates. + * + * This class is not thread-safe. + * + * @param deltaLog The Delta Log for the table this transaction is modifying. + * @param snapshot The snapshot that this transaction is reading at. + * @param rapidsConf RAPIDS Accelerator config settings. + */ +abstract class GpuOptimisticTransactionBase + (deltaLog: DeltaLog, snapshot: Snapshot, val rapidsConf: RapidsConf) + (implicit clock: Clock) + extends OptimisticTransaction(deltaLog, snapshot)(clock) + with DeltaLogging { + + /** + * Adds checking of constraints on the table + * @param plan Plan to generate the table to check against constraints + * @param constraints Constraints to check on the table + * @return GPU columnar plan to execute + */ + protected def addInvariantChecks(plan: SparkPlan, constraints: Seq[Constraint]): SparkPlan = { + val cpuInvariants = + DeltaInvariantCheckerExec.buildInvariantChecks(plan.output, constraints, plan.session) + GpuCheckDeltaInvariant.maybeConvertToGpu(cpuInvariants, rapidsConf) match { + case Some(gpuInvariants) => + val gpuPlan = convertToGpu(plan) + GpuDeltaInvariantCheckerExec(gpuPlan, gpuInvariants) + case None => + val cpuPlan = convertToCpu(plan) + DeltaInvariantCheckerExec(cpuPlan, constraints) + } + } + + /** GPU version of convertEmptyToNullIfNeeded */ + private def gpuConvertEmptyToNullIfNeeded( + plan: GpuExec, + partCols: Seq[Attribute], + constraints: Seq[Constraint]): SparkPlan = { + if (!spark.conf.get(DeltaSQLConf.CONVERT_EMPTY_TO_NULL_FOR_STRING_PARTITION_COL)) { + return plan + } + // No need to convert if there are no constraints. The empty strings will be converted later by + // FileFormatWriter and FileFormatDataWriter. Note that we might still do unnecessary convert + // here as the constraints might not be related to the string partition columns. A precise + // check will need to walk the constraints to see if such columns are really involved. It + // doesn't seem to worth the effort. + if (constraints.isEmpty) return plan + + val partSet = AttributeSet(partCols) + var needConvert = false + val projectList: Seq[NamedExpression] = plan.output.map { + case p if partSet.contains(p) && p.dataType == StringType => + needConvert = true + GpuAlias(GpuEmpty2Null(p), p.name)() + case attr => attr + } + if (needConvert) GpuProjectExec(projectList.toList, plan) else plan + } + + /** + * If there is any string partition column and there are constraints defined, add a projection to + * convert empty string to null for that column. The empty strings will be converted to null + * eventually even without this convert, but we want to do this earlier before check constraints + * so that empty strings are correctly rejected. Note that this should not cause the downstream + * logic in `FileFormatWriter` to add duplicate conversions because the logic there checks the + * partition column using the original plan's output. When the plan is modified with additional + * projections, the partition column check won't match and will not add more conversion. + * + * @param plan The original SparkPlan. + * @param partCols The partition columns. + * @param constraints The defined constraints. + * @return A SparkPlan potentially modified with an additional projection on top of `plan` + */ + override def convertEmptyToNullIfNeeded( + plan: SparkPlan, + partCols: Seq[Attribute], + constraints: Seq[Constraint]): SparkPlan = { + // Reuse the CPU implementation if the plan ends up on the CPU, otherwise do the + // equivalent on the GPU. + plan match { + case g: GpuExec => gpuConvertEmptyToNullIfNeeded(g, partCols, constraints) + case _ => super.convertEmptyToNullIfNeeded(plan, partCols, constraints) + } + } + + override def writeFiles( + inputData: Dataset[_], + additionalConstraints: Seq[Constraint]): Seq[FileAction] = { + writeFiles(inputData, None, additionalConstraints) + } + + protected def applyOptimizeWriteIfNeeded( + spark: SparkSession, + physicalPlan: SparkPlan, + partitionSchema: StructType, + isOptimize: Boolean): SparkPlan = { + val optimizeWriteEnabled = !isOptimize && + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_WRITE_ENABLED) + .orElse(DeltaConfigs.OPTIMIZE_WRITE.fromMetaData(metadata)).getOrElse(false) + if (optimizeWriteEnabled) { + val planWithoutTopRepartition = + DeltaShufflePartitionsUtil.removeTopRepartition(physicalPlan) + val partitioning = DeltaShufflePartitionsUtil.partitioningForRebalance( + physicalPlan.output, partitionSchema, spark.sessionState.conf.numShufflePartitions) + planWithoutTopRepartition match { + case p: GpuExec => + val partMeta = GpuOverrides.wrapPart(partitioning, rapidsConf, None) + partMeta.tagForGpu() + if (partMeta.canThisBeReplaced) { + val plan = GpuOptimizeWriteExchangeExec(partMeta.convertToGpu(), p) + if (GpuShuffleEnv.useGPUShuffle(rapidsConf)) { + GpuCoalesceBatches(plan, TargetSize(rapidsConf.gpuTargetBatchSizeBytes)) + } else { + GpuShuffleCoalesceExec(plan, rapidsConf.gpuTargetBatchSizeBytes) + } + } else { + GpuColumnarToRowExec(OptimizeWriteExchangeExec(partitioning, p)) + } + case p => + OptimizeWriteExchangeExec(partitioning, p) + } + } else { + physicalPlan + } + } + + protected def isOptimizeCommand(plan: LogicalPlan): Boolean = { + val leaves = plan.collectLeaves() + leaves.size == 1 && leaves.head.collect { + case LogicalRelation(HadoopFsRelation( + index: TahoeBatchFileIndex, _, _, _, _, _), _, _, _) => + index.actionType.equals("Optimize") + }.headOption.getOrElse(false) + } + + protected def convertToCpu(plan: SparkPlan): SparkPlan = plan match { + case GpuRowToColumnarExec(p, _) => p + case p: GpuExec => GpuColumnarToRowExec(p) + case p => p + } + + protected def convertToGpu(plan: SparkPlan): SparkPlan = plan match { + case GpuColumnarToRowExec(p, _) => p + case p: GpuExec => p + case p => GpuRowToColumnarExec(p, TargetSize(rapidsConf.gpuTargetBatchSizeBytes)) + } +} diff --git a/delta-lake/delta-spark341db/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala b/delta-lake/delta-spark341db/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala index e109b81f1e5..088a2a788da 100644 --- a/delta-lake/delta-spark341db/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala +++ b/delta-lake/delta-spark341db/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala @@ -113,7 +113,7 @@ object GpuDeltaParquetFileFormat { meta.willNotWorkOnGpu( s"reading metadata column $IS_ROW_DELETED_COLUMN_NAME is not supported") } - if (format.hasDeletionVectorMap()) { + if (format.hasDeletionVectorMap) { meta.willNotWorkOnGpu("deletion vectors are not supported") } } diff --git a/delta-lake/delta-spark350db143/pom.xml b/delta-lake/delta-spark350db143/pom.xml new file mode 100644 index 00000000000..333a035a680 --- /dev/null +++ b/delta-lake/delta-spark350db143/pom.xml @@ -0,0 +1,85 @@ + + + + 4.0.0 + + + com.nvidia + rapids-4-spark-shim-deps-parent_2.12 + 24.12.0 + ../../shim-deps/pom.xml + + + rapids-4-spark-delta-spark350db143_2.12 + RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support + Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark + 24.12.0 + + + false + **/* + package + + + + + org.roaringbitmap + RoaringBitmap + + + com.nvidia + rapids-4-spark-sql_${scala.binary.version} + ${project.version} + ${spark.version.classifier} + provided + + + + + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-common-sources + generate-sources + + add-source + + + + ${project.basedir}/../common/src/main/scala + ${project.basedir}/../common/src/main/databricks/scala + + + + + + + net.alchim31.maven + scala-maven-plugin + + + org.apache.rat + apache-rat-plugin + + + + diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuCreateDeltaTableCommand.scala b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuCreateDeltaTableCommand.scala new file mode 100644 index 00000000000..ace7a13966c --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuCreateDeltaTableCommand.scala @@ -0,0 +1,464 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * This file was derived from CreateDeltaTableCommand.scala in the + * Delta Lake project at https://github.com/delta-io/delta. + * + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.databricks.sql.transaction.tahoe.rapids + +import com.databricks.sql.transaction.tahoe._ +import com.databricks.sql.transaction.tahoe.actions.Metadata +import com.databricks.sql.transaction.tahoe.commands.{TableCreationModes, WriteIntoDelta} +import com.databricks.sql.transaction.tahoe.metering.DeltaLogging +import com.databricks.sql.transaction.tahoe.schema.SchemaUtils +import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf +import com.nvidia.spark.rapids.RapidsConf +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} + +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType} +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.catalog.Identifier +import org.apache.spark.sql.execution.command.{LeafRunnableCommand, RunnableCommand} +import org.apache.spark.sql.types.StructType + +/** + * Single entry point for all write or declaration operations for Delta tables accessed through + * the table name. + * + * @param table The table identifier for the Delta table + * @param existingTableOpt The existing table for the same identifier if exists + * @param mode The save mode when writing data. Relevant when the query is empty or set to Ignore + * with `CREATE TABLE IF NOT EXISTS`. + * @param query The query to commit into the Delta table if it exist. This can come from + * - CTAS + * - saveAsTable + */ +case class GpuCreateDeltaTableCommand( + table: CatalogTable, + existingTableOpt: Option[CatalogTable], + mode: SaveMode, + query: Option[LogicalPlan], + operation: TableCreationModes.CreationMode = TableCreationModes.Create, + tableByPath: Boolean = false, + override val output: Seq[Attribute] = Nil)(@transient rapidsConf: RapidsConf) + extends LeafRunnableCommand + with DeltaLogging { + + override def otherCopyArgs: Seq[AnyRef] = Seq(rapidsConf) + + override def run(sparkSession: SparkSession): Seq[Row] = { + val table = this.table + + assert(table.tableType != CatalogTableType.VIEW) + assert(table.identifier.database.isDefined, "Database should've been fixed at analysis") + // There is a subtle race condition here, where the table can be created by someone else + // while this command is running. Nothing we can do about that though :( + val tableExists = existingTableOpt.isDefined + if (mode == SaveMode.Ignore && tableExists) { + // Early exit on ignore + return Nil + } else if (mode == SaveMode.ErrorIfExists && tableExists) { + throw DeltaErrors.tableAlreadyExists(table) + } + + val tableWithLocation = if (tableExists) { + val existingTable = existingTableOpt.get + table.storage.locationUri match { + case Some(location) if location.getPath != existingTable.location.getPath => + throw DeltaErrors.tableLocationMismatch(table, existingTable) + case _ => + } + table.copy( + storage = existingTable.storage, + tableType = existingTable.tableType) + } else if (table.storage.locationUri.isEmpty) { + // We are defining a new managed table + assert(table.tableType == CatalogTableType.MANAGED) + val loc = sparkSession.sessionState.catalog.defaultTablePath(table.identifier) + table.copy(storage = table.storage.copy(locationUri = Some(loc))) + } else { + // 1. We are defining a new external table + // 2. It's a managed table which already has the location populated. This can happen in DSV2 + // CTAS flow. + table + } + + val isManagedTable = tableWithLocation.tableType == CatalogTableType.MANAGED + val tableLocation = new Path(tableWithLocation.location) + val gpuDeltaLog = GpuDeltaLog.forTable(sparkSession, tableLocation, rapidsConf) + val hadoopConf = gpuDeltaLog.deltaLog.newDeltaHadoopConf() + val fs = tableLocation.getFileSystem(hadoopConf) + val options = new DeltaOptions(table.storage.properties, sparkSession.sessionState.conf) + var result: Seq[Row] = Nil + + recordDeltaOperation(gpuDeltaLog.deltaLog, "delta.ddl.createTable") { + val txn = gpuDeltaLog.startTransaction() + val opStartTs = System.currentTimeMillis() + if (query.isDefined) { + // If the mode is Ignore or ErrorIfExists, the table must not exist, or we would return + // earlier. And the data should not exist either, to match the behavior of + // Ignore/ErrorIfExists mode. This means the table path should not exist or is empty. + if (mode == SaveMode.Ignore || mode == SaveMode.ErrorIfExists) { + assert(!tableExists) + // We may have failed a previous write. The retry should still succeed even if we have + // garbage data + if (txn.readVersion > -1 || !fs.exists(gpuDeltaLog.deltaLog.logPath)) { + assertPathEmpty(hadoopConf, tableWithLocation) + } + } + // We are either appending/overwriting with saveAsTable or creating a new table with CTAS or + // we are creating a table as part of a RunnableCommand + query.get match { + case writer: WriteIntoDelta => + // In the V2 Writer, methods like "replace" and "createOrReplace" implicitly mean that + // the metadata should be changed. This wasn't the behavior for DataFrameWriterV1. + if (!isV1Writer) { + replaceMetadataIfNecessary( + txn, tableWithLocation, options, writer.data.schema.asNullable) + } + val actions = writer.write(txn, sparkSession) + val op = getOperation(txn.metadata, isManagedTable, Some(options)) + txn.commit(actions, op) + case cmd: RunnableCommand => + result = cmd.run(sparkSession) + case other => + // When using V1 APIs, the `other` plan is not yet optimized, therefore, it is safe + // to once again go through analysis + val data = Dataset.ofRows(sparkSession, other) + + // In the V2 Writer, methods like "replace" and "createOrReplace" implicitly mean that + // the metadata should be changed. This wasn't the behavior for DataFrameWriterV1. + if (!isV1Writer) { + replaceMetadataIfNecessary( + txn, tableWithLocation, options, other.schema.asNullable) + } + + val actions = WriteIntoDelta( + deltaLog = gpuDeltaLog.deltaLog, + mode = mode, + options, + partitionColumns = table.partitionColumnNames, + configuration = tableWithLocation.properties + ("comment" -> table.comment.orNull), + data = data).write(txn, sparkSession) + + val op = getOperation(txn.metadata, isManagedTable, Some(options)) + txn.commit(actions, op) + } + } else { + def createTransactionLogOrVerify(): Unit = { + if (isManagedTable) { + // When creating a managed table, the table path should not exist or is empty, or + // users would be surprised to see the data, or see the data directory being dropped + // after the table is dropped. + assertPathEmpty(hadoopConf, tableWithLocation) + } + + // This is either a new table, or, we never defined the schema of the table. While it is + // unexpected that `txn.metadata.schema` to be empty when txn.readVersion >= 0, we still + // guard against it, in case of checkpoint corruption bugs. + val noExistingMetadata = txn.readVersion == -1 || txn.metadata.schema.isEmpty + if (noExistingMetadata) { + assertTableSchemaDefined(fs, tableLocation, tableWithLocation, txn, sparkSession) + assertPathEmpty(hadoopConf, tableWithLocation) + // This is a user provided schema. + // Doesn't come from a query, Follow nullability invariants. + val newMetadata = getProvidedMetadata(tableWithLocation, table.schema.json) + txn.updateMetadataForNewTable(newMetadata) + + val op = getOperation(newMetadata, isManagedTable, None) + txn.commit(Nil, op) + } else { + verifyTableMetadata(txn, tableWithLocation) + } + } + // We are defining a table using the Create or Replace Table statements. + operation match { + case TableCreationModes.Create => + require(!tableExists, "Can't recreate a table when it exists") + createTransactionLogOrVerify() + + case TableCreationModes.CreateOrReplace if !tableExists => + // If the table doesn't exist, CREATE OR REPLACE must provide a schema + if (tableWithLocation.schema.isEmpty) { + throw DeltaErrors.schemaNotProvidedException + } + createTransactionLogOrVerify() + case _ => + // When the operation is a REPLACE or CREATE OR REPLACE, then the schema shouldn't be + // empty, since we'll use the entry to replace the schema + if (tableWithLocation.schema.isEmpty) { + throw DeltaErrors.schemaNotProvidedException + } + // We need to replace + replaceMetadataIfNecessary(txn, tableWithLocation, options, tableWithLocation.schema) + // Truncate the table + val operationTimestamp = System.currentTimeMillis() + val removes = txn.filterFiles().map(_.removeWithTimestamp(operationTimestamp)) + val op = getOperation(txn.metadata, isManagedTable, None) + txn.commit(removes, op) + } + } + + // We would have failed earlier on if we couldn't ignore the existence of the table + // In addition, we just might using saveAsTable to append to the table, so ignore the creation + // if it already exists. + // Note that someone may have dropped and recreated the table in a separate location in the + // meantime... Unfortunately we can't do anything there at the moment, because Hive sucks. + logInfo(s"Table is path-based table: $tableByPath. Update catalog with mode: $operation") + updateCatalog( + sparkSession, + tableWithLocation, + gpuDeltaLog.deltaLog.update(checkIfUpdatedSinceTs = Some(opStartTs)), + txn) + + result + } + } + + private def getProvidedMetadata(table: CatalogTable, schemaString: String): Metadata = { + Metadata( + description = table.comment.orNull, + schemaString = schemaString, + partitionColumns = table.partitionColumnNames, + configuration = table.properties, + createdTime = Some(System.currentTimeMillis())) + } + + private def assertPathEmpty( + hadoopConf: Configuration, + tableWithLocation: CatalogTable): Unit = { + val path = new Path(tableWithLocation.location) + val fs = path.getFileSystem(hadoopConf) + // Verify that the table location associated with CREATE TABLE doesn't have any data. Note that + // we intentionally diverge from this behavior w.r.t regular datasource tables (that silently + // overwrite any previous data) + if (fs.exists(path) && fs.listStatus(path).nonEmpty) { + throw DeltaErrors.createTableWithNonEmptyLocation( + tableWithLocation.identifier.toString, + tableWithLocation.location.toString) + } + } + + private def assertTableSchemaDefined( + fs: FileSystem, + path: Path, + table: CatalogTable, + txn: OptimisticTransaction, + sparkSession: SparkSession): Unit = { + // If we allow creating an empty schema table and indeed the table is new, we just need to + // make sure: + // 1. txn.readVersion == -1 to read a new table + // 2. for external tables: path must either doesn't exist or is completely empty + val allowCreatingTableWithEmptySchema = sparkSession.sessionState + .conf.getConf(DeltaSQLConf.DELTA_ALLOW_CREATE_EMPTY_SCHEMA_TABLE) && txn.readVersion == -1 + + // Users did not specify the schema. We expect the schema exists in Delta. + if (table.schema.isEmpty) { + if (table.tableType == CatalogTableType.EXTERNAL) { + if (fs.exists(path) && fs.listStatus(path).nonEmpty) { + throw DeltaErrors.createExternalTableWithoutLogException( + path, table.identifier.quotedString, sparkSession) + } else { + if (allowCreatingTableWithEmptySchema) return + throw DeltaErrors.createExternalTableWithoutSchemaException( + path, table.identifier.quotedString, sparkSession) + } + } else { + if (allowCreatingTableWithEmptySchema) return + throw DeltaErrors.createManagedTableWithoutSchemaException( + table.identifier.quotedString, sparkSession) + } + } + } + + /** + * Verify against our transaction metadata that the user specified the right metadata for the + * table. + */ + private def verifyTableMetadata( + txn: OptimisticTransaction, + tableDesc: CatalogTable): Unit = { + val existingMetadata = txn.metadata + val path = new Path(tableDesc.location) + + // The delta log already exists. If they give any configuration, we'll make sure it all matches. + // Otherwise we'll just go with the metadata already present in the log. + // The schema compatibility checks will be made in `WriteIntoDelta` for CreateTable + // with a query + if (txn.readVersion > -1) { + if (tableDesc.schema.nonEmpty) { + // We check exact alignment on create table if everything is provided + // However, if in column mapping mode, we can safely ignore the related metadata fields in + // existing metadata because new table desc will not have related metadata assigned yet + val differences = SchemaUtils.reportDifferences( + DeltaColumnMapping.dropColumnMappingMetadata(existingMetadata.schema), + tableDesc.schema) + if (differences.nonEmpty) { + throw DeltaErrors.createTableWithDifferentSchemaException( + path, tableDesc.schema, existingMetadata.schema, differences) + } + } + + // If schema is specified, we must make sure the partitioning matches, even the partitioning + // is not specified. + if (tableDesc.schema.nonEmpty && + tableDesc.partitionColumnNames != existingMetadata.partitionColumns) { + throw DeltaErrors.createTableWithDifferentPartitioningException( + path, tableDesc.partitionColumnNames, existingMetadata.partitionColumns) + } + + if (tableDesc.properties.nonEmpty && tableDesc.properties != existingMetadata.configuration) { + throw DeltaErrors.createTableWithDifferentPropertiesException( + path, tableDesc.properties, existingMetadata.configuration) + } + } + } + + /** + * Based on the table creation operation, and parameters, we can resolve to different operations. + * A lot of this is needed for legacy reasons in Databricks Runtime. + * @param metadata The table metadata, which we are creating or replacing + * @param isManagedTable Whether we are creating or replacing a managed table + * @param options Write options, if this was a CTAS/RTAS + */ + private def getOperation( + metadata: Metadata, + isManagedTable: Boolean, + options: Option[DeltaOptions]): DeltaOperations.Operation = operation match { + // This is legacy saveAsTable behavior in Databricks Runtime + case TableCreationModes.Create if existingTableOpt.isDefined && query.isDefined => + DeltaOperations.Write(mode, Option(table.partitionColumnNames), options.get.replaceWhere, + options.flatMap(_.userMetadata)) + + // DataSourceV2 table creation + // CREATE TABLE (non-DataFrameWriter API) doesn't have options syntax + // (userMetadata uses SQLConf in this case) + case TableCreationModes.Create => + DeltaOperations.CreateTable(metadata, isManagedTable, query.isDefined) + + // DataSourceV2 table replace + // REPLACE TABLE (non-DataFrameWriter API) doesn't have options syntax + // (userMetadata uses SQLConf in this case) + case TableCreationModes.Replace => + DeltaOperations.ReplaceTable(metadata, isManagedTable, orCreate = false, query.isDefined) + + // Legacy saveAsTable with Overwrite mode + case TableCreationModes.CreateOrReplace if options.exists(_.replaceWhere.isDefined) => + DeltaOperations.Write(mode, Option(table.partitionColumnNames), options.get.replaceWhere, + options.flatMap(_.userMetadata)) + + // New DataSourceV2 saveAsTable with overwrite mode behavior + case TableCreationModes.CreateOrReplace => + DeltaOperations.ReplaceTable(metadata, isManagedTable, orCreate = true, query.isDefined, + options.flatMap(_.userMetadata)) + } + + /** + * Similar to getOperation, here we disambiguate the catalog alterations we need to do based + * on the table operation, and whether we have reached here through legacy code or DataSourceV2 + * code paths. + */ + private def updateCatalog( + spark: SparkSession, + table: CatalogTable, + snapshot: Snapshot, + txn: OptimisticTransaction): Unit = { + val cleaned = cleanupTableDefinition(table, snapshot) + operation match { + case _ if tableByPath => // do nothing with the metastore if this is by path + case TableCreationModes.Create => + spark.sessionState.catalog.createTable( + cleaned, + ignoreIfExists = existingTableOpt.isDefined, + validateLocation = false) + case TableCreationModes.Replace | TableCreationModes.CreateOrReplace + if existingTableOpt.isDefined => + spark.sessionState.catalog.alterTable(table) + case TableCreationModes.Replace => + val ident = Identifier.of(table.identifier.database.toArray, table.identifier.table) + throw DeltaErrors.cannotReplaceMissingTableException(ident) + case TableCreationModes.CreateOrReplace => + spark.sessionState.catalog.createTable( + cleaned, + ignoreIfExists = false, + validateLocation = false) + } + } + + /** Clean up the information we pass on to store in the catalog. */ + private def cleanupTableDefinition(table: CatalogTable, snapshot: Snapshot): CatalogTable = { + // These actually have no effect on the usability of Delta, but feature flagging legacy + // behavior for now + val storageProps = if (conf.getConf(DeltaSQLConf.DELTA_LEGACY_STORE_WRITER_OPTIONS_AS_PROPS)) { + // Legacy behavior + table.storage + } else { + table.storage.copy(properties = Map.empty) + } + + table.copy( + schema = new StructType(), + properties = Map.empty, + partitionColumnNames = Nil, + // Remove write specific options when updating the catalog + storage = storageProps, + tracksPartitionsInCatalog = true) + } + + /** + * With DataFrameWriterV2, methods like `replace()` or `createOrReplace()` mean that the + * metadata of the table should be replaced. If overwriteSchema=false is provided with these + * methods, then we will verify that the metadata match exactly. + */ + private def replaceMetadataIfNecessary( + txn: OptimisticTransaction, + tableDesc: CatalogTable, + options: DeltaOptions, + schema: StructType): Unit = { + val isReplace = (operation == TableCreationModes.CreateOrReplace || + operation == TableCreationModes.Replace) + // If a user explicitly specifies not to overwrite the schema, during a replace, we should + // tell them that it's not supported + val dontOverwriteSchema = options.options.contains(DeltaOptions.OVERWRITE_SCHEMA_OPTION) && + !options.canOverwriteSchema + if (isReplace && dontOverwriteSchema) { + throw DeltaErrors.illegalUsageException(DeltaOptions.OVERWRITE_SCHEMA_OPTION, "replacing") + } + if (txn.readVersion > -1L && isReplace && !dontOverwriteSchema) { + // When a table already exists, and we're using the DataFrameWriterV2 API to replace + // or createOrReplace a table, we blindly overwrite the metadata. + txn.updateMetadataForNewTable(getProvidedMetadata(table, schema.json)) + } + } + + /** + * Horrible hack to differentiate between DataFrameWriterV1 and V2 so that we can decide + * what to do with table metadata. In DataFrameWriterV1, mode("overwrite").saveAsTable, + * behaves as a CreateOrReplace table, but we have asked for "overwriteSchema" as an + * explicit option to overwrite partitioning or schema information. With DataFrameWriterV2, + * the behavior asked for by the user is clearer: .createOrReplace(), which means that we + * should overwrite schema and/or partitioning. Therefore we have this hack. + */ + private def isV1Writer: Boolean = { + Thread.currentThread().getStackTrace.exists(_.toString.contains( + classOf[DataFrameWriter[_]].getCanonicalName + ".")) + } +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeleteCommand.scala b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeleteCommand.scala new file mode 100644 index 00000000000..f49a42d2ed0 --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeleteCommand.scala @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * This file was derived from DeleteCommand.scala + * in the Delta Lake project at https://github.com/delta-io/delta. + * + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.databricks.sql.transaction.tahoe.rapids + +import com.databricks.sql.transaction.tahoe.{DeltaConfigs, DeltaLog, DeltaOperations, DeltaTableUtils, DeltaUDF, OptimisticTransaction} +import com.databricks.sql.transaction.tahoe.DeltaCommitTag._ +import com.databricks.sql.transaction.tahoe.RowTracking +import com.databricks.sql.transaction.tahoe.actions.{AddCDCFile, FileAction} +import com.databricks.sql.transaction.tahoe.commands.{DeleteCommandMetrics, DeleteMetric, DeltaCommand, DMLUtils} +import com.databricks.sql.transaction.tahoe.commands.MergeIntoCommandBase.totalBytesAndDistinctPartitionValues +import com.databricks.sql.transaction.tahoe.files.TahoeBatchFileIndex +import com.databricks.sql.transaction.tahoe.rapids.GpuDeleteCommand.{rewritingFilesMsg, FINDING_TOUCHED_FILES_MSG} +import com.nvidia.spark.rapids.delta.GpuDeltaMetricUpdateUDF + +import org.apache.spark.SparkContext +import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, SparkSession} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, EqualNullSafe, Expression, If, Literal, Not} +import org.apache.spark.sql.catalyst.plans.QueryPlan +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.SQLExecution +import org.apache.spark.sql.execution.command.LeafRunnableCommand +import org.apache.spark.sql.execution.metric.SQLMetrics +import org.apache.spark.sql.functions.input_file_name +import org.apache.spark.sql.types.LongType + +/** + * GPU version of Delta Lake DeleteCommand. + * + * Performs a Delete based on the search condition + * + * Algorithm: + * 1) Scan all the files and determine which files have + * the rows that need to be deleted. + * 2) Traverse the affected files and rebuild the touched files. + * 3) Use the Delta protocol to atomically write the remaining rows to new files and remove + * the affected files that are identified in step 1. + */ +case class GpuDeleteCommand( + gpuDeltaLog: GpuDeltaLog, + target: LogicalPlan, + condition: Option[Expression]) + extends LeafRunnableCommand with DeltaCommand with DeleteCommandMetrics { + + override def innerChildren: Seq[QueryPlan[_]] = Seq(target) + + override val output: Seq[Attribute] = Seq(AttributeReference("num_affected_rows", LongType)()) + + @transient private lazy val sc: SparkContext = SparkContext.getOrCreate() + + // DeleteCommandMetrics does not include deletion vector metrics, so add them here because + // the commit command needs to collect these metrics for inclusion in the delta log event + override lazy val metrics = createMetrics ++ Map( + "numDeletionVectorsAdded" -> SQLMetrics.createMetric(sc, "number of deletion vectors added."), + "numDeletionVectorsRemoved" -> + SQLMetrics.createMetric(sc, "number of deletion vectors removed."), + "numDeletionVectorsUpdated" -> + SQLMetrics.createMetric(sc, "number of deletion vectors updated.") + ) + + final override def run(sparkSession: SparkSession): Seq[Row] = { + val deltaLog = gpuDeltaLog.deltaLog + recordDeltaOperation(gpuDeltaLog.deltaLog, "delta.dml.delete") { + gpuDeltaLog.withNewTransaction { txn => + DeltaLog.assertRemovable(txn.snapshot) + val deleteCommitTags = performDelete(sparkSession, deltaLog, txn) + val deleteActions = deleteCommitTags.actions + if (deleteActions.nonEmpty) { + txn.commitIfNeeded(deleteActions, DeltaOperations.Delete(condition.toSeq), + deleteCommitTags.stringTags) + } + } + // Re-cache all cached plans(including this relation itself, if it's cached) that refer to + // this data source relation. + sparkSession.sharedState.cacheManager.recacheByPlan(sparkSession, target) + } + + // Adjust for deletes at partition boundaries. Deletes at partition boundaries is a metadata + // operation, therefore we don't actually have any information around how many rows were deleted + // While this info may exist in the file statistics, it's not guaranteed that we have these + // statistics. To avoid any performance regressions, we currently just return a -1 in such cases + if (metrics("numRemovedFiles").value > 0 && metrics("numDeletedRows").value == 0) { + Seq(Row(-1L)) + } else { + Seq(Row(metrics("numDeletedRows").value)) + } + } + + def performDelete( + sparkSession: SparkSession, + deltaLog: DeltaLog, + txn: OptimisticTransaction): DMLUtils.TaggedCommitData = { + import com.databricks.sql.transaction.tahoe.implicits._ + + var numRemovedFiles: Long = 0 + var numAddedFiles: Long = 0 + var numAddedChangeFiles: Long = 0 + var scanTimeMs: Long = 0 + var rewriteTimeMs: Long = 0 + var numBytesAdded: Long = 0 + var changeFileBytes: Long = 0 + var numBytesRemoved: Long = 0 + var numFilesBeforeSkipping: Long = 0 + var numBytesBeforeSkipping: Long = 0 + var numFilesAfterSkipping: Long = 0 + var numBytesAfterSkipping: Long = 0 + var numPartitionsAfterSkipping: Option[Long] = None + var numPartitionsRemovedFrom: Option[Long] = None + var numPartitionsAddedTo: Option[Long] = None + var numDeletedRows: Option[Long] = None + var numCopiedRows: Option[Long] = None + + val startTime = System.nanoTime() + val numFilesTotal = txn.snapshot.numOfFiles + + val deleteActions: Seq[FileAction] = condition match { + case None => + // Case 1: Delete the whole table if the condition is true + val allFiles = txn.filterFiles(Nil) + + numRemovedFiles = allFiles.size + scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 + val (numBytes, numPartitions) = totalBytesAndDistinctPartitionValues(allFiles) + numBytesRemoved = numBytes + numFilesBeforeSkipping = numRemovedFiles + numBytesBeforeSkipping = numBytes + numFilesAfterSkipping = numRemovedFiles + numBytesAfterSkipping = numBytes + if (txn.metadata.partitionColumns.nonEmpty) { + numPartitionsAfterSkipping = Some(numPartitions) + numPartitionsRemovedFrom = Some(numPartitions) + numPartitionsAddedTo = Some(0) + } + val operationTimestamp = System.currentTimeMillis() + allFiles.map(_.removeWithTimestamp(operationTimestamp)) + case Some(cond) => + val (metadataPredicates, otherPredicates) = + DeltaTableUtils.splitMetadataAndDataPredicates( + cond, txn.metadata.partitionColumns, sparkSession) + + numFilesBeforeSkipping = txn.snapshot.numOfFiles + numBytesBeforeSkipping = txn.snapshot.sizeInBytes + + if (otherPredicates.isEmpty) { + // Case 2: The condition can be evaluated using metadata only. + // Delete a set of files without the need of scanning any data files. + val operationTimestamp = System.currentTimeMillis() + val candidateFiles = txn.filterFiles(metadataPredicates) + + scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 + numRemovedFiles = candidateFiles.size + numBytesRemoved = candidateFiles.map(_.size).sum + numFilesAfterSkipping = candidateFiles.size + val (numCandidateBytes, numCandidatePartitions) = + totalBytesAndDistinctPartitionValues(candidateFiles) + numBytesAfterSkipping = numCandidateBytes + if (txn.metadata.partitionColumns.nonEmpty) { + numPartitionsAfterSkipping = Some(numCandidatePartitions) + numPartitionsRemovedFrom = Some(numCandidatePartitions) + numPartitionsAddedTo = Some(0) + } + candidateFiles.map(_.removeWithTimestamp(operationTimestamp)) + } else { + // Case 3: Delete the rows based on the condition. + val candidateFiles = txn.filterFiles(metadataPredicates ++ otherPredicates) + + numFilesAfterSkipping = candidateFiles.size + val (numCandidateBytes, numCandidatePartitions) = + totalBytesAndDistinctPartitionValues(candidateFiles) + numBytesAfterSkipping = numCandidateBytes + if (txn.metadata.partitionColumns.nonEmpty) { + numPartitionsAfterSkipping = Some(numCandidatePartitions) + } + + val nameToAddFileMap = generateCandidateFileMap(deltaLog.dataPath, candidateFiles) + + val fileIndex = new TahoeBatchFileIndex( + sparkSession, "delete", candidateFiles, deltaLog, deltaLog.dataPath, txn.snapshot) + // Keep everything from the resolved target except a new TahoeFileIndex + // that only involves the affected files instead of all files. + val newTarget = DeltaTableUtils.replaceFileIndex(target, fileIndex) + val data = Dataset.ofRows(sparkSession, newTarget) + val deletedRowCount = metrics("numDeletedRows") + val deletedRowUdf = DeltaUDF.boolean { + new GpuDeltaMetricUpdateUDF(deletedRowCount) + }.asNondeterministic() + val filesToRewrite = + withStatusCode("DELTA", FINDING_TOUCHED_FILES_MSG) { + if (candidateFiles.isEmpty) { + Array.empty[String] + } else { + data.filter(new Column(cond)) + .select(input_file_name()) + .filter(deletedRowUdf()) + .distinct() + .as[String] + .collect() + } + } + + numRemovedFiles = filesToRewrite.length + scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 + if (filesToRewrite.isEmpty) { + // Case 3.1: no row matches and no delete will be triggered + if (txn.metadata.partitionColumns.nonEmpty) { + numPartitionsRemovedFrom = Some(0) + numPartitionsAddedTo = Some(0) + } + Nil + } else { + // Case 3.2: some files need an update to remove the deleted files + // Do the second pass and just read the affected files + val baseRelation = buildBaseRelation( + sparkSession, txn, "delete", deltaLog.dataPath, filesToRewrite, nameToAddFileMap) + // Keep everything from the resolved target except a new TahoeFileIndex + // that only involves the affected files instead of all files. + val newTarget = DeltaTableUtils.replaceFileIndex(target, baseRelation.location) + val targetDF = Dataset.ofRows(sparkSession, newTarget) + val filterCond = Not(EqualNullSafe(cond, Literal.TrueLiteral)) + val rewrittenActions = rewriteFiles(txn, targetDF, filterCond, filesToRewrite.length) + val (changeFiles, rewrittenFiles) = rewrittenActions + .partition(_.isInstanceOf[AddCDCFile]) + numAddedFiles = rewrittenFiles.size + val removedFiles = filesToRewrite.map(f => + getTouchedFile(deltaLog.dataPath, f, nameToAddFileMap)) + val (removedBytes, removedPartitions) = + totalBytesAndDistinctPartitionValues(removedFiles) + numBytesRemoved = removedBytes + val (rewrittenBytes, rewrittenPartitions) = + totalBytesAndDistinctPartitionValues(rewrittenFiles) + numBytesAdded = rewrittenBytes + if (txn.metadata.partitionColumns.nonEmpty) { + numPartitionsRemovedFrom = Some(removedPartitions) + numPartitionsAddedTo = Some(rewrittenPartitions) + } + numAddedChangeFiles = changeFiles.size + changeFileBytes = changeFiles.collect { case f: AddCDCFile => f.size }.sum + rewriteTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 - scanTimeMs + numDeletedRows = Some(metrics("numDeletedRows").value) + numCopiedRows = Some(metrics("numTouchedRows").value - metrics("numDeletedRows").value) + + val operationTimestamp = System.currentTimeMillis() + removeFilesFromPaths(deltaLog, nameToAddFileMap, filesToRewrite, + operationTimestamp) ++ rewrittenActions + } + } + } + metrics("numRemovedFiles").set(numRemovedFiles) + metrics("numAddedFiles").set(numAddedFiles) + val executionTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 + metrics("executionTimeMs").set(executionTimeMs) + metrics("scanTimeMs").set(scanTimeMs) + metrics("rewriteTimeMs").set(rewriteTimeMs) + metrics("numAddedChangeFiles").set(numAddedChangeFiles) + metrics("changeFileBytes").set(changeFileBytes) + metrics("numAddedBytes").set(numBytesAdded) + metrics("numRemovedBytes").set(numBytesRemoved) + metrics("numFilesBeforeSkipping").set(numFilesBeforeSkipping) + metrics("numBytesBeforeSkipping").set(numBytesBeforeSkipping) + metrics("numFilesAfterSkipping").set(numFilesAfterSkipping) + metrics("numBytesAfterSkipping").set(numBytesAfterSkipping) + numPartitionsAfterSkipping.foreach(metrics("numPartitionsAfterSkipping").set) + numPartitionsAddedTo.foreach(metrics("numPartitionsAddedTo").set) + numPartitionsRemovedFrom.foreach(metrics("numPartitionsRemovedFrom").set) + numCopiedRows.foreach(metrics("numCopiedRows").set) + metrics("numDeletionVectorsAdded").set(0) + metrics("numDeletionVectorsRemoved").set(0) + metrics("numDeletionVectorsUpdated").set(0) + txn.registerSQLMetrics(sparkSession, metrics) + // This is needed to make the SQL metrics visible in the Spark UI + val executionId = sparkSession.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) + SQLMetrics.postDriverMetricUpdates( + sparkSession.sparkContext, executionId, metrics.values.toSeq) + + recordDeltaEvent( + deltaLog, + "delta.dml.delete.stats", + data = DeleteMetric( + condition = condition.map(_.sql).getOrElse("true"), + numFilesTotal, + numFilesAfterSkipping, + numAddedFiles, + numRemovedFiles, + numAddedFiles, + numAddedChangeFiles = numAddedChangeFiles, + numFilesBeforeSkipping, + numBytesBeforeSkipping, + numFilesAfterSkipping, + numBytesAfterSkipping, + numPartitionsAfterSkipping, + numPartitionsAddedTo, + numPartitionsRemovedFrom, + numCopiedRows, + numDeletedRows, + numBytesAdded, + numBytesRemoved, + changeFileBytes = changeFileBytes, + scanTimeMs, + rewriteTimeMs, + // We don't support deletion vectors + numDeletionVectorsAdded = 0, + numDeletionVectorsRemoved = 0, + numDeletionVectorsUpdated = 0) + + ) + + DMLUtils.TaggedCommitData(deleteActions) + .withTag(PreservedRowTrackingTag, RowTracking.isEnabled(txn.protocol, txn.metadata)) + .withTag(NoRowsCopiedTag, metrics("numCopiedRows").value == 0) + } + + /** + * Returns the list of `AddFile`s and `AddCDCFile`s that have been re-written. + */ + private def rewriteFiles( + txn: OptimisticTransaction, + baseData: DataFrame, + filterCondition: Expression, + numFilesToRewrite: Long): Seq[FileAction] = { + val shouldWriteCdc = DeltaConfigs.CHANGE_DATA_FEED.fromMetaData(txn.metadata) + + // number of total rows that we have seen / are either copying or deleting (sum of both). + val numTouchedRows = metrics("numTouchedRows") + val numTouchedRowsUdf = DeltaUDF.boolean { + new GpuDeltaMetricUpdateUDF(numTouchedRows) + }.asNondeterministic() + + withStatusCode( + "DELTA", rewritingFilesMsg(numFilesToRewrite)) { + val dfToWrite = if (shouldWriteCdc) { + import com.databricks.sql.transaction.tahoe.commands.cdc.CDCReader._ + // The logic here ends up being surprisingly elegant, with all source rows ending up in + // the output. Recall that we flipped the user-provided delete condition earlier, before the + // call to `rewriteFiles`. All rows which match this latest `filterCondition` are retained + // as table data, while all rows which don't match are removed from the rewritten table data + // but do get included in the output as CDC events. + baseData + .filter(numTouchedRowsUdf()) + .withColumn( + CDC_TYPE_COLUMN_NAME, + new Column(If(filterCondition, CDC_TYPE_NOT_CDC, CDC_TYPE_DELETE)) + ) + } else { + baseData + .filter(numTouchedRowsUdf()) + .filter(new Column(filterCondition)) + } + + txn.writeFiles(dfToWrite) + } + } +} + +object GpuDeleteCommand { + val FINDING_TOUCHED_FILES_MSG: String = "Finding files to rewrite for DELETE operation" + + def rewritingFilesMsg(numFilesToRewrite: Long): String = + s"Rewriting $numFilesToRewrite files for DELETE operation" +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeltaCatalog.scala b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeltaCatalog.scala new file mode 100644 index 00000000000..bd1260857ed --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDeltaCatalog.scala @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * This file was derived from DeltaDataSource.scala in the + * Delta Lake project at https://github.com/delta-io/delta. + * + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.databricks.sql.transaction.tahoe.rapids + +import java.util + +import com.databricks.sql.transaction.tahoe.{DeltaConfigs, DeltaErrors} +import com.databricks.sql.transaction.tahoe.commands.TableCreationModes +import com.databricks.sql.transaction.tahoe.metering.DeltaLogging +import com.databricks.sql.transaction.tahoe.sources.DeltaSourceUtils +import com.nvidia.spark.rapids.RapidsConf + +import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connector.catalog.{Identifier, StagedTable, StagingTableCatalog, Table} +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.execution.command.LeafRunnableCommand +import org.apache.spark.sql.execution.datasources.PartitioningUtils +import org.apache.spark.sql.types.StructType + +class GpuDeltaCatalog( + override val cpuCatalog: StagingTableCatalog, + override val rapidsConf: RapidsConf) + extends GpuDeltaCatalogBase with SupportsPathIdentifier with DeltaLogging { + + override protected def buildGpuCreateDeltaTableCommand( + rapidsConf: RapidsConf, + table: CatalogTable, + existingTableOpt: Option[CatalogTable], + mode: SaveMode, + query: Option[LogicalPlan], + operation: TableCreationModes.CreationMode, + tableByPath: Boolean): LeafRunnableCommand = { + GpuCreateDeltaTableCommand( + table, + existingTableOpt, + mode, + query, + operation, + tableByPath = tableByPath + )(rapidsConf) + } + + override protected def getExistingTableIfExists(table: TableIdentifier): Option[CatalogTable] = { + // If this is a path identifier, we cannot return an existing CatalogTable. The Create command + // will check the file system itself + if (isPathIdentifier(table)) return None + val tableExists = catalog.tableExists(table) + if (tableExists) { + val oldTable = catalog.getTableMetadata(table) + if (oldTable.tableType == CatalogTableType.VIEW) { + throw new AnalysisException( + s"$table is a view. You may not write data into a view.") + } + if (!DeltaSourceUtils.isDeltaTable(oldTable.provider)) { + throw new AnalysisException(s"$table is not a Delta table. Please drop this " + + "table first if you would like to recreate it with Delta Lake.") + } + Some(oldTable) + } else { + None + } + } + + override protected def verifyTableAndSolidify( + tableDesc: CatalogTable, + query: Option[LogicalPlan]): CatalogTable = { + + if (tableDesc.bucketSpec.isDefined) { + throw DeltaErrors.operationNotSupportedException("Bucketing", tableDesc.identifier) + } + + val schema = query.map { plan => + assert(tableDesc.schema.isEmpty, "Can't specify table schema in CTAS.") + plan.schema.asNullable + }.getOrElse(tableDesc.schema) + + PartitioningUtils.validatePartitionColumn( + schema, + tableDesc.partitionColumnNames, + caseSensitive = false) // Delta is case insensitive + + val validatedConfigurations = DeltaConfigs.validateConfigurations(tableDesc.properties) + + val db = tableDesc.identifier.database.getOrElse(catalog.getCurrentDatabase) + val tableIdentWithDB = tableDesc.identifier.copy(database = Some(db)) + tableDesc.copy( + identifier = tableIdentWithDB, + schema = schema, + properties = validatedConfigurations) + } + + override protected def createGpuStagedDeltaTableV2( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String], + operation: TableCreationModes.CreationMode): StagedTable = { + new GpuStagedDeltaTableV2WithLogging(ident, schema, partitions, properties, operation) + } + + override def loadTable(ident: Identifier, timestamp: Long): Table = { + cpuCatalog.loadTable(ident, timestamp) + } + + override def loadTable(ident: Identifier, version: String): Table = { + cpuCatalog.loadTable(ident, version) + } + + /** + * Creates a Delta table using GPU for writing the data + * + * @param ident The identifier of the table + * @param schema The schema of the table + * @param partitions The partition transforms for the table + * @param allTableProperties The table properties that configure the behavior of the table or + * provide information about the table + * @param writeOptions Options specific to the write during table creation or replacement + * @param sourceQuery A query if this CREATE request came from a CTAS or RTAS + * @param operation The specific table creation mode, whether this is a + * Create/Replace/Create or Replace + */ + override def createDeltaTable( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + allTableProperties: util.Map[String, String], + writeOptions: Map[String, String], + sourceQuery: Option[DataFrame], + operation: TableCreationModes.CreationMode + ): Table = recordFrameProfile( + "DeltaCatalog", "createDeltaTable") { + super.createDeltaTable( + ident, + schema, + partitions, + allTableProperties, + writeOptions, + sourceQuery, + operation) + } + + override def createTable( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): Table = + recordFrameProfile("DeltaCatalog", "createTable") { + super.createTable(ident, schema, partitions, properties) + } + + override def stageReplace( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): StagedTable = + recordFrameProfile("DeltaCatalog", "stageReplace") { + super.stageReplace(ident, schema, partitions, properties) + } + + override def stageCreateOrReplace( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): StagedTable = + recordFrameProfile("DeltaCatalog", "stageCreateOrReplace") { + super.stageCreateOrReplace(ident, schema, partitions, properties) + } + + override def stageCreate( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String]): StagedTable = + recordFrameProfile("DeltaCatalog", "stageCreate") { + super.stageCreate(ident, schema, partitions, properties) + } + + /** + * A staged Delta table, which creates a HiveMetaStore entry and appends data if this was a + * CTAS/RTAS command. We have a ugly way of using this API right now, but it's the best way to + * maintain old behavior compatibility between Databricks Runtime and OSS Delta Lake. + */ + protected class GpuStagedDeltaTableV2WithLogging( + ident: Identifier, + schema: StructType, + partitions: Array[Transform], + properties: util.Map[String, String], + operation: TableCreationModes.CreationMode) + extends GpuStagedDeltaTableV2(ident, schema, partitions, properties, operation) { + + override def commitStagedChanges(): Unit = recordFrameProfile( + "DeltaCatalog", "commitStagedChanges") { + super.commitStagedChanges() + } + } +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDoAutoCompaction.scala b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDoAutoCompaction.scala new file mode 100644 index 00000000000..6b7b24bbefb --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuDoAutoCompaction.scala @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * This file was derived from DoAutoCompaction.scala + * from https://github.com/delta-io/delta/pull/1156 + * in the Delta Lake project at https://github.com/delta-io/delta. + * + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.databricks.sql.transaction.tahoe.rapids + +import com.databricks.sql.transaction.tahoe._ +import com.databricks.sql.transaction.tahoe.actions.Action +import com.databricks.sql.transaction.tahoe.hooks.PostCommitHook +import com.databricks.sql.transaction.tahoe.metering.DeltaLogging + +import org.apache.spark.sql.SparkSession + +object GpuDoAutoCompaction extends PostCommitHook + with DeltaLogging + with Serializable { + override val name: String = "Triggers compaction if necessary" + + override def run(spark: SparkSession, + txn: OptimisticTransactionImpl, + committedVersion: Long, + postCommitSnapshot: Snapshot, + committedActions: Seq[Action]): Unit = { + val gpuTxn = txn.asInstanceOf[GpuOptimisticTransaction] + val newTxn = new GpuDeltaLog(gpuTxn.deltaLog, gpuTxn.rapidsConf).startTransaction() + // Note: The Databricks AutoCompact PostCommitHook cannot be used here + // (with a GpuOptimisticTransaction). It appears that AutoCompact creates a new transaction, + // thereby circumventing GpuOptimisticTransaction (which intercepts Parquet writes + // to go through the GPU). + new GpuOptimizeExecutor(spark, newTxn, Seq.empty, Seq.empty, committedActions).optimize() + } + + override def handleError(error: Throwable, version: Long): Unit = + throw DeltaErrors.postCommitHookFailedException(this, version, name, error) +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuLowShuffleMergeCommand.scala b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuLowShuffleMergeCommand.scala new file mode 100644 index 00000000000..fddebda33bd --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuLowShuffleMergeCommand.scala @@ -0,0 +1,1083 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * This file was derived from MergeIntoCommand.scala + * in the Delta Lake project at https://github.com/delta-io/delta. + * + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.databricks.sql.transaction.tahoe.rapids + +import java.net.URI +import java.util.concurrent.TimeUnit + +import scala.collection.mutable + +import com.databricks.sql.io.RowIndexFilterType +import com.databricks.sql.transaction.tahoe._ +import com.databricks.sql.transaction.tahoe.DeltaOperations.MergePredicate +import com.databricks.sql.transaction.tahoe.DeltaParquetFileFormat.DeletionVectorDescriptorWithFilterType +import com.databricks.sql.transaction.tahoe.actions.{AddCDCFile, AddFile, DeletionVectorDescriptor, FileAction} +import com.databricks.sql.transaction.tahoe.commands.DeltaCommand +import com.databricks.sql.transaction.tahoe.rapids.MergeExecutor.{toDeletionVector, totalBytesAndDistinctPartitionValues, FILE_PATH_COL, INCR_METRICS_COL, INCR_METRICS_FIELD, ROW_DROPPED_COL, ROW_DROPPED_FIELD, SOURCE_ROW_PRESENT_COL, SOURCE_ROW_PRESENT_FIELD, TARGET_ROW_PRESENT_COL, TARGET_ROW_PRESENT_FIELD} +import com.databricks.sql.transaction.tahoe.schema.ImplicitMetadataOperation +import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf +import com.databricks.sql.transaction.tahoe.util.{AnalysisHelper, DeltaFileOperations} +import com.nvidia.spark.rapids.{GpuOverrides, RapidsConf, SparkPlanMeta} +import com.nvidia.spark.rapids.RapidsConf.DELTA_LOW_SHUFFLE_MERGE_DEL_VECTOR_BROADCAST_THRESHOLD +import com.nvidia.spark.rapids.delta._ +import com.nvidia.spark.rapids.delta.GpuDeltaParquetFileFormatUtils.{METADATA_ROW_DEL_COL, METADATA_ROW_DEL_FIELD, METADATA_ROW_IDX_COL, METADATA_ROW_IDX_FIELD} +import com.nvidia.spark.rapids.shims.FileSourceScanExecMeta +import org.roaringbitmap.longlong.Roaring64Bitmap + +import org.apache.spark.SparkContext +import org.apache.spark.internal.Logging +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.expressions.{Alias, And, Attribute, AttributeReference, CaseWhen, Expression, Literal, NamedExpression, PredicateHelper} +import org.apache.spark.sql.catalyst.expressions.Literal.TrueLiteral +import org.apache.spark.sql.catalyst.plans.logical.{DeltaMergeAction, DeltaMergeIntoClause, DeltaMergeIntoMatchedClause, DeltaMergeIntoMatchedDeleteClause, DeltaMergeIntoMatchedUpdateClause, DeltaMergeIntoNotMatchedBySourceClause, DeltaMergeIntoNotMatchedBySourceDeleteClause, DeltaMergeIntoNotMatchedBySourceUpdateClause, DeltaMergeIntoNotMatchedClause, DeltaMergeIntoNotMatchedInsertClause, LogicalPlan, Project} +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.execution.{SparkPlan, SQLExecution} +import org.apache.spark.sql.execution.command.LeafRunnableCommand +import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} +import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{BooleanType, LongType, StringType, StructField, StructType} + +/** + * GPU version of Delta Lake's low shuffle merge implementation. + * + * Performs a merge of a source query/table into a Delta table. + * + * Issues an error message when the ON search_condition of the MERGE statement can match + * a single row from the target table with multiple rows of the source table-reference. + * Different from the original implementation, it optimized writing touched unmodified target files. + * + * Algorithm: + * + * Phase 1: Find the input files in target that are touched by the rows that satisfy + * the condition and verify that no two source rows match with the same target row. + * This is implemented as an inner-join using the given condition. See [[findTouchedFiles]] + * for more details. + * + * Phase 2: Read the touched files again and write new files with updated and/or inserted rows + * without copying unmodified rows. + * + * Phase 3: Read the touched files again and write new files with unmodified rows in target table, + * trying to keep its original order and avoid shuffle as much as possible. + * + * Phase 4: Use the Delta protocol to atomically remove the touched files and add the new files. + * + * @param source Source data to merge from + * @param target Target table to merge into + * @param gpuDeltaLog Delta log to use + * @param condition Condition for a source row to match with a target row + * @param matchedClauses All info related to matched clauses. + * @param notMatchedClauses All info related to not matched clause. + * @param migratedSchema The final schema of the target - may be changed by schema evolution. + */ +case class GpuLowShuffleMergeCommand( + @transient source: LogicalPlan, + @transient target: LogicalPlan, + @transient gpuDeltaLog: GpuDeltaLog, + condition: Expression, + matchedClauses: Seq[DeltaMergeIntoMatchedClause], + notMatchedClauses: Seq[DeltaMergeIntoNotMatchedClause], + notMatchedBySourceClauses: Seq[DeltaMergeIntoNotMatchedBySourceClause], + migratedSchema: Option[StructType])( + @transient val rapidsConf: RapidsConf) + extends LeafRunnableCommand + with DeltaCommand with PredicateHelper with AnalysisHelper with ImplicitMetadataOperation { + + import SQLMetrics._ + + override val otherCopyArgs: Seq[AnyRef] = Seq(rapidsConf) + + override val canMergeSchema: Boolean = conf.getConf(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE) + override val canOverwriteSchema: Boolean = false + + override val output: Seq[Attribute] = Seq( + AttributeReference("num_affected_rows", LongType)(), + AttributeReference("num_updated_rows", LongType)(), + AttributeReference("num_deleted_rows", LongType)(), + AttributeReference("num_inserted_rows", LongType)()) + + @transient private lazy val sc: SparkContext = SparkContext.getOrCreate() + @transient lazy val targetDeltaLog: DeltaLog = gpuDeltaLog.deltaLog + + override lazy val metrics = Map[String, SQLMetric]( + "numSourceRows" -> createMetric(sc, "number of source rows"), + "numSourceRowsInSecondScan" -> + createMetric(sc, "number of source rows (during repeated scan)"), + "numTargetRowsCopied" -> createMetric(sc, "number of target rows rewritten unmodified"), + "numTargetRowsInserted" -> createMetric(sc, "number of inserted rows"), + "numTargetRowsUpdated" -> createMetric(sc, "number of updated rows"), + "numTargetRowsDeleted" -> createMetric(sc, "number of deleted rows"), + "numTargetRowsMatchedUpdated" -> createMetric(sc, "number of target rows updated when matched"), + "numTargetRowsMatchedDeleted" -> createMetric(sc, "number of target rows deleted when matched"), + "numTargetRowsNotMatchedBySourceUpdated" -> createMetric(sc, + "number of target rows updated when not matched by source"), + "numTargetRowsNotMatchedBySourceDeleted" -> createMetric(sc, + "number of target rows deleted when not matched by source"), + "numTargetFilesBeforeSkipping" -> createMetric(sc, "number of target files before skipping"), + "numTargetFilesAfterSkipping" -> createMetric(sc, "number of target files after skipping"), + "numTargetFilesRemoved" -> createMetric(sc, "number of files removed to target"), + "numTargetFilesAdded" -> createMetric(sc, "number of files added to target"), + "numTargetChangeFilesAdded" -> + createMetric(sc, "number of change data capture files generated"), + "numTargetChangeFileBytes" -> + createMetric(sc, "total size of change data capture files generated"), + "numTargetBytesBeforeSkipping" -> createMetric(sc, "number of target bytes before skipping"), + "numTargetBytesAfterSkipping" -> createMetric(sc, "number of target bytes after skipping"), + "numTargetBytesRemoved" -> createMetric(sc, "number of target bytes removed"), + "numTargetBytesAdded" -> createMetric(sc, "number of target bytes added"), + "numTargetPartitionsAfterSkipping" -> + createMetric(sc, "number of target partitions after skipping"), + "numTargetPartitionsRemovedFrom" -> + createMetric(sc, "number of target partitions from which files were removed"), + "numTargetPartitionsAddedTo" -> + createMetric(sc, "number of target partitions to which files were added"), + "executionTimeMs" -> + createMetric(sc, "time taken to execute the entire operation"), + "scanTimeMs" -> + createMetric(sc, "time taken to scan the files for matches"), + "rewriteTimeMs" -> + createMetric(sc, "time taken to rewrite the matched files")) + + /** Whether this merge statement has only a single insert (NOT MATCHED) clause. */ + protected def isSingleInsertOnly: Boolean = matchedClauses.isEmpty && + notMatchedClauses.length == 1 + + override def run(spark: SparkSession): Seq[Row] = { + recordDeltaOperation(targetDeltaLog, "delta.dml.lowshufflemerge") { + val startTime = System.nanoTime() + val result = gpuDeltaLog.withNewTransaction { deltaTxn => + if (target.schema.size != deltaTxn.metadata.schema.size) { + throw DeltaErrors.schemaChangedSinceAnalysis( + atAnalysis = target.schema, latestSchema = deltaTxn.metadata.schema) + } + + if (canMergeSchema) { + updateMetadata( + spark, deltaTxn, migratedSchema.getOrElse(target.schema), + deltaTxn.metadata.partitionColumns, deltaTxn.metadata.configuration, + isOverwriteMode = false, rearrangeOnly = false) + } + + + val (executor, fallback) = { + val context = MergeExecutorContext(this, spark, deltaTxn, rapidsConf) + if (isSingleInsertOnly && spark.conf.get(DeltaSQLConf.MERGE_INSERT_ONLY_ENABLED)) { + (new InsertOnlyMergeExecutor(context), false) + } else { + val executor = new LowShuffleMergeExecutor(context) + (executor, executor.shouldFallback()) + } + } + + if (fallback) { + None + } else { + Some(runLowShuffleMerge(spark, startTime, deltaTxn, executor)) + } + } + + result match { + case Some(row) => row + case None => + // We should rollback to normal gpu + new GpuMergeIntoCommand(source, target, gpuDeltaLog, condition, matchedClauses, + notMatchedClauses, notMatchedBySourceClauses, migratedSchema)(rapidsConf) + .run(spark) + } + } + } + + + private def runLowShuffleMerge( + spark: SparkSession, + startTime: Long, + deltaTxn: GpuOptimisticTransactionBase, + mergeExecutor: MergeExecutor): Seq[Row] = { + val deltaActions = mergeExecutor.execute() + // Metrics should be recorded before commit (where they are written to delta logs). + metrics("executionTimeMs").set((System.nanoTime() - startTime) / 1000 / 1000) + deltaTxn.registerSQLMetrics(spark, metrics) + + // This is a best-effort sanity check. + if (metrics("numSourceRowsInSecondScan").value >= 0 && + metrics("numSourceRows").value != metrics("numSourceRowsInSecondScan").value) { + log.warn(s"Merge source has ${metrics("numSourceRows").value} rows in initial scan but " + + s"${metrics("numSourceRowsInSecondScan").value} rows in second scan") + if (conf.getConf(DeltaSQLConf.MERGE_FAIL_IF_SOURCE_CHANGED)) { + throw DeltaErrors.sourceNotDeterministicInMergeException(spark) + } + } + + deltaTxn.commit( + deltaActions, + DeltaOperations.Merge( + Option(condition), + matchedClauses.map(DeltaOperations.MergePredicate(_)), + notMatchedClauses.map(DeltaOperations.MergePredicate(_)), + // We do not support notMatchedBySourcePredicates yet and fall back to CPU + // See https://github.com/NVIDIA/spark-rapids/issues/8415 + notMatchedBySourcePredicates = Seq.empty[MergePredicate] + )) + + // Record metrics + val stats = GpuMergeStats.fromMergeSQLMetrics( + metrics, + condition, + matchedClauses, + notMatchedClauses, + deltaTxn.metadata.partitionColumns.nonEmpty) + recordDeltaEvent(targetDeltaLog, "delta.dml.merge.stats", data = stats) + + + spark.sharedState.cacheManager.recacheByPlan(spark, target) + + // This is needed to make the SQL metrics visible in the Spark UI. Also this needs + // to be outside the recordMergeOperation because this method will update some metric. + val executionId = spark.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) + SQLMetrics.postDriverMetricUpdates(spark.sparkContext, executionId, metrics.values.toSeq) + Seq(Row(metrics("numTargetRowsUpdated").value + metrics("numTargetRowsDeleted").value + + metrics("numTargetRowsInserted").value, metrics("numTargetRowsUpdated").value, + metrics("numTargetRowsDeleted").value, metrics("numTargetRowsInserted").value)) + } + + /** + * Execute the given `thunk` and return its result while recording the time taken to do it. + * + * @param sqlMetricName name of SQL metric to update with the time taken by the thunk + * @param thunk the code to execute + */ + def recordMergeOperation[A](sqlMetricName: String)(thunk: => A): A = { + val startTimeNs = System.nanoTime() + val r = thunk + val timeTakenMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTimeNs) + if (sqlMetricName != null && timeTakenMs > 0) { + metrics(sqlMetricName) += timeTakenMs + } + r + } + + /** Expressions to increment SQL metrics */ + def makeMetricUpdateUDF(name: String, deterministic: Boolean = false) + : Expression = { + // only capture the needed metric in a local variable + val metric = metrics(name) + var u = DeltaUDF.boolean(new GpuDeltaMetricUpdateUDF(metric)) + if (!deterministic) { + u = u.asNondeterministic() + } + u.apply().expr + } +} + +/** + * Context merge execution. + */ +case class MergeExecutorContext(cmd: GpuLowShuffleMergeCommand, + spark: SparkSession, + deltaTxn: OptimisticTransaction, + rapidsConf: RapidsConf) + +trait MergeExecutor extends AnalysisHelper with PredicateHelper with Logging { + + val context: MergeExecutorContext + + + /** + * Map to get target output attributes by name. + * The case sensitivity of the map is set accordingly to Spark configuration. + */ + @transient private lazy val targetOutputAttributesMap: Map[String, Attribute] = { + val attrMap: Map[String, Attribute] = context.cmd.target + .outputSet.view + .map(attr => attr.name -> attr).toMap + if (context.cmd.conf.caseSensitiveAnalysis) { + attrMap + } else { + CaseInsensitiveMap(attrMap) + } + } + + def execute(): Seq[FileAction] + + protected def targetOutputCols: Seq[NamedExpression] = { + context.deltaTxn.metadata.schema.map { col => + targetOutputAttributesMap + .get(col.name) + .map { a => + AttributeReference(col.name, col.dataType, col.nullable)(a.exprId) + } + .getOrElse(Alias(Literal(null), col.name)()) + } + } + + /** + * Build a DataFrame using the given `files` that has the same output columns (exprIds) + * as the `target` logical plan, so that existing update/insert expressions can be applied + * on this new plan. + */ + protected def buildTargetDFWithFiles(files: Seq[AddFile]): DataFrame = { + val targetOutputColsMap = { + val colsMap: Map[String, NamedExpression] = targetOutputCols.view + .map(col => col.name -> col).toMap + if (context.cmd.conf.caseSensitiveAnalysis) { + colsMap + } else { + CaseInsensitiveMap(colsMap) + } + } + + val plan = { + // We have to do surgery to use the attributes from `targetOutputCols` to scan the table. + // In cases of schema evolution, they may not be the same type as the original attributes. + val original = + context.deltaTxn.deltaLog.createDataFrame(context.deltaTxn.snapshot, files) + .queryExecution + .analyzed + val transformed = original.transform { + case LogicalRelation(base, _, catalogTbl, isStreaming) => + LogicalRelation( + base, + // We can ignore the new columns which aren't yet AttributeReferences. + targetOutputCols.collect { case a: AttributeReference => a }, + catalogTbl, + isStreaming) + } + + // In case of schema evolution & column mapping, we would also need to rebuild the file + // format because under column mapping, the reference schema within DeltaParquetFileFormat + // that is used to populate metadata needs to be updated + if (context.deltaTxn.metadata.columnMappingMode != NoMapping) { + val updatedFileFormat = context.deltaTxn.deltaLog.fileFormat( + context.deltaTxn.deltaLog.unsafeVolatileSnapshot.protocol, context.deltaTxn.metadata) + DeltaTableUtils.replaceFileFormat(transformed, updatedFileFormat) + } else { + transformed + } + } + + // For each plan output column, find the corresponding target output column (by name) and + // create an alias + val aliases = plan.output.map { + case newAttrib: AttributeReference => + val existingTargetAttrib = targetOutputColsMap.getOrElse(newAttrib.name, + throw new AnalysisException( + s"Could not find ${newAttrib.name} among the existing target output " + + targetOutputCols.mkString(","))).asInstanceOf[AttributeReference] + + if (existingTargetAttrib.exprId == newAttrib.exprId) { + // It's not valid to alias an expression to its own exprId (this is considered a + // non-unique exprId by the analyzer), so we just use the attribute directly. + newAttrib + } else { + Alias(newAttrib, existingTargetAttrib.name)(exprId = existingTargetAttrib.exprId) + } + } + + Dataset.ofRows(context.spark, Project(aliases, plan)) + } + + + /** + * Repartitions the output DataFrame by the partition columns if table is partitioned + * and `merge.repartitionBeforeWrite.enabled` is set to true. + */ + protected def repartitionIfNeeded(df: DataFrame): DataFrame = { + val partitionColumns = context.deltaTxn.metadata.partitionColumns + // TODO: We should remove this method and use optimized write instead, see + // https://github.com/NVIDIA/spark-rapids/issues/10417 + if (partitionColumns.nonEmpty && context.spark.conf.get(DeltaSQLConf + .MERGE_REPARTITION_BEFORE_WRITE)) { + df.repartition(partitionColumns.map(col): _*) + } else { + df + } + } + + protected def sourceDF: DataFrame = { + // UDF to increment metrics + val incrSourceRowCountExpr = context.cmd.makeMetricUpdateUDF("numSourceRows") + Dataset.ofRows(context.spark, context.cmd.source) + .filter(new Column(incrSourceRowCountExpr)) + } + + /** Whether this merge statement has no insert (NOT MATCHED) clause. */ + protected def hasNoInserts: Boolean = context.cmd.notMatchedClauses.isEmpty + + +} + +/** + * This is an optimization of the case when there is no update clause for the merge. + * We perform an left anti join on the source data to find the rows to be inserted. + * + * This will currently only optimize for the case when there is a _single_ notMatchedClause. + */ +class InsertOnlyMergeExecutor(override val context: MergeExecutorContext) extends MergeExecutor { + override def execute(): Seq[FileAction] = { + context.cmd.recordMergeOperation(sqlMetricName = "rewriteTimeMs") { + + // UDFs to update metrics + val incrSourceRowCountExpr = context.cmd.makeMetricUpdateUDF("numSourceRows") + val incrInsertedCountExpr = context.cmd.makeMetricUpdateUDF("numTargetRowsInserted") + + val outputColNames = targetOutputCols.map(_.name) + // we use head here since we know there is only a single notMatchedClause + val outputExprs = context.cmd.notMatchedClauses.head.resolvedActions.map(_.expr) + val outputCols = outputExprs.zip(outputColNames).map { case (expr, name) => + new Column(Alias(expr, name)()) + } + + // source DataFrame + val sourceDF = Dataset.ofRows(context.spark, context.cmd.source) + .filter(new Column(incrSourceRowCountExpr)) + .filter(new Column(context.cmd.notMatchedClauses.head.condition + .getOrElse(Literal.TrueLiteral))) + + // Skip data based on the merge condition + val conjunctivePredicates = splitConjunctivePredicates(context.cmd.condition) + val targetOnlyPredicates = + conjunctivePredicates.filter(_.references.subsetOf(context.cmd.target.outputSet)) + val dataSkippedFiles = context.deltaTxn.filterFiles(targetOnlyPredicates) + + // target DataFrame + val targetDF = buildTargetDFWithFiles(dataSkippedFiles) + + val insertDf = sourceDF.join(targetDF, new Column(context.cmd.condition), "leftanti") + .select(outputCols: _*) + .filter(new Column(incrInsertedCountExpr)) + + val newFiles = context.deltaTxn + .writeFiles(repartitionIfNeeded(insertDf, + )) + + // Update metrics + context.cmd.metrics("numTargetFilesBeforeSkipping") += context.deltaTxn.snapshot.numOfFiles + context.cmd.metrics("numTargetBytesBeforeSkipping") += context.deltaTxn.snapshot.sizeInBytes + val (afterSkippingBytes, afterSkippingPartitions) = + totalBytesAndDistinctPartitionValues(dataSkippedFiles) + context.cmd.metrics("numTargetFilesAfterSkipping") += dataSkippedFiles.size + context.cmd.metrics("numTargetBytesAfterSkipping") += afterSkippingBytes + context.cmd.metrics("numTargetPartitionsAfterSkipping") += afterSkippingPartitions + context.cmd.metrics("numTargetFilesRemoved") += 0 + context.cmd.metrics("numTargetBytesRemoved") += 0 + context.cmd.metrics("numTargetPartitionsRemovedFrom") += 0 + val (addedBytes, addedPartitions) = totalBytesAndDistinctPartitionValues(newFiles) + context.cmd.metrics("numTargetFilesAdded") += newFiles.count(_.isInstanceOf[AddFile]) + context.cmd.metrics("numTargetBytesAdded") += addedBytes + context.cmd.metrics("numTargetPartitionsAddedTo") += addedPartitions + newFiles + } + } +} + + +/** + * This is an optimized algorithm for merge statement, where we avoid shuffling the unmodified + * target data. + * + * The algorithm is as follows: + * 1. Find touched target files in the target table by joining the source and target data, with + * collecting joined row identifiers as (`__metadata_file_path`, `__metadata_row_idx`) pairs. + * 2. Read the touched files again and write new files with updated and/or inserted rows + * without coping unmodified data from target table, but filtering target table with collected + * rows mentioned above. + * 3. Read the touched files again, filtering unmodified rows with collected row identifiers + * collected in first step, and saving them without shuffle. + */ +class LowShuffleMergeExecutor(override val context: MergeExecutorContext) extends MergeExecutor { + + // We over-count numTargetRowsDeleted when there are multiple matches; + // this is the amount of the overcount, so we can subtract it to get a correct final metric. + private var multipleMatchDeleteOnlyOvercount: Option[Long] = None + + // UDFs to update metrics + private val incrSourceRowCountExpr: Expression = context.cmd. + makeMetricUpdateUDF("numSourceRowsInSecondScan") + private val incrUpdatedCountExpr: Expression = context.cmd + .makeMetricUpdateUDF("numTargetRowsUpdated") + private val incrUpdatedMatchedCountExpr: Expression = context.cmd + .makeMetricUpdateUDF("numTargetRowsMatchedUpdated") + private val incrUpdatedNotMatchedBySourceCountExpr: Expression = context.cmd + .makeMetricUpdateUDF("numTargetRowsNotMatchedBySourceUpdated") + private val incrInsertedCountExpr: Expression = context.cmd + .makeMetricUpdateUDF("numTargetRowsInserted") + private val incrDeletedCountExpr: Expression = context.cmd + .makeMetricUpdateUDF("numTargetRowsDeleted") + private val incrDeletedMatchedCountExpr: Expression = context.cmd + .makeMetricUpdateUDF("numTargetRowsMatchedDeleted") + private val incrDeletedNotMatchedBySourceCountExpr: Expression = context.cmd + .makeMetricUpdateUDF("numTargetRowsNotMatchedBySourceDeleted") + + private def updateOutput(resolvedActions: Seq[DeltaMergeAction], incrExpr: Expression) + : Seq[Expression] = { + resolvedActions.map(_.expr) :+ + Literal.FalseLiteral :+ + UnresolvedAttribute(TARGET_ROW_PRESENT_COL) :+ + UnresolvedAttribute(SOURCE_ROW_PRESENT_COL) :+ + incrExpr + } + + private def deleteOutput(incrExpr: Expression): Seq[Expression] = { + targetOutputCols :+ + TrueLiteral :+ + UnresolvedAttribute(TARGET_ROW_PRESENT_COL) :+ + UnresolvedAttribute(SOURCE_ROW_PRESENT_COL) :+ + incrExpr + } + + private def insertOutput(resolvedActions: Seq[DeltaMergeAction], incrExpr: Expression) + : Seq[Expression] = { + resolvedActions.map(_.expr) :+ + Literal.FalseLiteral :+ + UnresolvedAttribute(TARGET_ROW_PRESENT_COL) :+ + UnresolvedAttribute(SOURCE_ROW_PRESENT_COL) :+ + incrExpr + } + + private def clauseOutput(clause: DeltaMergeIntoClause): Seq[Expression] = clause match { + case u: DeltaMergeIntoMatchedUpdateClause => + updateOutput(u.resolvedActions, And(incrUpdatedCountExpr, incrUpdatedMatchedCountExpr)) + case _: DeltaMergeIntoMatchedDeleteClause => + deleteOutput(And(incrDeletedCountExpr, incrDeletedMatchedCountExpr)) + case i: DeltaMergeIntoNotMatchedInsertClause => + insertOutput(i.resolvedActions, incrInsertedCountExpr) + case u: DeltaMergeIntoNotMatchedBySourceUpdateClause => + updateOutput(u.resolvedActions, + And(incrUpdatedCountExpr, incrUpdatedNotMatchedBySourceCountExpr)) + case _: DeltaMergeIntoNotMatchedBySourceDeleteClause => + deleteOutput(And(incrDeletedCountExpr, incrDeletedNotMatchedBySourceCountExpr)) + } + + private def clauseCondition(clause: DeltaMergeIntoClause): Expression = { + // if condition is None, then expression always evaluates to true + clause.condition.getOrElse(TrueLiteral) + } + + /** + * Though low shuffle merge algorithm performs better than traditional merge algorithm in some + * cases, there are some case we should fallback to traditional merge executor: + * + * 1. Low shuffle merge algorithm requires generating metadata columns such as + * [[METADATA_ROW_IDX_COL]], [[METADATA_ROW_DEL_COL]], which only implemented on + * [[org.apache.spark.sql.rapids.GpuFileSourceScanExec]]. That means we need to fallback to + * this normal executor when [[org.apache.spark.sql.rapids.GpuFileSourceScanExec]] is disabled + * for some reason. + * 2. Low shuffle merge algorithm currently needs to broadcast deletion vector, which may + * introduce extra overhead. It maybe better to fallback to this algorithm when the changeset + * it too large. + */ + def shouldFallback(): Boolean = { + // Trying to detect if we can execute finding touched files. + val touchFilePlanOverrideSucceed = verifyGpuPlan(planForFindingTouchedFiles()) { planMeta => + def check(meta: SparkPlanMeta[SparkPlan]): Boolean = { + meta match { + case scan if scan.isInstanceOf[FileSourceScanExecMeta] => scan + .asInstanceOf[FileSourceScanExecMeta] + .wrapped + .schema + .fieldNames + .contains(METADATA_ROW_IDX_COL) && scan.canThisBeReplaced + case m => m.childPlans.exists(check) + } + } + + check(planMeta) + } + if (!touchFilePlanOverrideSucceed) { + logWarning("Unable to override file scan for low shuffle merge for finding touched files " + + "plan, fallback to tradition merge.") + return true + } + + // Trying to detect if we can execute the merge plan. + val mergePlanOverrideSucceed = verifyGpuPlan(planForMergeExecution(touchedFiles)) { planMeta => + var overrideCount = 0 + def count(meta: SparkPlanMeta[SparkPlan]): Unit = { + meta match { + case scan if scan.isInstanceOf[FileSourceScanExecMeta] => + if (scan.asInstanceOf[FileSourceScanExecMeta] + .wrapped.schema.fieldNames.contains(METADATA_ROW_DEL_COL) && scan.canThisBeReplaced) { + overrideCount += 1 + } + case m => m.childPlans.foreach(count) + } + } + + count(planMeta) + overrideCount == 2 + } + + if (!mergePlanOverrideSucceed) { + logWarning("Unable to override file scan for low shuffle merge for merge plan, fallback to " + + "tradition merge.") + return true + } + + val deletionVectorSize = touchedFiles.values.map(_._1.serializedSizeInBytes()).sum + val maxDelVectorSize = context.rapidsConf + .get(DELTA_LOW_SHUFFLE_MERGE_DEL_VECTOR_BROADCAST_THRESHOLD) + if (deletionVectorSize > maxDelVectorSize) { + logWarning( + s"""Low shuffle merge can't be executed because broadcast deletion vector count + |$deletionVectorSize is large than max value $maxDelVectorSize """.stripMargin) + return true + } + + false + } + + private def verifyGpuPlan(input: DataFrame)(checkPlanMeta: SparkPlanMeta[SparkPlan] => Boolean) + : Boolean = { + val overridePlan = GpuOverrides.wrapAndTagPlan(input.queryExecution.sparkPlan, + context.rapidsConf) + checkPlanMeta(overridePlan) + } + + override def execute(): Seq[FileAction] = { + val newFiles = context.cmd.withStatusCode("DELTA", + s"Rewriting ${touchedFiles.size} files and saving modified data") { + val df = planForMergeExecution(touchedFiles) + context.deltaTxn.writeFiles(df) + } + + // Update metrics + val (addedBytes, addedPartitions) = totalBytesAndDistinctPartitionValues(newFiles) + context.cmd.metrics("numTargetFilesAdded") += newFiles.count(_.isInstanceOf[AddFile]) + context.cmd.metrics("numTargetChangeFilesAdded") += newFiles.count(_.isInstanceOf[AddCDCFile]) + context.cmd.metrics("numTargetChangeFileBytes") += newFiles.collect { + case f: AddCDCFile => f.size + } + .sum + context.cmd.metrics("numTargetBytesAdded") += addedBytes + context.cmd.metrics("numTargetPartitionsAddedTo") += addedPartitions + + if (multipleMatchDeleteOnlyOvercount.isDefined) { + // Compensate for counting duplicates during the query. + val actualRowsDeleted = + context.cmd.metrics("numTargetRowsDeleted").value - multipleMatchDeleteOnlyOvercount.get + assert(actualRowsDeleted >= 0) + context.cmd.metrics("numTargetRowsDeleted").set(actualRowsDeleted) + } + + touchedFiles.values.map(_._2).map(_.remove).toSeq ++ newFiles + } + + private lazy val dataSkippedFiles: Seq[AddFile] = { + // Skip data based on the merge condition + val targetOnlyPredicates = splitConjunctivePredicates(context.cmd.condition) + .filter(_.references.subsetOf(context.cmd.target.outputSet)) + context.deltaTxn.filterFiles(targetOnlyPredicates) + } + + private lazy val dataSkippedTargetDF: DataFrame = { + addRowIndexMetaColumn(buildTargetDFWithFiles(dataSkippedFiles)) + } + + private lazy val touchedFiles: Map[String, (Roaring64Bitmap, AddFile)] = this.findTouchedFiles() + + private def planForFindingTouchedFiles(): DataFrame = { + + // Apply inner join to between source and target using the merge condition to find matches + // In addition, we attach two columns + // - METADATA_ROW_IDX column to identify target row in file + // - FILE_PATH_COL the target file name the row is from to later identify the files touched + // by matched rows + val targetDF = dataSkippedTargetDF.withColumn(FILE_PATH_COL, input_file_name()) + + sourceDF.join(targetDF, new Column(context.cmd.condition), "inner") + } + + private def planForMergeExecution(touchedFiles: Map[String, (Roaring64Bitmap, AddFile)]) + : DataFrame = { + getModifiedDF(touchedFiles).unionAll(getUnmodifiedDF(touchedFiles)) + } + + /** + * Find the target table files that contain the rows that satisfy the merge condition. This is + * implemented as an inner-join between the source query/table and the target table using + * the merge condition. + */ + private def findTouchedFiles(): Map[String, (Roaring64Bitmap, AddFile)] = + context.cmd.recordMergeOperation(sqlMetricName = "scanTimeMs") { + context.spark.udf.register("row_index_set", udaf(RoaringBitmapUDAF)) + // Process the matches from the inner join to record touched files and find multiple matches + val collectTouchedFiles = planForFindingTouchedFiles() + .select(col(FILE_PATH_COL), col(METADATA_ROW_IDX_COL)) + .groupBy(FILE_PATH_COL) + .agg( + expr(s"row_index_set($METADATA_ROW_IDX_COL) as row_idxes"), + count("*").as("count")) + .collect().map(row => { + val filename = row.getAs[String](FILE_PATH_COL) + val rowIdxSet = row.getAs[RoaringBitmapWrapper]("row_idxes").inner + val count = row.getAs[Long]("count") + (filename, (rowIdxSet, count)) + }) + .toMap + + val duplicateCount = { + val distinctMatchedRowCounts = collectTouchedFiles.values + .map(_._1.getLongCardinality).sum + val allMatchedRowCounts = collectTouchedFiles.values.map(_._2).sum + allMatchedRowCounts - distinctMatchedRowCounts + } + + val hasMultipleMatches = duplicateCount > 0 + + // Throw error if multiple matches are ambiguous or cannot be computed correctly. + val canBeComputedUnambiguously = { + // Multiple matches are not ambiguous when there is only one unconditional delete as + // all the matched row pairs in the 2nd join in `writeAllChanges` will get deleted. + val isUnconditionalDelete = context.cmd.matchedClauses.headOption match { + case Some(DeltaMergeIntoMatchedDeleteClause(None)) => true + case _ => false + } + context.cmd.matchedClauses.size == 1 && isUnconditionalDelete + } + + if (hasMultipleMatches && !canBeComputedUnambiguously) { + throw DeltaErrors.multipleSourceRowMatchingTargetRowInMergeException(context.spark) + } + + if (hasMultipleMatches) { + // This is only allowed for delete-only queries. + // This query will count the duplicates for numTargetRowsDeleted in Job 2, + // because we count matches after the join and not just the target rows. + // We have to compensate for this by subtracting the duplicates later, + // so we need to record them here. + multipleMatchDeleteOnlyOvercount = Some(duplicateCount) + } + + // Get the AddFiles using the touched file names. + val touchedFileNames = collectTouchedFiles.keys.toSeq + + val nameToAddFileMap = context.cmd.generateCandidateFileMap( + context.cmd.targetDeltaLog.dataPath, + dataSkippedFiles) + + val touchedAddFiles = touchedFileNames.map(f => + context.cmd.getTouchedFile(context.cmd.targetDeltaLog.dataPath, f, nameToAddFileMap)) + .map(f => (DeltaFileOperations + .absolutePath(context.cmd.targetDeltaLog.dataPath.toString, f.path) + .toString, f)).toMap + + // When the target table is empty, and the optimizer optimized away the join entirely + // numSourceRows will be incorrectly 0. + // We need to scan the source table once to get the correct + // metric here. + if (context.cmd.metrics("numSourceRows").value == 0 && + (dataSkippedFiles.isEmpty || dataSkippedTargetDF.take(1).isEmpty)) { + val numSourceRows = sourceDF.count() + context.cmd.metrics("numSourceRows").set(numSourceRows) + } + + // Update metrics + context.cmd.metrics("numTargetFilesBeforeSkipping") += context.deltaTxn.snapshot.numOfFiles + context.cmd.metrics("numTargetBytesBeforeSkipping") += context.deltaTxn.snapshot.sizeInBytes + val (afterSkippingBytes, afterSkippingPartitions) = + totalBytesAndDistinctPartitionValues(dataSkippedFiles) + context.cmd.metrics("numTargetFilesAfterSkipping") += dataSkippedFiles.size + context.cmd.metrics("numTargetBytesAfterSkipping") += afterSkippingBytes + context.cmd.metrics("numTargetPartitionsAfterSkipping") += afterSkippingPartitions + val (removedBytes, removedPartitions) = + totalBytesAndDistinctPartitionValues(touchedAddFiles.values.toSeq) + context.cmd.metrics("numTargetFilesRemoved") += touchedAddFiles.size + context.cmd.metrics("numTargetBytesRemoved") += removedBytes + context.cmd.metrics("numTargetPartitionsRemovedFrom") += removedPartitions + + collectTouchedFiles.map(kv => (kv._1, (kv._2._1, touchedAddFiles(kv._1)))) + } + + + /** + * Modify original data frame to insert + * [[GpuDeltaParquetFileFormatUtils.METADATA_ROW_IDX_COL]]. + */ + private def addRowIndexMetaColumn(baseDF: DataFrame): DataFrame = { + val rowIdxAttr = AttributeReference( + METADATA_ROW_IDX_COL, + METADATA_ROW_IDX_FIELD.dataType, + METADATA_ROW_IDX_FIELD.nullable)() + + val newPlan = baseDF.queryExecution.analyzed.transformUp { + case r@LogicalRelation(fs: HadoopFsRelation, _, _, _) => + val newSchema = StructType(fs.dataSchema.fields).add(METADATA_ROW_IDX_FIELD) + + // This is required to ensure that row index is correctly calculated. + val newFileFormat = fs.fileFormat.asInstanceOf[DeltaParquetFileFormat] + .copy(isSplittable = false, disablePushDowns = true) + + val newFs = fs.copy(dataSchema = newSchema, fileFormat = newFileFormat)(context.spark) + + val newOutput = r.output :+ rowIdxAttr + r.copy(relation = newFs, output = newOutput) + case p@Project(projectList, _) => + val newProjectList = projectList :+ rowIdxAttr + p.copy(projectList = newProjectList) + } + + Dataset.ofRows(context.spark, newPlan) + } + + /** + * The result is scanning target table with touched files, and added an extra + * [[METADATA_ROW_DEL_COL]] to indicate whether filtered by joining with source table in first + * step. + */ + private def getTouchedTargetDF(touchedFiles: Map[String, (Roaring64Bitmap, AddFile)]) + : DataFrame = { + // Generate a new target dataframe that has same output attributes exprIds as the target plan. + // This allows us to apply the existing resolved update/insert expressions. + val baseTargetDF = buildTargetDFWithFiles(touchedFiles.values.map(_._2).toSeq) + + val newPlan = { + val rowDelAttr = AttributeReference( + METADATA_ROW_DEL_COL, + METADATA_ROW_DEL_FIELD.dataType, + METADATA_ROW_DEL_FIELD.nullable)() + + baseTargetDF.queryExecution.analyzed.transformUp { + case r@LogicalRelation(fs: HadoopFsRelation, _, _, _) => + val newSchema = StructType(fs.dataSchema.fields).add(METADATA_ROW_DEL_FIELD) + + // This is required to ensure that row index is correctly calculated. + val newFileFormat = { + val oldFormat = fs.fileFormat.asInstanceOf[DeltaParquetFileFormat] + val dvs = touchedFiles.map(kv => (new URI(kv._1), + DeletionVectorDescriptorWithFilterType(toDeletionVector(kv._2._1), + RowIndexFilterType.UNKNOWN))) + val broadcastDVs = context.spark.sparkContext.broadcast(dvs) + + oldFormat.copy(isSplittable = false, + broadcastDvMap = Some(broadcastDVs), + disablePushDowns = true) + } + + val newFs = fs.copy(dataSchema = newSchema, fileFormat = newFileFormat)(context.spark) + + val newOutput = r.output :+ rowDelAttr + r.copy(relation = newFs, output = newOutput) + case p@Project(projectList, _) => + val newProjectList = projectList :+ rowDelAttr + p.copy(projectList = newProjectList) + } + } + + val df = Dataset.ofRows(context.spark, newPlan) + .withColumn(TARGET_ROW_PRESENT_COL, lit(true)) + + df + } + + /** + * Generate a plan by calculating modified rows. It's computed by joining source and target + * tables, where target table has been filtered by (`__metadata_file_name`, + * `__metadata_row_idx`) pairs collected in first step. + * + * Schema of `modifiedDF`: + * + * targetSchema + ROW_DROPPED_COL + TARGET_ROW_PRESENT_COL + + * SOURCE_ROW_PRESENT_COL + INCR_METRICS_COL + * INCR_METRICS_COL + * + * It consists of several parts: + * + * 1. Unmatched source rows which are inserted + * 2. Unmatched source rows which are deleted + * 3. Target rows which are updated + * 4. Target rows which are deleted + */ + private def getModifiedDF(touchedFiles: Map[String, (Roaring64Bitmap, AddFile)]): DataFrame = { + val sourceDF = this.sourceDF + .withColumn(SOURCE_ROW_PRESENT_COL, new Column(incrSourceRowCountExpr)) + + val targetDF = getTouchedTargetDF(touchedFiles) + + val joinedDF = { + val joinType = if (hasNoInserts && + context.spark.conf.get(DeltaSQLConf.MERGE_MATCHED_ONLY_ENABLED)) { + "inner" + } else { + "leftOuter" + } + val matchedTargetDF = targetDF.filter(METADATA_ROW_DEL_COL) + .drop(METADATA_ROW_DEL_COL) + + sourceDF.join(matchedTargetDF, new Column(context.cmd.condition), joinType) + } + + val modifiedRowsSchema = context.deltaTxn.metadata.schema + .add(ROW_DROPPED_FIELD) + .add(TARGET_ROW_PRESENT_FIELD.copy(nullable = true)) + .add(SOURCE_ROW_PRESENT_FIELD.copy(nullable = true)) + .add(INCR_METRICS_FIELD) + + // Here we generate a case when statement to handle all cases: + // CASE + // WHEN + // CASE WHEN + // + // WHEN + // + // ELSE + // + // WHEN + // CASE WHEN + // + // WHEN + // + // ELSE + // + // END + + val notMatchedConditions = context.cmd.notMatchedClauses.map(clauseCondition) + val notMatchedExpr = { + val deletedNotMatchedRow = { + targetOutputCols :+ + Literal.TrueLiteral :+ + Literal.FalseLiteral :+ + Literal(null) :+ + Literal.TrueLiteral + } + if (context.cmd.notMatchedClauses.isEmpty) { + // If there no `WHEN NOT MATCHED` clause, we should just delete not matched row + deletedNotMatchedRow + } else { + val notMatchedOutputs = context.cmd.notMatchedClauses.map(clauseOutput) + modifiedRowsSchema.zipWithIndex.map { + case (_, idx) => + CaseWhen(notMatchedConditions.zip(notMatchedOutputs.map(_(idx))), + deletedNotMatchedRow(idx)) + } + } + } + + val matchedConditions = context.cmd.matchedClauses.map(clauseCondition) + val matchedOutputs = context.cmd.matchedClauses.map(clauseOutput) + val matchedExprs = { + val notMatchedRow = { + targetOutputCols :+ + Literal.FalseLiteral :+ + Literal.TrueLiteral :+ + Literal(null) :+ + Literal.TrueLiteral + } + if (context.cmd.matchedClauses.isEmpty) { + // If there is not matched clause, this is insert only, we should delete this row. + notMatchedRow + } else { + modifiedRowsSchema.zipWithIndex.map { + case (_, idx) => + CaseWhen(matchedConditions.zip(matchedOutputs.map(_(idx))), + notMatchedRow(idx)) + } + } + } + + val sourceRowHasNoMatch = col(TARGET_ROW_PRESENT_COL).isNull.expr + + val modifiedCols = modifiedRowsSchema.zipWithIndex.map { case (col, idx) => + val caseWhen = CaseWhen( + Seq(sourceRowHasNoMatch -> notMatchedExpr(idx)), + matchedExprs(idx)) + new Column(Alias(caseWhen, col.name)()) + } + + val modifiedDF = { + + // Make this a udf to avoid catalyst to be too aggressive to even remove the join! + val noopRowDroppedCol = udf(new GpuDeltaNoopUDF()).apply(!col(ROW_DROPPED_COL)) + + val modifiedDF = joinedDF.select(modifiedCols: _*) + // This will not filter anything since they always return true, but we need to avoid + // catalyst from optimizing these udf + .filter(noopRowDroppedCol && col(INCR_METRICS_COL)) + .drop(ROW_DROPPED_COL, INCR_METRICS_COL, TARGET_ROW_PRESENT_COL, SOURCE_ROW_PRESENT_COL) + + repartitionIfNeeded(modifiedDF) + } + + modifiedDF + } + + private def getUnmodifiedDF(touchedFiles: Map[String, (Roaring64Bitmap, AddFile)]): DataFrame = { + getTouchedTargetDF(touchedFiles) + .filter(!col(METADATA_ROW_DEL_COL)) + .drop(TARGET_ROW_PRESENT_COL, METADATA_ROW_DEL_COL) + } +} + + +object MergeExecutor { + + /** + * Spark UI will track all normal accumulators along with Spark tasks to show them on Web UI. + * However, the accumulator used by `MergeIntoCommand` can store a very large value since it + * tracks all files that need to be rewritten. We should ask Spark UI to not remember it, + * otherwise, the UI data may consume lots of memory. Hence, we use the prefix `internal.metrics.` + * to make this accumulator become an internal accumulator, so that it will not be tracked by + * Spark UI. + */ + val TOUCHED_FILES_ACCUM_NAME = "internal.metrics.MergeIntoDelta.touchedFiles" + + val ROW_ID_COL = "_row_id_" + val FILE_PATH_COL: String = GpuDeltaParquetFileFormatUtils.FILE_PATH_COL + val SOURCE_ROW_PRESENT_COL: String = "_source_row_present_" + val SOURCE_ROW_PRESENT_FIELD: StructField = StructField(SOURCE_ROW_PRESENT_COL, BooleanType, + nullable = false) + val TARGET_ROW_PRESENT_COL: String = "_target_row_present_" + val TARGET_ROW_PRESENT_FIELD: StructField = StructField(TARGET_ROW_PRESENT_COL, BooleanType, + nullable = false) + val ROW_DROPPED_COL: String = GpuDeltaMergeConstants.ROW_DROPPED_COL + val ROW_DROPPED_FIELD: StructField = StructField(ROW_DROPPED_COL, BooleanType, nullable = false) + val INCR_METRICS_COL: String = "_incr_metrics_" + val INCR_METRICS_FIELD: StructField = StructField(INCR_METRICS_COL, BooleanType, nullable = false) + val INCR_ROW_COUNT_COL: String = "_incr_row_count_" + + // Some Delta versions use Literal(null) which translates to a literal of NullType instead + // of the Literal(null, StringType) which is needed, so using a fixed version here + // rather than the version from Delta Lake. + val CDC_TYPE_NOT_CDC_LITERAL: Literal = Literal(null, StringType) + + def toDeletionVector(bitmap: Roaring64Bitmap): DeletionVectorDescriptor = { + DeletionVectorDescriptor.inlineInLog(RoaringBitmapWrapper(bitmap).serializeToBytes(), + bitmap.getLongCardinality) + } + + /** Count the number of distinct partition values among the AddFiles in the given set. */ + def totalBytesAndDistinctPartitionValues(files: Seq[FileAction]): (Long, Int) = { + val distinctValues = new mutable.HashSet[Map[String, String]]() + var bytes = 0L + val iter = files.collect { case a: AddFile => a }.iterator + while (iter.hasNext) { + val file = iter.next() + distinctValues += file.partitionValues + bytes += file.size + } + // If the only distinct value map is an empty map, then it must be an unpartitioned table. + // Return 0 in that case. + val numDistinctValues = + if (distinctValues.size == 1 && distinctValues.head.isEmpty) 0 else distinctValues.size + (bytes, numDistinctValues) + } +} \ No newline at end of file diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuMergeIntoCommand.scala b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuMergeIntoCommand.scala new file mode 100644 index 00000000000..71e8a413b00 --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuMergeIntoCommand.scala @@ -0,0 +1,1189 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * This file was derived from MergeIntoCommand.scala + * in the Delta Lake project at https://github.com/delta-io/delta. + * + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.databricks.sql.transaction.tahoe.rapids + +import java.util.concurrent.TimeUnit + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +import com.databricks.sql.transaction.tahoe._ +import com.databricks.sql.transaction.tahoe.DeltaOperations.MergePredicate +import com.databricks.sql.transaction.tahoe.actions.{AddCDCFile, AddFile, FileAction} +import com.databricks.sql.transaction.tahoe.commands.DeltaCommand +import com.databricks.sql.transaction.tahoe.schema.ImplicitMetadataOperation +import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf +import com.databricks.sql.transaction.tahoe.util.{AnalysisHelper, SetAccumulator} +import com.fasterxml.jackson.databind.annotation.JsonDeserialize +import com.nvidia.spark.rapids.{BaseExprMeta, GpuOverrides, RapidsConf} +import com.nvidia.spark.rapids.delta._ + +import org.apache.spark.SparkContext +import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute +import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, AttributeReference, BasePredicate, Expression, Literal, NamedExpression, PredicateHelper, UnsafeProjection} +import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate +import org.apache.spark.sql.catalyst.plans.logical.{DeltaMergeIntoClause, DeltaMergeIntoMatchedClause, DeltaMergeIntoMatchedDeleteClause, DeltaMergeIntoMatchedUpdateClause, DeltaMergeIntoNotMatchedBySourceClause, DeltaMergeIntoNotMatchedClause, LogicalPlan, Project} +import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.execution.SQLExecution +import org.apache.spark.sql.execution.command.LeafRunnableCommand +import org.apache.spark.sql.execution.datasources.LogicalRelation +import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{DataTypes, LongType, StringType, StructType} +case class GpuMergeDataSizes( + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + rows: Option[Long] = None, + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + files: Option[Long] = None, + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + bytes: Option[Long] = None, + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + partitions: Option[Long] = None) + +/** + * Represents the state of a single merge clause: + * - merge clause's (optional) predicate + * - action type (insert, update, delete) + * - action's expressions + */ +case class GpuMergeClauseStats( + condition: Option[String], + actionType: String, + actionExpr: Seq[String]) + +object GpuMergeClauseStats { + def apply(mergeClause: DeltaMergeIntoClause): GpuMergeClauseStats = { + GpuMergeClauseStats( + condition = mergeClause.condition.map(_.sql), + mergeClause.clauseType.toLowerCase(), + actionExpr = mergeClause.actions.map(_.sql)) + } +} + +/** State for a GPU merge operation */ +case class GpuMergeStats( + // Merge condition expression + conditionExpr: String, + + // Expressions used in old MERGE stats, now always Null + updateConditionExpr: String, + updateExprs: Seq[String], + insertConditionExpr: String, + insertExprs: Seq[String], + deleteConditionExpr: String, + + // Newer expressions used in MERGE with any number of MATCHED/NOT MATCHED + matchedStats: Seq[GpuMergeClauseStats], + notMatchedStats: Seq[GpuMergeClauseStats], + + // Data sizes of source and target at different stages of processing + source: GpuMergeDataSizes, + targetBeforeSkipping: GpuMergeDataSizes, + targetAfterSkipping: GpuMergeDataSizes, + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + sourceRowsInSecondScan: Option[Long], + + // Data change sizes + targetFilesRemoved: Long, + targetFilesAdded: Long, + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + targetChangeFilesAdded: Option[Long], + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + targetChangeFileBytes: Option[Long], + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + targetBytesRemoved: Option[Long], + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + targetBytesAdded: Option[Long], + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + targetPartitionsRemovedFrom: Option[Long], + @JsonDeserialize(contentAs = classOf[java.lang.Long]) + targetPartitionsAddedTo: Option[Long], + targetRowsCopied: Long, + targetRowsUpdated: Long, + targetRowsInserted: Long, + targetRowsDeleted: Long +) + +object GpuMergeStats { + + def fromMergeSQLMetrics( + metrics: Map[String, SQLMetric], + condition: Expression, + matchedClauses: Seq[DeltaMergeIntoMatchedClause], + notMatchedClauses: Seq[DeltaMergeIntoNotMatchedClause], + isPartitioned: Boolean): GpuMergeStats = { + + def metricValueIfPartitioned(metricName: String): Option[Long] = { + if (isPartitioned) Some(metrics(metricName).value) else None + } + + GpuMergeStats( + // Merge condition expression + conditionExpr = condition.sql, + + // Newer expressions used in MERGE with any number of MATCHED/NOT MATCHED + matchedStats = matchedClauses.map(GpuMergeClauseStats(_)), + notMatchedStats = notMatchedClauses.map(GpuMergeClauseStats(_)), + + // Data sizes of source and target at different stages of processing + source = GpuMergeDataSizes(rows = Some(metrics("numSourceRows").value)), + targetBeforeSkipping = + GpuMergeDataSizes( + files = Some(metrics("numTargetFilesBeforeSkipping").value), + bytes = Some(metrics("numTargetBytesBeforeSkipping").value)), + targetAfterSkipping = + GpuMergeDataSizes( + files = Some(metrics("numTargetFilesAfterSkipping").value), + bytes = Some(metrics("numTargetBytesAfterSkipping").value), + partitions = metricValueIfPartitioned("numTargetPartitionsAfterSkipping")), + sourceRowsInSecondScan = + metrics.get("numSourceRowsInSecondScan").map(_.value).filter(_ >= 0), + + // Data change sizes + targetFilesAdded = metrics("numTargetFilesAdded").value, + targetChangeFilesAdded = metrics.get("numTargetChangeFilesAdded").map(_.value), + targetChangeFileBytes = metrics.get("numTargetChangeFileBytes").map(_.value), + targetFilesRemoved = metrics("numTargetFilesRemoved").value, + targetBytesAdded = Some(metrics("numTargetBytesAdded").value), + targetBytesRemoved = Some(metrics("numTargetBytesRemoved").value), + targetPartitionsRemovedFrom = metricValueIfPartitioned("numTargetPartitionsRemovedFrom"), + targetPartitionsAddedTo = metricValueIfPartitioned("numTargetPartitionsAddedTo"), + targetRowsCopied = metrics("numTargetRowsCopied").value, + targetRowsUpdated = metrics("numTargetRowsUpdated").value, + targetRowsInserted = metrics("numTargetRowsInserted").value, + targetRowsDeleted = metrics("numTargetRowsDeleted").value, + + // Deprecated fields + updateConditionExpr = null, + updateExprs = null, + insertConditionExpr = null, + insertExprs = null, + deleteConditionExpr = null) + } +} + +/** + * GPU version of Delta Lake's MergeIntoCommand. + * + * Performs a merge of a source query/table into a Delta table. + * + * Issues an error message when the ON search_condition of the MERGE statement can match + * a single row from the target table with multiple rows of the source table-reference. + * + * Algorithm: + * + * Phase 1: Find the input files in target that are touched by the rows that satisfy + * the condition and verify that no two source rows match with the same target row. + * This is implemented as an inner-join using the given condition. See [[findTouchedFiles]] + * for more details. + * + * Phase 2: Read the touched files again and write new files with updated and/or inserted rows. + * + * Phase 3: Use the Delta protocol to atomically remove the touched files and add the new files. + * + * @param source Source data to merge from + * @param target Target table to merge into + * @param gpuDeltaLog Delta log to use + * @param condition Condition for a source row to match with a target row + * @param matchedClauses All info related to matched clauses. + * @param notMatchedClauses All info related to not matched clause. + * @param migratedSchema The final schema of the target - may be changed by schema evolution. + */ +case class GpuMergeIntoCommand( + @transient source: LogicalPlan, + @transient target: LogicalPlan, + @transient gpuDeltaLog: GpuDeltaLog, + condition: Expression, + matchedClauses: Seq[DeltaMergeIntoMatchedClause], + notMatchedClauses: Seq[DeltaMergeIntoNotMatchedClause], + notMatchedBySourceClauses: Seq[DeltaMergeIntoNotMatchedBySourceClause], + migratedSchema: Option[StructType])( + @transient val rapidsConf: RapidsConf) + extends LeafRunnableCommand + with DeltaCommand with PredicateHelper with AnalysisHelper with ImplicitMetadataOperation { + + import GpuMergeIntoCommand._ + + import SQLMetrics._ + import com.databricks.sql.transaction.tahoe.commands.cdc.CDCReader._ + + override val otherCopyArgs: Seq[AnyRef] = Seq(rapidsConf) + + override val canMergeSchema: Boolean = conf.getConf(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE) + override val canOverwriteSchema: Boolean = false + + override val output: Seq[Attribute] = Seq( + AttributeReference("num_affected_rows", LongType)(), + AttributeReference("num_updated_rows", LongType)(), + AttributeReference("num_deleted_rows", LongType)(), + AttributeReference("num_inserted_rows", LongType)()) + + @transient private lazy val sc: SparkContext = SparkContext.getOrCreate() + @transient private lazy val targetDeltaLog: DeltaLog = gpuDeltaLog.deltaLog + /** + * Map to get target output attributes by name. + * The case sensitivity of the map is set accordingly to Spark configuration. + */ + @transient private lazy val targetOutputAttributesMap: Map[String, Attribute] = { + val attrMap: Map[String, Attribute] = target + .outputSet.view + .map(attr => attr.name -> attr).toMap + if (conf.caseSensitiveAnalysis) { + attrMap + } else { + CaseInsensitiveMap(attrMap) + } + } + + /** Whether this merge statement has only a single insert (NOT MATCHED) clause. */ + private def isSingleInsertOnly: Boolean = matchedClauses.isEmpty && notMatchedClauses.length == 1 + /** Whether this merge statement has only MATCHED clauses. */ + private def isMatchedOnly: Boolean = notMatchedClauses.isEmpty && matchedClauses.nonEmpty + + // We over-count numTargetRowsDeleted when there are multiple matches; + // this is the amount of the overcount, so we can subtract it to get a correct final metric. + private var multipleMatchDeleteOnlyOvercount: Option[Long] = None + + override lazy val metrics = Map[String, SQLMetric]( + "numSourceRows" -> createMetric(sc, "number of source rows"), + "numSourceRowsInSecondScan" -> + createMetric(sc, "number of source rows (during repeated scan)"), + "numTargetRowsCopied" -> createMetric(sc, "number of target rows rewritten unmodified"), + "numTargetRowsInserted" -> createMetric(sc, "number of inserted rows"), + "numTargetRowsUpdated" -> createMetric(sc, "number of updated rows"), + "numTargetRowsDeleted" -> createMetric(sc, "number of deleted rows"), + "numTargetFilesBeforeSkipping" -> createMetric(sc, "number of target files before skipping"), + "numTargetFilesAfterSkipping" -> createMetric(sc, "number of target files after skipping"), + "numTargetFilesRemoved" -> createMetric(sc, "number of files removed to target"), + "numTargetFilesAdded" -> createMetric(sc, "number of files added to target"), + "numTargetChangeFilesAdded" -> + createMetric(sc, "number of change data capture files generated"), + "numTargetChangeFileBytes" -> + createMetric(sc, "total size of change data capture files generated"), + "numTargetBytesBeforeSkipping" -> createMetric(sc, "number of target bytes before skipping"), + "numTargetBytesAfterSkipping" -> createMetric(sc, "number of target bytes after skipping"), + "numTargetBytesRemoved" -> createMetric(sc, "number of target bytes removed"), + "numTargetBytesAdded" -> createMetric(sc, "number of target bytes added"), + "numTargetPartitionsAfterSkipping" -> + createMetric(sc, "number of target partitions after skipping"), + "numTargetPartitionsRemovedFrom" -> + createMetric(sc, "number of target partitions from which files were removed"), + "numTargetPartitionsAddedTo" -> + createMetric(sc, "number of target partitions to which files were added"), + "executionTimeMs" -> + createMetric(sc, "time taken to execute the entire operation"), + "scanTimeMs" -> + createMetric(sc, "time taken to scan the files for matches"), + "rewriteTimeMs" -> + createMetric(sc, "time taken to rewrite the matched files")) + + override def run(spark: SparkSession): Seq[Row] = { + recordDeltaOperation(targetDeltaLog, "delta.dml.merge") { + val startTime = System.nanoTime() + gpuDeltaLog.withNewTransaction { deltaTxn => + if (target.schema.size != deltaTxn.metadata.schema.size) { + throw DeltaErrors.schemaChangedSinceAnalysis( + atAnalysis = target.schema, latestSchema = deltaTxn.metadata.schema) + } + + if (canMergeSchema) { + updateMetadata( + spark, deltaTxn, migratedSchema.getOrElse(target.schema), + deltaTxn.metadata.partitionColumns, deltaTxn.metadata.configuration, + isOverwriteMode = false, rearrangeOnly = false) + } + + val deltaActions = { + if (isSingleInsertOnly && spark.conf.get(DeltaSQLConf.MERGE_INSERT_ONLY_ENABLED)) { + writeInsertsOnlyWhenNoMatchedClauses(spark, deltaTxn) + } else { + val filesToRewrite = findTouchedFiles(spark, deltaTxn) + val newWrittenFiles = withStatusCode("DELTA", "Writing merged data") { + writeAllChanges(spark, deltaTxn, filesToRewrite) + } + filesToRewrite.map(_.remove) ++ newWrittenFiles + } + } + + // Metrics should be recorded before commit (where they are written to delta logs). + metrics("executionTimeMs").set((System.nanoTime() - startTime) / 1000 / 1000) + deltaTxn.registerSQLMetrics(spark, metrics) + + // This is a best-effort sanity check. + if (metrics("numSourceRowsInSecondScan").value >= 0 && + metrics("numSourceRows").value != metrics("numSourceRowsInSecondScan").value) { + log.warn(s"Merge source has ${metrics("numSourceRows").value} rows in initial scan but " + + s"${metrics("numSourceRowsInSecondScan").value} rows in second scan") + if (conf.getConf(DeltaSQLConf.MERGE_FAIL_IF_SOURCE_CHANGED)) { + throw DeltaErrors.sourceNotDeterministicInMergeException(spark) + } + } + + deltaTxn.commit( + deltaActions, + DeltaOperations.Merge( + Option(condition), + matchedClauses.map(DeltaOperations.MergePredicate(_)), + notMatchedClauses.map(DeltaOperations.MergePredicate(_)), + // We do not support notMatchedBySourcePredicates yet and fall back to CPU + // See https://github.com/NVIDIA/spark-rapids/issues/8415 + notMatchedBySourcePredicates = Seq.empty[MergePredicate] + )) + + // Record metrics + val stats = GpuMergeStats.fromMergeSQLMetrics( + metrics, condition, matchedClauses, notMatchedClauses, + deltaTxn.metadata.partitionColumns.nonEmpty) + recordDeltaEvent(targetDeltaLog, "delta.dml.merge.stats", data = stats) + + } + spark.sharedState.cacheManager.recacheByPlan(spark, target) + } + // This is needed to make the SQL metrics visible in the Spark UI. Also this needs + // to be outside the recordMergeOperation because this method will update some metric. + val executionId = spark.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) + SQLMetrics.postDriverMetricUpdates(spark.sparkContext, executionId, metrics.values.toSeq) + Seq(Row(metrics("numTargetRowsUpdated").value + metrics("numTargetRowsDeleted").value + + metrics("numTargetRowsInserted").value, metrics("numTargetRowsUpdated").value, + metrics("numTargetRowsDeleted").value, metrics("numTargetRowsInserted").value)) + } + + /** + * Find the target table files that contain the rows that satisfy the merge condition. This is + * implemented as an inner-join between the source query/table and the target table using + * the merge condition. + */ + private def findTouchedFiles( + spark: SparkSession, + deltaTxn: OptimisticTransaction + ): Seq[AddFile] = recordMergeOperation(sqlMetricName = "scanTimeMs") { + + // Accumulator to collect all the distinct touched files + val touchedFilesAccum = new SetAccumulator[String]() + spark.sparkContext.register(touchedFilesAccum, TOUCHED_FILES_ACCUM_NAME) + + // UDFs to records touched files names and add them to the accumulator + val recordTouchedFileName = udf(new GpuDeltaRecordTouchedFileNameUDF(touchedFilesAccum)) + .asNondeterministic() + + // Skip data based on the merge condition + val targetOnlyPredicates = + splitConjunctivePredicates(condition).filter(_.references.subsetOf(target.outputSet)) + val dataSkippedFiles = deltaTxn.filterFiles(targetOnlyPredicates) + + // UDF to increment metrics + val incrSourceRowCountExpr = makeMetricUpdateUDF("numSourceRows") + val sourceDF = Dataset.ofRows(spark, source) + .filter(new Column(incrSourceRowCountExpr)) + + // Apply inner join to between source and target using the merge condition to find matches + // In addition, we attach two columns + // - a monotonically increasing row id for target rows to later identify whether the same + // target row is modified by multiple user or not + // - the target file name the row is from to later identify the files touched by matched rows + val targetDF = Dataset.ofRows(spark, buildTargetPlanWithFiles(deltaTxn, dataSkippedFiles)) + .withColumn(ROW_ID_COL, monotonically_increasing_id()) + .withColumn(FILE_NAME_COL, input_file_name()) + val joinToFindTouchedFiles = sourceDF.join(targetDF, new Column(condition), "inner") + + // Process the matches from the inner join to record touched files and find multiple matches + val collectTouchedFiles = joinToFindTouchedFiles + .select(col(ROW_ID_COL), recordTouchedFileName(col(FILE_NAME_COL)).as("one")) + + // Calculate frequency of matches per source row + val matchedRowCounts = collectTouchedFiles.groupBy(ROW_ID_COL).agg(sum("one").as("count")) + + // Get multiple matches and simultaneously collect (using touchedFilesAccum) the file names + // multipleMatchCount = # of target rows with more than 1 matching source row (duplicate match) + // multipleMatchSum = total # of duplicate matched rows + import spark.implicits._ + val (multipleMatchCount, multipleMatchSum) = matchedRowCounts + .filter("count > 1") + .select(coalesce(count("*"), lit(0)), coalesce(sum("count"), lit(0))) + .as[(Long, Long)] + .collect() + .head + + val hasMultipleMatches = multipleMatchCount > 0 + + // Throw error if multiple matches are ambiguous or cannot be computed correctly. + val canBeComputedUnambiguously = { + // Multiple matches are not ambiguous when there is only one unconditional delete as + // all the matched row pairs in the 2nd join in `writeAllChanges` will get deleted. + val isUnconditionalDelete = matchedClauses.headOption match { + case Some(DeltaMergeIntoMatchedDeleteClause(None)) => true + case _ => false + } + matchedClauses.size == 1 && isUnconditionalDelete + } + + if (hasMultipleMatches && !canBeComputedUnambiguously) { + throw DeltaErrors.multipleSourceRowMatchingTargetRowInMergeException(spark) + } + + if (hasMultipleMatches) { + // This is only allowed for delete-only queries. + // This query will count the duplicates for numTargetRowsDeleted in Job 2, + // because we count matches after the join and not just the target rows. + // We have to compensate for this by subtracting the duplicates later, + // so we need to record them here. + val duplicateCount = multipleMatchSum - multipleMatchCount + multipleMatchDeleteOnlyOvercount = Some(duplicateCount) + } + + // Get the AddFiles using the touched file names. + val touchedFileNames = touchedFilesAccum.value.iterator().asScala.toSeq + logTrace(s"findTouchedFiles: matched files:\n\t${touchedFileNames.mkString("\n\t")}") + + val nameToAddFileMap = generateCandidateFileMap(targetDeltaLog.dataPath, dataSkippedFiles) + val touchedAddFiles = touchedFileNames.map(f => + getTouchedFile(targetDeltaLog.dataPath, f, nameToAddFileMap)) + + // When the target table is empty, and the optimizer optimized away the join entirely + // numSourceRows will be incorrectly 0. We need to scan the source table once to get the correct + // metric here. + if (metrics("numSourceRows").value == 0 && + (dataSkippedFiles.isEmpty || targetDF.take(1).isEmpty)) { + val numSourceRows = sourceDF.count() + metrics("numSourceRows").set(numSourceRows) + } + + // Update metrics + metrics("numTargetFilesBeforeSkipping") += deltaTxn.snapshot.numOfFiles + metrics("numTargetBytesBeforeSkipping") += deltaTxn.snapshot.sizeInBytes + val (afterSkippingBytes, afterSkippingPartitions) = + totalBytesAndDistinctPartitionValues(dataSkippedFiles) + metrics("numTargetFilesAfterSkipping") += dataSkippedFiles.size + metrics("numTargetBytesAfterSkipping") += afterSkippingBytes + metrics("numTargetPartitionsAfterSkipping") += afterSkippingPartitions + val (removedBytes, removedPartitions) = totalBytesAndDistinctPartitionValues(touchedAddFiles) + metrics("numTargetFilesRemoved") += touchedAddFiles.size + metrics("numTargetBytesRemoved") += removedBytes + metrics("numTargetPartitionsRemovedFrom") += removedPartitions + touchedAddFiles + } + + /** + * This is an optimization of the case when there is no update clause for the merge. + * We perform an left anti join on the source data to find the rows to be inserted. + * + * This will currently only optimize for the case when there is a _single_ notMatchedClause. + */ + private def writeInsertsOnlyWhenNoMatchedClauses( + spark: SparkSession, + deltaTxn: OptimisticTransaction + ): Seq[FileAction] = recordMergeOperation(sqlMetricName = "rewriteTimeMs") { + + // UDFs to update metrics + val incrSourceRowCountExpr = makeMetricUpdateUDF("numSourceRows") + val incrInsertedCountExpr = makeMetricUpdateUDF("numTargetRowsInserted") + + val outputColNames = getTargetOutputCols(deltaTxn).map(_.name) + // we use head here since we know there is only a single notMatchedClause + val outputExprs = notMatchedClauses.head.resolvedActions.map(_.expr) + val outputCols = outputExprs.zip(outputColNames).map { case (expr, name) => + new Column(Alias(expr, name)()) + } + + // source DataFrame + val sourceDF = Dataset.ofRows(spark, source) + .filter(new Column(incrSourceRowCountExpr)) + .filter(new Column(notMatchedClauses.head.condition.getOrElse(Literal.TrueLiteral))) + + // Skip data based on the merge condition + val conjunctivePredicates = splitConjunctivePredicates(condition) + val targetOnlyPredicates = + conjunctivePredicates.filter(_.references.subsetOf(target.outputSet)) + val dataSkippedFiles = deltaTxn.filterFiles(targetOnlyPredicates) + + // target DataFrame + val targetDF = Dataset.ofRows( + spark, buildTargetPlanWithFiles(deltaTxn, dataSkippedFiles)) + + val insertDf = sourceDF.join(targetDF, new Column(condition), "leftanti") + .select(outputCols: _*) + .filter(new Column(incrInsertedCountExpr)) + + val newFiles = deltaTxn + .writeFiles(repartitionIfNeeded(spark, insertDf, deltaTxn.metadata.partitionColumns)) + + // Update metrics + metrics("numTargetFilesBeforeSkipping") += deltaTxn.snapshot.numOfFiles + metrics("numTargetBytesBeforeSkipping") += deltaTxn.snapshot.sizeInBytes + val (afterSkippingBytes, afterSkippingPartitions) = + totalBytesAndDistinctPartitionValues(dataSkippedFiles) + metrics("numTargetFilesAfterSkipping") += dataSkippedFiles.size + metrics("numTargetBytesAfterSkipping") += afterSkippingBytes + metrics("numTargetPartitionsAfterSkipping") += afterSkippingPartitions + metrics("numTargetFilesRemoved") += 0 + metrics("numTargetBytesRemoved") += 0 + metrics("numTargetPartitionsRemovedFrom") += 0 + val (addedBytes, addedPartitions) = totalBytesAndDistinctPartitionValues(newFiles) + metrics("numTargetFilesAdded") += newFiles.count(_.isInstanceOf[AddFile]) + metrics("numTargetBytesAdded") += addedBytes + metrics("numTargetPartitionsAddedTo") += addedPartitions + newFiles + } + + /** + * Write new files by reading the touched files and updating/inserting data using the source + * query/table. This is implemented using a full|right-outer-join using the merge condition. + * + * Note that unlike the insert-only code paths with just one control column INCR_ROW_COUNT_COL, + * this method has two additional control columns ROW_DROPPED_COL for dropping deleted rows and + * CDC_TYPE_COL_NAME used for handling CDC when enabled. + */ + private def writeAllChanges( + spark: SparkSession, + deltaTxn: OptimisticTransaction, + filesToRewrite: Seq[AddFile] + ): Seq[FileAction] = recordMergeOperation(sqlMetricName = "rewriteTimeMs") { + import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLiteral} + + val cdcEnabled = DeltaConfigs.CHANGE_DATA_FEED.fromMetaData(deltaTxn.metadata) + + var targetOutputCols = getTargetOutputCols(deltaTxn) + var outputRowSchema = deltaTxn.metadata.schema + + // When we have duplicate matches (only allowed when the whenMatchedCondition is a delete with + // no match condition) we will incorrectly generate duplicate CDC rows. + // Duplicate matches can be due to: + // - Duplicate rows in the source w.r.t. the merge condition + // - A target-only or source-only merge condition, which essentially turns our join into a cross + // join with the target/source satisfiying the merge condition. + // These duplicate matches are dropped from the main data output since this is a delete + // operation, but the duplicate CDC rows are not removed by default. + // See https://github.com/delta-io/delta/issues/1274 + + // We address this specific scenario by adding row ids to the target before performing our join. + // There should only be one CDC delete row per target row so we can use these row ids to dedupe + // the duplicate CDC delete rows. + + // We also need to address the scenario when there are duplicate matches with delete and we + // insert duplicate rows. Here we need to additionally add row ids to the source before the + // join to avoid dropping these valid duplicate inserted rows and their corresponding cdc rows. + + // When there is an insert clause, we set SOURCE_ROW_ID_COL=null for all delete rows because we + // need to drop the duplicate matches. + val isDeleteWithDuplicateMatchesAndCdc = multipleMatchDeleteOnlyOvercount.nonEmpty && cdcEnabled + + // Generate a new logical plan that has same output attributes exprIds as the target plan. + // This allows us to apply the existing resolved update/insert expressions. + val newTarget = buildTargetPlanWithFiles(deltaTxn, filesToRewrite) + val joinType = if (isMatchedOnly && + spark.conf.get(DeltaSQLConf.MERGE_MATCHED_ONLY_ENABLED)) { + "rightOuter" + } else { + "fullOuter" + } + + logDebug(s"""writeAllChanges using $joinType join: + | source.output: ${source.outputSet} + | target.output: ${target.outputSet} + | condition: $condition + | newTarget.output: ${newTarget.outputSet} + """.stripMargin) + + // UDFs to update metrics + // Make UDFs that appear in the custom join processor node deterministic, as they always + // return true and update a metric. Catalyst precludes non-deterministic UDFs that are not + // allowed outside a very specific set of Catalyst nodes (Project, Filter, Window, Aggregate). + val incrSourceRowCountExpr = makeMetricUpdateUDF("numSourceRowsInSecondScan") + val incrUpdatedCountExpr = makeMetricUpdateUDF("numTargetRowsUpdated", deterministic = true) + val incrInsertedCountExpr = makeMetricUpdateUDF("numTargetRowsInserted", deterministic = true) + val incrNoopCountExpr = makeMetricUpdateUDF("numTargetRowsCopied", deterministic = true) + val incrDeletedCountExpr = makeMetricUpdateUDF("numTargetRowsDeleted", deterministic = true) + + // Apply an outer join to find both, matches and non-matches. We are adding two boolean fields + // with value `true`, one to each side of the join. Whether this field is null or not after + // the outer join, will allow us to identify whether the resultant joined row was a + // matched inner result or an unmatched result with null on one side. + // We add row IDs to the targetDF if we have a delete-when-matched clause with duplicate + // matches and CDC is enabled, and additionally add row IDs to the source if we also have an + // insert clause. See above at isDeleteWithDuplicateMatchesAndCdc definition for more details. + var sourceDF = Dataset.ofRows(spark, source) + .withColumn(SOURCE_ROW_PRESENT_COL, new Column(incrSourceRowCountExpr)) + var targetDF = Dataset.ofRows(spark, newTarget) + .withColumn(TARGET_ROW_PRESENT_COL, lit(true)) + if (isDeleteWithDuplicateMatchesAndCdc) { + targetDF = targetDF.withColumn(TARGET_ROW_ID_COL, monotonically_increasing_id()) + if (notMatchedClauses.nonEmpty) { // insert clause + sourceDF = sourceDF.withColumn(SOURCE_ROW_ID_COL, monotonically_increasing_id()) + } + } + val joinedDF = sourceDF.join(targetDF, new Column(condition), joinType) + val joinedPlan = joinedDF.queryExecution.analyzed + + def resolveOnJoinedPlan(exprs: Seq[Expression]): Seq[Expression] = { + tryResolveReferencesForExpressions(spark, exprs, joinedPlan) + } + + // ==== Generate the expressions to process full-outer join output and generate target rows ==== + // If there are N columns in the target table, there will be N + 3 columns after processing + // - N columns for target table + // - ROW_DROPPED_COL to define whether the generated row should dropped or written + // - INCR_ROW_COUNT_COL containing a UDF to update the output row row counter + // - CDC_TYPE_COLUMN_NAME containing the type of change being performed in a particular row + + // To generate these N + 3 columns, we will generate N + 3 expressions and apply them to the + // rows in the joinedDF. The CDC column will be either used for CDC generation or dropped before + // performing the final write, and the other two will always be dropped after executing the + // metrics UDF and filtering on ROW_DROPPED_COL. + + // We produce rows for both the main table data (with CDC_TYPE_COLUMN_NAME = CDC_TYPE_NOT_CDC), + // and rows for the CDC data which will be output to CDCReader.CDC_LOCATION. + // See [[CDCReader]] for general details on how partitioning on the CDC type column works. + + // In the following two functions `matchedClauseOutput` and `notMatchedClauseOutput`, we + // produce a Seq[Expression] for each intended output row. + // Depending on the clause and whether CDC is enabled, we output between 0 and 3 rows, as a + // Seq[Seq[Expression]] + + // There is one corner case outlined above at isDeleteWithDuplicateMatchesAndCdc definition. + // When we have a delete-ONLY merge with duplicate matches we have N + 4 columns: + // N target cols, TARGET_ROW_ID_COL, ROW_DROPPED_COL, INCR_ROW_COUNT_COL, CDC_TYPE_COLUMN_NAME + // When we have a delete-when-matched merge with duplicate matches + an insert clause, we have + // N + 5 columns: + // N target cols, TARGET_ROW_ID_COL, SOURCE_ROW_ID_COL, ROW_DROPPED_COL, INCR_ROW_COUNT_COL, + // CDC_TYPE_COLUMN_NAME + // These ROW_ID_COL will always be dropped before the final write. + + if (isDeleteWithDuplicateMatchesAndCdc) { + targetOutputCols = targetOutputCols :+ UnresolvedAttribute(TARGET_ROW_ID_COL) + outputRowSchema = outputRowSchema.add(TARGET_ROW_ID_COL, DataTypes.LongType) + if (notMatchedClauses.nonEmpty) { // there is an insert clause, make SRC_ROW_ID_COL=null + targetOutputCols = targetOutputCols :+ Alias(Literal(null), SOURCE_ROW_ID_COL)() + outputRowSchema = outputRowSchema.add(SOURCE_ROW_ID_COL, DataTypes.LongType) + } + } + + if (cdcEnabled) { + outputRowSchema = outputRowSchema + .add(ROW_DROPPED_COL, DataTypes.BooleanType) + .add(INCR_ROW_COUNT_COL, DataTypes.BooleanType) + .add(CDC_TYPE_COLUMN_NAME, DataTypes.StringType) + } + + def matchedClauseOutput(clause: DeltaMergeIntoMatchedClause): Seq[Seq[Expression]] = { + val exprs = clause match { + case u: DeltaMergeIntoMatchedUpdateClause => + // Generate update expressions and set ROW_DELETED_COL = false and + // CDC_TYPE_COLUMN_NAME = CDC_TYPE_NOT_CDC + val mainDataOutput = u.resolvedActions.map(_.expr) :+ FalseLiteral :+ + incrUpdatedCountExpr :+ CDC_TYPE_NOT_CDC_LITERAL + if (cdcEnabled) { + // For update preimage, we have do a no-op copy with ROW_DELETED_COL = false and + // CDC_TYPE_COLUMN_NAME = CDC_TYPE_UPDATE_PREIMAGE and INCR_ROW_COUNT_COL as a no-op + // (because the metric will be incremented in `mainDataOutput`) + val preImageOutput = targetOutputCols :+ FalseLiteral :+ TrueLiteral :+ + Literal(CDC_TYPE_UPDATE_PREIMAGE) + // For update postimage, we have the same expressions as for mainDataOutput but with + // INCR_ROW_COUNT_COL as a no-op (because the metric will be incremented in + // `mainDataOutput`), and CDC_TYPE_COLUMN_NAME = CDC_TYPE_UPDATE_POSTIMAGE + val postImageOutput = mainDataOutput.dropRight(2) :+ TrueLiteral :+ + Literal(CDC_TYPE_UPDATE_POSTIMAGE) + Seq(mainDataOutput, preImageOutput, postImageOutput) + } else { + Seq(mainDataOutput) + } + case _: DeltaMergeIntoMatchedDeleteClause => + // Generate expressions to set the ROW_DELETED_COL = true and CDC_TYPE_COLUMN_NAME = + // CDC_TYPE_NOT_CDC + val mainDataOutput = targetOutputCols :+ TrueLiteral :+ incrDeletedCountExpr :+ + CDC_TYPE_NOT_CDC_LITERAL + if (cdcEnabled) { + // For delete we do a no-op copy with ROW_DELETED_COL = false, INCR_ROW_COUNT_COL as a + // no-op (because the metric will be incremented in `mainDataOutput`) and + // CDC_TYPE_COLUMN_NAME = CDC_TYPE_DELETE + val deleteCdcOutput = targetOutputCols :+ FalseLiteral :+ TrueLiteral :+ + Literal(CDC_TYPE_DELETE) + Seq(mainDataOutput, deleteCdcOutput) + } else { + Seq(mainDataOutput) + } + } + exprs.map(resolveOnJoinedPlan) + } + + def notMatchedClauseOutput(clause: DeltaMergeIntoNotMatchedClause): Seq[Seq[Expression]] = { + // Generate insert expressions and set ROW_DELETED_COL = false and + // CDC_TYPE_COLUMN_NAME = CDC_TYPE_NOT_CDC + val insertExprs = clause.resolvedActions.map(_.expr) + val mainDataOutput = resolveOnJoinedPlan( + if (isDeleteWithDuplicateMatchesAndCdc) { + // Must be delete-when-matched merge with duplicate matches + insert clause + // Therefore we must keep the target row id and source row id. Since this is a not-matched + // clause we know the target row-id will be null. See above at + // isDeleteWithDuplicateMatchesAndCdc definition for more details. + insertExprs :+ + Alias(Literal(null), TARGET_ROW_ID_COL)() :+ UnresolvedAttribute(SOURCE_ROW_ID_COL) :+ + FalseLiteral :+ incrInsertedCountExpr :+ CDC_TYPE_NOT_CDC_LITERAL + } else { + insertExprs :+ FalseLiteral :+ incrInsertedCountExpr :+ CDC_TYPE_NOT_CDC_LITERAL + } + ) + if (cdcEnabled) { + // For insert we have the same expressions as for mainDataOutput, but with + // INCR_ROW_COUNT_COL as a no-op (because the metric will be incremented in + // `mainDataOutput`), and CDC_TYPE_COLUMN_NAME = CDC_TYPE_INSERT + val insertCdcOutput = mainDataOutput.dropRight(2) :+ TrueLiteral :+ Literal(CDC_TYPE_INSERT) + Seq(mainDataOutput, insertCdcOutput) + } else { + Seq(mainDataOutput) + } + } + + def clauseCondition(clause: DeltaMergeIntoClause): Expression = { + // if condition is None, then expression always evaluates to true + val condExpr = clause.condition.getOrElse(TrueLiteral) + resolveOnJoinedPlan(Seq(condExpr)).head + } + + val targetRowHasNoMatch = resolveOnJoinedPlan(Seq(col(SOURCE_ROW_PRESENT_COL).isNull.expr)).head + val sourceRowHasNoMatch = resolveOnJoinedPlan(Seq(col(TARGET_ROW_PRESENT_COL).isNull.expr)).head + val matchedConditions = matchedClauses.map(clauseCondition) + val matchedOutputs = matchedClauses.map(matchedClauseOutput) + val notMatchedConditions = notMatchedClauses.map(clauseCondition) + val notMatchedOutputs = notMatchedClauses.map(notMatchedClauseOutput) + // TODO support notMatchedBySourceClauses which is new in DBR 12.2 + // https://github.com/NVIDIA/spark-rapids/issues/8415 + val notMatchedBySourceConditions = Seq.empty + val notMatchedBySourceOutputs = Seq.empty + val noopCopyOutput = + resolveOnJoinedPlan(targetOutputCols :+ FalseLiteral :+ incrNoopCountExpr :+ + CDC_TYPE_NOT_CDC_LITERAL) + val deleteRowOutput = + resolveOnJoinedPlan(targetOutputCols :+ TrueLiteral :+ TrueLiteral :+ + CDC_TYPE_NOT_CDC_LITERAL) + var outputDF = addMergeJoinProcessor(spark, joinedPlan, outputRowSchema, + targetRowHasNoMatch = targetRowHasNoMatch, + sourceRowHasNoMatch = sourceRowHasNoMatch, + matchedConditions = matchedConditions, + matchedOutputs = matchedOutputs, + notMatchedConditions = notMatchedConditions, + notMatchedOutputs = notMatchedOutputs, + notMatchedBySourceConditions = notMatchedBySourceConditions, + notMatchedBySourceOutputs = notMatchedBySourceOutputs, + noopCopyOutput = noopCopyOutput, + deleteRowOutput = deleteRowOutput) + + if (isDeleteWithDuplicateMatchesAndCdc) { + // When we have a delete when matched clause with duplicate matches we have to remove + // duplicate CDC rows. This scenario is further explained at + // isDeleteWithDuplicateMatchesAndCdc definition. + + // To remove duplicate CDC rows generated by the duplicate matches we dedupe by + // TARGET_ROW_ID_COL since there should only be one CDC delete row per target row. + // When there is an insert clause in addition to the delete clause we additionally dedupe by + // SOURCE_ROW_ID_COL and CDC_TYPE_COLUMN_NAME to avoid dropping valid duplicate inserted rows + // and their corresponding CDC rows. + val columnsToDedupeBy = if (notMatchedClauses.nonEmpty) { // insert clause + Seq(TARGET_ROW_ID_COL, SOURCE_ROW_ID_COL, CDC_TYPE_COLUMN_NAME) + } else { + Seq(TARGET_ROW_ID_COL) + } + outputDF = outputDF + .dropDuplicates(columnsToDedupeBy) + .drop(ROW_DROPPED_COL, INCR_ROW_COUNT_COL, TARGET_ROW_ID_COL, SOURCE_ROW_ID_COL) + } else { + outputDF = outputDF.drop(ROW_DROPPED_COL, INCR_ROW_COUNT_COL) + } + + logDebug("writeAllChanges: join output plan:\n" + outputDF.queryExecution) + + // Write to Delta + val newFiles = deltaTxn + .writeFiles(repartitionIfNeeded(spark, outputDF, deltaTxn.metadata.partitionColumns)) + + // Update metrics + val (addedBytes, addedPartitions) = totalBytesAndDistinctPartitionValues(newFiles) + metrics("numTargetFilesAdded") += newFiles.count(_.isInstanceOf[AddFile]) + metrics("numTargetChangeFilesAdded") += newFiles.count(_.isInstanceOf[AddCDCFile]) + metrics("numTargetChangeFileBytes") += newFiles.collect{ case f: AddCDCFile => f.size }.sum + metrics("numTargetBytesAdded") += addedBytes + metrics("numTargetPartitionsAddedTo") += addedPartitions + if (multipleMatchDeleteOnlyOvercount.isDefined) { + // Compensate for counting duplicates during the query. + val actualRowsDeleted = + metrics("numTargetRowsDeleted").value - multipleMatchDeleteOnlyOvercount.get + assert(actualRowsDeleted >= 0) + metrics("numTargetRowsDeleted").set(actualRowsDeleted) + } + + newFiles + } + + private def addMergeJoinProcessor( + spark: SparkSession, + joinedPlan: LogicalPlan, + outputRowSchema: StructType, + targetRowHasNoMatch: Expression, + sourceRowHasNoMatch: Expression, + matchedConditions: Seq[Expression], + matchedOutputs: Seq[Seq[Seq[Expression]]], + notMatchedConditions: Seq[Expression], + notMatchedOutputs: Seq[Seq[Seq[Expression]]], + notMatchedBySourceConditions: Seq[Expression], + notMatchedBySourceOutputs: Seq[Seq[Seq[Expression]]], + noopCopyOutput: Seq[Expression], + deleteRowOutput: Seq[Expression]): Dataset[Row] = { + def wrap(e: Expression): BaseExprMeta[Expression] = { + GpuOverrides.wrapExpr(e, rapidsConf, None) + } + + val targetRowHasNoMatchMeta = wrap(targetRowHasNoMatch) + val sourceRowHasNoMatchMeta = wrap(sourceRowHasNoMatch) + val matchedConditionsMetas = matchedConditions.map(wrap) + val matchedOutputsMetas = matchedOutputs.map(_.map(_.map(wrap))) + val notMatchedConditionsMetas = notMatchedConditions.map(wrap) + val notMatchedOutputsMetas = notMatchedOutputs.map(_.map(_.map(wrap))) + val notMatchedBySourceConditionsMetas = notMatchedBySourceConditions.map(wrap) + val notMatchedBySourceOutputsMetas = notMatchedBySourceOutputs.map(_.map(_.map(wrap))) + val noopCopyOutputMetas = noopCopyOutput.map(wrap) + val deleteRowOutputMetas = deleteRowOutput.map(wrap) + val allMetas = Seq(targetRowHasNoMatchMeta, sourceRowHasNoMatchMeta) ++ + matchedConditionsMetas ++ matchedOutputsMetas.flatten.flatten ++ + notMatchedConditionsMetas ++ notMatchedOutputsMetas.flatten.flatten ++ + notMatchedBySourceConditionsMetas ++ notMatchedBySourceOutputsMetas.flatten.flatten ++ + noopCopyOutputMetas ++ deleteRowOutputMetas + allMetas.foreach(_.tagForGpu()) + val canReplace = allMetas.forall(_.canExprTreeBeReplaced) && rapidsConf.isOperatorEnabled( + "spark.rapids.sql.exec.RapidsProcessDeltaMergeJoinExec", false, false) + if (rapidsConf.shouldExplainAll || (rapidsConf.shouldExplain && !canReplace)) { + val exprExplains = allMetas.map(_.explain(rapidsConf.shouldExplainAll)) + val execWorkInfo = if (canReplace) { + "will run on GPU" + } else { + "cannot run on GPU because not all merge processing expressions can be replaced" + } + logWarning(s" $execWorkInfo:\n" + + s" ${exprExplains.mkString(" ")}") + } + + if (canReplace) { + val processedJoinPlan = RapidsProcessDeltaMergeJoin( + joinedPlan, + toAttributes(outputRowSchema), + targetRowHasNoMatch = targetRowHasNoMatch, + sourceRowHasNoMatch = sourceRowHasNoMatch, + matchedConditions = matchedConditions, + matchedOutputs = matchedOutputs, + notMatchedConditions = notMatchedConditions, + notMatchedOutputs = notMatchedOutputs, + notMatchedBySourceConditions = notMatchedBySourceConditions, + notMatchedBySourceOutputs = notMatchedBySourceOutputs, + noopCopyOutput = noopCopyOutput, + deleteRowOutput = deleteRowOutput) + Dataset.ofRows(spark, processedJoinPlan) + } else { + val joinedRowEncoder = ExpressionEncoder(RowEncoder.encoderFor(joinedPlan.schema)) + val outputRowEncoder = ExpressionEncoder(RowEncoder.encoderFor(outputRowSchema)). + resolveAndBind() + + val processor = new JoinedRowProcessor( + targetRowHasNoMatch = targetRowHasNoMatch, + sourceRowHasNoMatch = sourceRowHasNoMatch, + matchedConditions = matchedConditions, + matchedOutputs = matchedOutputs, + notMatchedConditions = notMatchedConditions, + notMatchedOutputs = notMatchedOutputs, + noopCopyOutput = noopCopyOutput, + deleteRowOutput = deleteRowOutput, + joinedAttributes = joinedPlan.output, + joinedRowEncoder = joinedRowEncoder, + outputRowEncoder = outputRowEncoder) + + Dataset.ofRows(spark, joinedPlan).mapPartitions(processor.processPartition)(outputRowEncoder) + } + } + + /** + * Build a new logical plan using the given `files` that has the same output columns (exprIds) + * as the `target` logical plan, so that existing update/insert expressions can be applied + * on this new plan. + */ + private def buildTargetPlanWithFiles( + deltaTxn: OptimisticTransaction, + files: Seq[AddFile]): LogicalPlan = { + val targetOutputCols = getTargetOutputCols(deltaTxn) + val targetOutputColsMap = { + val colsMap: Map[String, NamedExpression] = targetOutputCols.view + .map(col => col.name -> col).toMap + if (conf.caseSensitiveAnalysis) { + colsMap + } else { + CaseInsensitiveMap(colsMap) + } + } + + val plan = { + // We have to do surgery to use the attributes from `targetOutputCols` to scan the table. + // In cases of schema evolution, they may not be the same type as the original attributes. + val original = + deltaTxn.deltaLog.createDataFrame(deltaTxn.snapshot, files).queryExecution.analyzed + val transformed = original.transform { + case LogicalRelation(base, _, catalogTbl, isStreaming) => + LogicalRelation( + base, + // We can ignore the new columns which aren't yet AttributeReferences. + targetOutputCols.collect { case a: AttributeReference => a }, + catalogTbl, + isStreaming) + } + + // In case of schema evolution & column mapping, we would also need to rebuild the file format + // because under column mapping, the reference schema within DeltaParquetFileFormat + // that is used to populate metadata needs to be updated + if (deltaTxn.metadata.columnMappingMode != NoMapping) { + val updatedFileFormat = deltaTxn.deltaLog.fileFormat( + deltaTxn.deltaLog.unsafeVolatileSnapshot.protocol, deltaTxn.metadata) + DeltaTableUtils.replaceFileFormat(transformed, updatedFileFormat) + } else { + transformed + } + } + + // For each plan output column, find the corresponding target output column (by name) and + // create an alias + val aliases = plan.output.map { + case newAttrib: AttributeReference => + val existingTargetAttrib = targetOutputColsMap.get(newAttrib.name) + .getOrElse { + throw new AnalysisException( + s"Could not find ${newAttrib.name} among the existing target output " + + targetOutputCols.mkString(",")) + }.asInstanceOf[AttributeReference] + + if (existingTargetAttrib.exprId == newAttrib.exprId) { + // It's not valid to alias an expression to its own exprId (this is considered a + // non-unique exprId by the analyzer), so we just use the attribute directly. + newAttrib + } else { + Alias(newAttrib, existingTargetAttrib.name)(exprId = existingTargetAttrib.exprId) + } + } + + Project(aliases, plan) + } + + /** Expressions to increment SQL metrics */ + private def makeMetricUpdateUDF(name: String, deterministic: Boolean = false): Expression = { + // only capture the needed metric in a local variable + val metric = metrics(name) + var u = udf(new GpuDeltaMetricUpdateUDF(metric)) + if (!deterministic) { + u = u.asNondeterministic() + } + u.apply().expr + } + + private def getTargetOutputCols(txn: OptimisticTransaction): Seq[NamedExpression] = { + txn.metadata.schema.map { col => + targetOutputAttributesMap + .get(col.name) + .map { a => + AttributeReference(col.name, col.dataType, col.nullable)(a.exprId) + } + .getOrElse(Alias(Literal(null), col.name)() + ) + } + } + + /** + * Repartitions the output DataFrame by the partition columns if table is partitioned + * and `merge.repartitionBeforeWrite.enabled` is set to true. + */ + protected def repartitionIfNeeded( + spark: SparkSession, + df: DataFrame, + partitionColumns: Seq[String]): DataFrame = { + if (partitionColumns.nonEmpty && spark.conf.get(DeltaSQLConf.MERGE_REPARTITION_BEFORE_WRITE)) { + df.repartition(partitionColumns.map(col): _*) + } else { + df + } + } + + /** + * Execute the given `thunk` and return its result while recording the time taken to do it. + * + * @param sqlMetricName name of SQL metric to update with the time taken by the thunk + * @param thunk the code to execute + */ + private def recordMergeOperation[A](sqlMetricName: String)(thunk: => A): A = { + val startTimeNs = System.nanoTime() + val r = thunk + val timeTakenMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTimeNs) + if (sqlMetricName != null && timeTakenMs > 0) { + metrics(sqlMetricName) += timeTakenMs + } + r + } +} + +object GpuMergeIntoCommand { + /** + * Spark UI will track all normal accumulators along with Spark tasks to show them on Web UI. + * However, the accumulator used by `MergeIntoCommand` can store a very large value since it + * tracks all files that need to be rewritten. We should ask Spark UI to not remember it, + * otherwise, the UI data may consume lots of memory. Hence, we use the prefix `internal.metrics.` + * to make this accumulator become an internal accumulator, so that it will not be tracked by + * Spark UI. + */ + val TOUCHED_FILES_ACCUM_NAME = "internal.metrics.MergeIntoDelta.touchedFiles" + + val ROW_ID_COL = "_row_id_" + val TARGET_ROW_ID_COL = "_target_row_id_" + val SOURCE_ROW_ID_COL = "_source_row_id_" + val FILE_NAME_COL = "_file_name_" + val SOURCE_ROW_PRESENT_COL = "_source_row_present_" + val TARGET_ROW_PRESENT_COL = "_target_row_present_" + val ROW_DROPPED_COL = GpuDeltaMergeConstants.ROW_DROPPED_COL + val INCR_ROW_COUNT_COL = "_incr_row_count_" + + // Some Delta versions use Literal(null) which translates to a literal of NullType instead + // of the Literal(null, StringType) which is needed, so using a fixed version here + // rather than the version from Delta Lake. + val CDC_TYPE_NOT_CDC_LITERAL = Literal(null, StringType) + + /** + * @param targetRowHasNoMatch whether a joined row is a target row with no match in the source + * table + * @param sourceRowHasNoMatch whether a joined row is a source row with no match in the target + * table + * @param matchedConditions condition for each match clause + * @param matchedOutputs corresponding output for each match clause. for each clause, we + * have 1-3 output rows, each of which is a sequence of expressions + * to apply to the joined row + * @param notMatchedConditions condition for each not-matched clause + * @param notMatchedOutputs corresponding output for each not-matched clause. for each clause, + * we have 1-2 output rows, each of which is a sequence of + * expressions to apply to the joined row + * @param noopCopyOutput no-op expression to copy a target row to the output + * @param deleteRowOutput expression to drop a row from the final output. this is used for + * source rows that don't match any not-matched clauses + * @param joinedAttributes schema of our outer-joined dataframe + * @param joinedRowEncoder joinedDF row encoder + * @param outputRowEncoder final output row encoder + */ + class JoinedRowProcessor( + targetRowHasNoMatch: Expression, + sourceRowHasNoMatch: Expression, + matchedConditions: Seq[Expression], + matchedOutputs: Seq[Seq[Seq[Expression]]], + notMatchedConditions: Seq[Expression], + notMatchedOutputs: Seq[Seq[Seq[Expression]]], + noopCopyOutput: Seq[Expression], + deleteRowOutput: Seq[Expression], + joinedAttributes: Seq[Attribute], + joinedRowEncoder: ExpressionEncoder[Row], + outputRowEncoder: ExpressionEncoder[Row]) extends Serializable { + + private def generateProjection(exprs: Seq[Expression]): UnsafeProjection = { + UnsafeProjection.create(exprs, joinedAttributes) + } + + private def generatePredicate(expr: Expression): BasePredicate = { + GeneratePredicate.generate(expr, joinedAttributes) + } + + def processPartition(rowIterator: Iterator[Row]): Iterator[Row] = { + + val targetRowHasNoMatchPred = generatePredicate(targetRowHasNoMatch) + val sourceRowHasNoMatchPred = generatePredicate(sourceRowHasNoMatch) + val matchedPreds = matchedConditions.map(generatePredicate) + val matchedProjs = matchedOutputs.map(_.map(generateProjection)) + val notMatchedPreds = notMatchedConditions.map(generatePredicate) + val notMatchedProjs = notMatchedOutputs.map(_.map(generateProjection)) + val noopCopyProj = generateProjection(noopCopyOutput) + val deleteRowProj = generateProjection(deleteRowOutput) + val outputProj = UnsafeProjection.create(outputRowEncoder.schema) + + // this is accessing ROW_DROPPED_COL. If ROW_DROPPED_COL is not in outputRowEncoder.schema + // then CDC must be disabled and it's the column after our output cols + def shouldDeleteRow(row: InternalRow): Boolean = { + row.getBoolean( + outputRowEncoder.schema.getFieldIndex(ROW_DROPPED_COL) + .getOrElse(outputRowEncoder.schema.fields.size) + ) + } + + def processRow(inputRow: InternalRow): Iterator[InternalRow] = { + if (targetRowHasNoMatchPred.eval(inputRow)) { + // Target row did not match any source row, so just copy it to the output + Iterator(noopCopyProj.apply(inputRow)) + } else { + // identify which set of clauses to execute: matched or not-matched ones + val (predicates, projections, noopAction) = if (sourceRowHasNoMatchPred.eval(inputRow)) { + // Source row did not match with any target row, so insert the new source row + (notMatchedPreds, notMatchedProjs, deleteRowProj) + } else { + // Source row matched with target row, so update the target row + (matchedPreds, matchedProjs, noopCopyProj) + } + + // find (predicate, projection) pair whose predicate satisfies inputRow + val pair = (predicates zip projections).find { + case (predicate, _) => predicate.eval(inputRow) + } + + pair match { + case Some((_, projections)) => + projections.map(_.apply(inputRow)).iterator + case None => Iterator(noopAction.apply(inputRow)) + } + } + } + + val toRow = joinedRowEncoder.createSerializer() + val fromRow = outputRowEncoder.createDeserializer() + rowIterator + .map(toRow) + .flatMap(processRow) + .filter(!shouldDeleteRow(_)) + .map { notDeletedInternalRow => + fromRow(outputProj(notDeletedInternalRow)) + } + } + } + + /** Count the number of distinct partition values among the AddFiles in the given set. */ + def totalBytesAndDistinctPartitionValues(files: Seq[FileAction]): (Long, Int) = { + val distinctValues = new mutable.HashSet[Map[String, String]]() + var bytes = 0L + val iter = files.collect { case a: AddFile => a }.iterator + while (iter.hasNext) { + val file = iter.next() + distinctValues += file.partitionValues + bytes += file.size + } + // If the only distinct value map is an empty map, then it must be an unpartitioned table. + // Return 0 in that case. + val numDistinctValues = + if (distinctValues.size == 1 && distinctValues.head.isEmpty) 0 else distinctValues.size + (bytes, numDistinctValues) + } +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransaction.scala b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransaction.scala new file mode 100644 index 00000000000..e06aba55487 --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransaction.scala @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * This file was derived from OptimisticTransaction.scala and TransactionalWrite.scala + * in the Delta Lake project at https://github.com/delta-io/delta. + * + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.databricks.sql.transaction.tahoe.rapids + +import java.net.URI + +import scala.collection.mutable.ListBuffer + +import com.databricks.sql.transaction.tahoe._ +import com.databricks.sql.transaction.tahoe.actions.{AddFile, FileAction} +import com.databricks.sql.transaction.tahoe.constraints.{Constraint, Constraints} +import com.databricks.sql.transaction.tahoe.schema.InvariantViolationException +import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf +import com.nvidia.spark.rapids._ +import com.nvidia.spark.rapids.delta._ +import org.apache.commons.lang3.exception.ExceptionUtils +import org.apache.hadoop.fs.Path + +import org.apache.spark.SparkException +import org.apache.spark.sql.{DataFrame, Dataset} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} +import org.apache.spark.sql.catalyst.plans.logical.LocalRelation +import org.apache.spark.sql.execution.SQLExecution +import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, FileFormatWriter} +import org.apache.spark.sql.functions.to_json +import org.apache.spark.sql.rapids.{BasicColumnarWriteJobStatsTracker, ColumnarWriteJobStatsTracker, GpuFileFormatWriter, GpuWriteJobStatsTracker} +import org.apache.spark.sql.rapids.delta.GpuIdentityColumn +import org.apache.spark.sql.types._ +import org.apache.spark.sql.vectorized.ColumnarBatch +import org.apache.spark.util.{Clock, SerializableConfiguration} + +/** + * Used to perform a set of reads in a transaction and then commit a set of updates to the + * state of the log. All reads from the DeltaLog, MUST go through this instance rather + * than directly to the DeltaLog otherwise they will not be check for logical conflicts + * with concurrent updates. + * + * This class is not thread-safe. + * + * @param deltaLog The Delta Log for the table this transaction is modifying. + * @param snapshot The snapshot that this transaction is reading at. + * @param rapidsConf RAPIDS Accelerator config settings. + */ +class GpuOptimisticTransaction( + deltaLog: DeltaLog, + snapshot: Snapshot, + rapidsConf: RapidsConf)(implicit clock: Clock) + extends GpuOptimisticTransactionBase(deltaLog, snapshot, rapidsConf)(clock) { + + /** Creates a new OptimisticTransaction. + * + * @param deltaLog The Delta Log for the table this transaction is modifying. + * @param rapidsConf RAPIDS Accelerator config settings + */ + def this(deltaLog: DeltaLog, rapidsConf: RapidsConf)(implicit clock: Clock) = { + this(deltaLog, deltaLog.update(), rapidsConf) + } + + private def getGpuStatsColExpr( + statsDataSchema: Seq[Attribute], + statsCollection: GpuStatisticsCollection): Expression = { + Dataset.ofRows(spark, LocalRelation(statsDataSchema)) + .select(to_json(statsCollection.statsCollector)) + .queryExecution.analyzed.expressions.head + } + + /** Return the pair of optional stats tracker and stats collection class */ + private def getOptionalGpuStatsTrackerAndStatsCollection( + output: Seq[Attribute], + partitionSchema: StructType, data: DataFrame): ( + Option[GpuDeltaJobStatisticsTracker], + Option[GpuStatisticsCollection]) = { + if (spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_COLLECT_STATS)) { + + val (statsDataSchema, statsCollectionSchema) = getStatsSchema(output, partitionSchema) + + val indexedCols = DeltaConfigs.DATA_SKIPPING_NUM_INDEXED_COLS.fromMetaData(metadata) + val prefixLength = + spark.sessionState.conf.getConf(DeltaSQLConf.DATA_SKIPPING_STRING_PREFIX_LENGTH) + val tableSchema = { + // If collecting stats using the table schema, then pass in statsCollectionSchema. + // Otherwise pass in statsDataSchema to collect stats using the DataFrame schema. + if (spark.sessionState.conf.getConf(DeltaSQLConf + .DELTA_COLLECT_STATS_USING_TABLE_SCHEMA)) { + statsCollectionSchema.toStructType + } else { + statsDataSchema.toStructType + } + } + + val _spark = spark + val protocol = deltaLog.unsafeVolatileSnapshot.protocol + + val statsCollection = new GpuStatisticsCollection { + override val spark = _spark + override val deletionVectorsSupported = + protocol.isFeatureSupported(DeletionVectorsTableFeature) + override val tableDataSchema = tableSchema + override val dataSchema = statsDataSchema.toStructType + override val numIndexedCols = indexedCols + override val stringPrefixLength: Int = prefixLength + } + + val statsColExpr = getGpuStatsColExpr(statsDataSchema, statsCollection) + + val statsSchema = statsCollection.statCollectionSchema + val explodedDataSchema = statsCollection.explodedDataSchema + val batchStatsToRow = (batch: ColumnarBatch, row: InternalRow) => { + GpuStatisticsCollection.batchStatsToRow(statsSchema, explodedDataSchema, batch, row) + } + (Some(new GpuDeltaJobStatisticsTracker(statsDataSchema, statsColExpr, batchStatsToRow)), + Some(statsCollection)) + } else { + (None, None) + } + } + + override def writeFiles( + inputData: Dataset[_], + writeOptions: Option[DeltaOptions], + additionalConstraints: Seq[Constraint]): Seq[FileAction] = { + hasWritten = true + + val spark = inputData.sparkSession + val (data, partitionSchema) = performCDCPartition(inputData) + val outputPath = deltaLog.dataPath + + val (normalizedQueryExecution, output, generatedColumnConstraints, dataHighWaterMarks) = { + // TODO: is none ok to pass here? + normalizeData(deltaLog, None, data) + } + val highWaterMarks = trackHighWaterMarks.getOrElse(dataHighWaterMarks) + + // Build a new plan with a stub GpuDeltaWrite node to work around undesired transitions between + // columns and rows when AQE is involved. Without this node in the plan, AdaptiveSparkPlanExec + // could be the root node of the plan. In that case we do not have enough context to know + // whether the AdaptiveSparkPlanExec should be columnar or not, since the GPU overrides do not + // see how the parent is using the AdaptiveSparkPlanExec outputs. By using this stub node that + // appears to be a data writing node to AQE (it derives from V2CommandExec), the + // AdaptiveSparkPlanExec will be planned as a child of this new node. That provides enough + // context to plan the AQE sub-plan properly with respect to columnar and row transitions. + // We could force the AQE node to be columnar here by explicitly replacing the node, but that + // breaks the connection between the queryExecution and the node that will actually execute. + val gpuWritePlan = Dataset.ofRows(spark, RapidsDeltaWrite(normalizedQueryExecution.logical)) + val queryExecution = gpuWritePlan.queryExecution + + val partitioningColumns = getPartitioningColumns(partitionSchema, output) + + val committer = getCommitter(outputPath) + + // If Statistics Collection is enabled, then create a stats tracker that will be injected during + // the FileFormatWriter.write call below and will collect per-file stats using + // StatisticsCollection + val (optionalStatsTracker, _) = getOptionalGpuStatsTrackerAndStatsCollection(output, + partitionSchema, data) + + // schema should be normalized, therefore we can do an equality check + val (statsDataSchema, _) = getStatsSchema(output, partitionSchema) + val identityTracker = GpuIdentityColumn.createIdentityColumnStatsTracker( + spark, + statsDataSchema, + metadata.schema, + highWaterMarks) + + val constraints = + Constraints.getAll(metadata, spark) ++ generatedColumnConstraints ++ additionalConstraints + + val isOptimize = isOptimizeCommand(queryExecution.analyzed) + + SQLExecution.withNewExecutionId(queryExecution, Option("deltaTransactionalWrite")) { + val outputSpec = FileFormatWriter.OutputSpec( + outputPath.toString, + Map.empty, + output) + + // Remove any unnecessary row conversions added as part of Spark planning + val queryPhysicalPlan = queryExecution.executedPlan match { + case GpuColumnarToRowExec(child, _) => child + case p => p + } + val gpuRapidsWrite = queryPhysicalPlan match { + case g: GpuRapidsDeltaWriteExec => Some(g) + case _ => None + } + + val empty2NullPlan = convertEmptyToNullIfNeeded(queryPhysicalPlan, + partitioningColumns, constraints) + val optimizedPlan = + applyOptimizeWriteIfNeeded(spark, empty2NullPlan, partitionSchema, isOptimize) + val planWithInvariants = addInvariantChecks(optimizedPlan, constraints) + val physicalPlan = convertToGpu(planWithInvariants) + + val statsTrackers: ListBuffer[ColumnarWriteJobStatsTracker] = ListBuffer() + + val hadoopConf = spark.sessionState.newHadoopConfWithOptions( + metadata.configuration ++ deltaLog.options) + + if (spark.conf.get(DeltaSQLConf.DELTA_HISTORY_METRICS_ENABLED)) { + val serializableHadoopConf = new SerializableConfiguration(hadoopConf) + val basicWriteJobStatsTracker = new BasicColumnarWriteJobStatsTracker( + serializableHadoopConf, + BasicWriteJobStatsTracker.metrics) + registerSQLMetrics(spark, basicWriteJobStatsTracker.driverSideMetrics) + statsTrackers.append(basicWriteJobStatsTracker) + gpuRapidsWrite.foreach { grw => + val tracker = new GpuWriteJobStatsTracker(serializableHadoopConf, + grw.basicMetrics, grw.taskMetrics) + statsTrackers.append(tracker) + } + } + + // Retain only a minimal selection of Spark writer options to avoid any potential + // compatibility issues + val options = writeOptions match { + case None => Map.empty[String, String] + case Some(writeOptions) => + writeOptions.options.filterKeys { key => + key.equalsIgnoreCase(DeltaOptions.MAX_RECORDS_PER_FILE) || + key.equalsIgnoreCase(DeltaOptions.COMPRESSION) + }.toMap + } + val deltaFileFormat = deltaLog.fileFormat(deltaLog.unsafeVolatileSnapshot.protocol, metadata) + val gpuFileFormat = if (deltaFileFormat.getClass == classOf[DeltaParquetFileFormat]) { + new GpuParquetFileFormat + } else { + throw new IllegalStateException(s"file format $deltaFileFormat is not supported") + } + + try { + logDebug(s"Physical plan for write:\n$physicalPlan") + GpuFileFormatWriter.write( + sparkSession = spark, + plan = physicalPlan, + fileFormat = gpuFileFormat, + committer = committer, + outputSpec = outputSpec, + hadoopConf = hadoopConf, + partitionColumns = partitioningColumns, + bucketSpec = None, + statsTrackers = optionalStatsTracker.toSeq ++ identityTracker.toSeq ++ statsTrackers, + options = options, + rapidsConf.stableSort, + rapidsConf.concurrentWriterPartitionFlushSize) + } catch { + case s: SparkException => + // Pull an InvariantViolationException up to the top level if it was the root cause. + val violationException = ExceptionUtils.getRootCause(s) + if (violationException.isInstanceOf[InvariantViolationException]) { + throw violationException + } else { + throw s + } + } + } + + val resultFiles = committer.addedStatuses.map { a => + a.copy(stats = optionalStatsTracker.map( + _.recordedStats(new Path(new URI(a.path)).getName)).getOrElse(a.stats)) + }.filter { + // In some cases, we can write out an empty `inputData`. Some examples of this (though, they + // may be fixed in the future) are the MERGE command when you delete with empty source, or + // empty target, or on disjoint tables. This is hard to catch before the write without + // collecting the DF ahead of time. Instead, we can return only the AddFiles that + // a) actually add rows, or + // b) don't have any stats so we don't know the number of rows at all + case a: AddFile => a.numLogicalRecords.forall(_ > 0) + case _ => true + } + + identityTracker.foreach { tracker => + updatedIdentityHighWaterMarks.appendAll(tracker.highWaterMarks.toSeq) + } + val fileActions = resultFiles.toSeq ++ committer.changeFiles + + // Check if auto-compaction is enabled. + // (Auto compaction checks are derived from the work in + // https://github.com/delta-io/delta/pull/1156). + lazy val autoCompactEnabled = + spark.sessionState.conf + .getConf[String](DeltaSQLConf.DELTA_AUTO_COMPACT_ENABLED) + .getOrElse { + DeltaConfigs.AUTO_COMPACT.fromMetaData(metadata) + .getOrElse("false") + }.toBoolean + + if (!isOptimize && autoCompactEnabled && fileActions.nonEmpty) { + registerPostCommitHook(GpuDoAutoCompaction) + } + + fileActions + } +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala new file mode 100644 index 00000000000..40e2651505c --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimisticTransactionBase.scala @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * This file was derived from OptimisticTransaction.scala and TransactionalWrite.scala + * in the Delta Lake project at https://github.com/delta-io/delta. + * + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.databricks.sql.transaction.tahoe.rapids + +import com.databricks.sql.transaction.tahoe._ +import com.databricks.sql.transaction.tahoe.actions.FileAction +import com.databricks.sql.transaction.tahoe.constraints.{Constraint, DeltaInvariantCheckerExec} +import com.databricks.sql.transaction.tahoe.files.TahoeBatchFileIndex +import com.databricks.sql.transaction.tahoe.metering.DeltaLogging +import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf +import com.nvidia.spark.rapids._ + +import org.apache.spark.sql.{Dataset, SparkSession} +import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, NamedExpression} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} +import org.apache.spark.sql.rapids.GpuShuffleEnv +import org.apache.spark.sql.rapids.GpuV1WriteUtils.GpuEmpty2Null +import org.apache.spark.sql.rapids.delta.{DeltaShufflePartitionsUtil, GpuOptimizeWriteExchangeExec, OptimizeWriteExchangeExec} +import org.apache.spark.sql.types.{StringType, StructType} +import org.apache.spark.util.Clock + +/** + * Used to perform a set of reads in a transaction and then commit a set of updates to the + * state of the log. All reads from the DeltaLog, MUST go through this instance rather + * than directly to the DeltaLog otherwise they will not be check for logical conflicts + * with concurrent updates. + * + * This class is not thread-safe. + * + * @param deltaLog The Delta Log for the table this transaction is modifying. + * @param snapshot The snapshot that this transaction is reading at. + * @param rapidsConf RAPIDS Accelerator config settings. + */ +abstract class GpuOptimisticTransactionBase + (deltaLog: DeltaLog, snapshot: Snapshot, val rapidsConf: RapidsConf) + (implicit clock: Clock) + extends OptimisticTransaction(deltaLog, Option.empty[CatalogTable], snapshot) + with DeltaLogging { + + /** + * Adds checking of constraints on the table + * @param plan Plan to generate the table to check against constraints + * @param constraints Constraints to check on the table + * @return GPU columnar plan to execute + */ + protected def addInvariantChecks(plan: SparkPlan, constraints: Seq[Constraint]): SparkPlan = { + val cpuInvariants = + DeltaInvariantCheckerExec.buildInvariantChecks(plan.output, constraints, plan.session) + GpuCheckDeltaInvariant.maybeConvertToGpu(cpuInvariants, rapidsConf) match { + case Some(gpuInvariants) => + val gpuPlan = convertToGpu(plan) + GpuDeltaInvariantCheckerExec(gpuPlan, gpuInvariants) + case None => + val cpuPlan = convertToCpu(plan) + DeltaInvariantCheckerExec(cpuPlan, constraints) + } + } + + /** GPU version of convertEmptyToNullIfNeeded */ + private def gpuConvertEmptyToNullIfNeeded( + plan: GpuExec, + partCols: Seq[Attribute], + constraints: Seq[Constraint]): SparkPlan = { + if (!spark.conf.get(DeltaSQLConf.CONVERT_EMPTY_TO_NULL_FOR_STRING_PARTITION_COL)) { + return plan + } + // No need to convert if there are no constraints. The empty strings will be converted later by + // FileFormatWriter and FileFormatDataWriter. Note that we might still do unnecessary convert + // here as the constraints might not be related to the string partition columns. A precise + // check will need to walk the constraints to see if such columns are really involved. It + // doesn't seem to worth the effort. + if (constraints.isEmpty) return plan + + val partSet = AttributeSet(partCols) + var needConvert = false + val projectList: Seq[NamedExpression] = plan.output.map { + case p if partSet.contains(p) && p.dataType == StringType => + needConvert = true + GpuAlias(GpuEmpty2Null(p), p.name)() + case attr => attr + } + if (needConvert) GpuProjectExec(projectList.toList, plan) else plan + } + + /** + * If there is any string partition column and there are constraints defined, add a projection to + * convert empty string to null for that column. The empty strings will be converted to null + * eventually even without this convert, but we want to do this earlier before check constraints + * so that empty strings are correctly rejected. Note that this should not cause the downstream + * logic in `FileFormatWriter` to add duplicate conversions because the logic there checks the + * partition column using the original plan's output. When the plan is modified with additional + * projections, the partition column check won't match and will not add more conversion. + * + * @param plan The original SparkPlan. + * @param partCols The partition columns. + * @param constraints The defined constraints. + * @return A SparkPlan potentially modified with an additional projection on top of `plan` + */ + override def convertEmptyToNullIfNeeded( + plan: SparkPlan, + partCols: Seq[Attribute], + constraints: Seq[Constraint]): SparkPlan = { + // Reuse the CPU implementation if the plan ends up on the CPU, otherwise do the + // equivalent on the GPU. + plan match { + case g: GpuExec => gpuConvertEmptyToNullIfNeeded(g, partCols, constraints) + case _ => super.convertEmptyToNullIfNeeded(plan, partCols, constraints) + } + } + + override def writeFiles( + inputData: Dataset[_], + additionalConstraints: Seq[Constraint]): Seq[FileAction] = { + writeFiles(inputData, None, additionalConstraints) + } + + protected def applyOptimizeWriteIfNeeded( + spark: SparkSession, + physicalPlan: SparkPlan, + partitionSchema: StructType, + isOptimize: Boolean): SparkPlan = { + val optimizeWriteEnabled = !isOptimize && + spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_WRITE_ENABLED) + .orElse(DeltaConfigs.OPTIMIZE_WRITE.fromMetaData(metadata)).getOrElse(false) + if (optimizeWriteEnabled) { + val planWithoutTopRepartition = + DeltaShufflePartitionsUtil.removeTopRepartition(physicalPlan) + val partitioning = DeltaShufflePartitionsUtil.partitioningForRebalance( + physicalPlan.output, partitionSchema, spark.sessionState.conf.numShufflePartitions) + planWithoutTopRepartition match { + case p: GpuExec => + val partMeta = GpuOverrides.wrapPart(partitioning, rapidsConf, None) + partMeta.tagForGpu() + if (partMeta.canThisBeReplaced) { + val plan = GpuOptimizeWriteExchangeExec(partMeta.convertToGpu(), p) + if (GpuShuffleEnv.useGPUShuffle(rapidsConf)) { + GpuCoalesceBatches(plan, TargetSize(rapidsConf.gpuTargetBatchSizeBytes)) + } else { + GpuShuffleCoalesceExec(plan, rapidsConf.gpuTargetBatchSizeBytes) + } + } else { + GpuColumnarToRowExec(OptimizeWriteExchangeExec(partitioning, p)) + } + case p => + OptimizeWriteExchangeExec(partitioning, p) + } + } else { + physicalPlan + } + } + + protected def isOptimizeCommand(plan: LogicalPlan): Boolean = { + val leaves = plan.collectLeaves() + leaves.size == 1 && leaves.head.collect { + case LogicalRelation(HadoopFsRelation( + index: TahoeBatchFileIndex, _, _, _, _, _), _, _, _) => + index.actionType.equals("Optimize") + }.headOption.getOrElse(false) + } + + protected def convertToCpu(plan: SparkPlan): SparkPlan = plan match { + case GpuRowToColumnarExec(p, _) => p + case p: GpuExec => GpuColumnarToRowExec(p) + case p => p + } + + protected def convertToGpu(plan: SparkPlan): SparkPlan = plan match { + case GpuColumnarToRowExec(p, _) => p + case p: GpuExec => p + case p => GpuRowToColumnarExec(p, TargetSize(rapidsConf.gpuTargetBatchSizeBytes)) + } +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimizeExecutor.scala b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimizeExecutor.scala new file mode 100644 index 00000000000..479776b760b --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuOptimizeExecutor.scala @@ -0,0 +1,420 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * This file was derived from: + * 1. DoAutoCompaction.scala from PR#1156 at https://github.com/delta-io/delta/pull/1156, + * 2. OptimizeTableCommand.scala from the Delta Lake project at https://github.com/delta-io/delta. + * + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.databricks.sql.transaction.tahoe.rapids + +import java.util.ConcurrentModificationException + +import scala.annotation.tailrec +import scala.collection.mutable.ArrayBuffer + +import com.databricks.sql.io.skipping.MultiDimClustering +import com.databricks.sql.io.skipping.liquid.{ClusteredTableUtils, ClusteringColumnInfo} +import com.databricks.sql.transaction.tahoe._ +import com.databricks.sql.transaction.tahoe.DeltaOperations.Operation +import com.databricks.sql.transaction.tahoe.actions.{Action, AddFile, FileAction, RemoveFile} +import com.databricks.sql.transaction.tahoe.commands.DeltaCommand +import com.databricks.sql.transaction.tahoe.commands.optimize._ +import com.databricks.sql.transaction.tahoe.files.SQLMetricsReporting +import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf +import com.nvidia.spark.rapids.delta.RapidsDeltaSQLConf + +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext.SPARK_JOB_GROUP_ID +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.catalyst.catalog.CatalogTable +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.execution.metric.SQLMetric +import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric +import org.apache.spark.util.ThreadUtils + +class GpuOptimizeExecutor( + sparkSession: SparkSession, + txn: OptimisticTransaction, + partitionPredicate: Seq[Expression], + zOrderByColumns: Seq[String], + prevCommitActions: Seq[Action]) + extends DeltaCommand with SQLMetricsReporting with Serializable { + + /** Timestamp to use in [[FileAction]] */ + private val operationTimestamp = System.currentTimeMillis + + private val isMultiDimClustering = zOrderByColumns.nonEmpty + private val isAutoCompact = prevCommitActions.nonEmpty + private val optimizeType = GpuOptimizeType(isMultiDimClustering, isAutoCompact) + + def optimize(): Seq[Row] = { + recordDeltaOperation(txn.deltaLog, "delta.optimize") { + val maxFileSize = optimizeType.maxFileSize + require(maxFileSize > 0, "maxFileSize must be > 0") + + val minNumFilesInDir = optimizeType.minNumFiles + val (candidateFiles, filesToProcess) = optimizeType.targetFiles + val partitionSchema = txn.metadata.partitionSchema + + // select all files in case of multi-dimensional clustering + val partitionsToCompact = filesToProcess + .groupBy(_.partitionValues) + .filter { case (_, filesInPartition) => filesInPartition.size >= minNumFilesInDir } + .toSeq + + val groupedJobs = groupFilesIntoBins(partitionsToCompact, maxFileSize) + val jobs = optimizeType.targetBins(groupedJobs) + + val maxThreads = + sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MAX_THREADS) + val updates = ThreadUtils.parmap(jobs, "OptimizeJob", maxThreads) { partitionBinGroup => + runOptimizeBinJob(txn, partitionBinGroup._1, partitionBinGroup._2, maxFileSize) + }.flatten + + val addedFiles = updates.collect { case a: AddFile => a } + val removedFiles = updates.collect { case r: RemoveFile => r } + if (addedFiles.nonEmpty) { + val operation = DeltaOperations.Optimize(partitionPredicate, zOrderByColumns) + val metrics = createMetrics(sparkSession.sparkContext, addedFiles, removedFiles) + commitAndRetry(txn, operation, updates, metrics) { newTxn => + val newPartitionSchema = newTxn.metadata.partitionSchema + val candidateSetOld = candidateFiles.map(_.path).toSet + val candidateSetNew = newTxn.filterFiles(partitionPredicate).map(_.path).toSet + + // As long as all of the files that we compacted are still part of the table, + // and the partitioning has not changed it is valid to continue to try + // and commit this checkpoint. + if (candidateSetOld.subsetOf(candidateSetNew) && partitionSchema == newPartitionSchema) { + true + } else { + val deleted = candidateSetOld -- candidateSetNew + logWarning(s"The following compacted files were delete " + + s"during checkpoint ${deleted.mkString(",")}. Aborting the compaction.") + false + } + } + } + + val optimizeStats = OptimizeStats() + optimizeStats.addedFilesSizeStats.merge(addedFiles) + optimizeStats.removedFilesSizeStats.merge(removedFiles) + optimizeStats.numPartitionsOptimized = jobs.map(j => j._1).distinct.size + optimizeStats.numBatches = jobs.size + optimizeStats.totalConsideredFiles = candidateFiles.size + optimizeStats.totalFilesSkipped = optimizeStats.totalConsideredFiles - removedFiles.size + optimizeStats.totalClusterParallelism = sparkSession.sparkContext.defaultParallelism + + if (isMultiDimClustering) { + val inputFileStats = + ZOrderFileStats(removedFiles.size, removedFiles.map(_.size.getOrElse(0L)).sum) + optimizeStats.zOrderStats = Some(ZOrderStats( + strategyName = "all", // means process all files in a partition + inputCubeFiles = ZOrderFileStats(0, 0), + inputOtherFiles = inputFileStats, + inputNumCubes = 0, + mergedFiles = inputFileStats, + // There will one z-cube for each partition + numOutputCubes = optimizeStats.numPartitionsOptimized)) + } + + return Seq(Row(txn.deltaLog.dataPath.toString, optimizeStats.toOptimizeMetrics)) + } + } + + /** + * Utility methods to group files into bins for optimize. + * + * @param partitionsToCompact List of files to compact group by partition. + * Partition is defined by the partition values (partCol -> partValue) + * @param maxTargetFileSize Max size (in bytes) of the compaction output file. + * @return Sequence of bins. Each bin contains one or more files from the same + * partition and targeted for one output file. + */ + private def groupFilesIntoBins( + partitionsToCompact: Seq[(Map[String, String], Seq[AddFile])], + maxTargetFileSize: Long): Seq[(Map[String, String], Seq[AddFile])] = { + + partitionsToCompact.flatMap { + case (partition, files) => + val bins = new ArrayBuffer[Seq[AddFile]]() + + val currentBin = new ArrayBuffer[AddFile]() + var currentBinSize = 0L + + files.sortBy(_.size).foreach { file => + // Generally, a bin is a group of existing files, whose total size does not exceed the + // desired maxFileSize. They will be coalesced into a single output file. + // However, if isMultiDimClustering = true, all files in a partition will be read by the + // same job, the data will be range-partitioned and numFiles = totalFileSize / maxFileSize + // will be produced. See below. + if (file.size + currentBinSize > maxTargetFileSize && !isMultiDimClustering) { + bins += currentBin.toVector + currentBin.clear() + currentBin += file + currentBinSize = file.size + } else { + currentBin += file + currentBinSize += file.size + } + } + + if (currentBin.nonEmpty) { + bins += currentBin.toVector + } + + bins.map(b => (partition, b)) + // select bins that have at least two files or in case of multi-dim clustering + // select all bins + .filter(_._2.size > 1 || isMultiDimClustering) + } + } + + private val isClusteredTable = ClusteredTableUtils.isSupported(txn.snapshot.protocol) + + private val clusteringColumns: Seq[String] = { + if (zOrderByColumns.nonEmpty) { + zOrderByColumns + } else if (isClusteredTable) { + ClusteringColumnInfo.extractLogicalNames(txn.snapshot) + } else { + Nil + } + } + + /** + * Utility method to run a Spark job to compact the files in given bin + * + * @param txn [[OptimisticTransaction]] instance in use to commit the changes to DeltaLog. + * @param partition Partition values of the partition that files in [[bin]] belongs to. + * @param bin List of files to compact into one large file. + * @param maxFileSize Targeted output file size in bytes + */ + private def runOptimizeBinJob( + txn: OptimisticTransaction, + partition: Map[String, String], + bin: Seq[AddFile], + maxFileSize: Long): Seq[FileAction] = { + val baseTablePath = txn.deltaLog.dataPath + + val input = txn.deltaLog.createDataFrame(txn.snapshot, bin, actionTypeOpt = Some("Optimize")) + val repartitionDF = if (isMultiDimClustering) { + val totalSize = bin.map(_.size).sum + val approxNumFiles = Math.max(1, totalSize / maxFileSize).toInt + MultiDimClustering.cluster( + input, + approxNumFiles, + clusteringColumns, + "zorder") + } else { + val useRepartition = sparkSession.sessionState.conf.getConf( + DeltaSQLConf.DELTA_OPTIMIZE_REPARTITION_ENABLED) + if (useRepartition) { + input.repartition(numPartitions = 1) + } else { + input.coalesce(numPartitions = 1) + } + } + + val partitionDesc = partition.toSeq.map(entry => entry._1 + "=" + entry._2).mkString(",") + + val partitionName = if (partition.isEmpty) "" else s" in partition ($partitionDesc)" + val description = s"$baseTablePath
Optimizing ${bin.size} files" + partitionName + sparkSession.sparkContext.setJobGroup( + sparkSession.sparkContext.getLocalProperty(SPARK_JOB_GROUP_ID), + description) + + val addFiles = txn.writeFiles(repartitionDF).collect { + case a: AddFile => + a.copy(dataChange = false) + case other => + throw new IllegalStateException( + s"Unexpected action $other with type ${other.getClass}. File compaction job output" + + s"should only have AddFiles") + } + val removeFiles = bin.map(f => f.removeWithTimestamp(operationTimestamp, dataChange = false)) + val updates = addFiles ++ removeFiles + updates + } + + private type PartitionedBin = (Map[String, String], Seq[AddFile]) + + private trait GpuOptimizeType { + def minNumFiles: Long + + def maxFileSize: Long = + sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_OPTIMIZE_MAX_FILE_SIZE) + + def targetFiles: (Seq[AddFile], Seq[AddFile]) + + def targetBins(jobs: Seq[PartitionedBin]): Seq[PartitionedBin] = jobs + } + + private case class GpuCompaction() extends GpuOptimizeType { + def minNumFiles: Long = 2 + + def targetFiles: (Seq[AddFile], Seq[AddFile]) = { + val minFileSize = sparkSession.sessionState.conf.getConf( + DeltaSQLConf.DELTA_OPTIMIZE_MIN_FILE_SIZE) + require(minFileSize > 0, "minFileSize must be > 0") + val candidateFiles = txn.filterFiles(partitionPredicate) + val filesToProcess = candidateFiles.filter(_.size < minFileSize) + (candidateFiles, filesToProcess) + } + } + + private case class GpuMultiDimOrdering() extends GpuOptimizeType { + def minNumFiles: Long = 1 + + def targetFiles: (Seq[AddFile], Seq[AddFile]) = { + // select all files in case of multi-dimensional clustering + val candidateFiles = txn.filterFiles(partitionPredicate) + (candidateFiles, candidateFiles) + } + } + + private case class GpuAutoCompaction() extends GpuOptimizeType { + def minNumFiles: Long = { + val minNumFiles = + sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_AUTO_COMPACT_MIN_NUM_FILES) + require(minNumFiles > 0, "minNumFiles must be > 0") + minNumFiles + } + + override def maxFileSize: Long = + sparkSession.sessionState.conf.getConf(DeltaSQLConf.DELTA_AUTO_COMPACT_MAX_FILE_SIZE) + .getOrElse(128 * 1024 * 1024) + + override def targetFiles: (Seq[AddFile], Seq[AddFile]) = { + val autoCompactTarget = + sparkSession.sessionState.conf.getConf(RapidsDeltaSQLConf.AUTO_COMPACT_TARGET) + // Filter the candidate files according to autoCompact.target config. + lazy val addedFiles = prevCommitActions.collect { case a: AddFile => a } + val candidateFiles = autoCompactTarget match { + case "table" => + txn.filterFiles() + case "commit" => + addedFiles + case "partition" => + val eligiblePartitions = addedFiles.map(_.partitionValues).toSet + txn.filterFiles().filter(f => eligiblePartitions.contains(f.partitionValues)) + case _ => + logError(s"Invalid config for autoCompact.target: $autoCompactTarget. " + + s"Falling back to the default value 'table'.") + txn.filterFiles() + } + val filesToProcess = candidateFiles.filter(_.size < maxFileSize) + (candidateFiles, filesToProcess) + } + + override def targetBins(jobs: Seq[PartitionedBin]): Seq[PartitionedBin] = { + var acc = 0L + val maxCompactBytes = + sparkSession.sessionState.conf.getConf(RapidsDeltaSQLConf.AUTO_COMPACT_MAX_COMPACT_BYTES) + // bins with more files are prior to less files. + jobs + .sortBy { case (_, filesInBin) => -filesInBin.length } + .takeWhile { case (_, filesInBin) => + acc += filesInBin.map(_.size).sum + acc <= maxCompactBytes + } + } + } + + private object GpuOptimizeType { + + def apply(isMultiDimClustering: Boolean, isAutoCompact: Boolean): GpuOptimizeType = { + if (isMultiDimClustering) { + GpuMultiDimOrdering() + } else if (isAutoCompact) { + GpuAutoCompaction() + } else { + GpuCompaction() + } + } + } + + /** + * Attempts to commit the given actions to the log. In the case of a concurrent update, + * the given function will be invoked with a new transaction to allow custom conflict + * detection logic to indicate it is safe to try again, by returning `true`. + * + * This function will continue to try to commit to the log as long as `f` returns `true`, + * otherwise throws a subclass of [[ConcurrentModificationException]]. + */ + @tailrec + private def commitAndRetry( + txn: OptimisticTransaction, + optimizeOperation: Operation, + actions: Seq[Action], + metrics: Map[String, SQLMetric])(f: OptimisticTransaction => Boolean) + : Unit = { + try { + txn.registerSQLMetrics(sparkSession, metrics) + txn.commit(actions, optimizeOperation) + } catch { + case e: ConcurrentModificationException => + val newTxn = txn.deltaLog.startTransaction(Option.empty[CatalogTable]) + if (f(newTxn)) { + logInfo("Retrying commit after checking for semantic conflicts with concurrent updates.") + commitAndRetry(newTxn, optimizeOperation, actions, metrics)(f) + } else { + logWarning("Semantic conflicts detected. Aborting operation.") + throw e + } + } + } + + /** Create a map of SQL metrics for adding to the commit history. */ + private def createMetrics( + sparkContext: SparkContext, + addedFiles: Seq[AddFile], + removedFiles: Seq[RemoveFile]): Map[String, SQLMetric] = { + + def setAndReturnMetric(description: String, value: Long) = { + val metric = createMetric(sparkContext, description) + metric.set(value) + metric + } + + def totalSize(actions: Seq[FileAction]): Long = { + var totalSize = 0L + actions.foreach { file => + val fileSize = file match { + case addFile: AddFile => addFile.size + case removeFile: RemoveFile => removeFile.size.getOrElse(0L) + case default => + throw new IllegalArgumentException(s"Unknown FileAction type: ${default.getClass}") + } + totalSize += fileSize + } + totalSize + } + + val sizeStats = FileSizeStatsWithHistogram.create(addedFiles.map(_.size).sorted) + Map[String, SQLMetric]( + "minFileSize" -> setAndReturnMetric("minimum file size", sizeStats.get.min), + "p25FileSize" -> setAndReturnMetric("25th percentile file size", sizeStats.get.p25), + "p50FileSize" -> setAndReturnMetric("50th percentile file size", sizeStats.get.p50), + "p75FileSize" -> setAndReturnMetric("75th percentile file size", sizeStats.get.p75), + "maxFileSize" -> setAndReturnMetric("maximum file size", sizeStats.get.max), + "numAddedFiles" -> setAndReturnMetric("total number of files added.", addedFiles.size), + "numRemovedFiles" -> setAndReturnMetric("total number of files removed.", removedFiles.size), + "numAddedBytes" -> setAndReturnMetric("total number of bytes added", totalSize(addedFiles)), + "numRemovedBytes" -> + setAndReturnMetric("total number of bytes removed", totalSize(removedFiles))) + } +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuUpdateCommand.scala b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuUpdateCommand.scala new file mode 100644 index 00000000000..b158062cf60 --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/databricks/sql/transaction/tahoe/rapids/GpuUpdateCommand.scala @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * This file was derived from UpdateCommand.scala + * in the Delta Lake project at https://github.com/delta-io/delta. + * + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.databricks.sql.transaction.tahoe.rapids + +import com.databricks.sql.transaction.tahoe.{DeltaLog, DeltaOperations, DeltaTableUtils, DeltaUDF, OptimisticTransaction} +import com.databricks.sql.transaction.tahoe.DeltaCommitTag._ +import com.databricks.sql.transaction.tahoe.RowTracking +import com.databricks.sql.transaction.tahoe.actions.{AddCDCFile, AddFile, FileAction} +import com.databricks.sql.transaction.tahoe.commands.{DeltaCommand, DMLUtils, UpdateCommand, UpdateMetric} +import com.databricks.sql.transaction.tahoe.files.{TahoeBatchFileIndex, TahoeFileIndex} +import com.nvidia.spark.rapids.delta.GpuDeltaMetricUpdateUDF +import org.apache.hadoop.fs.Path + +import org.apache.spark.SparkContext +import org.apache.spark.sql.{Column, Dataset, Row, SparkSession} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, Literal} +import org.apache.spark.sql.catalyst.plans.QueryPlan +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.execution.SQLExecution +import org.apache.spark.sql.execution.command.LeafRunnableCommand +import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} +import org.apache.spark.sql.execution.metric.SQLMetrics.{createMetric, createTimingMetric} +import org.apache.spark.sql.functions.input_file_name +import org.apache.spark.sql.types.LongType + +case class GpuUpdateCommand( + gpuDeltaLog: GpuDeltaLog, + tahoeFileIndex: TahoeFileIndex, + target: LogicalPlan, + updateExpressions: Seq[Expression], + condition: Option[Expression]) + extends LeafRunnableCommand with DeltaCommand { + + override val output: Seq[Attribute] = { + Seq(AttributeReference("num_affected_rows", LongType)()) + } + + override def innerChildren: Seq[QueryPlan[_]] = Seq(target) + + @transient private lazy val sc: SparkContext = SparkContext.getOrCreate() + + override lazy val metrics = Map[String, SQLMetric]( + "numAddedFiles" -> createMetric(sc, "number of files added."), + "numRemovedFiles" -> createMetric(sc, "number of files removed."), + "numUpdatedRows" -> createMetric(sc, "number of rows updated."), + "numCopiedRows" -> createMetric(sc, "number of rows copied."), + "executionTimeMs" -> + createTimingMetric(sc, "time taken to execute the entire operation"), + "scanTimeMs" -> + createTimingMetric(sc, "time taken to scan the files for matches"), + "rewriteTimeMs" -> + createTimingMetric(sc, "time taken to rewrite the matched files"), + "numAddedChangeFiles" -> createMetric(sc, "number of change data capture files generated"), + "changeFileBytes" -> createMetric(sc, "total size of change data capture files generated"), + "numTouchedRows" -> createMetric(sc, "number of rows touched (copied + updated)"), + "numDeletionVectorsAdded" -> createMetric(sc, "number of deletion vectors added."), + "numDeletionVectorsRemoved" -> createMetric(sc, "number of deletion vectors removed."), + "numDeletionVectorsUpdated" -> createMetric(sc, "number of deletion vectors updated.") + ) + + final override def run(sparkSession: SparkSession): Seq[Row] = { + recordDeltaOperation(tahoeFileIndex.deltaLog, "delta.dml.update") { + val deltaLog = tahoeFileIndex.deltaLog + gpuDeltaLog.withNewTransaction { txn => + DeltaLog.assertRemovable(txn.snapshot) + performUpdate(sparkSession, deltaLog, txn) + } + // Re-cache all cached plans(including this relation itself, if it's cached) that refer to + // this data source relation. + sparkSession.sharedState.cacheManager.recacheByPlan(sparkSession, target) + } + Seq(Row(metrics("numUpdatedRows").value)) + } + + private def performUpdate( + sparkSession: SparkSession, deltaLog: DeltaLog, txn: OptimisticTransaction): Unit = { + import com.databricks.sql.transaction.tahoe.implicits._ + + var numTouchedFiles: Long = 0 + var numRewrittenFiles: Long = 0 + var numAddedChangeFiles: Long = 0 + var changeFileBytes: Long = 0 + var scanTimeMs: Long = 0 + var rewriteTimeMs: Long = 0 + + val startTime = System.nanoTime() + val numFilesTotal = txn.snapshot.numOfFiles + + val updateCondition = condition.getOrElse(Literal.TrueLiteral) + val (metadataPredicates, dataPredicates) = + DeltaTableUtils.splitMetadataAndDataPredicates( + updateCondition, txn.metadata.partitionColumns, sparkSession) + val candidateFiles = txn.filterFiles(metadataPredicates ++ dataPredicates) + val nameToAddFile = generateCandidateFileMap(deltaLog.dataPath, candidateFiles) + + scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 + + val filesToRewrite: Seq[AddFile] = if (candidateFiles.isEmpty) { + // Case 1: Do nothing if no row qualifies the partition predicates + // that are part of Update condition + Nil + } else if (dataPredicates.isEmpty) { + // Case 2: Update all the rows from the files that are in the specified partitions + // when the data filter is empty + candidateFiles + } else { + // Case 3: Find all the affected files using the user-specified condition + val fileIndex = new TahoeBatchFileIndex( + sparkSession, "update", candidateFiles, deltaLog, tahoeFileIndex.path, txn.snapshot) + // Keep everything from the resolved target except a new TahoeFileIndex + // that only involves the affected files instead of all files. + val newTarget = DeltaTableUtils.replaceFileIndex(target, fileIndex) + val data = Dataset.ofRows(sparkSession, newTarget) + val updatedRowCount = metrics("numUpdatedRows") + val updatedRowUdf = DeltaUDF.boolean { + new GpuDeltaMetricUpdateUDF(updatedRowCount) + }.asNondeterministic() + val pathsToRewrite = + withStatusCode("DELTA", UpdateCommand.FINDING_TOUCHED_FILES_MSG) { + data.filter(new Column(updateCondition)) + .select(input_file_name()) + .filter(updatedRowUdf()) + .distinct() + .as[String] + .collect() + } + + scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 + + pathsToRewrite.map(getTouchedFile(deltaLog.dataPath, _, nameToAddFile)).toSeq + } + + numTouchedFiles = filesToRewrite.length + + val newActions = if (filesToRewrite.isEmpty) { + // Do nothing if no row qualifies the UPDATE condition + Nil + } else { + // Generate the new files containing the updated values + withStatusCode("DELTA", UpdateCommand.rewritingFilesMsg(filesToRewrite.size)) { + rewriteFiles(sparkSession, txn, tahoeFileIndex.path, + filesToRewrite.map(_.path), nameToAddFile, updateCondition) + } + } + + rewriteTimeMs = (System.nanoTime() - startTime) / 1000 / 1000 - scanTimeMs + + val (changeActions, addActions) = newActions.partition(_.isInstanceOf[AddCDCFile]) + numRewrittenFiles = addActions.size + numAddedChangeFiles = changeActions.size + changeFileBytes = changeActions.collect { case f: AddCDCFile => f.size }.sum + + val totalActions = if (filesToRewrite.isEmpty) { + // Do nothing if no row qualifies the UPDATE condition + Nil + } else { + // Delete the old files and return those delete actions along with the new AddFile actions for + // files containing the updated values + val operationTimestamp = System.currentTimeMillis() + val deleteActions = filesToRewrite.map(_.removeWithTimestamp(operationTimestamp)) + + deleteActions ++ newActions + } + + if (totalActions.nonEmpty) { + metrics("numAddedFiles").set(numRewrittenFiles) + metrics("numAddedChangeFiles").set(numAddedChangeFiles) + metrics("changeFileBytes").set(changeFileBytes) + metrics("numRemovedFiles").set(numTouchedFiles) + metrics("executionTimeMs").set((System.nanoTime() - startTime) / 1000 / 1000) + metrics("scanTimeMs").set(scanTimeMs) + metrics("rewriteTimeMs").set(rewriteTimeMs) + // In the case where the numUpdatedRows is not captured, we can siphon out the metrics from + // the BasicWriteStatsTracker. This is for case 2 where the update condition contains only + // metadata predicates and so the entire partition is re-written. + val outputRows = txn.getMetric("numOutputRows").map(_.value).getOrElse(-1L) + if (metrics("numUpdatedRows").value == 0 && outputRows != 0 && + metrics("numCopiedRows").value == 0) { + // We know that numTouchedRows = numCopiedRows + numUpdatedRows. + // Since an entire partition was re-written, no rows were copied. + // So numTouchedRows == numUpdateRows + metrics("numUpdatedRows").set(metrics("numTouchedRows").value) + } else { + // This is for case 3 where the update condition contains both metadata and data predicates + // so relevant files will have some rows updated and some rows copied. We don't need to + // consider case 1 here, where no files match the update condition, as we know that + // `totalActions` is empty. + metrics("numCopiedRows").set( + metrics("numTouchedRows").value - metrics("numUpdatedRows").value) + } + metrics("numDeletionVectorsAdded").set(0) + metrics("numDeletionVectorsRemoved").set(0) + metrics("numDeletionVectorsUpdated").set(0) + txn.registerSQLMetrics(sparkSession, metrics) + val tags = DMLUtils.TaggedCommitData.EMPTY + .withTag(PreservedRowTrackingTag, RowTracking.isEnabled(txn.protocol, txn.metadata)) + .withTag(NoRowsCopiedTag, metrics("numCopiedRows").value == 0) + txn.commitIfNeeded(totalActions, DeltaOperations.Update(condition), tags.stringTags) + // This is needed to make the SQL metrics visible in the Spark UI + val executionId = sparkSession.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) + SQLMetrics.postDriverMetricUpdates( + sparkSession.sparkContext, executionId, metrics.values.toSeq) + } + + recordDeltaEvent( + deltaLog, + "delta.dml.update.stats", + data = UpdateMetric( + condition = condition.map(_.sql).getOrElse("true"), + numFilesTotal, + numTouchedFiles, + numRewrittenFiles, + numAddedChangeFiles, + changeFileBytes, + scanTimeMs, + rewriteTimeMs, + // We don't support deletion vectors + numDeletionVectorsAdded = 0, + numDeletionVectorsRemoved = 0, + numDeletionVectorsUpdated = 0) + ) + } + + /** + * Scan all the affected files and write out the updated files. + * + * When CDF is enabled, includes the generation of CDC preimage and postimage columns for + * changed rows. + * + * @return the list of [[AddFile]]s and [[AddCDCFile]]s that have been written. + */ + private def rewriteFiles( + spark: SparkSession, + txn: OptimisticTransaction, + rootPath: Path, + inputLeafFiles: Seq[String], + nameToAddFileMap: Map[String, AddFile], + condition: Expression): Seq[FileAction] = { + // Containing the map from the relative file path to AddFile + val baseRelation = buildBaseRelation( + spark, txn, "update", rootPath, inputLeafFiles, nameToAddFileMap) + val newTarget = DeltaTableUtils.replaceFileIndex(target, baseRelation.location) + val targetDf = Dataset.ofRows(spark, newTarget) + + // Number of total rows that we have seen, i.e. are either copying or updating (sum of both). + // This will be used later, along with numUpdatedRows, to determine numCopiedRows. + val numTouchedRows = metrics("numTouchedRows") + val numTouchedRowsUdf = DeltaUDF.boolean { + new GpuDeltaMetricUpdateUDF(numTouchedRows) + }.asNondeterministic() + + val updatedDataFrame = UpdateCommand.withUpdatedColumns( + target.output, + updateExpressions, + condition, + targetDf + .filter(numTouchedRowsUdf()) + .withColumn(UpdateCommand.CONDITION_COLUMN_NAME, new Column(condition)), + UpdateCommand.shouldOutputCdc(txn)) + + txn.writeFiles(updatedDataFrame) + } +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/DeltaProbe.scala b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/DeltaProbe.scala new file mode 100644 index 00000000000..aa38d0460d1 --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/DeltaProbe.scala @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.nvidia.spark.rapids.delta + +/** + * Implements the Delta Probe interface for probing the Delta Lake provider on Databricks. + * @note This is instantiated via reflection from ShimLoader. + */ +class DeltaProbeImpl extends DeltaProbe { + // Delta Lake is built-in for Databricks instances, so no probing is necessary. + override def getDeltaProvider: DeltaProvider = DeltaSpark341DBProvider +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/DeltaSpark341DBProvider.scala b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/DeltaSpark341DBProvider.scala new file mode 100644 index 00000000000..48673eaf167 --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/DeltaSpark341DBProvider.scala @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.delta + +import com.databricks.sql.transaction.tahoe.rapids.GpuDeltaCatalog +import com.nvidia.spark.rapids.{AtomicCreateTableAsSelectExecMeta, AtomicReplaceTableAsSelectExecMeta, GpuExec} + +import org.apache.spark.sql.execution.datasources.v2.{AtomicCreateTableAsSelectExec, AtomicReplaceTableAsSelectExec} +import org.apache.spark.sql.execution.datasources.v2.rapids.{GpuAtomicCreateTableAsSelectExec, GpuAtomicReplaceTableAsSelectExec} + +object DeltaSpark341DBProvider extends DatabricksDeltaProviderBase { + + override def convertToGpu( + cpuExec: AtomicCreateTableAsSelectExec, + meta: AtomicCreateTableAsSelectExecMeta): GpuExec = { + GpuAtomicCreateTableAsSelectExec( + cpuExec.output, + new GpuDeltaCatalog(cpuExec.catalog, meta.conf), + cpuExec.ident, + cpuExec.partitioning, + cpuExec.query, + cpuExec.tableSpec, + cpuExec.writeOptions, + cpuExec.ifNotExists) + } + + override def convertToGpu( + cpuExec: AtomicReplaceTableAsSelectExec, + meta: AtomicReplaceTableAsSelectExecMeta): GpuExec = { + GpuAtomicReplaceTableAsSelectExec( + cpuExec.output, + new GpuDeltaCatalog(cpuExec.catalog, meta.conf), + cpuExec.ident, + cpuExec.partitioning, + cpuExec.query, + cpuExec.tableSpec, + cpuExec.writeOptions, + cpuExec.orCreate, + cpuExec.invalidateCache) + } +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala new file mode 100644 index 00000000000..abaece5feb3 --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.delta + +import java.net.URI + +import com.databricks.sql.transaction.tahoe.{DeltaColumnMappingMode, DeltaParquetFileFormat, IdMapping} +import com.databricks.sql.transaction.tahoe.DeltaParquetFileFormat.{DeletionVectorDescriptorWithFilterType, IS_ROW_DELETED_COLUMN_NAME} +import com.nvidia.spark.rapids.{GpuMetric, RapidsConf, SparkPlanMeta} +import com.nvidia.spark.rapids.delta.GpuDeltaParquetFileFormatUtils.addMetadataColumnToIterator +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path + +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.FileSourceScanExec +import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.sources.Filter +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.vectorized.ColumnarBatch + +case class GpuDeltaParquetFileFormat( + override val columnMappingMode: DeltaColumnMappingMode, + override val referenceSchema: StructType, + isSplittable: Boolean, + disablePushDown: Boolean, + broadcastDvMap: Option[Broadcast[Map[URI, DeletionVectorDescriptorWithFilterType]]] +) extends GpuDeltaParquetFileFormatBase { + + if (columnMappingMode == IdMapping) { + val requiredReadConf = SQLConf.PARQUET_FIELD_ID_READ_ENABLED + require(SparkSession.getActiveSession.exists(_.sessionState.conf.getConf(requiredReadConf)), + s"${requiredReadConf.key} must be enabled to support Delta id column mapping mode") + val requiredWriteConf = SQLConf.PARQUET_FIELD_ID_WRITE_ENABLED + require(SparkSession.getActiveSession.exists(_.sessionState.conf.getConf(requiredWriteConf)), + s"${requiredWriteConf.key} must be enabled to support Delta id column mapping mode") + } + + override def isSplitable( + sparkSession: SparkSession, + options: Map[String, String], + path: Path): Boolean = isSplittable + + override def buildReaderWithPartitionValuesAndMetrics( + sparkSession: SparkSession, + dataSchema: StructType, + partitionSchema: StructType, + requiredSchema: StructType, + filters: Seq[Filter], + options: Map[String, String], + hadoopConf: Configuration, + metrics: Map[String, GpuMetric], + alluxioPathReplacementMap: Option[Map[String, String]]) + : PartitionedFile => Iterator[InternalRow] = { + + val dataReader = super.buildReaderWithPartitionValuesAndMetrics( + sparkSession, + dataSchema, + partitionSchema, + requiredSchema, + filters, + options, + hadoopConf, + metrics, + alluxioPathReplacementMap) + + val delVecs = broadcastDvMap + val maxDelVecScatterBatchSize = RapidsConf + .DELTA_LOW_SHUFFLE_MERGE_SCATTER_DEL_VECTOR_BATCH_SIZE + .get(sparkSession.sessionState.conf) + + val delVecScatterTimeMetric = metrics(GpuMetric.DELETION_VECTOR_SCATTER_TIME) + val delVecSizeMetric = metrics(GpuMetric.DELETION_VECTOR_SIZE) + + (file: PartitionedFile) => { + val input = dataReader(file) + val dv = delVecs.flatMap(_.value.get(new URI(file.filePath.toString()))) + .map { dv => + delVecSizeMetric += dv.descriptor.inlineData.length + RoaringBitmapWrapper.deserializeFromBytes(dv.descriptor.inlineData).inner + } + addMetadataColumnToIterator(prepareSchema(requiredSchema), + dv, + input.asInstanceOf[Iterator[ColumnarBatch]], + maxDelVecScatterBatchSize, + delVecScatterTimeMetric + ).asInstanceOf[Iterator[InternalRow]] + } + } +} + +object GpuDeltaParquetFileFormat { + def tagSupportForGpuFileSourceScan(meta: SparkPlanMeta[FileSourceScanExec]): Unit = { + val format = meta.wrapped.relation.fileFormat.asInstanceOf[DeltaParquetFileFormat] + val requiredSchema = meta.wrapped.requiredSchema + if (requiredSchema.exists(_.name == IS_ROW_DELETED_COLUMN_NAME)) { + meta.willNotWorkOnGpu( + s"reading metadata column $IS_ROW_DELETED_COLUMN_NAME is not supported") + } + if (format.hasDeletionVectorMap) { + meta.willNotWorkOnGpu("deletion vectors are not supported") + } + } + + def convertToGpu(fmt: DeltaParquetFileFormat): GpuDeltaParquetFileFormat = { + GpuDeltaParquetFileFormat(fmt.columnMappingMode, fmt.referenceSchema, fmt.isSplittable, + fmt.disablePushDowns, fmt.broadcastDvMap) + } +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/DeleteCommandMetaShim.scala b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/DeleteCommandMetaShim.scala new file mode 100644 index 00000000000..6cff03517dc --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/DeleteCommandMetaShim.scala @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.delta.shims + +import com.databricks.sql.transaction.tahoe.commands.DeletionVectorUtils +import com.databricks.sql.transaction.tahoe.sources.DeltaSQLConf +import com.nvidia.spark.rapids.delta.{DeleteCommandEdgeMeta, DeleteCommandMeta} + +object DeleteCommandMetaShim { + def tagForGpu(meta: DeleteCommandMeta): Unit = { + val dvFeatureEnabled = DeletionVectorUtils.deletionVectorsWritable( + meta.deleteCmd.deltaLog.unsafeVolatileSnapshot) + if (dvFeatureEnabled && meta.deleteCmd.conf.getConf( + DeltaSQLConf.DELETE_USE_PERSISTENT_DELETION_VECTORS)) { + // https://github.com/NVIDIA/spark-rapids/issues/8654 + meta.willNotWorkOnGpu("Deletion vector writes are not supported on GPU") + } + } + + def tagForGpu(meta: DeleteCommandEdgeMeta): Unit = { + val dvFeatureEnabled = DeletionVectorUtils.deletionVectorsWritable( + meta.deleteCmd.deltaLog.unsafeVolatileSnapshot) + if (dvFeatureEnabled && meta.deleteCmd.conf.getConf( + DeltaSQLConf.DELETE_USE_PERSISTENT_DELETION_VECTORS)) { + // https://github.com/NVIDIA/spark-rapids/issues/8654 + meta.willNotWorkOnGpu("Deletion vector writes are not supported on GPU") + } + } +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/DeltaLogShim.scala b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/DeltaLogShim.scala new file mode 100644 index 00000000000..6b35e7c3e45 --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/DeltaLogShim.scala @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.delta.shims + +import com.databricks.sql.transaction.tahoe.DeltaLog +import com.databricks.sql.transaction.tahoe.actions.Metadata + +import org.apache.spark.sql.execution.datasources.FileFormat + +object DeltaLogShim { + def fileFormat(deltaLog: DeltaLog): FileFormat = { + deltaLog.fileFormat(deltaLog.unsafeVolatileSnapshot.protocol, + deltaLog.unsafeVolatileSnapshot.metadata) + } + def getMetadata(deltaLog: DeltaLog): Metadata = { + deltaLog.unsafeVolatileSnapshot.metadata + } +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/InvariantViolationExceptionShim.scala b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/InvariantViolationExceptionShim.scala new file mode 100644 index 00000000000..31881d2e375 --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/InvariantViolationExceptionShim.scala @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.nvidia.spark.rapids.delta.shims + +import com.databricks.sql.transaction.tahoe.constraints.Constraints._ +import com.databricks.sql.transaction.tahoe.schema.DeltaInvariantViolationException +import com.databricks.sql.transaction.tahoe.schema.InvariantViolationException + +object InvariantViolationExceptionShim { + def apply(c: Check, m: Map[String, Any]): InvariantViolationException = { + DeltaInvariantViolationException(c, m) + } + + def apply(c: NotNull): InvariantViolationException = { + DeltaInvariantViolationException(c) + } +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/MergeIntoCommandMetaShim.scala b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/MergeIntoCommandMetaShim.scala new file mode 100644 index 00000000000..ebe801e66e9 --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/MergeIntoCommandMetaShim.scala @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.delta.shims + +import com.databricks.sql.transaction.tahoe.commands.{MergeIntoCommand, MergeIntoCommandEdge} +import com.databricks.sql.transaction.tahoe.rapids.{GpuDeltaLog, GpuLowShuffleMergeCommand, GpuMergeIntoCommand} +import com.nvidia.spark.rapids.{RapidsConf, RapidsReaderType} +import com.nvidia.spark.rapids.delta.{MergeIntoCommandEdgeMeta, MergeIntoCommandMeta} + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.execution.command.RunnableCommand + +object MergeIntoCommandMetaShim extends Logging { + def tagForGpu(meta: MergeIntoCommandMeta, mergeCmd: MergeIntoCommand): Unit = { + // see https://github.com/NVIDIA/spark-rapids/issues/8415 for more information + if (mergeCmd.notMatchedBySourceClauses.nonEmpty) { + meta.willNotWorkOnGpu("notMatchedBySourceClauses not supported on GPU") + } + } + + def tagForGpu(meta: MergeIntoCommandEdgeMeta, mergeCmd: MergeIntoCommandEdge): Unit = { + // see https://github.com/NVIDIA/spark-rapids/issues/8415 for more information + if (mergeCmd.notMatchedBySourceClauses.nonEmpty) { + meta.willNotWorkOnGpu("notMatchedBySourceClauses not supported on GPU") + } + } + + def convertToGpu(mergeCmd: MergeIntoCommand, conf: RapidsConf): RunnableCommand = { + // TODO: Currently we only support low shuffler merge only when parquet per file read is enabled + // due to the limitation of implementing row index metadata column. + if (conf.isDeltaLowShuffleMergeEnabled) { + if (conf.isParquetPerFileReadEnabled) { + GpuLowShuffleMergeCommand( + mergeCmd.source, + mergeCmd.target, + new GpuDeltaLog(mergeCmd.targetFileIndex.deltaLog, conf), + mergeCmd.condition, + mergeCmd.matchedClauses, + mergeCmd.notMatchedClauses, + mergeCmd.notMatchedBySourceClauses, + mergeCmd.migratedSchema)(conf) + } else { + logWarning(s"""Low shuffle merge disabled since ${RapidsConf.PARQUET_READER_TYPE} is + not set to ${RapidsReaderType.PERFILE}. Falling back to classic merge.""") + GpuMergeIntoCommand( + mergeCmd.source, + mergeCmd.target, + new GpuDeltaLog(mergeCmd.targetFileIndex.deltaLog, conf), + mergeCmd.condition, + mergeCmd.matchedClauses, + mergeCmd.notMatchedClauses, + mergeCmd.notMatchedBySourceClauses, + mergeCmd.migratedSchema)(conf) + } + } else { + GpuMergeIntoCommand( + mergeCmd.source, + mergeCmd.target, + new GpuDeltaLog(mergeCmd.targetFileIndex.deltaLog, conf), + mergeCmd.condition, + mergeCmd.matchedClauses, + mergeCmd.notMatchedClauses, + mergeCmd.notMatchedBySourceClauses, + mergeCmd.migratedSchema)(conf) + } + } + + def convertToGpu(mergeCmd: MergeIntoCommandEdge, conf: RapidsConf): RunnableCommand = { + // TODO: Currently we only support low shuffler merge only when parquet per file read is enabled + // due to the limitation of implementing row index metadata column. + if (conf.isDeltaLowShuffleMergeEnabled) { + if (conf.isParquetPerFileReadEnabled) { + GpuLowShuffleMergeCommand( + mergeCmd.source, + mergeCmd.target, + new GpuDeltaLog(mergeCmd.targetFileIndex.deltaLog, conf), + mergeCmd.condition, + mergeCmd.matchedClauses, + mergeCmd.notMatchedClauses, + mergeCmd.notMatchedBySourceClauses, + mergeCmd.migratedSchema)(conf) + } else { + logWarning(s"""Low shuffle merge is still disable since ${RapidsConf.PARQUET_READER_TYPE} is + not set to ${RapidsReaderType.PERFILE}. Falling back to classic merge.""") + GpuMergeIntoCommand( + mergeCmd.source, + mergeCmd.target, + new GpuDeltaLog(mergeCmd.targetFileIndex.deltaLog, conf), + mergeCmd.condition, + mergeCmd.matchedClauses, + mergeCmd.notMatchedClauses, + mergeCmd.notMatchedBySourceClauses, + mergeCmd.migratedSchema)(conf) + } + } else { + GpuMergeIntoCommand( + mergeCmd.source, + mergeCmd.target, + new GpuDeltaLog(mergeCmd.targetFileIndex.deltaLog, conf), + mergeCmd.condition, + mergeCmd.matchedClauses, + mergeCmd.notMatchedClauses, + mergeCmd.notMatchedBySourceClauses, + mergeCmd.migratedSchema)(conf) + } + } +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/MetadataShims.scala b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/MetadataShims.scala new file mode 100644 index 00000000000..e717df94d89 --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/MetadataShims.scala @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.nvidia.spark.rapids.delta.shims + +import com.databricks.sql.transaction.tahoe.stats.DeltaStatistics + +trait ShimUsesMetadataFields { + val NUM_RECORDS = DeltaStatistics.NUM_RECORDS + val MIN = DeltaStatistics.MIN + val MAX = DeltaStatistics.MAX + val NULL_COUNT = DeltaStatistics.NULL_COUNT +} diff --git a/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/ShimDeltaUDF.scala b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/ShimDeltaUDF.scala new file mode 100644 index 00000000000..52a76ab907a --- /dev/null +++ b/delta-lake/delta-spark350db143/src/main/scala/com/nvidia/spark/rapids/delta/shims/ShimDeltaUDF.scala @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.delta.shims + +import com.databricks.sql.transaction.tahoe.DeltaUDF + +import org.apache.spark.sql.expressions.UserDefinedFunction + +object ShimDeltaUDF { + def stringStringUdf(f: String => String): UserDefinedFunction = DeltaUDF.stringFromString(f) +} diff --git a/delta-lake/delta-stub/pom.xml b/delta-lake/delta-stub/pom.xml index 5e855365fa9..deb2bb7233d 100644 --- a/delta-lake/delta-stub/pom.xml +++ b/delta-lake/delta-stub/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.1 + 24.12.0 ../../jdk-profiles/pom.xml rapids-4-spark-delta-stub_2.12 RAPIDS Accelerator for Apache Spark Delta Lake Stub Delta Lake stub for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-stub diff --git a/dist/pom.xml b/dist/pom.xml index fb2024262ca..f7d69b761f2 100644 --- a/dist/pom.xml +++ b/dist/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.12 - 24.10.1 + 24.12.0 ../jdk-profiles/pom.xml rapids-4-spark_2.12 RAPIDS Accelerator for Apache Spark Distribution Creates the distribution package of the RAPIDS plugin for Apache Spark - 24.10.1 + 24.12.0 com.nvidia diff --git a/docs/additional-functionality/advanced_configs.md b/docs/additional-functionality/advanced_configs.md index 8f030b1aa29..07346a5b850 100644 --- a/docs/additional-functionality/advanced_configs.md +++ b/docs/additional-functionality/advanced_configs.md @@ -95,8 +95,8 @@ Name | Description | Default Value | Applicable at spark.rapids.sql.format.hive.text.write.enabled|When set to false disables Hive text table write acceleration|false|Runtime spark.rapids.sql.format.iceberg.enabled|When set to false disables all Iceberg acceleration|true|Runtime spark.rapids.sql.format.iceberg.read.enabled|When set to false disables Iceberg input acceleration|true|Runtime -spark.rapids.sql.format.json.enabled|When set to true enables all json input and output acceleration. (only input is currently supported anyways)|false|Runtime -spark.rapids.sql.format.json.read.enabled|When set to true enables json input acceleration|false|Runtime +spark.rapids.sql.format.json.enabled|When set to true enables all json input and output acceleration. (only input is currently supported anyways)|true|Runtime +spark.rapids.sql.format.json.read.enabled|When set to true enables json input acceleration|true|Runtime spark.rapids.sql.format.orc.enabled|When set to false disables all orc input and output acceleration|true|Runtime spark.rapids.sql.format.orc.floatTypesToString.enable|When reading an ORC file, the source data schemas(schemas of ORC file) may differ from the target schemas (schemas of the reader), we need to handle the castings from source type to target type. Since float/double numbers in GPU have different precision with CPU, when casting float/double to string, the result of GPU is different from result of CPU spark. Its default value is `true` (this means the strings result will differ from result of CPU). If it's set `false` explicitly and there exists casting from float/double to string in the job, then such behavior will cause an exception, and the job will fail.|true|Runtime spark.rapids.sql.format.orc.multiThreadedRead.maxNumFilesParallel|A limit on the maximum number of files per task processed in parallel on the CPU side before the file is sent to the GPU. This affects the amount of host memory used when reading the files in parallel. Used with MULTITHREADED reader, see spark.rapids.sql.format.orc.reader.type.|2147483647|Runtime @@ -129,6 +129,7 @@ Name | Description | Default Value | Applicable at spark.rapids.sql.join.leftOuter.enabled|When set to true left outer joins are enabled on the GPU|true|Runtime spark.rapids.sql.join.leftSemi.enabled|When set to true left semi joins are enabled on the GPU|true|Runtime spark.rapids.sql.join.rightOuter.enabled|When set to true right outer joins are enabled on the GPU|true|Runtime +spark.rapids.sql.json.read.datetime.enabled|JSON reading is not 100% compatible when reading dates and timestamps.|false|Runtime spark.rapids.sql.json.read.decimal.enabled|When reading a quoted string as a decimal Spark supports reading non-ascii unicode digits, and the RAPIDS Accelerator does not.|true|Runtime spark.rapids.sql.json.read.double.enabled|JSON reading is not 100% compatible when reading doubles.|true|Runtime spark.rapids.sql.json.read.float.enabled|JSON reading is not 100% compatible when reading floats.|true|Runtime @@ -277,7 +278,7 @@ Name | SQL Function(s) | Description | Default Value | Notes spark.rapids.sql.expression.IsNaN|`isnan`|Checks if a value is NaN|true|None| spark.rapids.sql.expression.IsNotNull|`isnotnull`|Checks if a value is not null|true|None| spark.rapids.sql.expression.IsNull|`isnull`|Checks if a value is null|true|None| -spark.rapids.sql.expression.JsonToStructs|`from_json`|Returns a struct value with the given `jsonStr` and `schema`|false|This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case| +spark.rapids.sql.expression.JsonToStructs|`from_json`|Returns a struct value with the given `jsonStr` and `schema`|true|None| spark.rapids.sql.expression.JsonTuple|`json_tuple`|Returns a tuple like the function get_json_object, but it takes multiple names. All the input parameters and output column types are string.|false|This is disabled by default because Experimental feature that could be unstable or have performance issues.| spark.rapids.sql.expression.KnownFloatingPointNormalized| |Tag to prevent redundant normalization|true|None| spark.rapids.sql.expression.KnownNotNull| |Tag an expression as known to not be null|true|None| @@ -310,6 +311,7 @@ Name | SQL Function(s) | Description | Default Value | Notes spark.rapids.sql.expression.Minute|`minute`|Returns the minute component of the string/timestamp|true|None| spark.rapids.sql.expression.MonotonicallyIncreasingID|`monotonically_increasing_id`|Returns monotonically increasing 64-bit integers|true|None| spark.rapids.sql.expression.Month|`month`|Returns the month from a date or timestamp|true|None| +spark.rapids.sql.expression.MonthsBetween|`months_between`|If `timestamp1` is later than `timestamp2`, then the result is positive. If `timestamp1` and `timestamp2` are on the same day of month, or both are the last day of month, time of day will be ignored. Otherwise, the difference is calculated based on 31 days per month, and rounded to 8 digits unless roundOff=false.|true|None| spark.rapids.sql.expression.Multiply|`*`|Multiplication|true|None| spark.rapids.sql.expression.Murmur3Hash|`hash`|Murmur3 hash operator|true|None| spark.rapids.sql.expression.NaNvl|`nanvl`|Evaluates to `left` iff left is not NaN, `right` otherwise|true|None| diff --git a/docs/archives/CHANGELOG_24.02-to-24.06.md b/docs/archives/CHANGELOG_24.02-to-24.08.md similarity index 74% rename from docs/archives/CHANGELOG_24.02-to-24.06.md rename to docs/archives/CHANGELOG_24.02-to-24.08.md index d95307a1efe..0366869935d 100644 --- a/docs/archives/CHANGELOG_24.02-to-24.06.md +++ b/docs/archives/CHANGELOG_24.02-to-24.08.md @@ -1,5 +1,212 @@ # Change log -Generated on 2024-10-09 +Generated on 2024-12-08 +## Release 24.08 + +### Features +||| +|:---|:---| +|[#9259](https://github.com/NVIDIA/spark-rapids/issues/9259)|[FEA] Create Spark 4.0.0 shim and build env| +|[#10366](https://github.com/NVIDIA/spark-rapids/issues/10366)|[FEA] It would be nice if we could support Hive-style write bucketing table| +|[#10987](https://github.com/NVIDIA/spark-rapids/issues/10987)|[FEA] Implement lore framework to support all operators.| +|[#11087](https://github.com/NVIDIA/spark-rapids/issues/11087)|[FEA] Support regex pattern with brackets when rewrite to PrefixRange patten in rlike| +|[#22](https://github.com/NVIDIA/spark-rapids/issues/22)|[FEA] Add support for bucketed writes| +|[#9939](https://github.com/NVIDIA/spark-rapids/issues/9939)|[FEA] `GpuInsertIntoHiveTable` supports parquet format| + +### Performance +||| +|:---|:---| +|[#8750](https://github.com/NVIDIA/spark-rapids/issues/8750)|[FEA] Rework GpuSubstringIndex to use cudf::slice_strings| +|[#7404](https://github.com/NVIDIA/spark-rapids/issues/7404)|[FEA] explore a hash agg passthrough on partial aggregates| +|[#10976](https://github.com/NVIDIA/spark-rapids/issues/10976)|Rewrite `pattern1|pattern2|pattern3` to multiple contains in `rlike`| + +### Bugs Fixed +||| +|:---|:---| +|[#11287](https://github.com/NVIDIA/spark-rapids/issues/11287)|[BUG] String split APIs on empty string produce incorrect result| +|[#11270](https://github.com/NVIDIA/spark-rapids/issues/11270)|[BUG] test_regexp_replace[DATAGEN_SEED=1722297411, TZ=UTC] hanging there forever in pre-merge CI intermittently| +|[#9682](https://github.com/NVIDIA/spark-rapids/issues/9682)|[BUG] Casting FLOAT64 to DECIMAL(12,7) produces different rows from Apache Spark CPU| +|[#10809](https://github.com/NVIDIA/spark-rapids/issues/10809)|[BUG] cast(9.95 as decimal(3,1)), actual: 9.9, expected: 10.0| +|[#11266](https://github.com/NVIDIA/spark-rapids/issues/11266)|[BUG] test_broadcast_hash_join_constant_keys failed in databricks runtimes| +|[#11243](https://github.com/NVIDIA/spark-rapids/issues/11243)|[BUG] ArrayIndexOutOfBoundsException on a left outer join| +|[#11030](https://github.com/NVIDIA/spark-rapids/issues/11030)|Fix tests failures in string_test.py| +|[#11245](https://github.com/NVIDIA/spark-rapids/issues/11245)|[BUG] mvn verify for the source-javadoc fails and no pre-merge check catches it| +|[#11223](https://github.com/NVIDIA/spark-rapids/issues/11223)|[BUG] Remove unreferenced `CUDF_VER=xxx` in the CI script| +|[#11114](https://github.com/NVIDIA/spark-rapids/issues/11114)|[BUG] Update nightly tests for Scala 2.13 to use JDK 17 only| +|[#11229](https://github.com/NVIDIA/spark-rapids/issues/11229)|[BUG] test_delta_name_column_mapping_no_field_ids fails on Spark | +|[#11031](https://github.com/NVIDIA/spark-rapids/issues/11031)|Fix tests failures in multiple files | +|[#10948](https://github.com/NVIDIA/spark-rapids/issues/10948)|Figure out why `MapFromArrays ` appears in the tests for hive parquet write| +|[#11018](https://github.com/NVIDIA/spark-rapids/issues/11018)|Fix tests failures in hash_aggregate_test.py| +|[#11173](https://github.com/NVIDIA/spark-rapids/issues/11173)|[BUG] The `rs. serialization time` metric is misleading| +|[#11017](https://github.com/NVIDIA/spark-rapids/issues/11017)|Fix tests failures in url_test.py| +|[#11201](https://github.com/NVIDIA/spark-rapids/issues/11201)|[BUG] Delta Lake tables with name mapping can throw exceptions on read| +|[#11175](https://github.com/NVIDIA/spark-rapids/issues/11175)|[BUG] Clean up unused and duplicated 'org/roaringbitmap' folder in the spark3xx shims| +|[#11196](https://github.com/NVIDIA/spark-rapids/issues/11196)|[BUG] pipeline failed due to class not found exception: NoClassDefFoundError: com/nvidia/spark/rapids/GpuScalar| +|[#11189](https://github.com/NVIDIA/spark-rapids/issues/11189)|[BUG] regression in NDS after PR #11170| +|[#11167](https://github.com/NVIDIA/spark-rapids/issues/11167)|[BUG] UnsupportedOperationException during delta write with `optimize()`| +|[#11172](https://github.com/NVIDIA/spark-rapids/issues/11172)|[BUG] `get_json_object` returns wrong output with wildcard path| +|[#11148](https://github.com/NVIDIA/spark-rapids/issues/11148)|[BUG] Integration test `test_write_hive_bucketed_table` fails| +|[#11155](https://github.com/NVIDIA/spark-rapids/issues/11155)|[BUG] ArrayIndexOutOfBoundsException in BatchWithPartitionData.splitColumnarBatch| +|[#11152](https://github.com/NVIDIA/spark-rapids/issues/11152)|[BUG] LORE dumping consumes too much memory.| +|[#11029](https://github.com/NVIDIA/spark-rapids/issues/11029)|Fix tests failures in subquery_test.py| +|[#11150](https://github.com/NVIDIA/spark-rapids/issues/11150)|[BUG] hive_parquet_write_test.py::test_insert_hive_bucketed_table failure| +|[#11070](https://github.com/NVIDIA/spark-rapids/issues/11070)|[BUG] numpy2 fail fastparquet cases: numpy.dtype size changed| +|[#11136](https://github.com/NVIDIA/spark-rapids/issues/11136)|UnaryPositive expression doesn't extend UnaryExpression| +|[#11122](https://github.com/NVIDIA/spark-rapids/issues/11122)|[BUG] UT MetricRange failed 651070526 was not less than 1.5E8 in spark313| +|[#11119](https://github.com/NVIDIA/spark-rapids/issues/11119)|[BUG] window_function_test.py::test_window_group_limits_fallback_for_row_number fails in a distributed environment| +|[#11023](https://github.com/NVIDIA/spark-rapids/issues/11023)|Fix tests failures in dpp_test.py| +|[#11026](https://github.com/NVIDIA/spark-rapids/issues/11026)|Fix tests failures in map_test.py| +|[#11020](https://github.com/NVIDIA/spark-rapids/issues/11020)|Fix tests failures in grouping_sets_test.py| +|[#11113](https://github.com/NVIDIA/spark-rapids/issues/11113)|[BUG] Update premerge tests for Scala 2.13 to use JDK 17 only| +|[#11027](https://github.com/NVIDIA/spark-rapids/issues/11027)|Fix tests failures in sort_test.py| +|[#10775](https://github.com/NVIDIA/spark-rapids/issues/10775)|[BUG] Issues found by Spark UT Framework on RapidsStringExpressionsSuite| +|[#11033](https://github.com/NVIDIA/spark-rapids/issues/11033)|[BUG] CICD failed a case: cmp_test.py::test_empty_filter[>]| +|[#11103](https://github.com/NVIDIA/spark-rapids/issues/11103)|[BUG] UCX Shuffle With scala.MatchError | +|[#11007](https://github.com/NVIDIA/spark-rapids/issues/11007)|Fix tests failures in array_test.py| +|[#10801](https://github.com/NVIDIA/spark-rapids/issues/10801)|[BUG] JDK17 nightly build after Spark UT Framework is merged| +|[#11019](https://github.com/NVIDIA/spark-rapids/issues/11019)|Fix tests failures in window_function_test.py| +|[#11063](https://github.com/NVIDIA/spark-rapids/issues/11063)|[BUG] op time for GpuCoalesceBatches is more than actual| +|[#11006](https://github.com/NVIDIA/spark-rapids/issues/11006)|Fix test failures in arithmetic_ops_test.py| +|[#10995](https://github.com/NVIDIA/spark-rapids/issues/10995)|Fallback TimeZoneAwareExpression that only support UTC with zoneId instead of timeZone config| +|[#8652](https://github.com/NVIDIA/spark-rapids/issues/8652)|[BUG] array_item test failures on Spark 3.3.x| +|[#11053](https://github.com/NVIDIA/spark-rapids/issues/11053)|[BUG] Build on Databricks 330 fails| +|[#10925](https://github.com/NVIDIA/spark-rapids/issues/10925)| Concat cannot accept no parameter| +|[#10975](https://github.com/NVIDIA/spark-rapids/issues/10975)|[BUG] regex `^.*literal` cannot be rewritten as `contains(literal)` for multiline strings| +|[#10956](https://github.com/NVIDIA/spark-rapids/issues/10956)|[BUG] hive_parquet_write_test.py: test_write_compressed_parquet_into_hive_table integration test failures| +|[#10772](https://github.com/NVIDIA/spark-rapids/issues/10772)|[BUG] Issues found by Spark UT Framework on RapidsDataFrameAggregateSuite| +|[#10986](https://github.com/NVIDIA/spark-rapids/issues/10986)|[BUG]Cast from string to float using hand-picked values failed in CastOpSuite| +|[#10972](https://github.com/NVIDIA/spark-rapids/issues/10972)|Spark 4.0 compile errors | +|[#10794](https://github.com/NVIDIA/spark-rapids/issues/10794)|[BUG] Incorrect cast of string columns containing various infinity notations with trailing spaces | +|[#10964](https://github.com/NVIDIA/spark-rapids/issues/10964)|[BUG] Improve stability of pre-merge jenkinsfile| +|[#10714](https://github.com/NVIDIA/spark-rapids/issues/10714)|Signature changed for `PythonUDFRunner.writeUDFs` | +|[#10712](https://github.com/NVIDIA/spark-rapids/issues/10712)|[AUDIT] BatchScanExec/DataSourceV2Relation to group splits by join keys if they differ from partition keys| +|[#10673](https://github.com/NVIDIA/spark-rapids/issues/10673)|[AUDIT] Rename plan nodes for PythonMapInArrowExec| +|[#10710](https://github.com/NVIDIA/spark-rapids/issues/10710)|[AUDIT] `uncacheTableOrView` changed in CommandUtils | +|[#10711](https://github.com/NVIDIA/spark-rapids/issues/10711)|[AUDIT] Match DataSourceV2ScanExecBase changes to groupPartitions method | +|[#10669](https://github.com/NVIDIA/spark-rapids/issues/10669)|Supporting broadcast of multiple filtering keys in DynamicPruning | + +### PRs +||| +|:---|:---| +|[#11400](https://github.com/NVIDIA/spark-rapids/pull/11400)|[DOC] update notes in download page for the decompressing gzip issue [skip ci]| +|[#11355](https://github.com/NVIDIA/spark-rapids/pull/11355)|Update changelog for the v24.08 release [skip ci]| +|[#11353](https://github.com/NVIDIA/spark-rapids/pull/11353)|Update download doc for v24.08.1 [skip ci]| +|[#11352](https://github.com/NVIDIA/spark-rapids/pull/11352)|Update version to 24.08.1-SNAPSHOT [skip ci]| +|[#11337](https://github.com/NVIDIA/spark-rapids/pull/11337)|Update changelog for the v24.08 release [skip ci]| +|[#11335](https://github.com/NVIDIA/spark-rapids/pull/11335)|Fix Delta Lake truncation of min/max string values| +|[#11304](https://github.com/NVIDIA/spark-rapids/pull/11304)|Update changelog for v24.08.0 release [skip ci]| +|[#11303](https://github.com/NVIDIA/spark-rapids/pull/11303)|Update rapids JNI and private dependency to 24.08.0| +|[#11296](https://github.com/NVIDIA/spark-rapids/pull/11296)|[DOC] update doc for 2408 release [skip CI]| +|[#11309](https://github.com/NVIDIA/spark-rapids/pull/11309)|[Doc ]Update lore doc about the range [skip ci]| +|[#11292](https://github.com/NVIDIA/spark-rapids/pull/11292)|Add work around for string split with empty input.| +|[#11278](https://github.com/NVIDIA/spark-rapids/pull/11278)|Fix formatting of advanced configs doc| +|[#10917](https://github.com/NVIDIA/spark-rapids/pull/10917)|Adopt changes from JNI for casting from float to decimal| +|[#11269](https://github.com/NVIDIA/spark-rapids/pull/11269)|Revert "upgrade ucx to 1.17.0"| +|[#11260](https://github.com/NVIDIA/spark-rapids/pull/11260)|Mitigate intermittent test_buckets and shuffle_smoke_test OOM issue| +|[#11268](https://github.com/NVIDIA/spark-rapids/pull/11268)|Fix degenerate conditional nested loop join detection| +|[#11244](https://github.com/NVIDIA/spark-rapids/pull/11244)|Fix ArrayIndexOutOfBoundsException on join counts with constant join keys| +|[#11259](https://github.com/NVIDIA/spark-rapids/pull/11259)|CI Docker to support integration tests with Rocky OS + jdk17 [skip ci]| +|[#11247](https://github.com/NVIDIA/spark-rapids/pull/11247)|Fix `string_test.py` errors on Spark 4.0| +|[#11246](https://github.com/NVIDIA/spark-rapids/pull/11246)|Rework Maven Source Plugin Skip| +|[#11149](https://github.com/NVIDIA/spark-rapids/pull/11149)|Rework on substring index| +|[#11236](https://github.com/NVIDIA/spark-rapids/pull/11236)|Remove the unused vars from the version-def CI script| +|[#11237](https://github.com/NVIDIA/spark-rapids/pull/11237)|Fork jvm for maven-source-plugin| +|[#11200](https://github.com/NVIDIA/spark-rapids/pull/11200)|Multi-get_json_object| +|[#11230](https://github.com/NVIDIA/spark-rapids/pull/11230)|Skip test where Delta Lake may not be fully compatible with Spark| +|[#11220](https://github.com/NVIDIA/spark-rapids/pull/11220)|Avoid failing spark bug SPARK-44242 while generate run_dir| +|[#11226](https://github.com/NVIDIA/spark-rapids/pull/11226)|Fix auto merge conflict 11212| +|[#11129](https://github.com/NVIDIA/spark-rapids/pull/11129)|Spark 4: Fix miscellaneous tests including logic, repart, hive_delimited.| +|[#11163](https://github.com/NVIDIA/spark-rapids/pull/11163)|Support `MapFromArrays` on GPU| +|[#11219](https://github.com/NVIDIA/spark-rapids/pull/11219)|Fix hash_aggregate_test.py to run with ANSI enabled| +|[#11186](https://github.com/NVIDIA/spark-rapids/pull/11186)|from_json Json to Struct Exception Logging| +|[#11180](https://github.com/NVIDIA/spark-rapids/pull/11180)|More accurate estimation for the result serialization time in RapidsShuffleThreadedWriterBase| +|[#11194](https://github.com/NVIDIA/spark-rapids/pull/11194)|Fix ANSI mode test failures in url_test.py| +|[#11202](https://github.com/NVIDIA/spark-rapids/pull/11202)|Fix read from Delta Lake table with name column mapping and missing Parquet IDs| +|[#11185](https://github.com/NVIDIA/spark-rapids/pull/11185)|Fix multi-release jar problem| +|[#11144](https://github.com/NVIDIA/spark-rapids/pull/11144)|Build the Scala2.13 dist jar with JDK17| +|[#11197](https://github.com/NVIDIA/spark-rapids/pull/11197)|Fix class not found error: com/nvidia/spark/rapids/GpuScalar| +|[#11191](https://github.com/NVIDIA/spark-rapids/pull/11191)|Fix dynamic pruning regression in GpuFileSourceScanExec| +|[#10994](https://github.com/NVIDIA/spark-rapids/pull/10994)|Add Spark 4.0.0 Build Profile and Other Supporting Changes| +|[#11192](https://github.com/NVIDIA/spark-rapids/pull/11192)|Append new authorized user to blossom-ci whitelist [skip ci]| +|[#11179](https://github.com/NVIDIA/spark-rapids/pull/11179)|Allow more expressions to be tiered| +|[#11141](https://github.com/NVIDIA/spark-rapids/pull/11141)|Enable some Rapids config in RapidsSQLTestsBaseTrait for Spark UT| +|[#11170](https://github.com/NVIDIA/spark-rapids/pull/11170)|Avoid listFiles or inputFiles on relations with static partitioning| +|[#11159](https://github.com/NVIDIA/spark-rapids/pull/11159)|Drop spark31x shims| +|[#10951](https://github.com/NVIDIA/spark-rapids/pull/10951)|Case when performance improvement: reduce the `copy_if_else`| +|[#11165](https://github.com/NVIDIA/spark-rapids/pull/11165)|Fix some GpuBroadcastToRowExec by not dropping columns| +|[#11126](https://github.com/NVIDIA/spark-rapids/pull/11126)|Coalesce batches after a logical coalesce operation| +|[#11164](https://github.com/NVIDIA/spark-rapids/pull/11164)|fix the bucketed write error for non-utc cases| +|[#11132](https://github.com/NVIDIA/spark-rapids/pull/11132)|Add deletion vector metrics for low shuffle merge.| +|[#11156](https://github.com/NVIDIA/spark-rapids/pull/11156)|Fix batch splitting for partition column size on row-count-only batches| +|[#11153](https://github.com/NVIDIA/spark-rapids/pull/11153)|Fix LORE dump oom.| +|[#11102](https://github.com/NVIDIA/spark-rapids/pull/11102)|Fix ANSI mode failures in subquery_test.py| +|[#11151](https://github.com/NVIDIA/spark-rapids/pull/11151)|Fix the test error of the bucketed write for the non-utc case| +|[#11147](https://github.com/NVIDIA/spark-rapids/pull/11147)|upgrade ucx to 1.17.0| +|[#11138](https://github.com/NVIDIA/spark-rapids/pull/11138)|Update fastparquet to 2024.5.0 for numpy2 compatibility| +|[#11137](https://github.com/NVIDIA/spark-rapids/pull/11137)|Handle the change for UnaryPositive now extending RuntimeReplaceable| +|[#11094](https://github.com/NVIDIA/spark-rapids/pull/11094)|Add `HiveHash` support on GPU| +|[#11139](https://github.com/NVIDIA/spark-rapids/pull/11139)|Improve MetricsSuite to allow more gc jitter| +|[#11133](https://github.com/NVIDIA/spark-rapids/pull/11133)|Fix `test_window_group_limits_fallback`| +|[#11097](https://github.com/NVIDIA/spark-rapids/pull/11097)|Fix miscellaneous integ tests for Spark 4| +|[#11118](https://github.com/NVIDIA/spark-rapids/pull/11118)|Fix issue with DPP and AQE on reused broadcast exchanges| +|[#11043](https://github.com/NVIDIA/spark-rapids/pull/11043)|Dataproc serverless test fixes| +|[#10965](https://github.com/NVIDIA/spark-rapids/pull/10965)|Profiler: Disable collecting async allocation events by default| +|[#11117](https://github.com/NVIDIA/spark-rapids/pull/11117)|Update Scala2.13 premerge CI against JDK17| +|[#11084](https://github.com/NVIDIA/spark-rapids/pull/11084)|Introduce LORE framework.| +|[#11099](https://github.com/NVIDIA/spark-rapids/pull/11099)|Spark 4: Handle ANSI mode in sort_test.py| +|[#11115](https://github.com/NVIDIA/spark-rapids/pull/11115)|Fix match error in RapidsShuffleIterator.scala [scala2.13]| +|[#11088](https://github.com/NVIDIA/spark-rapids/pull/11088)|Support regex patterns with brackets when rewriting to PrefixRange pattern in rlike.| +|[#10950](https://github.com/NVIDIA/spark-rapids/pull/10950)|Add a heuristic to skip second or third agg pass| +|[#11048](https://github.com/NVIDIA/spark-rapids/pull/11048)|Fixed array_tests for Spark 4.0.0| +|[#11049](https://github.com/NVIDIA/spark-rapids/pull/11049)|Fix some cast_tests for Spark 4.0.0| +|[#11066](https://github.com/NVIDIA/spark-rapids/pull/11066)|Replaced spark3xx-common references to spark-shared| +|[#11083](https://github.com/NVIDIA/spark-rapids/pull/11083)|Exclude a case based on JDK version in Spark UT| +|[#10997](https://github.com/NVIDIA/spark-rapids/pull/10997)|Fix some test issues in Spark UT and keep RapidsTestSettings update-to-date| +|[#11073](https://github.com/NVIDIA/spark-rapids/pull/11073)|Disable ANSI mode for window function tests| +|[#11076](https://github.com/NVIDIA/spark-rapids/pull/11076)|Improve the diagnostics for 'conv' fallback explain| +|[#11092](https://github.com/NVIDIA/spark-rapids/pull/11092)|Add GpuBucketingUtils shim to Spark 4.0.0| +|[#11062](https://github.com/NVIDIA/spark-rapids/pull/11062)|fix duplicate counted metrics like op time for GpuCoalesceBatches| +|[#11044](https://github.com/NVIDIA/spark-rapids/pull/11044)|Fixed Failing tests in arithmetic_ops_tests for Spark 4.0.0| +|[#11086](https://github.com/NVIDIA/spark-rapids/pull/11086)|upgrade blossom-ci actions version [skip ci]| +|[#10957](https://github.com/NVIDIA/spark-rapids/pull/10957)|Support bucketing write for GPU| +|[#10979](https://github.com/NVIDIA/spark-rapids/pull/10979)|[FEA] Introduce low shuffle merge.| +|[#10996](https://github.com/NVIDIA/spark-rapids/pull/10996)|Fallback non-UTC TimeZoneAwareExpression with zoneId| +|[#11072](https://github.com/NVIDIA/spark-rapids/pull/11072)|Workaround numpy2 failed fastparquet compatibility tests| +|[#11046](https://github.com/NVIDIA/spark-rapids/pull/11046)|Calculate parallelism to speed up pre-merge CI| +|[#11054](https://github.com/NVIDIA/spark-rapids/pull/11054)|fix flaky array_item test failures| +|[#11051](https://github.com/NVIDIA/spark-rapids/pull/11051)|[FEA] Increase parallelism of deltalake test on databricks| +|[#10993](https://github.com/NVIDIA/spark-rapids/pull/10993)|`binary-dedupe` changes for Spark 4.0.0| +|[#11060](https://github.com/NVIDIA/spark-rapids/pull/11060)|Add in the ability to fingerprint JSON columns| +|[#11059](https://github.com/NVIDIA/spark-rapids/pull/11059)|Revert "Add in the ability to fingerprint JSON columns (#11002)" [skip ci]| +|[#11039](https://github.com/NVIDIA/spark-rapids/pull/11039)|Concat() Exception bug fix| +|[#11002](https://github.com/NVIDIA/spark-rapids/pull/11002)|Add in the ability to fingerprint JSON columns| +|[#10977](https://github.com/NVIDIA/spark-rapids/pull/10977)|Rewrite multiple literal choice regex to multiple contains in rlike| +|[#11035](https://github.com/NVIDIA/spark-rapids/pull/11035)|Fix auto merge conflict 11034 [skip ci]| +|[#11040](https://github.com/NVIDIA/spark-rapids/pull/11040)|Append new authorized user to blossom-ci whitelist [skip ci]| +|[#11036](https://github.com/NVIDIA/spark-rapids/pull/11036)|Update blossom-ci ACL to secure format [skip ci]| +|[#11032](https://github.com/NVIDIA/spark-rapids/pull/11032)|Fix a hive write test failure for Spark 350| +|[#10998](https://github.com/NVIDIA/spark-rapids/pull/10998)|Improve log to print more lines in build [skip ci]| +|[#10992](https://github.com/NVIDIA/spark-rapids/pull/10992)|Addressing the Named Parameter change in Spark 4.0.0| +|[#10943](https://github.com/NVIDIA/spark-rapids/pull/10943)|Fix Spark UT issues in RapidsDataFrameAggregateSuite| +|[#10963](https://github.com/NVIDIA/spark-rapids/pull/10963)|Add rapids configs to enable GPU running in Spark UT| +|[#10978](https://github.com/NVIDIA/spark-rapids/pull/10978)|More compilation fixes for Spark 4.0.0| +|[#10953](https://github.com/NVIDIA/spark-rapids/pull/10953)|Speed up the integration tests by running them in parallel on the Databricks cluster| +|[#10958](https://github.com/NVIDIA/spark-rapids/pull/10958)|Fix a hive write test failure| +|[#10970](https://github.com/NVIDIA/spark-rapids/pull/10970)|Move Support for `RaiseError` to a Shim Excluding Spark 4.0.0| +|[#10966](https://github.com/NVIDIA/spark-rapids/pull/10966)|Add default value for REF of premerge jenkinsfile to avoid bad overwritten [skip ci]| +|[#10959](https://github.com/NVIDIA/spark-rapids/pull/10959)|Add new ID to blossom-ci allow list [skip ci]| +|[#10952](https://github.com/NVIDIA/spark-rapids/pull/10952)|Add shims to take care of the signature change for writeUDFs in PythonUDFRunner| +|[#10931](https://github.com/NVIDIA/spark-rapids/pull/10931)|Add Support for Renaming of PythonMapInArrow| +|[#10949](https://github.com/NVIDIA/spark-rapids/pull/10949)|Change dependency version to 24.08.0-SNAPSHOT| +|[#10857](https://github.com/NVIDIA/spark-rapids/pull/10857)|[Spark 4.0] Account for `PartitionedFileUtil.splitFiles` signature change.| +|[#10912](https://github.com/NVIDIA/spark-rapids/pull/10912)|GpuInsertIntoHiveTable supports parquet format| +|[#10863](https://github.com/NVIDIA/spark-rapids/pull/10863)|[Spark 4.0] Account for `CommandUtils.uncacheTableOrView` signature change.| +|[#10944](https://github.com/NVIDIA/spark-rapids/pull/10944)|Added Shim for BatchScanExec to Support Spark 4.0| +|[#10946](https://github.com/NVIDIA/spark-rapids/pull/10946)|Unarchive Spark test jar for spark.read(ability)| +|[#10945](https://github.com/NVIDIA/spark-rapids/pull/10945)|Add Support for Multiple Filtering Keys for Subquery Broadcast| +|[#10871](https://github.com/NVIDIA/spark-rapids/pull/10871)|Add classloader diagnostics to initShuffleManager error message| +|[#10933](https://github.com/NVIDIA/spark-rapids/pull/10933)|Fixed Databricks build| +|[#10929](https://github.com/NVIDIA/spark-rapids/pull/10929)|Append new authorized user to blossom-ci whitelist [skip ci]| + ## Release 24.06 ### Features diff --git a/docs/compatibility.md b/docs/compatibility.md index 71a3927df13..0c745069032 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -316,118 +316,102 @@ case. ## JSON -The JSON format read is an experimental feature which is expected to have some issues, so we disable -it by default. If you would like to test it, you need to enable `spark.rapids.sql.format.json.enabled` and -`spark.rapids.sql.format.json.read.enabled`. +JSON, despite being a standard format, has some ambiguity in it. Spark also offers the ability to allow +some invalid JSON to be parsed. We have tried to provide JSON parsing that is compatible with +what Apache Spark does support. Note that Spark itself has changed through different releases, and we will +try to call out which releases we offer different results for. JSON parsing is enabled by default +except for date and timestamp types where we still have work to complete. If you wish to disable +JSON Scan you can set `spark.rapids.sql.format.json.enabled` or +`spark.rapids.sql.format.json.read.enabled` to false. To disable `from_json` you can set +`spark.rapids.sql.expression.JsonToStructs` to false. -### Invalid JSON +### Limits -In Apache Spark on the CPU if a line in the JSON file is invalid the entire row is considered -invalid and will result in nulls being returned for all columns. It is considered invalid if it -violates the JSON specification, but with a few extensions. +In versions of Spark before 3.5.0 there is no maximum to how deeply nested JSON can be. After +3.5.0 this was updated to be 1,000 by default. The current GPU implementation of JSON Scan and +`from_json` limits this to 254 no matter what version of Spark is used. If the nesting level is +over this the JSON is considered invalid and all values will be returned as nulls. +`get_json_object` and `json_tuple` have a maximum nesting depth of 64. An exception is thrown if +the nesting depth goes over the maximum. - * Single quotes are allowed to quote strings and keys - * Unquoted values like NaN and Infinity can be parsed as floating point values - * Control characters do not need to be replaced with the corresponding escape sequences in a - quoted string. - * Garbage at the end of a row, if there is valid JSON at the beginning of the row, is ignored. +Spark 3.5.0 and above have limits on maximum string length 20,000,000 and maximum number length of +1,000. We do not have any of these limits on the GPU. -The GPU implementation does the same kinds of validations, but many of them are done on a per-column -basis, which, for example, means if a number is formatted incorrectly, it is likely only that value -will be considered invalid and return a null instead of nulls for the entire row. +We, like Spark, cannot support an JSON string that is larger than 2 GiB is size. -There are options that can be used to enable and disable many of these features which are mostly -listed below. +### JSON Validation -### JSON options +Spark supports the option `allowNonNumericNumbers`. Versions of Spark prior to 3.3.0 where inconsistent between +quoted and non-quoted values ([SPARK-38060](https://issues.apache.org/jira/browse/SPARK-38060)). The +GPU implementation is consistent with 3.3.0 and above. -Spark supports passing options to the JSON parser when reading a dataset. In most cases if the RAPIDS Accelerator -sees one of these options that it does not support it will fall back to the CPU. In some cases we do not. The -following options are documented below. +### JSON Floating Point Types -- `allowNumericLeadingZeros` - Allows leading zeros in numbers (e.g. 00012). By default this is set to false. - When it is false Spark considers the JSON invalid if it encounters this type of number. The RAPIDS - Accelerator supports validating columns that are returned to the user with this option on or off. +Parsing floating-point values has the same limitations as [casting from string to float](#string-to-float). -- `allowUnquotedControlChars` - Allows JSON Strings to contain unquoted control characters (ASCII characters with - value less than 32, including tab and line feed characters) or not. By default this is set to false. If the schema - is provided while reading JSON file, then this flag has no impact on the RAPIDS Accelerator as it always allows - unquoted control characters but Spark sees these are invalid are returns nulls. However, if the schema is not provided - and this option is false, then RAPIDS Accelerator's behavior is same as Spark where an exception is thrown - as discussed in `JSON Schema discovery` section. +### JSON Integral Types -- `allowNonNumericNumbers` - Allows `NaN` and `Infinity` values to be parsed (note that these are not valid numeric - values in the [JSON specification](https://json.org)). Spark versions prior to 3.3.0 have inconsistent behavior and will - parse some variants of `NaN` and `Infinity` even when this option is disabled - ([SPARK-38060](https://issues.apache.org/jira/browse/SPARK-38060)). The RAPIDS Accelerator behavior is consistent with - Spark version 3.3.0 and later. +Versions of Spark prior to 3.3.0 would parse quoted integer values, like "1". But 3.3.0 and above consider +these to be invalid and will return `null` when parsed as an Integral types. The GPU implementation +follows 3.3.0 and above. -### Nesting -In versions of Spark before 3.5.0 there is no maximum to how deeply nested JSON can be. After -3.5.0 this was updated to be 1000 by default. The current GPU implementation limits this to 254 -no matter what version of Spark is used. If the nesting level is over this the JSON is considered -invalid and all values will be returned as nulls. +### JSON Decimal Types -Mixed types can have some problems. If an item being read could have some lines that are arrays -and others that are structs/dictionaries it is possible an error will be thrown. +Spark supports parsing decimal types either formatted as floating point number or integral numbers, even if it is +in a quoted string. If it is in a quoted string the local of the JVM is used to determine the number format. +If the local is not for the `US`, which is the default we will fall back to the CPU because we do not currently +parse those numbers correctly. The `US` format removes all commas ',' from the quoted string. +As a part of this, though, non-arabic numbers are also supported. We do not support parsing these numbers +see (issue 10532)[https://github.com/NVIDIA/spark-rapids/issues/10532]. -Dates and Timestamps have some issues and may return values for technically invalid inputs. +### JSON Date/Timestamp Types -Floating point numbers have issues generally like with the rest of Spark, and we can parse them into -a valid floating point number, but it might not match 100% with the way Spark does it. +Dates and timestamps are not supported by default in JSON parser, since the GPU implementation is not 100% +compatible with Apache Spark. +If needed, they can be turned on through the config `spark.rapids.sql.json.read.datetime.enabled`. +This config works for both JSON scan and `from_json`. Once enabled, the JSON parser still does +not support the `TimestampNTZ` type and will fall back to CPU if `spark.sql.timestampType` is set +to `TIMESTAMP_NTZ` or if an explicit schema is provided that contains the `TimestampNTZ` type. -Strings are supported, but the data returned might not be normalized in the same way as the CPU -implementation. Generally this comes down to the GPU not modifying the input, whereas Spark will -do things like remove extra white space and parse numbers before turning them back into a string. +There is currently no support for reading numeric values as timestamps and null values are returned instead +([#4940](https://github.com/NVIDIA/spark-rapids/issues/4940)). A workaround would be to read as longs and then cast to timestamp. -### JSON Floating Point +### JSON Arrays and Structs with Overflowing Numbers -Parsing floating-point values has the same limitations as [casting from string to float](#string-to-float). +Spark is inconsistent between versions in how it handles numbers that overflow that are nested in either an array +or a non-top-level struct. In some versions only the value that overflowed is marked as null. In other versions the +wrapping array or struct is marked as null. We currently only mark the individual value as null. This matches +versions 3.4.2 and above of Spark for structs. Arrays on most versions of spark invalidate the entire array if there +is a single value that overflows within it. -Prior to Spark 3.3.0, reading JSON strings such as `"+Infinity"` when specifying that the data type is `FloatType` -or `DoubleType` caused these values to be parsed even when `allowNonNumericNumbers` is set to false. Also, Spark -versions prior to 3.3.0 only supported the `"Infinity"` and `"-Infinity"` representations of infinity and did not -support `"+INF"`, `"-INF"`, or `"+Infinity"`, which Spark considers valid when unquoted. The GPU JSON reader is -consistent with the behavior in Spark 3.3.0 and later. +### Duplicate Struct Names -Another limitation of the GPU JSON reader is that it will parse strings containing non-string boolean or numeric values where -Spark will treat them as invalid inputs and will just return `null`. +The JSON specification technically allows for duplicate keys in a struct, but does not explain what to +do with them. In the case of Spark it is inconsistent between operators which value wins. `get_json_object` +depends on the query being performed. We do not always match what Spark does. We do match it in many cases, +but we consider this enough of a corner case that we have not tried to make it work in all cases. -### JSON Timestamps/Dates +We also do not support schemas where there are duplicate column names. We just fall back to the CPU for those cases. -The JSON parser does not support the `TimestampNTZ` type and will fall back to CPU if `spark.sql.timestampType` is -set to `TIMESTAMP_NTZ` or if an explicit schema is provided that contains the `TimestampNTZ` type. - -There is currently no support for reading numeric values as timestamps and null values are returned instead -([#4940](https://github.com/NVIDIA/spark-rapids/issues/4940)). A workaround would be to read as longs and then cast -to timestamp. +### JSON Normalization (String Types) -### JSON Schema discovery +In versions of Spark prior to 4.0.0 input JSON Strings were parsed to JSON tokens and then converted back to +strings. This effectively normalizes the output string. So things like single quotes are transformed into double +quotes, floating point numbers are parsed and converted back to strings possibly changing the format, and +escaped characters are converted back to their simplest form. We try to support this on the GPU as well. Single quotes +will be converted to double quotes. Only `get_json_object` and `json_tuple` attempt to normalize floating point +numbers. There is no implementation on the GPU right now that tries to normalize escape characters. -Spark SQL can automatically infer the schema of a JSON dataset if schema is not provided explicitly. The CPU -handles schema discovery and there is no GPU acceleration of this. By default Spark will read/parse the entire -dataset to determine the schema. This means that some options/errors which are ignored by the GPU may still -result in an exception if used with schema discovery. +### `from_json` Function -### `from_json` function - -`JsonToStructs` of `from_json` is based on the same code as reading a JSON lines file. There are +`JsonToStructs` or `from_json` is based on the same code as reading a JSON lines file. There are a few differences with it. -The `from_json` function is disabled by default because it is experimental and has some known -incompatibilities with Spark, and can be enabled by setting -`spark.rapids.sql.expression.JsonToStructs=true`. You don't need to set -`spark.rapids.sql.format.json.enabled` and`spark.rapids.sql.format.json.read.enabled` to true. - -There is no schema discovery as a schema is required as input to `from_json` - -In addition to `structs`, a top level `map` type is supported, but only if the key and value are -strings. - -### `to_json` function +The main difference is that `from_json` supports parsing Maps and Arrays directly from a JSON column, whereas +JSON Scan only supports parsing top level structs. The GPU implementation of `from_json` has support for parsing +a `MAP` as a top level schema, but does not currently support arrays at the top level. -The `to_json` function is disabled by default because it is experimental and has some known incompatibilities -with Spark, and can be enabled by setting `spark.rapids.sql.expression.StructsToJson=true`. +### `to_json` Function Known issues are: @@ -435,7 +419,7 @@ Known issues are: produce `-4.1243574E26` but the GPU may produce `-4.124357351E26`. - Not all JSON options are respected -### get_json_object +### `get_json_object` Function Known issue: - [Floating-point number normalization error](https://github.com/NVIDIA/spark-rapids-jni/issues/1922). `get_json_object` floating-point number normalization on the GPU could sometimes return incorrect results if the string contains high-precision values, see the String to Float and Float to String section for more details. @@ -477,17 +461,16 @@ These are the known edge cases where running on the GPU will produce different r next to a newline or a repetition that produces zero or more results ([#5610](https://github.com/NVIDIA/spark-rapids/pull/5610))` - Word and non-word boundaries, `\b` and `\B` -- Line anchor `$` will incorrectly match any of the unicode characters `\u0085`, `\u2028`, or `\u2029` followed by - another line-terminator, such as `\n`. For example, the pattern `TEST$` will match `TEST\u0085\n` on the GPU but - not on the CPU ([#7585](https://github.com/NVIDIA/spark-rapids/issues/7585)). The following regular expression patterns are not yet supported on the GPU and will fall back to the CPU. - Line anchors `^` and `$` are not supported in some contexts, such as when combined with a choice (`^|a` or `$|a`). - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts. -- String anchor `\z` is not supported -- Patterns containing an end of line or string anchor immediately next to a newline or repetition that produces zero +- String anchor `\z` is not supported. +- Patterns containing an end-of-line or string anchor immediately next to a newline or repetition that produces zero or more results +- Patterns containing end-of-line anchors like `$` or `\Z` immediately followed by + escape sequences (e.g., `\w`, `\b`) are not supported. - Line anchor `$` and string anchors `\Z` are not supported in patterns containing `\W` or `\D` - Line and string anchors are not supported by `string_split` and `str_to_map` - Lazy quantifiers within a choice block such as `(2|\u2029??)+` @@ -652,16 +635,19 @@ guaranteed to produce the same results as the CPU: - `yyyy/MM/dd` - `yyyy-MM-dd` - `yyyyMMdd` +- `yyyymmdd` - `yyyy/MM/dd HH:mm:ss` - `yyyy-MM-dd HH:mm:ss` +- `yyyyMMdd HH:mm:ss` LEGACY timeParserPolicy support has the following limitations when running on the GPU: - Only 4 digit years are supported - The proleptic Gregorian calendar is used instead of the hybrid Julian+Gregorian calendar that Spark uses in legacy mode -- When format is `yyyyMMdd`, GPU only supports 8 digit strings. Spark supports like 7 digit - `2024101` string while GPU does not support. Only tested `UTC` and `Asia/Shanghai` timezones. +- When format is/contains `yyyyMMdd` or `yyyymmdd`, GPU only supports 8 digit strings for these formats. + Spark supports like 7 digit `2024101` string while GPU does not support. Only tested `UTC` and + `Asia/Shanghai` timezones. ## Formatting dates and timestamps as strings diff --git a/docs/configs.md b/docs/configs.md index 4ef4d8efb3c..75076bafe7c 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -10,7 +10,7 @@ The following is the list of options that `rapids-plugin-4-spark` supports. On startup use: `--conf [conf key]=[conf value]`. For example: ``` -${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.10.1-cuda11.jar \ +${SPARK_HOME}/bin/spark-shell --jars rapids-4-spark_2.12-24.12.0-cuda11.jar \ --conf spark.plugins=com.nvidia.spark.SQLPlugin \ --conf spark.rapids.sql.concurrentGpuTasks=2 ``` @@ -37,7 +37,7 @@ Name | Description | Default Value | Applicable at spark.rapids.memory.gpu.minAllocFraction|The fraction of total GPU memory that limits the minimum size of the RMM pool. The value must be less than or equal to the setting for spark.rapids.memory.gpu.allocFraction.|0.25|Startup spark.rapids.memory.host.spillStorageSize|Amount of off-heap host memory to use for buffering spilled GPU data before spilling to local disk. Use -1 to set the amount to the combined size of pinned and pageable memory pools.|-1|Startup spark.rapids.memory.pinnedPool.size|The size of the pinned memory pool in bytes unless otherwise specified. Use 0 to disable the pool.|0|Startup -spark.rapids.sql.batchSizeBytes|Set the target number of bytes for a GPU batch. Splits sizes for input data is covered by separate configs. The maximum setting is 2 GB to avoid exceeding the cudf row count limit of a column.|1073741824|Runtime +spark.rapids.sql.batchSizeBytes|Set the target number of bytes for a GPU batch. Splits sizes for input data is covered by separate configs.|1073741824|Runtime spark.rapids.sql.concurrentGpuTasks|Set the number of tasks that can execute concurrently per GPU. Tasks may temporarily block when the number of concurrent tasks in the executor exceeds this amount. Allowing too many concurrent tasks on the same GPU may lead to GPU out of memory errors.|2|Runtime spark.rapids.sql.enabled|Enable (true) or disable (false) sql operations on the GPU|true|Runtime spark.rapids.sql.explain|Explain why some parts of a query were not placed on a GPU or not. Possible values are ALL: print everything, NONE: print nothing, NOT_ON_GPU: print only parts of a query that did not go on the GPU|NOT_ON_GPU|Runtime @@ -45,7 +45,6 @@ Name | Description | Default Value | Applicable at spark.rapids.sql.multiThreadedRead.numThreads|The maximum number of threads on each executor to use for reading small files in parallel. This can not be changed at runtime after the executor has started. Used with COALESCING and MULTITHREADED readers, see spark.rapids.sql.format.parquet.reader.type, spark.rapids.sql.format.orc.reader.type, or spark.rapids.sql.format.avro.reader.type for a discussion of reader types. If it is not set explicitly and spark.executor.cores is set, it will be tried to assign value of `max(MULTITHREAD_READ_NUM_THREADS_DEFAULT, spark.executor.cores)`, where MULTITHREAD_READ_NUM_THREADS_DEFAULT = 20.|20|Startup spark.rapids.sql.reader.batchSizeBytes|Soft limit on the maximum number of bytes the reader reads per batch. The readers will read chunks of data until this limit is met or exceeded. Note that the reader may estimate the number of bytes that will be used on the GPU in some cases based on the schema and number of rows in each batch.|2147483647|Runtime spark.rapids.sql.reader.batchSizeRows|Soft limit on the maximum number of rows the reader will read per batch. The orc and parquet readers will read row groups until this limit is met or exceeded. The limit is respected by the csv reader.|2147483647|Runtime -spark.rapids.sql.shuffle.spillThreads|Number of threads used to spill shuffle data to disk in the background.|6|Runtime spark.rapids.sql.udfCompiler.enabled|When set to true, Scala UDFs will be considered for compilation as Catalyst expressions|false|Runtime For more advanced configs, please refer to the [RAPIDS Accelerator for Apache Spark Advanced Configuration](./additional-functionality/advanced_configs.md) page. diff --git a/docs/dev/shimplify.md b/docs/dev/shimplify.md index a8f075016ae..cd9100ff447 100644 --- a/docs/dev/shimplify.md +++ b/docs/dev/shimplify.md @@ -65,7 +65,15 @@ validations: * The file is stored under the *owner shim* directory. * All files participating listing the `buildver` of the current Maven build session are symlinked to -`target/${buildver}/generated/src/(main|test)/(scala|java)`. Thus, instead of hardcoding distinct +`target/${buildver}/generated/src/(main|test)/(scala|java)` +except for template classes requiring spark.version.classifier in the package name. + +* If the package name of a class such as RapidsShuffleManager contains `$_spark.version.classifier_` +(because it is source-identical across shims up to the package name) it will be materialized in the +`target/${buildver}/generated/src/(main|test)/(scala|java)` with `spark.version.classifier` +interpolated into the package name. + +Thus, instead of hardcoding distinct lists of directories for `build-helper` Maven plugin to add (one for each shim) after the full transition to shimplify, the pom will have only 4 add source statements that is independent of the number of supported shims. diff --git a/docs/dev/shims.md b/docs/dev/shims.md index 281915f18c5..0d62eb4cae8 100644 --- a/docs/dev/shims.md +++ b/docs/dev/shims.md @@ -68,17 +68,17 @@ Using JarURLConnection URLs we create a Parallel World of the current version wi Spark 3.0.2's URLs: ```text -jar:file:/home/spark/rapids-4-spark_2.12-24.10.1.jar!/ -jar:file:/home/spark/rapids-4-spark_2.12-24.10.1.jar!/spark-shared/ -jar:file:/home/spark/rapids-4-spark_2.12-24.10.1.jar!/spark302/ +jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/ +jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark-shared/ +jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark302/ ``` Spark 3.2.0's URLs : ```text -jar:file:/home/spark/rapids-4-spark_2.12-24.10.1.jar!/ -jar:file:/home/spark/rapids-4-spark_2.12-24.10.1.jar!/spark-shared/ -jar:file:/home/spark/rapids-4-spark_2.12-24.10.1.jar!/spark320/ +jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/ +jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark-shared/ +jar:file:/home/spark/rapids-4-spark_2.12-24.12.0.jar!/spark320/ ``` ### Late Inheritance in Public Classes diff --git a/docs/dev/testing.md b/docs/dev/testing.md index af4d97d1699..9f1c33091f1 100644 --- a/docs/dev/testing.md +++ b/docs/dev/testing.md @@ -5,5 +5,5 @@ nav_order: 2 parent: Developer Overview --- An overview of testing can be found within the repository at: -* [Unit tests](https://github.com/NVIDIA/spark-rapids/tree/branch-24.10/tests#readme) -* [Integration testing](https://github.com/NVIDIA/spark-rapids/tree/branch-24.10/integration_tests#readme) +* [Unit tests](https://github.com/NVIDIA/spark-rapids/tree/branch-24.12/tests#readme) +* [Integration testing](https://github.com/NVIDIA/spark-rapids/tree/branch-24.12/integration_tests#readme) diff --git a/docs/download.md b/docs/download.md index 572d342a07b..60c62071f8b 100644 --- a/docs/download.md +++ b/docs/download.md @@ -27,7 +27,8 @@ The plugin is tested on the following architectures: ### Software Requirements: - OS: Ubuntu 20.04, Ubuntu 22.04, CentOS 7, or Rocky Linux 8 + OS: Spark RAPIDS is compatible with any Linux distribution with glibc >= 2.28 (Please check ldd --version output). glibc 2.28 was released August 1, 2018. + Tested on Ubuntu 20.04, Ubuntu 22.04, Rocky Linux 8 and Rocky Linux 9 NVIDIA Driver*: R470+ diff --git a/docs/supported_ops.md b/docs/supported_ops.md index 8f6a9ca0e5f..acf7133af40 100644 --- a/docs/supported_ops.md +++ b/docs/supported_ops.md @@ -9279,7 +9279,7 @@ are limited. JsonToStructs `from_json` Returns a struct value with the given `jsonStr` and `schema` -This is disabled by default because it is currently in beta and undergoes continuous enhancements. Please consult the [compatibility documentation](../compatibility.md#json-supporting-types) to determine whether you can enable this configuration for your use case +None project jsonStr @@ -9320,7 +9320,7 @@ are limited. NS -PS
MAP only supports keys and values that are of STRING type;
UTC is only supported TZ for child TIMESTAMP;
unsupported child types NULL, BINARY, CALENDAR, MAP, UDT, DAYTIME, YEARMONTH
+PS
MAP only supports keys and values that are of STRING type and is only supported at the top level;
UTC is only supported TZ for child TIMESTAMP;
unsupported child types NULL, BINARY, CALENDAR, MAP, UDT, DAYTIME, YEARMONTH
PS
UTC is only supported TZ for child TIMESTAMP;
unsupported child types NULL, BINARY, CALENDAR, MAP, UDT, DAYTIME, YEARMONTH
@@ -11493,6 +11493,103 @@ are limited. +MonthsBetween +`months_between` +If `timestamp1` is later than `timestamp2`, then the result is positive. If `timestamp1` and `timestamp2` are on the same day of month, or both are the last day of month, time of day will be ignored. Otherwise, the difference is calculated based on 31 days per month, and rounded to 8 digits unless roundOff=false. +None +project +timestamp1 + + + + + + + + +PS
UTC is only supported TZ for TIMESTAMP
+ + + + + + + + + + + + + +timestamp2 + + + + + + + + +PS
UTC is only supported TZ for TIMESTAMP
+ + + + + + + + + + + + + +round +PS
Literal value only
+ + + + + + + + + + + + + + + + + + + + + +result + + + + + + +S + + + + + + + + + + + + + + + Multiply `*` Multiplication @@ -11637,6 +11734,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + Murmur3Hash `hash` Murmur3 hash operator @@ -11762,34 +11887,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - NamedLambdaVariable A parameter to a higher order SQL function @@ -12041,6 +12138,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + Or `or` Logical OR @@ -12185,34 +12310,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - ParseUrl `parse_url` Extracts a part from a URL @@ -12435,6 +12532,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + PosExplode `posexplode_outer`, `posexplode` Given an input array produces a sequence of rows for each value in the array @@ -12630,34 +12755,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - PreciseTimestampConversion Expression used internally to convert the TimestampType to Long and back without losing precision, i.e. in microseconds. Used in time windowing @@ -12952,6 +13049,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + Quarter `quarter` Returns the quarter of the year for date, in the range 1 to 4 @@ -13077,34 +13202,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - RaiseError `raise_error` Throw an exception @@ -13355,6 +13452,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + RegExpExtractAll `regexp_extract_all` Extract all strings matching a regular expression corresponding to the regex group index @@ -13572,34 +13697,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - Remainder `%`, `mod` Remainder or modulo @@ -13776,6 +13873,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + Rint `rint` Rounds up a double value to the nearest double equal to an integer @@ -13976,34 +14101,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - ScalaUDF User Defined Function, the UDF can choose to implement a RAPIDS accelerated interface to get better performance. @@ -14254,6 +14351,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + ShiftLeft `shiftleft` Bitwise shift left (<<) @@ -14402,34 +14527,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - ShiftRightUnsigned `shiftrightunsigned` Bitwise unsigned shift right (>>>) @@ -14653,33 +14750,61 @@ are limited. -Sinh -`sinh` -Hyperbolic sine -None -project -input - - - - - - -S - - - - - - - - - - - - - - +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + +Sinh +`sinh` +Hyperbolic sine +None +project +input + + + + + + +S + + + + + + + + + + + + + + result @@ -14802,34 +14927,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - SortArray `sort_array` Returns a sorted array with the input array and the ascending / descending order @@ -15057,6 +15154,34 @@ are limited. NS +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + Sqrt `sqrt` Square root @@ -15229,34 +15354,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - StartsWith Starts with @@ -15502,6 +15599,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + StringLocate `locate`, `position` Substring search operator @@ -15696,34 +15821,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - StringRepeat `repeat` StringRepeat operator that repeats the given strings with numbers of times given by repeatTimes @@ -15895,6 +15992,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + StringSplit `split` Splits `str` around occurrences that match `regex` @@ -16089,34 +16214,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - StringTranslate `translate` StringTranslate operator @@ -16288,6 +16385,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + StringTrimLeft `ltrim` StringTrimLeft operator @@ -16487,34 +16612,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - Substring `substr`, `substring` Substring operator @@ -16709,6 +16806,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + Subtract `-` Subtraction @@ -16951,34 +17076,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - Tanh `tanh` Hyperbolic tangent @@ -17151,6 +17248,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + ToDegrees `degrees` Converts radians to degrees @@ -17393,40 +17518,12 @@ are limited. - - - - - - - - -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH + + + + + + TransformKeys @@ -17577,6 +17674,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + UnaryMinus `negative` Negate a numeric value @@ -17801,34 +17926,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - UnboundedPreceding$ Special boundary for a window frame, indicating all rows preceding the current row @@ -17982,6 +18079,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + Upper `ucase`, `upper` String uppercase operator @@ -18232,34 +18357,6 @@ are limited. NS -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - XxHash64 `xxhash64` xxhash64 hash operator @@ -18576,6 +18673,34 @@ are limited. S +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + ApproximatePercentile `approx_percentile`, `percentile_approx` Approximate percentile @@ -18766,34 +18891,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - Average `avg`, `mean` Average aggregate operator @@ -19084,6 +19181,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + CollectSet `collect_set` Collect a set of unique elements, not supported in reduction @@ -19229,34 +19354,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - Count `count` Count aggregate operator @@ -19547,6 +19644,34 @@ are limited. NS +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + Last `last_value`, `last` last aggregate operator @@ -19692,34 +19817,6 @@ are limited. NS -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - Max `max` Max aggregate operator @@ -20009,6 +20106,34 @@ are limited. NS +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + Min `min` Min aggregate operator @@ -20154,34 +20279,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - MinBy `min_by` MinBy aggregate operator. It may produce different results than CPU when multiple rows in a group have same minimum value in the ordering column and different associated values in the value column. @@ -20516,6 +20613,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + PivotFirst PivotFirst operator @@ -20660,34 +20785,6 @@ are limited. NS -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - StddevPop `stddev_pop` Aggregation computing population standard deviation @@ -20978,6 +21075,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + Sum `sum` Sum aggregate operator @@ -21123,34 +21248,6 @@ are limited. -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - VariancePop `var_pop` Aggregation computing population variance @@ -21441,6 +21538,34 @@ are limited. +Expression +SQL Functions(s) +Description +Notes +Context +Param/Output +BOOLEAN +BYTE +SHORT +INT +LONG +FLOAT +DOUBLE +DATE +TIMESTAMP +STRING +DECIMAL +NULL +BINARY +CALENDAR +ARRAY +MAP +STRUCT +UDT +DAYTIME +YEARMONTH + + NormalizeNaNAndZero Normalize NaN and zero @@ -21520,34 +21645,6 @@ are limited. NS -Expression -SQL Functions(s) -Description -Notes -Context -Param/Output -BOOLEAN -BYTE -SHORT -INT -LONG -FLOAT -DOUBLE -DATE -TIMESTAMP -STRING -DECIMAL -NULL -BINARY -CALENDAR -ARRAY -MAP -STRUCT -UDT -DAYTIME -YEARMONTH - - HiveGenericUDF Hive Generic UDF, the UDF can choose to implement a RAPIDS accelerated interface to get better performance @@ -23058,8 +23155,8 @@ dates or timestamps, or for a lack of type coercion support. S S S -S -PS
UTC is only supported TZ for TIMESTAMP
+PS
DATE is not supported by default due to compatibility
+PS
TIMESTAMP is not supported by default due to compatibility;
UTC is only supported TZ for TIMESTAMP
S S diff --git a/integration_tests/README.md b/integration_tests/README.md index 7c1486418aa..f8b6d9510ff 100644 --- a/integration_tests/README.md +++ b/integration_tests/README.md @@ -263,7 +263,7 @@ individually, so you don't risk running unit tests along with the integration te http://www.scalatest.org/user_guide/using_the_scalatest_shell ```shell -spark-shell --jars rapids-4-spark-tests_2.12-24.10.1-tests.jar,rapids-4-spark-integration-tests_2.12-24.10.1-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar +spark-shell --jars rapids-4-spark-tests_2.12-24.12.0-tests.jar,rapids-4-spark-integration-tests_2.12-24.12.0-tests.jar,scalatest_2.12-3.0.5.jar,scalactic_2.12-3.0.5.jar ``` First you import the `scalatest_shell` and tell the tests where they can find the test files you @@ -286,7 +286,7 @@ If you just want to verify the SQL replacement is working you will need to add t assumes CUDA 11.0 is being used and the Spark distribution is built with Scala 2.12. ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.10.1-cuda11.jar" ./runtests.py +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.12.0-cuda11.jar" ./runtests.py ``` You don't have to enable the plugin for this to work, the test framework will do that for you. @@ -443,7 +443,7 @@ To run cudf_udf tests, need following configuration changes: As an example, here is the `spark-submit` command with the cudf_udf parameter on CUDA 11.0: ``` -$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.10.1-cuda11.jar,rapids-4-spark-tests_2.12-24.10.1.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-24.10.1-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-24.10.1-cuda11.jar" ./runtests.py --cudf_udf +$SPARK_HOME/bin/spark-submit --jars "rapids-4-spark_2.12-24.12.0-cuda11.jar,rapids-4-spark-tests_2.12-24.12.0.jar" --conf spark.rapids.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.memory.gpu.allocFraction=0.3 --conf spark.rapids.python.concurrentPythonWorkers=2 --py-files "rapids-4-spark_2.12-24.12.0-cuda11.jar" --conf spark.executorEnv.PYTHONPATH="rapids-4-spark_2.12-24.12.0-cuda11.jar" ./runtests.py --cudf_udf ``` ### Enabling fuzz tests diff --git a/integration_tests/ScaleTest.md b/integration_tests/ScaleTest.md index a72125f59f9..bea34954cab 100644 --- a/integration_tests/ScaleTest.md +++ b/integration_tests/ScaleTest.md @@ -97,7 +97,7 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.sql.parquet.datetimeRebaseModeInWrite=CORRECTED \ --jars $SPARK_HOME/examples/jars/scopt_2.12-3.7.1.jar \ --class com.nvidia.spark.rapids.tests.scaletest.ScaleTest \ -./target/rapids-4-spark-integration-tests_2.12-24.10.1-spark332.jar \ +./target/rapids-4-spark-integration-tests_2.12-24.12.0-spark332.jar \ 10 \ 100 \ parquet \ diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml index 19fcf18ba83..bac78bce0df 100644 --- a/integration_tests/pom.xml +++ b/integration_tests/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.12 - 24.10.1 + 24.12.0 ../shim-deps/pom.xml rapids-4-spark-integration-tests_2.12 - 24.10.1 + 24.12.0 integration_tests diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 22a23349791..9bd72b2ada0 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -28,10 +28,10 @@ else [[ ! -x "$(command -v zip)" ]] && { echo "fail to find zip command in $PATH"; exit 1; } PY4J_TMP=("${SPARK_HOME}"/python/lib/py4j-*-src.zip) PY4J_FILE=${PY4J_TMP[0]} - # PySpark uses ".dev0" for "-SNAPSHOT", ".dev" for "preview" + # PySpark uses ".dev0" for "-SNAPSHOT" and either ".dev" for "preview" or ".devN" for "previewN" # https://github.com/apache/spark/blob/66f25e314032d562567620806057fcecc8b71f08/dev/create-release/release-build.sh#L267 VERSION_STRING=$(PYTHONPATH=${SPARK_HOME}/python:${PY4J_FILE} python -c \ - "import pyspark, re; print(re.sub('\.dev0?$', '', pyspark.__version__))" + "import pyspark, re; print(re.sub('\.dev[012]?$', '', pyspark.__version__))" ) SCALA_VERSION=`$SPARK_HOME/bin/pyspark --version 2>&1| grep Scala | awk '{split($4,v,"."); printf "%s.%s", v[1], v[2]}'` diff --git a/integration_tests/src/main/python/aqe_test.py b/integration_tests/src/main/python/aqe_test.py index 3e10f6e9148..5b3b04efdfb 100755 --- a/integration_tests/src/main/python/aqe_test.py +++ b/integration_tests/src/main/python/aqe_test.py @@ -19,7 +19,7 @@ from conftest import is_databricks_runtime, is_not_utc from data_gen import * from marks import ignore_order, allow_non_gpu -from spark_session import with_cpu_session, is_databricks113_or_later, is_before_spark_330 +from spark_session import with_cpu_session, is_databricks113_or_later, is_before_spark_330, is_databricks_version_or_later # allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653' not_utc_aqe_allow=['ShuffleExchangeExec', 'HashAggregateExec'] if is_not_utc() else [] @@ -338,6 +338,8 @@ def do_it(spark): # this should be fixed by https://github.com/NVIDIA/spark-rapids/issues/11120 aqe_join_with_dpp_fallback=["FilterExec"] if (is_databricks_runtime() or is_before_spark_330()) else [] +if is_databricks_version_or_later(14, 3): + aqe_join_with_dpp_fallback.append("CollectLimitExec") # Verify that DPP and AQE can coexist in even some odd cases involving multiple tables @ignore_order(local=True) diff --git a/integration_tests/src/main/python/cast_test.py b/integration_tests/src/main/python/cast_test.py index 044f1d46322..83cc4922b3b 100644 --- a/integration_tests/src/main/python/cast_test.py +++ b/integration_tests/src/main/python/cast_test.py @@ -18,7 +18,7 @@ from conftest import is_not_utc, is_supported_time_zone, is_dataproc_serverless_runtime from data_gen import * from spark_session import * -from marks import allow_non_gpu, approximate_float, datagen_overrides, tz_sensitive_test +from marks import allow_non_gpu, approximate_float, datagen_overrides, disable_ansi_mode, tz_sensitive_test from pyspark.sql.types import * from spark_init_internal import spark_version from datetime import date, datetime @@ -26,13 +26,27 @@ _decimal_gen_36_5 = DecimalGen(precision=36, scale=5) -def test_cast_empty_string_to_int(): + +def test_cast_empty_string_to_int_ansi_off(): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, StringGen(pattern="")).selectExpr( 'CAST(a as BYTE)', 'CAST(a as SHORT)', 'CAST(a as INTEGER)', - 'CAST(a as LONG)')) + 'CAST(a as LONG)'), + conf=ansi_disabled_conf) + + +@pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/11552") +def test_cast_empty_string_to_int_ansi_on(): + assert_gpu_and_cpu_error( + lambda spark : unary_op_df(spark, StringGen(pattern="")).selectExpr( + 'CAST(a as BYTE)', + 'CAST(a as SHORT)', + 'CAST(a as INTEGER)', + 'CAST(a as LONG)').collect(), + conf=ansi_enabled_conf, + error_message="cannot be cast to ") # These tests are not intended to be exhaustive. The scala test CastOpSuite should cover # just about everything for non-nested values. This is intended to check that the @@ -61,12 +75,22 @@ def test_cast_nested(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type))) -def test_cast_string_date_valid_format(): +def test_cast_string_date_valid_format_ansi_off(): # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. # This provides values that are valid in all of those formats. assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, StringGen(date_start_1_1_1)).select(f.col('a').cast(DateType())), - conf = {'spark.rapids.sql.hasExtendedYearValues': 'false'}) + conf = copy_and_update(ansi_disabled_conf, {'spark.rapids.sql.hasExtendedYearValues': False})) + + +@pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/11556") +def test_cast_string_date_valid_format_ansi_on(): + # In Spark 3.2.0+ the valid format changed, and we cannot support all formats. + # This provides values that are valid in all of those formats. + assert_gpu_and_cpu_error( + lambda spark : unary_op_df(spark, StringGen(date_start_1_1_1)).select(f.col('a').cast(DateType())).collect(), + conf = copy_and_update(ansi_enabled_conf, {'spark.rapids.sql.hasExtendedYearValues': False}), + error_message="One or more values could not be converted to DateType") invalid_values_string_to_date = ['200', ' 1970A', '1970 A', '1970T', # not conform to "yyyy" after trim '1970 T', ' 1970-01T', '1970-01 A', # not conform to "yyyy-[M]M" after trim @@ -94,8 +118,8 @@ def test_cast_string_date_invalid_ansi_before_320(): data_rows = [(v,) for v in values_string_to_data] assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), - conf={'spark.rapids.sql.hasExtendedYearValues': 'false', - 'spark.sql.ansi.enabled': 'true'}, ) + conf={'spark.rapids.sql.hasExtendedYearValues': False, + 'spark.sql.ansi.enabled': True}, ) # test Spark versions >= 320 and databricks, ANSI mode, valid values @pytest.mark.skipif(is_before_spark_320(), reason="Spark versions(< 320) not support Ansi mode when casting string to date") @@ -103,8 +127,8 @@ def test_cast_string_date_valid_ansi(): data_rows = [(v,) for v in valid_values_string_to_date] assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), - conf={'spark.rapids.sql.hasExtendedYearValues': 'false', - 'spark.sql.ansi.enabled': 'true'}) + conf={'spark.rapids.sql.hasExtendedYearValues': False, + 'spark.sql.ansi.enabled': True}) # test Spark versions >= 320, ANSI mode @pytest.mark.skipif(is_before_spark_320(), reason="ansi cast(string as date) throws exception only in 3.2.0+") @@ -112,8 +136,8 @@ def test_cast_string_date_valid_ansi(): def test_cast_string_date_invalid_ansi(invalid): assert_gpu_and_cpu_error( lambda spark: spark.createDataFrame([(invalid,)], "a string").select(f.col('a').cast(DateType())).collect(), - conf={'spark.rapids.sql.hasExtendedYearValues': 'false', - 'spark.sql.ansi.enabled': 'true'}, + conf={'spark.rapids.sql.hasExtendedYearValues': False, + 'spark.sql.ansi.enabled': True}, error_message="DateTimeException") @@ -144,7 +168,8 @@ def test_cast_string_date_non_ansi(): data_rows = [(v,) for v in values_string_to_data] assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())), - conf={'spark.rapids.sql.hasExtendedYearValues': 'false'}) + conf=copy_and_update(ansi_disabled_conf, {'spark.rapids.sql.hasExtendedYearValues': False})) + @pytest.mark.parametrize('data_gen', [StringGen(date_start_1_1_1), StringGen(date_start_1_1_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'), @@ -153,32 +178,65 @@ def test_cast_string_date_non_ansi(): ids=idfn) @tz_sensitive_test @allow_non_gpu(*non_utc_allow) -def test_cast_string_ts_valid_format(data_gen): +def test_cast_string_ts_valid_format_ansi_off(data_gen): # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. # This provides values that are valid in all of those formats. assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(TimestampType())), - conf = {'spark.rapids.sql.hasExtendedYearValues': 'false', - 'spark.rapids.sql.castStringToTimestamp.enabled': 'true'}) + conf = copy_and_update(ansi_disabled_conf, + {'spark.rapids.sql.hasExtendedYearValues': False, + 'spark.rapids.sql.castStringToTimestamp.enabled': True})) + + +@pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/11556") +@pytest.mark.parametrize('data_gen', [StringGen(date_start_1_1_1)], + ids=idfn) +@tz_sensitive_test +@allow_non_gpu(*non_utc_allow) +def test_cast_string_ts_valid_format_ansi_on(data_gen): + # In Spark 3.2.0+ the valid format changed, and we cannot support all of the format. + # This provides values that are valid in all of those formats. + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(TimestampType())), + conf = copy_and_update(ansi_enabled_conf, + {'spark.rapids.sql.hasExtendedYearValues': False, + 'spark.rapids.sql.castStringToTimestamp.enabled': True})) + @allow_non_gpu('ProjectExec', 'Cast', 'Alias') @pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0+ do we have issues with extended years") -def test_cast_string_date_fallback(): +def test_cast_string_date_fallback_ansi_off(): + """ + This tests that STRING->DATE conversion is run on CPU, via a fallback. + The point of this test is to exercise the fallback, and not to examine any errors in casting. + There is no change in behaviour between Apache Spark and the plugin, since they're both + exercising the CPU implementation. Therefore, this needn't be tested with ANSI enabled. + """ assert_gpu_fallback_collect( # Cast back to String because this goes beyond what python can support for years lambda spark : unary_op_df(spark, StringGen('([0-9]|-|\\+){4,12}')).select(f.col('a').cast(DateType()).cast(StringType())), - 'Cast') + 'Cast', + conf=ansi_disabled_conf) @allow_non_gpu('ProjectExec', 'Cast', 'Alias') @pytest.mark.skipif(is_before_spark_320(), reason="Only in Spark 3.2.0+ do we have issues with extended years") def test_cast_string_timestamp_fallback(): + """ + This tests that STRING->TIMESTAMP conversion is run on CPU, via a fallback. + The point of this test is to exercise the fallback, and not to examine any errors in casting. + There is no change in behaviour between Apache Spark and the plugin, since they're both + exercising the CPU implementation. Therefore, this needn't be tested with ANSI enabled. + """ assert_gpu_fallback_collect( # Cast back to String because this goes beyond what python can support for years lambda spark : unary_op_df(spark, StringGen('([0-9]|-|\\+){4,12}')).select(f.col('a').cast(TimestampType()).cast(StringType())), 'Cast', - conf = {'spark.rapids.sql.castStringToTimestamp.enabled': 'true'}) + conf = copy_and_update(ansi_disabled_conf, + {'spark.rapids.sql.castStringToTimestamp.enabled': True})) +@disable_ansi_mode # In ANSI mode, there are restrictions to casting DECIMAL to other types. + # ANSI mode behaviour is tested in test_ansi_cast_decimal_to. @approximate_float @pytest.mark.parametrize('data_gen', [ decimal_gen_32bit, @@ -191,10 +249,10 @@ def test_cast_string_timestamp_fallback(): DecimalGen(precision=38, scale=10), DecimalGen(precision=36, scale=-5), DecimalGen(precision=38, scale=-10)], ids=meta_idfn('from:')) @pytest.mark.parametrize('to_type', [ByteType(), ShortType(), IntegerType(), LongType(), FloatType(), DoubleType(), StringType()], ids=meta_idfn('to:')) -def test_cast_decimal_to(data_gen, to_type): +def test_with_ansi_disabled_cast_decimal_to(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type), f.col('a')), - conf = {'spark.rapids.sql.castDecimalToFloat.enabled': 'true'}) + conf = {'spark.rapids.sql.castDecimalToFloat.enabled': True}) @approximate_float @pytest.mark.parametrize('data_gen', [ @@ -210,6 +268,8 @@ def test_ansi_cast_decimal_to(data_gen, to_type): conf = {'spark.rapids.sql.castDecimalToFloat.enabled': True, 'spark.sql.ansi.enabled': True}) + +@disable_ansi_mode # With ANSI enabled, casting from wider to narrower types will fail. @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/10050') @pytest.mark.parametrize('data_gen', [ DecimalGen(7, 1), @@ -226,10 +286,24 @@ def test_ansi_cast_decimal_to(data_gen, to_type): DecimalType(30, -4), DecimalType(38, -10), DecimalType(1, -1)], ids=meta_idfn('to:')) -def test_cast_decimal_to_decimal(data_gen, to_type): +def test_with_ansi_disabled_cast_decimal_to_decimal(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type), f.col('a'))) + +@pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/11550") +@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/10050') +@pytest.mark.parametrize('data_gen', [ + DecimalGen(3, 0)], ids=meta_idfn('from:')) +@pytest.mark.parametrize('to_type', [ + DecimalType(1, -1)], ids=meta_idfn('to:')) +def test_ansi_cast_failures_decimal_to_decimal(data_gen, to_type): + assert_gpu_and_cpu_error( + lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type), f.col('a')).collect(), + conf=ansi_enabled_conf, + error_message="overflow occurred") + + @pytest.mark.parametrize('data_gen', [byte_gen, short_gen, int_gen, long_gen], ids=idfn) @pytest.mark.parametrize('to_type', [ DecimalType(2, 0), @@ -240,10 +314,21 @@ def test_cast_decimal_to_decimal(data_gen, to_type): DecimalType(10, 2), DecimalType(18, 0), DecimalType(18, 2)], ids=idfn) -def test_cast_integral_to_decimal(data_gen, to_type): +def test_cast_integral_to_decimal_ansi_off(data_gen, to_type): + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, data_gen).select( + f.col('a').cast(to_type)), + conf=ansi_disabled_conf) + + +@pytest.mark.skip("https://github.com/NVIDIA/spark-rapids/issues/11550") +@pytest.mark.parametrize('data_gen', [long_gen], ids=idfn) +@pytest.mark.parametrize('to_type', [DecimalType(2, 0)], ids=idfn) +def test_cast_integral_to_decimal_ansi_on(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select( - f.col('a').cast(to_type))) + f.col('a').cast(to_type)), + conf=ansi_enabled_conf) def test_cast_byte_to_decimal_overflow(): assert_gpu_and_cpu_are_equal_collect( @@ -278,11 +363,28 @@ def test_cast_long_to_decimal_overflow(): DecimalType(30, 3), DecimalType(5, -3), DecimalType(3, 0)], ids=idfn) -def test_cast_floating_point_to_decimal(data_gen, to_type): +def test_cast_floating_point_to_decimal_ansi_off(data_gen, to_type): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).select( f.col('a'), f.col('a').cast(to_type)), - conf={'spark.rapids.sql.castFloatToDecimal.enabled': 'true'}) + conf=copy_and_update( + ansi_disabled_conf, + {'spark.rapids.sql.castFloatToDecimal.enabled': True})) + + +@pytest.mark.skip("https://github.com/NVIDIA/spark-rapids/issues/11550") +@pytest.mark.parametrize('data_gen', [FloatGen(special_cases=_float_special_cases)]) +@pytest.mark.parametrize('to_type', [DecimalType(7, 1)]) +def test_cast_floating_point_to_decimal_ansi_on(data_gen, to_type): + assert_gpu_and_cpu_error( + lambda spark : unary_op_df(spark, data_gen).select( + f.col('a'), + f.col('a').cast(to_type)).collect(), + conf=copy_and_update( + ansi_enabled_conf, + {'spark.rapids.sql.castFloatToDecimal.enabled': True}), + error_message="[NUMERIC_VALUE_OUT_OF_RANGE.WITH_SUGGESTION]") + # casting these types to string should be passed basic_gens_for_cast_to_string = [ByteGen, ShortGen, IntegerGen, LongGen, StringGen, BooleanGen, DateGen, TimestampGen] @@ -323,7 +425,7 @@ def _assert_cast_to_string_equal (data_gen, conf): @pytest.mark.parametrize('data_gen', all_array_gens_for_cast_to_string, ids=idfn) -@pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.parametrize('legacy', [True, False]) @allow_non_gpu(*non_utc_allow) def test_cast_array_to_string(data_gen, legacy): _assert_cast_to_string_equal( @@ -347,18 +449,18 @@ def test_cast_double_to_string(): assert from_cpu_float == from_gpu_float @pytest.mark.parametrize('data_gen', [ArrayGen(sub) for sub in not_matched_struct_array_gens_for_cast_to_string], ids=idfn) -@pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.parametrize('legacy', [True, False]) @pytest.mark.xfail(reason='casting this type to string is not exact match') def test_cast_array_with_unmatched_element_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, - {"spark.rapids.sql.castFloatToString.enabled" : "true", + {"spark.rapids.sql.castFloatToString.enabled" : True, "spark.sql.legacy.castComplexTypesToString.enabled": legacy} ) @pytest.mark.parametrize('data_gen', basic_map_gens_for_cast_to_string, ids=idfn) -@pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.parametrize('legacy', [True, False]) @allow_non_gpu(*non_utc_allow) def test_cast_map_to_string(data_gen, legacy): _assert_cast_to_string_equal( @@ -367,18 +469,18 @@ def test_cast_map_to_string(data_gen, legacy): @pytest.mark.parametrize('data_gen', not_matched_map_gens_for_cast_to_string, ids=idfn) -@pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.parametrize('legacy', [True, False]) @pytest.mark.xfail(reason='casting this type to string is not exact match') def test_cast_map_with_unmatched_element_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, - {"spark.rapids.sql.castFloatToString.enabled" : "true", + {"spark.rapids.sql.castFloatToString.enabled" : True, "spark.sql.legacy.castComplexTypesToString.enabled": legacy} ) @pytest.mark.parametrize('data_gen', [StructGen([[str(i), gen] for i, gen in enumerate(basic_array_struct_gens_for_cast_to_string)] + [["map", MapGen(ByteGen(nullable=False), null_gen)]])], ids=idfn) -@pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.parametrize('legacy', [True, False]) @allow_non_gpu(*non_utc_allow) def test_cast_struct_to_string(data_gen, legacy): _assert_cast_to_string_equal( @@ -400,7 +502,7 @@ def was_broken_for_nested_null(spark): assert_gpu_and_cpu_are_equal_collect( was_broken_for_nested_null, - {"spark.sql.legacy.castComplexTypesToString.enabled": 'true' if cast_conf == 'LEGACY' else 'false'} + {"spark.sql.legacy.castComplexTypesToString.enabled": True if cast_conf == 'LEGACY' else False} ) # https://github.com/NVIDIA/spark-rapids/issues/2315 @@ -417,16 +519,16 @@ def broken_df(spark): assert_gpu_and_cpu_are_equal_collect( broken_df, - {"spark.sql.legacy.castComplexTypesToString.enabled": 'true' if cast_conf == 'LEGACY' else 'false'} + {"spark.sql.legacy.castComplexTypesToString.enabled": True if cast_conf == 'LEGACY' else False} ) @pytest.mark.parametrize('data_gen', [StructGen([["first", element_gen]]) for element_gen in not_matched_struct_array_gens_for_cast_to_string], ids=idfn) -@pytest.mark.parametrize('legacy', ['true', 'false']) +@pytest.mark.parametrize('legacy', [True, False]) @pytest.mark.xfail(reason='casting this type to string is not an exact match') def test_cast_struct_with_unmatched_element_to_string(data_gen, legacy): _assert_cast_to_string_equal( data_gen, - {"spark.rapids.sql.castFloatToString.enabled" : "true", + {"spark.rapids.sql.castFloatToString.enabled" : True, "spark.sql.legacy.castComplexTypesToString.enabled": legacy} ) @@ -481,13 +583,17 @@ def getDf(spark): # non ansi mode, will get null @pytest.mark.parametrize('type', [DoubleType(), FloatType()], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_cast_float_to_timestamp_for_nan_inf(type): +def test_with_ansi_off_cast_float_to_timestamp_for_nan_inf(type): + """ + Tests the behaviour of floats when cast to timestamp, with ANSI disabled. + ANSI mode tests are covered in test_cast_float_to_timestamp_ansi_for_nan_inf. + """ def fun(spark): data = [(float("inf"),), (float("-inf"),), (float("nan"),)] schema = StructType([StructField("value", type, True)]) df = spark.createDataFrame(data, schema) return df.select(f.col('value').cast(TimestampType())) - assert_gpu_and_cpu_are_equal_collect(fun) + assert_gpu_and_cpu_are_equal_collect(fun, conf=ansi_disabled_conf) # gen for casting long to timestamp, range is about in [0000, 9999] long_gen_to_timestamp = LongGen(max_val=math.floor((9999-1970) * 365 * 86400), @@ -554,11 +660,20 @@ def test_cast_timestamp_to_numeric_ansi_no_overflow(): "cast(value as float)", "cast(value as double)"), conf=ansi_enabled_conf) + +@pytest.mark.skipif(is_databricks_runtime() and is_databricks_version_or_later(14, 3), + reason="https://github.com/NVIDIA/spark-rapids/issues/11555") +@pytest.mark.skipif(not is_databricks_runtime() and is_spark_400_or_later(), + reason="https://github.com/NVIDIA/spark-rapids/issues/11555") def test_cast_timestamp_to_numeric_non_ansi(): + """ + Test timestamp->numeric conversions with ANSI off. + """ assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, timestamp_gen) .selectExpr("cast(a as byte)", "cast(a as short)", "cast(a as int)", "cast(a as long)", - "cast(a as float)", "cast(a as double)")) + "cast(a as float)", "cast(a as double)"), + conf=ansi_disabled_conf) @allow_non_gpu(*non_utc_allow) def test_cast_timestamp_to_string(): @@ -735,9 +850,16 @@ def test_cast_fallback_not_UTC(from_gen, to_type): lambda spark: unary_op_df(spark, from_gen).selectExpr("CAST(a AS {}) as casted".format(to_type)), "Cast", {"spark.sql.session.timeZone": "+08", - "spark.rapids.sql.castStringToTimestamp.enabled": "true"}) + "spark.rapids.sql.castStringToTimestamp.enabled": True}) -def test_cast_date_integral_and_fp(): + +def test_cast_date_integral_and_fp_ansi_off(): + """ + This tests that a date column can be cast to different numeric/floating-point types. + This needs to be tested with ANSI disabled, because none of these conversions are + ANSI-compliant. + """ assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, date_gen).selectExpr( - "cast(a as boolean)", "cast(a as byte)", "cast(a as short)", "cast(a as int)", "cast(a as long)", "cast(a as float)", "cast(a as double)")) + "cast(a as boolean)", "cast(a as byte)", "cast(a as short)", "cast(a as int)", "cast(a as long)", "cast(a as float)", "cast(a as double)"), + conf=ansi_disabled_conf) diff --git a/integration_tests/src/main/python/collection_ops_test.py b/integration_tests/src/main/python/collection_ops_test.py index 099eb28c053..4aef35b0b59 100644 --- a/integration_tests/src/main/python/collection_ops_test.py +++ b/integration_tests/src/main/python/collection_ops_test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,10 +17,11 @@ from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error from data_gen import * from pyspark.sql.types import * + from string_test import mk_str_gen import pyspark.sql.functions as f import pyspark.sql.utils -from spark_session import with_cpu_session, with_gpu_session, is_before_spark_334, is_before_spark_351, is_before_spark_342, is_before_spark_340, is_spark_350 +from spark_session import with_cpu_session, with_gpu_session, is_before_spark_334, is_before_spark_342, is_before_spark_340, is_databricks_version_or_later, is_spark_350, is_spark_400_or_later from conftest import get_datagen_seed from marks import allow_non_gpu @@ -326,8 +327,12 @@ def test_sequence_illegal_boundaries(start_gen, stop_gen, step_gen): @pytest.mark.parametrize('stop_gen', sequence_too_long_length_gens, ids=idfn) @allow_non_gpu(*non_utc_allow) def test_sequence_too_long_sequence(stop_gen): - msg = "Too long sequence" if is_before_spark_334() or (not is_before_spark_340() and is_before_spark_342()) \ - or is_spark_350() else "Unsuccessful try to create array with" + msg = "Too long sequence" if is_before_spark_334() \ + or (not is_before_spark_340() and is_before_spark_342()) \ + or (is_spark_350() and not is_databricks_version_or_later(14, 3)) \ + else "Can't create array" if ((is_databricks_version_or_later(14, 3)) + or is_spark_400_or_later()) \ + else "Unsuccessful try to create array with" assert_gpu_and_cpu_error( # To avoid OOM, reduce the row number to 1, it is enough to verify this case. lambda spark:unary_op_df(spark, stop_gen, 1).selectExpr( diff --git a/integration_tests/src/main/python/conditionals_test.py b/integration_tests/src/main/python/conditionals_test.py index b95ed53f398..aaa390476a4 100644 --- a/integration_tests/src/main/python/conditionals_test.py +++ b/integration_tests/src/main/python/conditionals_test.py @@ -379,3 +379,30 @@ def test_case_when_all_then_values_are_scalars_with_nulls(): "tab", sql_without_else, conf = {'spark.rapids.sql.case_when.fuse': 'true'}) + +@pytest.mark.parametrize('combine_string_contains_enabled', [True, False]) +def test_combine_string_contains_in_case_when(combine_string_contains_enabled): + data_gen = [("c1", string_gen)] + sql = """ + SELECT + CASE + WHEN INSTR(c1, 'a') > 0 THEN 'a' + WHEN INSTR(c1, 'b') > 0 THEN 'b' + WHEN INSTR(c1, 'c') > 0 THEN 'c' + ELSE '' + END as output_1, + CASE + WHEN INSTR(c1, 'c') > 0 THEN 'c' + WHEN INSTR(c1, 'd') > 0 THEN 'd' + WHEN INSTR(c1, 'e') > 0 THEN 'e' + ELSE '' + END as output_2 + from tab + """ + # spark.rapids.sql.combined.expressions.enabled is true by default + assert_gpu_and_cpu_are_equal_sql( + lambda spark : gen_df(spark, data_gen), + "tab", + sql, + { "spark.rapids.sql.expression.combined.GpuContains" : combine_string_contains_enabled} + ) diff --git a/integration_tests/src/main/python/datasourcev2_write_test.py b/integration_tests/src/main/python/datasourcev2_write_test.py index 1f4bc133d2a..4fffd10ab44 100644 --- a/integration_tests/src/main/python/datasourcev2_write_test.py +++ b/integration_tests/src/main/python/datasourcev2_write_test.py @@ -18,7 +18,7 @@ from data_gen import gen_df, decimal_gens, non_utc_allow from marks import * from spark_session import is_hive_available, is_spark_330_or_later, with_cpu_session, with_gpu_session -from hive_parquet_write_test import _hive_bucket_gens, _hive_array_gens, _hive_struct_gens +from hive_parquet_write_test import _hive_bucket_gens_sans_bools, _hive_array_gens, _hive_struct_gens from hive_parquet_write_test import read_single_bucket _hive_write_conf = { @@ -33,9 +33,11 @@ @allow_non_gpu(*non_utc_allow) def test_write_hive_bucketed_table(spark_tmp_table_factory, file_format): num_rows = 2048 - + # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and + # https://github.com/rapidsai/cudf/issues/6763 . + # Once the first issue is fixed, add back boolean_gen def gen_table(spark): - gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens)] + gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens_sans_bools)] types_sql_str = ','.join('{} {}'.format( name, gen.data_type.simpleString()) for name, gen in gen_list) col_names_str = ','.join(name for name, gen in gen_list) diff --git a/integration_tests/src/main/python/date_time_test.py b/integration_tests/src/main/python/date_time_test.py index f83c238a70c..1a7024dac85 100644 --- a/integration_tests/src/main/python/date_time_test.py +++ b/integration_tests/src/main/python/date_time_test.py @@ -17,7 +17,7 @@ from conftest import is_utc, is_supported_time_zone, get_test_tz from data_gen import * from datetime import date, datetime, timezone -from marks import allow_non_gpu, datagen_overrides, disable_ansi_mode, ignore_order, incompat, tz_sensitive_test +from marks import allow_non_gpu, approximate_float, datagen_overrides, disable_ansi_mode, ignore_order, incompat, tz_sensitive_test from pyspark.sql.types import * from spark_session import with_cpu_session, is_before_spark_330, is_before_spark_350 import pyspark.sql.functions as f @@ -139,6 +139,39 @@ def test_datediff(data_gen): hms_fallback = ['ProjectExec'] if not is_supported_time_zone() else [] +@allow_non_gpu(*non_utc_tz_allow) +def test_months_between(): + assert_gpu_and_cpu_are_equal_collect( + lambda spark : binary_op_df(spark, timestamp_gen).selectExpr('months_between(a, b, false)')) + +@allow_non_gpu(*non_utc_tz_allow) +def test_months_between_first_day(): + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2024-01-01", false)')) + +@allow_non_gpu(*non_utc_tz_allow) +def test_months_between_last_day(): + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2023-12-31", false)')) + +@allow_non_gpu(*non_utc_tz_allow) +@approximate_float() +def test_months_between_round(): + assert_gpu_and_cpu_are_equal_collect( + lambda spark : binary_op_df(spark, timestamp_gen).selectExpr('months_between(a, b, true)')) + +@allow_non_gpu(*non_utc_tz_allow) +@approximate_float() +def test_months_between_first_day_round(): + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2024-01-01", true)')) + +@allow_non_gpu(*non_utc_tz_allow) +@approximate_float() +def test_months_between_last_day_round(): + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2023-12-31", true)')) + @allow_non_gpu(*hms_fallback) def test_hour(): assert_gpu_and_cpu_are_equal_collect( @@ -459,19 +492,39 @@ def test_to_timestamp(parser_policy): .select(f.col("a"), f.to_timestamp(f.col("a"), "yyyy-MM-dd HH:mm:ss")), { "spark.sql.legacy.timeParserPolicy": parser_policy}) +# mm: minute; MM: month @pytest.mark.skipif(not is_supported_time_zone(), reason="not all time zones are supported now, refer to https://github.com/NVIDIA/spark-rapids/issues/6839, please update after all time zones are supported") +@pytest.mark.parametrize("format", ['yyyyMMdd', 'yyyymmdd'], ids=idfn) # Test years after 1900, refer to issues: https://github.com/NVIDIA/spark-rapids/issues/11543, https://github.com/NVIDIA/spark-rapids/issues/11539 @pytest.mark.skipif(get_test_tz() != "Asia/Shanghai" and get_test_tz() != "UTC", reason="https://github.com/NVIDIA/spark-rapids/issues/11562") -def test_yyyyMMdd_format_for_legacy_mode(): +def test_formats_for_legacy_mode(format): gen = StringGen('(19[0-9]{2}|[2-9][0-9]{3})([0-9]{4})') assert_gpu_and_cpu_are_equal_sql( lambda spark : unary_op_df(spark, gen), "tab", - '''select unix_timestamp(a, 'yyyyMMdd'), - from_unixtime(unix_timestamp(a, 'yyyyMMdd'), 'yyyyMMdd'), - date_format(to_timestamp(a, 'yyyyMMdd'), 'yyyyMMdd') + '''select unix_timestamp(a, '{}'), + from_unixtime(unix_timestamp(a, '{}'), '{}'), + date_format(to_timestamp(a, '{}'), '{}') + from tab + '''.format(format, format, format, format, format), + {'spark.sql.legacy.timeParserPolicy': 'LEGACY', + 'spark.rapids.sql.incompatibleDateFormats.enabled': True}) + +# mm: minute; MM: month +@pytest.mark.skipif(not is_supported_time_zone(), reason="not all time zones are supported now, refer to https://github.com/NVIDIA/spark-rapids/issues/6839, please update after all time zones are supported") +@pytest.mark.skipif(get_test_tz() != "Asia/Shanghai" and get_test_tz() != "UTC", reason="https://github.com/NVIDIA/spark-rapids/issues/11562") +def test_formats_for_legacy_mode_other_formats(): + format = "yyyyMMdd HH:mm:ss" + # Test years after 1900, + gen = StringGen('(19[0-9]{2}|[2-9][0-9]{3})([0-9]{4}) [0-9]{2}:[0-9]{2}:[0-9]{2}') + assert_gpu_and_cpu_are_equal_sql( + lambda spark : unary_op_df(spark, gen), + "tab", + '''select unix_timestamp(a, '{}'), + from_unixtime(unix_timestamp(a, '{}'), '{}'), + date_format(to_timestamp(a, '{}'), '{}') from tab - ''', + '''.format(format, format, format, format, format), {'spark.sql.legacy.timeParserPolicy': 'LEGACY', 'spark.rapids.sql.incompatibleDateFormats.enabled': True}) diff --git a/integration_tests/src/main/python/dpp_test.py b/integration_tests/src/main/python/dpp_test.py index b362a4175f3..3d5ee1a5afa 100644 --- a/integration_tests/src/main/python/dpp_test.py +++ b/integration_tests/src/main/python/dpp_test.py @@ -20,7 +20,7 @@ from conftest import spark_tmp_table_factory from data_gen import * from marks import ignore_order, allow_non_gpu, datagen_overrides, disable_ansi_mode -from spark_session import is_before_spark_320, with_cpu_session, is_before_spark_312, is_databricks_runtime, is_databricks113_or_later +from spark_session import is_before_spark_320, with_cpu_session, is_before_spark_312, is_databricks_runtime, is_databricks113_or_later, is_databricks_version_or_later # non-positive values here can produce a degenerative join, so here we ensure that most values are # positive to ensure the join will produce rows. See https://github.com/NVIDIA/spark-rapids/issues/10147 @@ -167,10 +167,17 @@ def fn(spark): ''' ] +# On some Databricks versions (>=14.3), some query plans include a `CollectLimitExec`, +# when filtering partitions. This exec falls back to CPU. These tests allow for `CollectLimit` to +# run on the CPU, if everything else in the plan execute as expected. +# Further details are furnished at https://github.com/NVIDIA/spark-rapids/issues/11764. +dpp_fallback_execs=["CollectLimitExec"] if is_databricks_version_or_later(14,3) else [] + @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 # When BroadcastExchangeExec is available on filtering side, and it can be reused: # DynamicPruningExpression(InSubqueryExec(value, GpuSubqueryBroadcastExec))) @ignore_order +@allow_non_gpu(*dpp_fallback_execs) @datagen_overrides(seed=0, reason="https://github.com/NVIDIA/spark-rapids/issues/10147") @pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn) @pytest.mark.parametrize('s_index', list(range(len(_statements))), ids=idfn) @@ -245,6 +252,7 @@ def test_dpp_bypass(spark_tmp_table_factory, store_format, s_index, aqe_enabled) # then Spark will plan an extra Aggregate to collect filtering values: # DynamicPruningExpression(InSubqueryExec(value, SubqueryExec(Aggregate(...)))) @ignore_order +@allow_non_gpu(*dpp_fallback_execs) @pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn) @pytest.mark.parametrize('s_index', list(range(len(_statements))), ids=idfn) @pytest.mark.parametrize('aqe_enabled', [ @@ -285,10 +293,11 @@ def test_dpp_skip(spark_tmp_table_factory, store_format, s_index, aqe_enabled): non_exist_classes='DynamicPruningExpression', conf=dict(_dpp_fallback_conf + [('spark.sql.adaptive.enabled', aqe_enabled)])) +dpp_like_any_fallback_execs=['FilterExec', 'CollectLimitExec'] if is_databricks_version_or_later(14,3) else ['FilterExec'] # GPU verification on https://issues.apache.org/jira/browse/SPARK-34436 @ignore_order -@allow_non_gpu('FilterExec') +@allow_non_gpu(*dpp_like_any_fallback_execs) @pytest.mark.parametrize('store_format', ['parquet', 'orc'], ids=idfn) @pytest.mark.parametrize('aqe_enabled', [ 'false', @@ -327,6 +336,7 @@ def create_dim_table_for_like(spark): @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 +@allow_non_gpu(*dpp_fallback_execs) # Test handling DPP expressions from a HashedRelation that rearranges columns @pytest.mark.parametrize('aqe_enabled', [ 'false', diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py index 734b4dfb708..444e4131724 100644 --- a/integration_tests/src/main/python/hash_aggregate_test.py +++ b/integration_tests/src/main/python/hash_aggregate_test.py @@ -204,6 +204,8 @@ _decimal_gen_36_neg5 = DecimalGen(precision=36, scale=-5) _decimal_gen_38_10 = DecimalGen(precision=38, scale=10) +kudo_enabled_conf_key = "spark.rapids.shuffle.kudo.serializer.enabled" + def get_params(init_list, marked_params=[]): """ @@ -307,7 +309,8 @@ def get_params(init_list, marked_params=[]): @nightly_gpu_mem_consuming_case @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('precision', [38, 37, 36, 35, 34, 33, 32, 31], ids=idfn) -def test_hash_reduction_decimal_overflow_sum(precision): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_reduction_decimal_overflow_sum(precision, kudo_enabled): constant = '9' * precision count = pow(10, 38 - precision) assert_gpu_and_cpu_are_equal_collect( @@ -318,16 +321,20 @@ def test_hash_reduction_decimal_overflow_sum(precision): # run out of memory in some setups. These should not happen in production, because # we really are just doing a really bad job at multiplying to get this result so # some optimizations are conspiring against us. - conf = {'spark.rapids.sql.batchSizeBytes': '128m'}) + conf = {'spark.rapids.sql.batchSizeBytes': '128m', + kudo_enabled_conf_key: kudo_enabled}) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', [_longs_with_nulls], ids=idfn) @pytest.mark.parametrize('override_split_until_size', [None, 1], ids=idfn) @pytest.mark.parametrize('override_batch_size_bytes', [None, 1], ids=idfn) -def test_hash_grpby_sum_count_action(data_gen, override_split_until_size, override_batch_size_bytes): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_grpby_sum_count_action(data_gen, override_split_until_size, + override_batch_size_bytes, kudo_enabled): conf = { - 'spark.rapids.sql.test.overrides.splitUntilSize': override_split_until_size + 'spark.rapids.sql.test.overrides.splitUntilSize': override_split_until_size, + kudo_enabled_conf_key: kudo_enabled } if override_batch_size_bytes is not None: conf["spark.rapids.sql.batchSizeBytes"] = override_batch_size_bytes @@ -340,23 +347,29 @@ def test_hash_grpby_sum_count_action(data_gen, override_split_until_size, overri @allow_non_gpu("SortAggregateExec", "SortExec", "ShuffleExchangeExec") @ignore_order @pytest.mark.parametrize('data_gen', _grpkey_nested_structs_with_array_basic_child + _grpkey_list_with_non_nested_children, ids=idfn) -def test_hash_grpby_list_min_max(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_grpby_list_min_max(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( - lambda spark: gen_df(spark, data_gen, length=100).coalesce(1).groupby('a').agg(f.min('b'), f.max('b')) - ) + lambda spark: gen_df(spark, data_gen, length=100).coalesce(1).groupby('a').agg(f.min( + 'b'), f.max('b')), + conf = {kudo_enabled_conf_key: kudo_enabled}) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', [_longs_with_nulls], ids=idfn) -def test_hash_reduction_sum_count_action(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_reduction_sum_count_action(data_gen, kudo_enabled): assert_gpu_and_cpu_row_counts_equal( - lambda spark: gen_df(spark, data_gen, length=100).agg(f.sum('b')) + lambda spark: gen_df(spark, data_gen, length=100).agg(f.sum('b')), + conf = {kudo_enabled_conf_key: kudo_enabled} ) # Make sure that we can do computation in the group by columns @ignore_order @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 -def test_computation_in_grpby_columns(): - conf = {'spark.rapids.sql.batchSizeBytes' : '250'} +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_computation_in_grpby_columns(kudo_enabled): + conf = {'spark.rapids.sql.batchSizeBytes' : '250', + kudo_enabled_conf_key: kudo_enabled} data_gen = [ ('a', RepeatSeqGen(StringGen('a{1,20}'), length=50)), ('b', short_gen)] @@ -371,10 +384,12 @@ def test_computation_in_grpby_columns(): @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_grpby_sum(data_gen, conf): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_grpby_sum(data_gen, conf, kudo_enabled): + new_conf = copy_and_update(conf, {kudo_enabled_conf_key: kudo_enabled}) assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100).groupby('a').agg(f.sum('b')), - conf = conf) + conf = new_conf) @shuffle_test @approximate_float @@ -383,10 +398,12 @@ def test_hash_grpby_sum(data_gen, conf): @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', [_grpkey_short_sum_full_decimals, _grpkey_short_sum_full_neg_scale_decimals], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_grpby_sum_full_decimal(data_gen, conf): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_grpby_sum_full_decimal(data_gen, conf, kudo_enabled): + new_conf = copy_and_update(conf, {kudo_enabled_conf_key: kudo_enabled}) assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100).groupby('a').agg(f.sum('b')), - conf = conf) + conf = new_conf) @approximate_float @datagen_overrides(seed=0, reason="https://github.com/NVIDIA/spark-rapids/issues/9822") @@ -394,10 +411,12 @@ def test_hash_grpby_sum_full_decimal(data_gen, conf): @incompat @pytest.mark.parametrize('data_gen', numeric_gens + decimal_gens + [DecimalGen(precision=36, scale=5)], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_reduction_sum(data_gen, conf): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_reduction_sum(data_gen, conf, kudo_enabled): + new_conf = copy_and_update(conf, {kudo_enabled_conf_key: kudo_enabled}) assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen, length=100).selectExpr("SUM(a)"), - conf = conf) + conf = new_conf) @approximate_float @ignore_order @@ -406,11 +425,13 @@ def test_hash_reduction_sum(data_gen, conf): @pytest.mark.parametrize('data_gen', numeric_gens + decimal_gens + [ DecimalGen(precision=38, scale=0), DecimalGen(precision=38, scale=-10)], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @datagen_overrides(seed=0, permanent=True, reason='https://github.com/NVIDIA/spark-rapids/issues/9779') -def test_hash_reduction_sum_full_decimal(data_gen, conf): +def test_hash_reduction_sum_full_decimal(data_gen, conf, kudo_enabled): + new_conf = copy_and_update(conf, {kudo_enabled_conf_key: kudo_enabled}) assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen, length=100).selectExpr("SUM(a)"), - conf = conf) + conf = new_conf) @approximate_float @ignore_order @@ -419,10 +440,12 @@ def test_hash_reduction_sum_full_decimal(data_gen, conf): @pytest.mark.parametrize('data_gen', _init_list + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals, _grpkey_short_very_big_decimals, _grpkey_short_sum_full_decimals], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_grpby_avg(data_gen, conf): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_grpby_avg(data_gen, conf, kudo_enabled): + new_conf = copy_and_update(conf, {kudo_enabled_conf_key: kudo_enabled}) assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=200).groupby('a').agg(f.avg('b')), - conf=conf + conf=new_conf ) # tracks https://github.com/NVIDIA/spark-rapids/issues/154 @@ -438,30 +461,38 @@ def test_hash_grpby_avg(data_gen, conf): @pytest.mark.parametrize('data_gen', [ StructGen(children=[('a', int_gen), ('b', int_gen)],nullable=False, special_cases=[((None, None), 400.0), ((None, -1542301795), 100.0)])], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @pytest.mark.xfail(condition=is_databricks104_or_later(), reason='https://github.com/NVIDIA/spark-rapids/issues/4963') -def test_hash_avg_nulls_partial_only(data_gen): +def test_hash_avg_nulls_partial_only(data_gen, kudo_enabled): + conf = copy_and_update(_float_conf_partial, {kudo_enabled_conf_key: kudo_enabled}) assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=2).agg(f.avg('b')), - conf=_float_conf_partial - ) + conf=conf) @approximate_float @ignore_order @incompat @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn) -def test_intersect_all(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_intersect_all(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( - lambda spark : gen_df(spark, data_gen, length=100).intersectAll(gen_df(spark, data_gen, length=100))) + lambda spark : gen_df(spark, data_gen, length=100).intersectAll(gen_df(spark, data_gen, + length=100)), + conf = {kudo_enabled_conf_key: kudo_enabled}) @approximate_float @ignore_order @incompat @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', _init_list_with_decimalbig, ids=idfn) -def test_exceptAll(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_exceptAll(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( - lambda spark : gen_df(spark, data_gen, length=100).exceptAll(gen_df(spark, data_gen, length=100).filter('a != b'))) + lambda spark : (gen_df(spark, data_gen, length=100) + .exceptAll(gen_df(spark, data_gen, length=100) + .filter('a != b'))), + conf = {kudo_enabled_conf_key: kudo_enabled}) # Spark fails to sort some decimal values due to overflow when calculating the sorting prefix. # See https://issues.apache.org/jira/browse/SPARK-40129 @@ -488,13 +519,14 @@ def test_exceptAll(data_gen): @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', _pivot_gens_with_decimals, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_grpby_pivot(data_gen, conf): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_grpby_pivot(data_gen, conf, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) .groupby('a') .pivot('b') .agg(f.sum('c')), - conf = conf) + conf = copy_and_update(conf, {kudo_enabled_conf_key: kudo_enabled})) @approximate_float @ignore_order(local=True) @@ -503,13 +535,14 @@ def test_hash_grpby_pivot(data_gen, conf): @pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) @datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/10062') -def test_hash_multiple_grpby_pivot(data_gen, conf): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_multiple_grpby_pivot(data_gen, conf, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) .groupby('a','b') .pivot('b') .agg(f.sum('c'), f.max('c')), - conf=conf) + conf=copy_and_update(conf, {kudo_enabled_conf_key: kudo_enabled})) @approximate_float @ignore_order(local=True) @@ -517,13 +550,14 @@ def test_hash_multiple_grpby_pivot(data_gen, conf): @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_reduction_pivot(data_gen, conf): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_reduction_pivot(data_gen, conf, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) .groupby() .pivot('b') .agg(f.sum('c')), - conf=conf) + conf=copy_and_update(conf, {kudo_enabled_conf_key: kudo_enabled})) @approximate_float @@ -533,7 +567,8 @@ def test_hash_reduction_pivot(data_gen, conf): @incompat @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', [_grpkey_floats_with_nulls_and_nans], ids=idfn) -def test_hash_pivot_groupby_duplicates_fallback(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_pivot_groupby_duplicates_fallback(data_gen, kudo_enabled): # PivotFirst will not work on the GPU when pivot_values has duplicates assert_gpu_fallback_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -541,7 +576,7 @@ def test_hash_pivot_groupby_duplicates_fallback(data_gen): .pivot('b', ['10.0', '10.0']) .agg(f.sum('c')), "PivotFirst", - conf=_float_conf) + conf=copy_and_update(_float_conf, {kudo_enabled_conf_key: kudo_enabled}) ) _repeat_agg_column_for_collect_op = [ RepeatSeqGen(BooleanGen(), length=15), @@ -610,43 +645,53 @@ def test_hash_pivot_groupby_duplicates_fallback(data_gen): @ignore_order(local=True) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', [decimal_gen_128bit], ids=idfn) -def test_decimal128_count_reduction(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_decimal128_count_reduction(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, data_gen).selectExpr('count(a)')) + lambda spark: unary_op_df(spark, data_gen).selectExpr('count(a)'), + conf = {kudo_enabled_conf_key: kudo_enabled}) # very simple test for just a count on decimals 128 values until we can support more with them @ignore_order(local=True) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', [decimal_gen_128bit], ids=idfn) -def test_decimal128_count_group_by(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_decimal128_count_group_by(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, byte_gen, data_gen) .groupby('a') - .agg(f.count('b'))) + .agg(f.count('b')), + conf = {kudo_enabled_conf_key: kudo_enabled}) # very simple test for just a min/max on decimals 128 values until we can support more with them @ignore_order(local=True) @pytest.mark.parametrize('data_gen', [decimal_gen_128bit], ids=idfn) -def test_decimal128_min_max_reduction(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_decimal128_min_max_reduction(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, data_gen).selectExpr('min(a)', 'max(a)')) + lambda spark: unary_op_df(spark, data_gen).selectExpr('min(a)', 'max(a)'), + conf = {kudo_enabled_conf_key: kudo_enabled}) # very simple test for just a min/max on decimals 128 values until we can support more with them @ignore_order(local=True) @pytest.mark.parametrize('data_gen', [decimal_gen_128bit], ids=idfn) -def test_decimal128_min_max_group_by(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_decimal128_min_max_group_by(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, byte_gen, data_gen) .groupby('a') - .agg(f.min('b'), f.max('b'))) + .agg(f.min('b'), f.max('b')), + conf = {kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _all_basic_gens_with_all_nans_cases, ids=idfn) -def test_min_max_group_by(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_min_max_group_by(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, byte_gen, data_gen) .groupby('a') - .agg(f.min('b'), f.max('b'))) + .agg(f.min('b'), f.max('b')), + conf = {kudo_enabled_conf_key: kudo_enabled}) # To avoid ordering issues with collect_list, sorting the arrays that are returned. # NOTE: sorting the arrays locally, because sort_array() does not yet @@ -657,18 +702,21 @@ def test_min_max_group_by(data_gen): @ignore_order(local=True, arrays=["blist"]) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_list_op, ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', [True, False], ids=idfn) -def test_hash_groupby_collect_list(data_gen, use_obj_hash_agg): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_collect_list(data_gen, use_obj_hash_agg, kudo_enabled): def doit(spark): return gen_df(spark, data_gen, length=100)\ .groupby('a')\ .agg(f.collect_list('b').alias("blist")) assert_gpu_and_cpu_are_equal_collect( doit, - conf={'spark.sql.execution.useObjectHashAggregateExec': str(use_obj_hash_agg).lower()}) + conf={'spark.sql.execution.useObjectHashAggregateExec': str(use_obj_hash_agg).lower(), + kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @pytest.mark.parametrize('use_obj_hash_agg', [True, False], ids=idfn) -def test_hash_groupby_collect_list_of_maps(use_obj_hash_agg): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_collect_list_of_maps(use_obj_hash_agg, kudo_enabled): gens = [("a", RepeatSeqGen(LongGen(), length=20)), ("b", simple_string_to_string_map_gen)] def doit(spark): df = gen_df(spark, gens, length=100) \ @@ -680,27 +728,32 @@ def doit(spark): return spark.createDataFrame(df.rdd, schema=df.schema).select("a", f.explode("blist")) assert_gpu_and_cpu_are_equal_collect( doit, - conf={'spark.sql.execution.useObjectHashAggregateExec': str(use_obj_hash_agg).lower()}) + conf={'spark.sql.execution.useObjectHashAggregateExec': str(use_obj_hash_agg).lower(), + kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_hash_groupby_collect_set(data_gen): +def test_hash_groupby_collect_set(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) .groupby('a') - .agg(f.sort_array(f.collect_set('b')), f.count('b'))) + .agg(f.sort_array(f.collect_set('b')), f.count('b')), + conf = {kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_hash_groupby_collect_set_on_nested_type(data_gen): +def test_hash_groupby_collect_set_on_nested_type(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) .groupby('a') - .agg(f.sort_array(f.collect_set('b')))) + .agg(f.sort_array(f.collect_set('b'))), + conf= {kudo_enabled_conf_key: kudo_enabled}) # NOTE: sorting the arrays locally, because sort_array() does not yet @@ -710,9 +763,11 @@ def test_hash_groupby_collect_set_on_nested_type(data_gen): @ignore_order(local=True, arrays=["collect_set"]) @allow_non_gpu("ProjectExec", *non_utc_allow) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op_nested, ids=idfn) -def test_hash_groupby_collect_set_on_nested_array_type(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_collect_set_on_nested_array_type(data_gen, kudo_enabled): conf = copy_and_update(_float_conf, { "spark.rapids.sql.castFloatToString.enabled": "true", + kudo_enabled_conf_key: kudo_enabled }) def do_it(spark): @@ -726,19 +781,23 @@ def do_it(spark): @ignore_order(local=True) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_hash_reduction_collect_set(data_gen): +def test_hash_reduction_collect_set(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) - .agg(f.sort_array(f.collect_set('b')), f.count('b'))) + .agg(f.sort_array(f.collect_set('b')), f.count('b')), + conf = {kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_hash_reduction_collect_set_on_nested_type(data_gen): +def test_hash_reduction_collect_set_on_nested_type(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) - .agg(f.sort_array(f.collect_set('b')))) + .agg(f.sort_array(f.collect_set('b'))), + conf= {kudo_enabled_conf_key: kudo_enabled}) # NOTE: sorting the arrays locally, because sort_array() does not yet @@ -748,9 +807,11 @@ def test_hash_reduction_collect_set_on_nested_type(data_gen): @ignore_order(local=True, arrays=["collect_set"]) @allow_non_gpu("ProjectExec", *non_utc_allow) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_set_op_nested, ids=idfn) -def test_hash_reduction_collect_set_on_nested_array_type(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_reduction_collect_set_on_nested_array_type(data_gen, kudo_enabled): conf = copy_and_update(_float_conf, { "spark.rapids.sql.castFloatToString.enabled": "true", + kudo_enabled_conf_key: kudo_enabled }) def do_it(spark): @@ -763,8 +824,9 @@ def do_it(spark): @ignore_order(local=True) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', _full_gen_data_for_collect_op, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_hash_groupby_collect_with_single_distinct(data_gen): +def test_hash_groupby_collect_with_single_distinct(data_gen, kudo_enabled): # test collect_ops with other distinct aggregations assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -772,7 +834,8 @@ def test_hash_groupby_collect_with_single_distinct(data_gen): .agg(f.sort_array(f.collect_list('b')), f.sort_array(f.collect_set('b')), f.countDistinct('c'), - f.count('c'))) + f.count('c')), + conf = {kudo_enabled_conf_key: kudo_enabled}) def hash_groupby_single_distinct_collect_impl(data_gen, conf): @@ -798,41 +861,46 @@ def hash_groupby_single_distinct_collect_impl(data_gen, conf): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _gen_data_for_collect_op, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_hash_groupby_single_distinct_collect(data_gen): +def test_hash_groupby_single_distinct_collect(data_gen, kudo_enabled): """ Tests distinct collect, with ANSI disabled. The corresponding ANSI-enabled condition is tested in test_hash_groupby_single_distinct_collect_ansi_enabled """ - ansi_disabled_conf = {'spark.sql.ansi.enabled': False} + ansi_disabled_conf = {'spark.sql.ansi.enabled': False, + kudo_enabled_conf_key: kudo_enabled} hash_groupby_single_distinct_collect_impl(data_gen=data_gen, conf=ansi_disabled_conf) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', [_gen_data_for_collect_op[0]], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) @allow_non_gpu('ObjectHashAggregateExec', 'ShuffleExchangeExec') -def test_hash_groupby_single_distinct_collect_ansi_enabled(data_gen): +def test_hash_groupby_single_distinct_collect_ansi_enabled(data_gen, kudo_enabled): """ Tests distinct collect, with ANSI enabled. Enabling ANSI mode causes the plan to include ObjectHashAggregateExec, which runs on CPU. """ - hash_groupby_single_distinct_collect_impl(data_gen=data_gen, conf=ansi_enabled_conf) + hash_groupby_single_distinct_collect_impl(data_gen=data_gen, + conf=copy_and_update(ansi_enabled_conf, {kudo_enabled_conf_key: kudo_enabled})) @ignore_order(local=True) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', _gen_data_for_collect_op, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_hash_groupby_collect_with_multi_distinct(data_gen): +def test_hash_groupby_collect_with_multi_distinct(data_gen, kudo_enabled): def spark_fn(spark_session): return gen_df(spark_session, data_gen, length=100).groupby('a').agg( f.sort_array(f.collect_list('b')), f.sort_array(f.collect_set('b')), f.countDistinct('b'), f.countDistinct('c')) - assert_gpu_and_cpu_are_equal_collect(spark_fn) + assert_gpu_and_cpu_are_equal_collect(spark_fn, conf = {kudo_enabled_conf_key: kudo_enabled}) _replace_modes_non_distinct = [ # Spark: GPU(Final) -> CPU(Partial) @@ -851,13 +919,16 @@ def spark_fn(spark_session): @pytest.mark.parametrize('replace_mode', _replace_modes_non_distinct, ids=idfn) @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', ['false', 'true'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) def test_hash_groupby_collect_partial_replace_fallback(data_gen, replace_mode, aqe_enabled, - use_obj_hash_agg): + use_obj_hash_agg, + kudo_enabled): conf = {'spark.rapids.sql.hashAgg.replaceMode': replace_mode, 'spark.sql.adaptive.enabled': aqe_enabled, - 'spark.sql.execution.useObjectHashAggregateExec': use_obj_hash_agg} + 'spark.sql.execution.useObjectHashAggregateExec': use_obj_hash_agg, + kudo_enabled_conf_key: kudo_enabled} cpu_clz, gpu_clz = ['CollectList', 'CollectSet'], ['GpuCollectList', 'GpuCollectSet'] exist_clz, non_exist_clz = [], [] @@ -901,14 +972,17 @@ def test_hash_groupby_collect_partial_replace_fallback(data_gen, @pytest.mark.parametrize('replace_mode', _replace_modes_single_distinct, ids=idfn) @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', ['false', 'true'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @pytest.mark.xfail(condition=is_databricks104_or_later(), reason='https://github.com/NVIDIA/spark-rapids/issues/4963') def test_hash_groupby_collect_partial_replace_with_distinct_fallback(data_gen, replace_mode, aqe_enabled, - use_obj_hash_agg): + use_obj_hash_agg, + kudo_enabled): conf = {'spark.rapids.sql.hashAgg.replaceMode': replace_mode, 'spark.sql.adaptive.enabled': aqe_enabled, - 'spark.sql.execution.useObjectHashAggregateExec': use_obj_hash_agg} + 'spark.sql.execution.useObjectHashAggregateExec': use_obj_hash_agg, + kudo_enabled_conf_key: kudo_enabled} # test with single Distinct assert_cpu_and_gpu_are_equal_collect_with_capture( lambda spark: gen_df(spark, data_gen, length=100) @@ -975,10 +1049,11 @@ def exact_percentile_reduction(df): @datagen_overrides(seed=0, reason="https://github.com/NVIDIA/spark-rapids/issues/10233") @pytest.mark.parametrize('data_gen', exact_percentile_reduction_data_gen, ids=idfn) -def test_exact_percentile_reduction(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_exact_percentile_reduction(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( - lambda spark: exact_percentile_reduction(gen_df(spark, data_gen)) - ) + lambda spark: exact_percentile_reduction(gen_df(spark, data_gen)), + conf = {kudo_enabled_conf_key: kudo_enabled}) exact_percentile_reduction_cpu_fallback_data_gen = [ [('val', data_gen), @@ -992,9 +1067,10 @@ def test_exact_percentile_reduction(data_gen): @pytest.mark.parametrize('data_gen', exact_percentile_reduction_cpu_fallback_data_gen, ids=idfn) @pytest.mark.parametrize('replace_mode', ['partial', 'final|complete'], ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', ['false', 'true'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @pytest.mark.xfail(condition=is_databricks104_or_later(), reason='https://github.com/NVIDIA/spark-rapids/issues/9494') def test_exact_percentile_reduction_partial_fallback_to_cpu(data_gen, replace_mode, - use_obj_hash_agg): + use_obj_hash_agg, kudo_enabled): cpu_clz, gpu_clz = ['Percentile'], ['GpuPercentileDefault'] exist_clz, non_exist_clz = [], [] # For aggregations without distinct, Databricks runtime removes the partial Aggregate stage ( @@ -1017,7 +1093,8 @@ def test_exact_percentile_reduction_partial_fallback_to_cpu(data_gen, replace_m exist_classes=','.join(exist_clz), non_exist_classes=','.join(non_exist_clz), conf={'spark.rapids.sql.hashAgg.replaceMode': replace_mode, - 'spark.sql.execution.useObjectHashAggregateExec': use_obj_hash_agg} + 'spark.sql.execution.useObjectHashAggregateExec': use_obj_hash_agg, + kudo_enabled_conf_key: kudo_enabled} ) @@ -1051,10 +1128,11 @@ def exact_percentile_groupby(df): @ignore_order @pytest.mark.parametrize('data_gen', exact_percentile_groupby_data_gen, ids=idfn) -def test_exact_percentile_groupby(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_exact_percentile_groupby(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( - lambda spark: exact_percentile_groupby(gen_df(spark, data_gen)) - ) + lambda spark: exact_percentile_groupby(gen_df(spark, data_gen)), + conf = {kudo_enabled_conf_key: kudo_enabled}) exact_percentile_groupby_cpu_fallback_data_gen = [ [('key', RepeatSeqGen(IntegerGen(), length=100)), @@ -1070,8 +1148,10 @@ def test_exact_percentile_groupby(data_gen): @pytest.mark.parametrize('data_gen', exact_percentile_groupby_cpu_fallback_data_gen, ids=idfn) @pytest.mark.parametrize('replace_mode', ['partial', 'final|complete'], ids=idfn) @pytest.mark.parametrize('use_obj_hash_agg', ['false', 'true'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @pytest.mark.xfail(condition=is_databricks104_or_later(), reason='https://github.com/NVIDIA/spark-rapids/issues/9494') -def test_exact_percentile_groupby_partial_fallback_to_cpu(data_gen, replace_mode, use_obj_hash_agg): +def test_exact_percentile_groupby_partial_fallback_to_cpu(data_gen, replace_mode, + use_obj_hash_agg, kudo_enabled): cpu_clz, gpu_clz = ['Percentile'], ['GpuPercentileDefault'] exist_clz, non_exist_clz = [], [] # For aggregations without distinct, Databricks runtime removes the partial Aggregate stage ( @@ -1094,15 +1174,16 @@ def test_exact_percentile_groupby_partial_fallback_to_cpu(data_gen, replace_mode exist_classes=','.join(exist_clz), non_exist_classes=','.join(non_exist_clz), conf={'spark.rapids.sql.hashAgg.replaceMode': replace_mode, - 'spark.sql.execution.useObjectHashAggregateExec': use_obj_hash_agg} - ) + 'spark.sql.execution.useObjectHashAggregateExec': use_obj_hash_agg, + kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu('ObjectHashAggregateExec', 'ShuffleExchangeExec', 'HashAggregateExec', 'HashPartitioning', 'ApproximatePercentile', 'Alias', 'Literal', 'AggregateExpression') -def test_hash_groupby_typed_imperative_agg_without_gpu_implementation_fallback(): +def test_hash_groupby_typed_imperative_agg_without_gpu_implementation_fallback(kudo_enabled): assert_cpu_and_gpu_are_equal_sql_with_capture( lambda spark: gen_df(spark, [('k', RepeatSeqGen(LongGen(), length=20)), ('v', UniqueLongGen())], length=100), @@ -1110,7 +1191,8 @@ def test_hash_groupby_typed_imperative_agg_without_gpu_implementation_fallback() non_exist_classes='GpuApproximatePercentile,GpuObjectHashAggregateExec', table_name='table', sql="""select k, - approx_percentile(v, array(0.25, 0.5, 0.75)) from table group by k""") + approx_percentile(v, array(0.25, 0.5, 0.75)) from table group by k""", + conf = {kudo_enabled_conf_key: kudo_enabled}) @approximate_float @ignore_order @@ -1118,7 +1200,8 @@ def test_hash_groupby_typed_imperative_agg_without_gpu_implementation_fallback() @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_multiple_mode_query(data_gen, conf): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_multiple_mode_query(data_gen, conf, kudo_enabled): print_params(data_gen) assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) @@ -1132,7 +1215,7 @@ def test_hash_multiple_mode_query(data_gen, conf): f.max('a'), f.sumDistinct('b'), f.countDistinct('c') - ), conf=conf) + ), conf=copy_and_update(conf, {kudo_enabled_conf_key: kudo_enabled})) @approximate_float @@ -1143,11 +1226,12 @@ def test_hash_multiple_mode_query(data_gen, conf): @pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_multiple_mode_query_avg_distincts(data_gen, conf): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_multiple_mode_query_avg_distincts(data_gen, conf, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) .selectExpr('avg(distinct a)', 'avg(distinct b)','avg(distinct c)'), - conf=conf) + conf=copy_and_update(conf, {kudo_enabled_conf_key: kudo_enabled})) @approximate_float @@ -1157,8 +1241,11 @@ def test_hash_multiple_mode_query_avg_distincts(data_gen, conf): @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf): - local_conf = copy_and_update(conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf, kudo_enabled): + local_conf = copy_and_update(conf, + {'spark.sql.legacy.allowParameterlessCount': 'true', + kudo_enabled_conf_key: kudo_enabled}) assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, data_gen, length=100), "hash_agg_table", @@ -1181,8 +1268,10 @@ def test_hash_query_multiple_distincts_with_non_distinct(data_gen, conf): @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_query_max_with_multiple_distincts(data_gen, conf): - local_conf = copy_and_update(conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_query_max_with_multiple_distincts(data_gen, conf, kudo_enabled): + local_conf = copy_and_update(conf, {'spark.sql.legacy.allowParameterlessCount': 'true', + kudo_enabled_conf_key: kudo_enabled}) assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, data_gen, length=100), "hash_agg_table", @@ -1196,11 +1285,12 @@ def test_hash_query_max_with_multiple_distincts(data_gen, conf): @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', _init_list, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_count_with_filter(data_gen, conf): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_count_with_filter(data_gen, conf, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) .selectExpr('count(a) filter (where c > 50)'), - conf=conf) + conf=copy_and_update(conf, {kudo_enabled_conf_key: kudo_enabled})) @approximate_float @@ -1209,7 +1299,8 @@ def test_hash_count_with_filter(data_gen, conf): @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', _init_list + [_grpkey_short_mid_decimals, _grpkey_short_big_decimals], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_multiple_filters(data_gen, conf): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_multiple_filters(data_gen, conf, kudo_enabled): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, data_gen, length=100), "hash_agg_table", @@ -1217,15 +1308,17 @@ def test_hash_multiple_filters(data_gen, conf): 'count(b) filter (where c > 100),' + 'avg(b) filter (where b > 20),' + 'min(a), max(b) filter (where c > 250) from hash_agg_table group by a', - conf) + conf = copy_and_update(conf, {kudo_enabled_conf_key: kudo_enabled})) @approximate_float @ignore_order @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', [_grpkey_floats_with_nan_zero_grouping_keys, _grpkey_doubles_with_nan_zero_grouping_keys], ids=idfn) -def test_hash_agg_with_nan_keys(data_gen): - local_conf = copy_and_update(_float_conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_agg_with_nan_keys(data_gen, kudo_enabled): + local_conf = copy_and_update(_float_conf, {'spark.sql.legacy.allowParameterlessCount': 'true', + kudo_enabled_conf_key: kudo_enabled}) assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, data_gen, length=1024), "hash_agg_table", @@ -1245,8 +1338,10 @@ def test_hash_agg_with_nan_keys(data_gen): @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', [_grpkey_structs_with_non_nested_children, _grpkey_nested_structs], ids=idfn) -def test_hash_agg_with_struct_keys(data_gen): - local_conf = copy_and_update(_float_conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_agg_with_struct_keys(data_gen, kudo_enabled): + local_conf = copy_and_update(_float_conf, {'spark.sql.legacy.allowParameterlessCount': 'true', + kudo_enabled_conf_key: kudo_enabled}) assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, data_gen, length=1024), "hash_agg_table", @@ -1267,8 +1362,10 @@ def test_hash_agg_with_struct_keys(data_gen): 'Cast', 'Literal', 'Alias', 'AggregateExpression', 'ShuffleExchangeExec', 'HashPartitioning') @pytest.mark.parametrize('data_gen', [_grpkey_nested_structs_with_array_child], ids=idfn) -def test_hash_agg_with_struct_of_array_fallback(data_gen): - local_conf = copy_and_update(_float_conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_agg_with_struct_of_array_fallback(data_gen, kudo_enabled): + local_conf = copy_and_update(_float_conf, {'spark.sql.legacy.allowParameterlessCount': 'true', + kudo_enabled_conf_key: kudo_enabled}) assert_cpu_and_gpu_are_equal_sql_with_capture( lambda spark : gen_df(spark, data_gen, length=100), 'select a, ' @@ -1290,12 +1387,13 @@ def test_hash_agg_with_struct_of_array_fallback(data_gen): @ignore_order @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', [ _grpkey_floats_with_nulls_and_nans ], ids=idfn) -def test_count_distinct_with_nan_floats(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_count_distinct_with_nan_floats(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, data_gen, length=1024), "hash_agg_table", 'select a, count(distinct b) as count_distinct_bees from hash_agg_table group by a', - _float_conf) + copy_and_update(_float_conf, {kudo_enabled_conf_key: kudo_enabled})) # TODO: Literal tests @@ -1304,27 +1402,33 @@ def test_count_distinct_with_nan_floats(data_gen): _nested_gens = array_gens_sample + struct_gens_sample + map_gens_sample + [binary_gen] @pytest.mark.parametrize('data_gen', decimal_gens, ids=idfn) -def test_first_last_reductions_decimal_types(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_first_last_reductions_decimal_types(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( # Coalesce and sort are to make sure that first and last, which are non-deterministic # become deterministic lambda spark: unary_op_df(spark, data_gen).coalesce(1).selectExpr( - 'first(a)', 'last(a)', 'first(a, true)', 'last(a, true)')) + 'first(a)', 'last(a)', 'first(a, true)', 'last(a, true)'), + conf = {kudo_enabled_conf_key: kudo_enabled}) @pytest.mark.parametrize('data_gen', _nested_gens, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_first_last_reductions_nested_types(data_gen): +def test_first_last_reductions_nested_types(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( # Coalesce and sort are to make sure that first and last, which are non-deterministic # become deterministic lambda spark: unary_op_df(spark, data_gen).coalesce(1).selectExpr( - 'first(a)', 'last(a)', 'first(a, true)', 'last(a, true)')) + 'first(a)', 'last(a)', 'first(a, true)', 'last(a, true)'), + conf = {kudo_enabled_conf_key: kudo_enabled}) @pytest.mark.parametrize('data_gen', _all_basic_gens_with_all_nans_cases, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @allow_non_gpu(*non_utc_allow) -def test_generic_reductions(data_gen): - local_conf = copy_and_update(_float_conf, {'spark.sql.legacy.allowParameterlessCount': 'true'}) +def test_generic_reductions(data_gen, kudo_enabled): + local_conf = copy_and_update(_float_conf, {'spark.sql.legacy.allowParameterlessCount': 'true', + kudo_enabled_conf_key: kudo_enabled}) assert_gpu_and_cpu_are_equal_collect( # Coalesce and sort are to make sure that first and last, which are non-deterministic # become deterministic @@ -1342,43 +1446,50 @@ def test_generic_reductions(data_gen): # min_by and max_by are supported for pyspark since 3.3.0 so tested with sql @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_basic_gens + nested_gens_sample, ids=idfn) -def test_hash_groupby_min_max_by_unique(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_min_max_by_unique(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_sql( lambda spark: three_col_df(spark, byte_gen, data_gen, UniqueLongGen()), "tbl", - "SELECT a, min_by(b, c), max_by(b, c) FROM tbl GROUP BY a") + "SELECT a, min_by(b, c), max_by(b, c) FROM tbl GROUP BY a", + conf = {kudo_enabled_conf_key: kudo_enabled}) # When the ordering column is not unique this gpu will always return the minimal/maximal value # while spark's result is non-deterministic. So we need to set the column b and c to be # the same to make the result comparable. @ignore_order(local=True) @pytest.mark.parametrize('data_gen', basic_gen_no_floats + struct_gens_sample_with_decimal128 + array_gens_sample, ids=idfn) -def test_hash_groupby_min_max_by_same(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_min_max_by_same(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_sql( lambda spark: two_col_df(spark, byte_gen, data_gen), "tbl", - "SELECT a, min_by(b, b), max_by(b, b) FROM tbl GROUP BY a") + "SELECT a, min_by(b, b), max_by(b, b) FROM tbl GROUP BY a", + conf = {kudo_enabled_conf_key: kudo_enabled}) -def test_reduction_with_min_max_by_unique(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_reduction_with_min_max_by_unique(kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: two_col_df(spark, int_gen, UniqueLongGen()).selectExpr( - "min_by(a, b)", "max_by(a, b)") - ) + "min_by(a, b)", "max_by(a, b)"), + conf = {kudo_enabled_conf_key: kudo_enabled}) # When the ordering column is not unique this gpu will always return the minimal/maximal value # while spark's result is non-deterministic. So we need to set the column b and c to be # the same to make the result comparable. @pytest.mark.parametrize('data_gen', basic_gen_no_floats + struct_gens_sample_with_decimal128 + array_gens_sample, ids=idfn) -def test_reduction_with_max_by_same(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_reduction_with_max_by_same(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr( - "min_by(a, a)", "max_by(a, a)") - ) + "min_by(a, a)", "max_by(a, a)"), + conf = {kudo_enabled_conf_key: kudo_enabled}) @pytest.mark.parametrize('data_gen', all_gen + _nested_gens, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @allow_non_gpu(*non_utc_allow) -def test_count(data_gen): +def test_count(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen) \ .selectExpr( @@ -1386,42 +1497,49 @@ def test_count(data_gen): 'count()', 'count()', 'count(1)'), - conf = {'spark.sql.legacy.allowParameterlessCount': 'true'}) + conf = {'spark.sql.legacy.allowParameterlessCount': 'true', + kudo_enabled_conf_key: kudo_enabled}) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', all_basic_gens, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_distinct_count_reductions(data_gen): +def test_distinct_count_reductions(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).selectExpr( - 'count(DISTINCT a)')) + 'count(DISTINCT a)'), + conf= {kudo_enabled_conf_key: kudo_enabled}) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', [float_gen, double_gen], ids=idfn) -def test_distinct_float_count_reductions(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_distinct_float_count_reductions(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).selectExpr( - 'count(DISTINCT a)')) + 'count(DISTINCT a)'), + conf = {kudo_enabled_conf_key: kudo_enabled}) @approximate_float @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', numeric_gens + [decimal_gen_64bit, decimal_gen_128bit], ids=idfn) -def test_arithmetic_reductions(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_arithmetic_reductions(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : unary_op_df(spark, data_gen).selectExpr( 'sum(a)', 'avg(a)'), - conf = _float_conf) + conf = copy_and_update(_float_conf, {kudo_enabled_conf_key: kudo_enabled})) @pytest.mark.parametrize('data_gen', all_basic_gens + decimal_gens + _nested_gens, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_collect_list_reductions(data_gen): +def test_collect_list_reductions(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( # coalescing because collect_list is not deterministic lambda spark: unary_op_df(spark, data_gen).coalesce(1).selectExpr('collect_list(a)'), - conf=_float_conf) + conf= copy_and_update(_float_conf, {kudo_enabled_conf_key: kudo_enabled}) ) _no_neg_zero_all_basic_gens = [byte_gen, short_gen, int_gen, long_gen, # -0.0 cannot work because of -0.0 == 0.0 in cudf for distinct and @@ -1435,11 +1553,12 @@ def test_collect_list_reductions(data_gen): @pytest.mark.parametrize('data_gen', _no_neg_zero_all_basic_gens + decimal_gens + _struct_only_nested_gens, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_collect_set_reductions(data_gen): +def test_collect_set_reductions(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen).selectExpr('sort_array(collect_set(a))'), - conf=_float_conf) + conf=copy_and_update(_float_conf, {kudo_enabled_conf_key: kudo_enabled})) def test_collect_empty(): assert_gpu_and_cpu_are_equal_collect( @@ -1449,8 +1568,9 @@ def test_collect_empty(): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen + _nested_gens, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_groupby_first_last(data_gen): +def test_groupby_first_last(data_gen, kudo_enabled): gen_fn = [('a', RepeatSeqGen(LongGen(), length=20)), ('b', data_gen)] agg_fn = lambda df: df.groupBy('a').agg( f.first('b'), f.last('b'), f.first('b', True), f.last('b', True)) @@ -1459,12 +1579,14 @@ def test_groupby_first_last(data_gen): # We set parallelism 1 to prevent nondeterministic results because of distributed setup. lambda spark: agg_fn(gen_df(spark, gen_fn, num_slices=1)), # Disable RADIX sort as the CPU sort is not stable if it is - conf={'spark.sql.sort.enableRadixSort': False}) + conf={'spark.sql.sort.enableRadixSort': False, + kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen + _struct_only_nested_gens, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_sorted_groupby_first_last(data_gen): +def test_sorted_groupby_first_last(data_gen, kudo_enabled): gen_fn = [('a', RepeatSeqGen(LongGen(), length=20)), ('b', data_gen)] # sort by more than the group by columns to be sure that first/last don't remove the ordering agg_fn = lambda df: df.orderBy('a', 'b').groupBy('a').agg( @@ -1474,7 +1596,8 @@ def test_sorted_groupby_first_last(data_gen): # We set parallelism and partitions to 1 to prevent nondeterministic results because # of distributed setups. lambda spark: agg_fn(gen_df(spark, gen_fn, num_slices=1)), - conf = {'spark.sql.shuffle.partitions': '1'}) + conf = {'spark.sql.shuffle.partitions': '1', + kudo_enabled_conf_key: kudo_enabled}) # Spark has a sorting bug with decimals, see https://issues.apache.org/jira/browse/SPARK-40129. # Have pytest do the sorting rather than Spark as a workaround. @@ -1482,11 +1605,13 @@ def test_sorted_groupby_first_last(data_gen): @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('count_func', [f.count, f.countDistinct]) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_agg_count(data_gen, count_func): +def test_agg_count(data_gen, count_func, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, [('a', data_gen), ('b', data_gen)], - length=1024).groupBy('a').agg(count_func("b"))) + length=1024).groupBy('a').agg(count_func("b")), + conf = {kudo_enabled_conf_key: kudo_enabled}) # Spark has a sorting bug with decimals, see https://issues.apache.org/jira/browse/SPARK-40129. # Have pytest do the sorting rather than Spark as a workaround. @@ -1497,11 +1622,13 @@ def test_agg_count(data_gen, count_func): [ArrayGen(StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]])) , binary_gen], ids=idfn) @pytest.mark.parametrize('count_func', [f.count, f.countDistinct]) -def test_groupby_list_types_fallback(data_gen, count_func): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_groupby_list_types_fallback(data_gen, count_func, kudo_enabled): assert_gpu_fallback_collect( lambda spark : gen_df(spark, [('a', data_gen), ('b', data_gen)], length=1024).groupBy('a').agg(count_func("b")), - "HashAggregateExec") + "HashAggregateExec", + conf = {kudo_enabled_conf_key: kudo_enabled}) def subquery_create_temp_views(spark, expr): t1 = "select * from values (1,2) as t1(a,b)" @@ -1525,10 +1652,12 @@ def subquery_create_temp_views(spark, expr): "select sum(distinct(if(c > (select sum(distinct(a)) from t1), d, 0))) as csum " + "from t2 group by c" ]) -def test_subquery_in_agg(adaptive, expr): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_subquery_in_agg(adaptive, expr, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: subquery_create_temp_views(spark, expr), - conf = {"spark.sql.adaptive.enabled" : adaptive}) + conf = {"spark.sql.adaptive.enabled" : adaptive, + kudo_enabled_conf_key: kudo_enabled}) # TODO support multi-level structs https://github.com/NVIDIA/spark-rapids/issues/2438 @@ -1558,12 +1687,13 @@ def workaround_dedupe_by_value(df, num_cols): ], nullable=False), ], ids=idfn) @ignore_order(local=True) -def test_struct_groupby_count(key_data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_struct_groupby_count(key_data_gen, kudo_enabled): def group_by_count(spark): df = two_col_df(spark, key_data_gen, IntegerGen()) assert_single_level_struct(df) return workaround_dedupe_by_value(df.groupBy(df.a).count(), 3) - assert_gpu_and_cpu_are_equal_collect(group_by_count) + assert_gpu_and_cpu_are_equal_collect(group_by_count, conf = {kudo_enabled_conf_key: kudo_enabled}) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @@ -1578,13 +1708,15 @@ def group_by_count(spark): ], nullable=False) ], ids=idfn) @ignore_order(local=True) -def test_struct_cast_groupby_count(cast_struct_tostring, key_data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_struct_cast_groupby_count(cast_struct_tostring, key_data_gen, kudo_enabled): def _group_by_struct_or_cast(spark): df = two_col_df(spark, key_data_gen, IntegerGen()) assert_single_level_struct(df) return df.groupBy(df.a.cast(StringType())).count() assert_gpu_and_cpu_are_equal_collect(_group_by_struct_or_cast, { - 'spark.sql.legacy.castComplexTypesToString.enabled': cast_struct_tostring == 'LEGACY' + 'spark.sql.legacy.castComplexTypesToString.enabled': cast_struct_tostring == 'LEGACY', + kudo_enabled_conf_key: kudo_enabled }) @@ -1601,12 +1733,13 @@ def _group_by_struct_or_cast(spark): ]))], nullable=False), ], ids=idfn) @ignore_order(local=True) -def test_struct_count_distinct(key_data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_struct_count_distinct(key_data_gen, kudo_enabled): def _count_distinct_by_struct(spark): df = gen_df(spark, key_data_gen) assert_single_level_struct(df) return df.agg(f.countDistinct(df.a)) - assert_gpu_and_cpu_are_equal_collect(_count_distinct_by_struct) + assert_gpu_and_cpu_are_equal_collect(_count_distinct_by_struct, conf = {kudo_enabled_conf_key: kudo_enabled}) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @@ -1623,96 +1756,112 @@ def _count_distinct_by_struct(spark): ]))], nullable=False), ], ids=idfn) @ignore_order(local=True) -def test_struct_count_distinct_cast(cast_struct_tostring, key_data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_struct_count_distinct_cast(cast_struct_tostring, key_data_gen, kudo_enabled): def _count_distinct_by_struct(spark): df = gen_df(spark, key_data_gen) assert_single_level_struct(df) return df.agg(f.countDistinct(df.a.cast(StringType()))) assert_gpu_and_cpu_are_equal_collect(_count_distinct_by_struct, { - 'spark.sql.legacy.castComplexTypesToString.enabled': cast_struct_tostring == 'LEGACY' + 'spark.sql.legacy.castComplexTypesToString.enabled': cast_struct_tostring == 'LEGACY', + kudo_enabled_conf_key: kudo_enabled }) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @ignore_order(local=True) -def test_reduction_nested_struct(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_reduction_nested_struct(kudo_enabled): def do_it(spark): df = unary_op_df(spark, StructGen([('aa', StructGen([('aaa', IntegerGen(min_val=0, max_val=4))]))])) return df.agg(f.sum(df.a.aa.aaa)) - assert_gpu_and_cpu_are_equal_collect(do_it) + assert_gpu_and_cpu_are_equal_collect(do_it, conf = {kudo_enabled_conf_key: kudo_enabled}) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @ignore_order(local=True) -def test_reduction_nested_array(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_reduction_nested_array(kudo_enabled): def do_it(spark): df = unary_op_df(spark, ArrayGen(StructGen([('aa', IntegerGen(min_val=0, max_val=4))]))) return df.agg(f.sum(df.a[1].aa)) - assert_gpu_and_cpu_are_equal_collect(do_it) + assert_gpu_and_cpu_are_equal_collect(do_it, conf = {kudo_enabled_conf_key: kudo_enabled}) # The map here is a child not a top level, because we only support GetMapValue on String to String maps. @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @ignore_order(local=True) -def test_reduction_nested_map(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_reduction_nested_map(kudo_enabled): def do_it(spark): df = unary_op_df(spark, ArrayGen(MapGen(StringGen('a{1,5}', nullable=False), StringGen('[ab]{1,5}')))) return df.agg(f.min(df.a[1]["a"])) - assert_gpu_and_cpu_are_equal_collect(do_it) + assert_gpu_and_cpu_are_equal_collect(do_it, conf = {kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 -def test_agg_nested_struct(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_agg_nested_struct(kudo_enabled): def do_it(spark): df = two_col_df(spark, StringGen('k{1,5}'), StructGen([('aa', StructGen([('aaa', IntegerGen(min_val=0, max_val=4))]))])) return df.groupBy('a').agg(f.sum(df.b.aa.aaa)) - assert_gpu_and_cpu_are_equal_collect(do_it) + assert_gpu_and_cpu_are_equal_collect(do_it, conf = {kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 -def test_agg_nested_array(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_agg_nested_array(kudo_enabled): def do_it(spark): df = two_col_df(spark, StringGen('k{1,5}'), ArrayGen(StructGen([('aa', IntegerGen(min_val=0, max_val=4))]))) return df.groupBy('a').agg(f.sum(df.b[1].aa)) - assert_gpu_and_cpu_are_equal_collect(do_it) + assert_gpu_and_cpu_are_equal_collect(do_it, conf = {kudo_enabled_conf_key: kudo_enabled}) # The map here is a child not a top level, because we only support GetMapValue on String to String maps. @ignore_order(local=True) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 -def test_agg_nested_map(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_agg_nested_map(kudo_enabled): def do_it(spark): df = two_col_df(spark, StringGen('k{1,5}'), ArrayGen(MapGen(StringGen('a{1,5}', nullable=False), StringGen('[ab]{1,5}')))) return df.groupBy('a').agg(f.min(df.b[1]["a"])) - assert_gpu_and_cpu_are_equal_collect(do_it) + assert_gpu_and_cpu_are_equal_collect(do_it, conf = {kudo_enabled_conf_key: kudo_enabled}) @incompat @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) -def test_hash_groupby_approx_percentile_reduction(aqe_enabled): - conf = {'spark.sql.adaptive.enabled': aqe_enabled} +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_reduction(aqe_enabled, kudo_enabled): + conf = {'spark.sql.adaptive.enabled': aqe_enabled, + kudo_enabled_conf_key: kudo_enabled} compare_percentile_approx( lambda spark: gen_df(spark, [('v', DoubleGen())], length=100), [0.05, 0.25, 0.5, 0.75, 0.95], conf, reduction = True) @incompat @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) -def test_hash_groupby_approx_percentile_reduction_single_row(aqe_enabled): - conf = {'spark.sql.adaptive.enabled': aqe_enabled} +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_reduction_single_row(aqe_enabled, kudo_enabled): + conf = {'spark.sql.adaptive.enabled': aqe_enabled, + kudo_enabled_conf_key: kudo_enabled} compare_percentile_approx( lambda spark: gen_df(spark, [('v', DoubleGen())], length=1), [0.05, 0.25, 0.5, 0.75, 0.95], conf, reduction = True) @incompat @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) -def test_hash_groupby_approx_percentile_reduction_no_rows(aqe_enabled): - conf = {'spark.sql.adaptive.enabled': aqe_enabled} +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_reduction_no_rows(aqe_enabled, kudo_enabled): + conf = {'spark.sql.adaptive.enabled': aqe_enabled, + kudo_enabled_conf_key: kudo_enabled} compare_percentile_approx( lambda spark: gen_df(spark, [('v', DoubleGen())], length=0), [0.05, 0.25, 0.5, 0.75, 0.95], conf, reduction = True) @incompat @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) -def test_hash_groupby_approx_percentile_byte(aqe_enabled): - conf = {'spark.sql.adaptive.enabled': aqe_enabled} +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_byte(aqe_enabled, kudo_enabled): + conf = {'spark.sql.adaptive.enabled': aqe_enabled, + kudo_enabled_conf_key: kudo_enabled} compare_percentile_approx( lambda spark: gen_df(spark, [('k', StringGen(nullable=False)), ('v', ByteGen())], length=100), @@ -1721,8 +1870,10 @@ def test_hash_groupby_approx_percentile_byte(aqe_enabled): @incompat @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/11198 @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) -def test_hash_groupby_approx_percentile_byte_scalar(aqe_enabled): - conf = {'spark.sql.adaptive.enabled': aqe_enabled} +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_byte_scalar(aqe_enabled, kudo_enabled): + conf = {'spark.sql.adaptive.enabled': aqe_enabled, + kudo_enabled_conf_key: kudo_enabled} compare_percentile_approx( lambda spark: gen_df(spark, [('k', StringGen(nullable=False)), ('v', ByteGen())], length=100), @@ -1730,8 +1881,10 @@ def test_hash_groupby_approx_percentile_byte_scalar(aqe_enabled): @incompat @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) -def test_hash_groupby_approx_percentile_long_repeated_keys(aqe_enabled): - conf = {'spark.sql.adaptive.enabled': aqe_enabled} +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_long_repeated_keys(aqe_enabled, kudo_enabled): + conf = {'spark.sql.adaptive.enabled': aqe_enabled, + kudo_enabled_conf_key: kudo_enabled} compare_percentile_approx( lambda spark: gen_df(spark, [('k', RepeatSeqGen(LongGen(), length=20)), ('v', UniqueLongGen())], length=100), @@ -1739,8 +1892,10 @@ def test_hash_groupby_approx_percentile_long_repeated_keys(aqe_enabled): @incompat @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) -def test_hash_groupby_approx_percentile_long(aqe_enabled): - conf = {'spark.sql.adaptive.enabled': aqe_enabled} +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_long(aqe_enabled, kudo_enabled): + conf = {'spark.sql.adaptive.enabled': aqe_enabled, + kudo_enabled_conf_key: kudo_enabled} compare_percentile_approx( lambda spark: gen_df(spark, [('k', StringGen(nullable=False)), ('v', UniqueLongGen())], length=100), @@ -1749,8 +1904,10 @@ def test_hash_groupby_approx_percentile_long(aqe_enabled): @incompat @disable_ansi_mode # ANSI mode is tested in test_hash_groupby_approx_percentile_long_single_ansi @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) -def test_hash_groupby_approx_percentile_long_single(aqe_enabled): - conf = {'spark.sql.adaptive.enabled': aqe_enabled} +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_long_single(aqe_enabled, kudo_enabled): + conf = {'spark.sql.adaptive.enabled': aqe_enabled, + kudo_enabled_conf_key: kudo_enabled} compare_percentile_approx( lambda spark: gen_df(spark, [('k', StringGen(nullable=False)), ('v', UniqueLongGen())], length=100), @@ -1760,13 +1917,15 @@ def test_hash_groupby_approx_percentile_long_single(aqe_enabled): @incompat @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) @allow_non_gpu('ObjectHashAggregateExec', 'ShuffleExchangeExec') -def test_hash_groupby_approx_percentile_long_single_ansi(aqe_enabled): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_long_single_ansi(aqe_enabled, kudo_enabled): """ Tests approx_percentile with ANSI mode enabled. Note: In ANSI mode, the test query exercises ObjectHashAggregateExec and ShuffleExchangeExec, which fall back to CPU. """ - conf = {'spark.sql.adaptive.enabled': aqe_enabled} + conf = {'spark.sql.adaptive.enabled': aqe_enabled, + kudo_enabled_conf_key: kudo_enabled} conf.update(ansi_enabled_conf) compare_percentile_approx( lambda spark: gen_df(spark, [('k', StringGen(nullable=False)), @@ -1776,8 +1935,10 @@ def test_hash_groupby_approx_percentile_long_single_ansi(aqe_enabled): @incompat @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) -def test_hash_groupby_approx_percentile_double(aqe_enabled): - conf = {'spark.sql.adaptive.enabled': aqe_enabled} +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_double(aqe_enabled, kudo_enabled): + conf = {'spark.sql.adaptive.enabled': aqe_enabled, + kudo_enabled_conf_key: kudo_enabled} compare_percentile_approx( lambda spark: gen_df(spark, [('k', StringGen(nullable=False)), ('v', DoubleGen())], length=100), @@ -1785,8 +1946,10 @@ def test_hash_groupby_approx_percentile_double(aqe_enabled): @incompat @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) -def test_hash_groupby_approx_percentile_double_single(aqe_enabled): - conf = {'spark.sql.adaptive.enabled': aqe_enabled} +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_double_single(aqe_enabled, kudo_enabled): + conf = {'spark.sql.adaptive.enabled': aqe_enabled, + kudo_enabled_conf_key: kudo_enabled} compare_percentile_approx( lambda spark: gen_df(spark, [('k', StringGen(nullable=False)), ('v', DoubleGen())], length=100), @@ -1794,13 +1957,15 @@ def test_hash_groupby_approx_percentile_double_single(aqe_enabled): @incompat @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @ignore_order(local=True) @allow_non_gpu('TakeOrderedAndProjectExec', 'Alias', 'Cast', 'ObjectHashAggregateExec', 'AggregateExpression', 'ApproximatePercentile', 'Literal', 'ShuffleExchangeExec', 'HashPartitioning', 'CollectLimitExec') -def test_hash_groupby_approx_percentile_partial_fallback_to_cpu(aqe_enabled): +def test_hash_groupby_approx_percentile_partial_fallback_to_cpu(aqe_enabled, kudo_enabled): conf = { 'spark.rapids.sql.hashAgg.replaceMode': 'partial', - 'spark.sql.adaptive.enabled': aqe_enabled + 'spark.sql.adaptive.enabled': aqe_enabled, + kudo_enabled_conf_key: kudo_enabled } def approx_percentile_query(spark): @@ -1813,66 +1978,80 @@ def approx_percentile_query(spark): @incompat @ignore_order(local=True) -def test_hash_groupby_approx_percentile_decimal32(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_decimal32(kudo_enabled): compare_percentile_approx( lambda spark: gen_df(spark, [('k', RepeatSeqGen(ByteGen(nullable=False), length=2)), ('v', DecimalGen(6, 2))]), - [0.05, 0.25, 0.5, 0.75, 0.95]) + [0.05, 0.25, 0.5, 0.75, 0.95], + conf = {kudo_enabled_conf_key: kudo_enabled}) @incompat @ignore_order(local=True) @disable_ansi_mode # ANSI mode is tested with test_hash_groupby_approx_percentile_decimal_single_ansi. -def test_hash_groupby_approx_percentile_decimal32_single(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_decimal32_single(kudo_enabled): compare_percentile_approx( lambda spark: gen_df(spark, [('k', RepeatSeqGen(ByteGen(nullable=False), length=2)), ('v', DecimalGen(6, 2))]), - 0.05) + 0.05, + conf = {kudo_enabled_conf_key: kudo_enabled}) @incompat @ignore_order(local=True) @allow_non_gpu('ObjectHashAggregateExec', 'ShuffleExchangeExec') -def test_hash_groupby_approx_percentile_decimal_single_ansi(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_decimal_single_ansi(kudo_enabled): compare_percentile_approx( lambda spark: gen_df(spark, [('k', RepeatSeqGen(ByteGen(nullable=False), length=2)), ('v', DecimalGen(6, 2))]), - 0.05, conf=ansi_enabled_conf) + 0.05, + conf=copy_and_update(ansi_enabled_conf, {kudo_enabled_conf_key: kudo_enabled})) @incompat @ignore_order(local=True) -def test_hash_groupby_approx_percentile_decimal64(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_decimal64(kudo_enabled): compare_percentile_approx( lambda spark: gen_df(spark, [('k', RepeatSeqGen(ByteGen(nullable=False), length=2)), ('v', DecimalGen(10, 9))]), - [0.05, 0.25, 0.5, 0.75, 0.95]) + [0.05, 0.25, 0.5, 0.75, 0.95], + conf = {kudo_enabled_conf_key: kudo_enabled}) @incompat @disable_ansi_mode # ANSI mode is tested with test_hash_groupby_approx_percentile_decimal_single_ansi. @ignore_order(local=True) -def test_hash_groupby_approx_percentile_decimal64_single(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_decimal64_single(kudo_enabled): compare_percentile_approx( lambda spark: gen_df(spark, [('k', RepeatSeqGen(ByteGen(nullable=False), length=2)), ('v', DecimalGen(10, 9))]), - 0.05) + 0.05, + conf = {kudo_enabled_conf_key: kudo_enabled}) @incompat @ignore_order(local=True) -def test_hash_groupby_approx_percentile_decimal128(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_decimal128(kudo_enabled): compare_percentile_approx( lambda spark: gen_df(spark, [('k', RepeatSeqGen(ByteGen(nullable=False), length=2)), ('v', DecimalGen(19, 18))]), - [0.05, 0.25, 0.5, 0.75, 0.95]) + [0.05, 0.25, 0.5, 0.75, 0.95], + conf = {kudo_enabled_conf_key: kudo_enabled}) @incompat @disable_ansi_mode # ANSI mode is tested with test_hash_groupby_approx_percentile_decimal_single_ansi. @ignore_order(local=True) -def test_hash_groupby_approx_percentile_decimal128_single(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_groupby_approx_percentile_decimal128_single(kudo_enabled): compare_percentile_approx( lambda spark: gen_df(spark, [('k', RepeatSeqGen(ByteGen(nullable=False), length=2)), ('v', DecimalGen(19, 18))]), - 0.05) + 0.05, + conf = {kudo_enabled_conf_key: kudo_enabled}) # The percentile approx tests differ from other tests because we do not expect the CPU and GPU to produce the same # results due to the different algorithms being used. Instead we compute an exact percentile on the CPU and then @@ -1967,20 +2146,22 @@ def create_percentile_sql(func_name, percentiles, reduction): @disable_ansi_mode # ANSI mode is tested in test_hash_grpby_avg_nulls_ansi @pytest.mark.parametrize('data_gen', [_grpkey_strings_with_extra_nulls], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_grpby_avg_nulls(data_gen, conf): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_grpby_avg_nulls(data_gen, conf, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100).groupby('a') .agg(f.avg('c')), - conf=conf - ) + conf=copy_and_update(conf, {kudo_enabled_conf_key: kudo_enabled})) @ignore_order @allow_non_gpu('HashAggregateExec', 'Alias', 'AggregateExpression', 'Cast', 'HashPartitioning', 'ShuffleExchangeExec', 'Average') @pytest.mark.parametrize('data_gen', [_grpkey_strings_with_extra_nulls], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_grpby_avg_nulls_ansi(data_gen, conf): - local_conf = copy_and_update(conf, {'spark.sql.ansi.enabled': 'true'}) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_grpby_avg_nulls_ansi(data_gen, conf, kudo_enabled): + local_conf = copy_and_update(conf, {'spark.sql.ansi.enabled': 'true', + kudo_enabled_conf_key: kudo_enabled}) assert_gpu_fallback_collect( lambda spark: gen_df(spark, data_gen, length=100).groupby('a') .agg(f.avg('c')), @@ -1992,20 +2173,22 @@ def test_hash_grpby_avg_nulls_ansi(data_gen, conf): @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('data_gen', [_grpkey_strings_with_extra_nulls], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_reduction_avg_nulls(data_gen, conf): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_reduction_avg_nulls(data_gen, conf, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark: gen_df(spark, data_gen, length=100) .agg(f.avg('c')), - conf=conf - ) + conf=copy_and_update(conf, {kudo_enabled_conf_key: kudo_enabled})) @ignore_order @allow_non_gpu('HashAggregateExec', 'Alias', 'AggregateExpression', 'Cast', 'HashPartitioning', 'ShuffleExchangeExec', 'Average') @pytest.mark.parametrize('data_gen', [_grpkey_strings_with_extra_nulls], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_hash_reduction_avg_nulls_ansi(data_gen, conf): - local_conf = copy_and_update(conf, {'spark.sql.ansi.enabled': 'true'}) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_reduction_avg_nulls_ansi(data_gen, conf, kudo_enabled): + local_conf = copy_and_update(conf, {'spark.sql.ansi.enabled': 'true', + kudo_enabled_conf_key: kudo_enabled}) assert_gpu_fallback_collect( lambda spark: gen_df(spark, data_gen, length=100) .agg(f.avg('c')), @@ -2018,43 +2201,47 @@ def test_hash_reduction_avg_nulls_ansi(data_gen, conf): @allow_non_gpu('HashAggregateExec', 'Alias', 'AggregateExpression', 'Cast', 'HashPartitioning', 'ShuffleExchangeExec', 'Sum') @pytest.mark.parametrize('data_gen', _no_overflow_ansi_gens, ids=idfn) -def test_sum_fallback_when_ansi_enabled(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_sum_fallback_when_ansi_enabled(data_gen, kudo_enabled): def do_it(spark): df = gen_df(spark, [('a', data_gen), ('b', data_gen)], length=100) return df.groupBy('a').agg(f.sum("b")) assert_gpu_fallback_collect(do_it, 'Sum', - conf={'spark.sql.ansi.enabled': 'true'}) + conf={'spark.sql.ansi.enabled': 'true', kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @allow_non_gpu('HashAggregateExec', 'Alias', 'AggregateExpression', 'Cast', 'HashPartitioning', 'ShuffleExchangeExec', 'Average') @pytest.mark.parametrize('data_gen', _no_overflow_ansi_gens, ids=idfn) -def test_avg_fallback_when_ansi_enabled(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_avg_fallback_when_ansi_enabled(data_gen, kudo_enabled): def do_it(spark): df = gen_df(spark, [('a', data_gen), ('b', data_gen)], length=100) return df.groupBy('a').agg(f.avg("b")) assert_gpu_fallback_collect(do_it, 'Average', - conf={'spark.sql.ansi.enabled': 'true'}) + conf={'spark.sql.ansi.enabled': 'true', kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @allow_non_gpu('HashAggregateExec', 'Alias', 'AggregateExpression', 'HashPartitioning', 'ShuffleExchangeExec', 'Count', 'Literal') @pytest.mark.parametrize('data_gen', _no_overflow_ansi_gens, ids=idfn) -def test_count_fallback_when_ansi_enabled(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_count_fallback_when_ansi_enabled(data_gen, kudo_enabled): def do_it(spark): df = gen_df(spark, [('a', data_gen), ('b', data_gen)], length=100) return df.groupBy('a').agg(f.count("b"), f.count("*")) assert_gpu_fallback_collect(do_it, 'Count', - conf={'spark.sql.ansi.enabled': 'true'}) + conf={'spark.sql.ansi.enabled': 'true', kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', _no_overflow_ansi_gens, ids=idfn) -def test_no_fallback_when_ansi_enabled(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_no_fallback_when_ansi_enabled(data_gen, kudo_enabled): def do_it(spark): df = gen_df(spark, [('a', data_gen), ('b', data_gen)], length=100) # coalescing because first/last are not deterministic @@ -2062,7 +2249,7 @@ def do_it(spark): return df.groupBy('a').agg(f.first("b"), f.last("b"), f.min("b"), f.max("b")) assert_gpu_and_cpu_are_equal_collect(do_it, - conf={'spark.sql.ansi.enabled': 'true'}) + conf={'spark.sql.ansi.enabled': 'true', kudo_enabled_conf_key: kudo_enabled}) # Tests for standard deviation and variance aggregations. @ignore_order(local=True) @@ -2070,9 +2257,11 @@ def do_it(spark): @incompat @pytest.mark.parametrize('data_gen', _init_list_with_decimals, ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) -def test_std_variance(data_gen, conf): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_std_variance(data_gen, conf, kudo_enabled): local_conf = copy_and_update(conf, { - 'spark.rapids.sql.castDecimalToFloat.enabled': 'true'}) + 'spark.rapids.sql.castDecimalToFloat.enabled': 'true', + kudo_enabled_conf_key: kudo_enabled}) assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, data_gen, length=1000), "data_table", @@ -2101,8 +2290,10 @@ def test_std_variance(data_gen, conf): @pytest.mark.parametrize('data_gen', [_grpkey_strings_with_extra_nulls], ids=idfn) @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) @pytest.mark.parametrize('ansi_enabled', ['true', 'false']) -def test_std_variance_nulls(data_gen, conf, ansi_enabled): - local_conf = copy_and_update(conf, {'spark.sql.ansi.enabled': ansi_enabled}) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_std_variance_nulls(data_gen, conf, ansi_enabled, kudo_enabled): + local_conf = copy_and_update(conf, {'spark.sql.ansi.enabled': ansi_enabled, + kudo_enabled_conf_key: kudo_enabled}) assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, data_gen, length=1000), "data_table", @@ -2138,13 +2329,16 @@ def test_std_variance_nulls(data_gen, conf, ansi_enabled): @pytest.mark.parametrize('conf', get_params(_confs, params_markers_for_confs), ids=idfn) @pytest.mark.parametrize('replace_mode', _replace_modes_non_distinct, ids=idfn) @pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @pytest.mark.xfail(condition=is_databricks104_or_later(), reason='https://github.com/NVIDIA/spark-rapids/issues/4963') def test_std_variance_partial_replace_fallback(data_gen, conf, replace_mode, - aqe_enabled): + aqe_enabled, + kudo_enabled): local_conf = copy_and_update(conf, {'spark.rapids.sql.hashAgg.replaceMode': replace_mode, - 'spark.sql.adaptive.enabled': aqe_enabled}) + 'spark.sql.adaptive.enabled': aqe_enabled, + kudo_enabled_conf_key: kudo_enabled}) exist_clz = ['StddevPop', 'StddevSamp', 'VariancePop', 'VarianceSamp', 'GpuStddevPop', 'GpuStddevSamp', 'GpuVariancePop', 'GpuVarianceSamp'] @@ -2189,8 +2383,9 @@ def test_std_variance_partial_replace_fallback(data_gen, null_gen] + array_gens_sample + struct_gens_sample @ignore_order(local=True) @pytest.mark.parametrize('data_gen', gens_for_max_min, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_min_max_in_groupby_and_reduction(data_gen): +def test_min_max_in_groupby_and_reduction(data_gen, kudo_enabled): df_gen = [('a', data_gen), ('b', RepeatSeqGen(IntegerGen(), length=20))] # test max @@ -2198,44 +2393,48 @@ def test_min_max_in_groupby_and_reduction(data_gen): lambda spark : gen_df(spark, df_gen), "hash_agg_table", 'select b, max(a) from hash_agg_table group by b', - _float_conf) + copy_and_update(_float_conf, {kudo_enabled_conf_key: kudo_enabled})) assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, df_gen), "hash_agg_table", 'select max(a) from hash_agg_table', - _float_conf) + copy_and_update(_float_conf, {kudo_enabled_conf_key: kudo_enabled})) # test min assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, df_gen, length=1024), "hash_agg_table", 'select b, min(a) from hash_agg_table group by b', - _float_conf) + copy_and_update(_float_conf, {kudo_enabled_conf_key: kudo_enabled})) assert_gpu_and_cpu_are_equal_sql( lambda spark : gen_df(spark, df_gen, length=1024), "hash_agg_table", 'select min(a) from hash_agg_table', - _float_conf) + copy_and_update(_float_conf, {kudo_enabled_conf_key: kudo_enabled})) # Some Spark implementations will optimize this aggregation as a # complete aggregation (i.e.: only one aggregation node in the plan) @ignore_order(local=True) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 -def test_hash_aggregate_complete_with_grouping_expressions(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_aggregate_complete_with_grouping_expressions(kudo_enabled): assert_gpu_and_cpu_are_equal_sql( lambda spark : spark.range(10).withColumn("id2", f.col("id")), "hash_agg_complete_table", - "select id, avg(id) from hash_agg_complete_table group by id, id2 + 1") + "select id, avg(id) from hash_agg_complete_table group by id, id2 + 1", + conf = {kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('cast_key_to', ["byte", "short", "int", "long", "string", "DECIMAL(38,5)"], ids=idfn) -def test_hash_agg_force_pre_sort(cast_key_to): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_agg_force_pre_sort(cast_key_to, kudo_enabled): def do_it(spark): gen = StructGen([("key", UniqueLongGen()), ("value", long_gen)], nullable=False) df = gen_df(spark, gen) return df.selectExpr("CAST((key div 10) as " + cast_key_to + ") as key", "value").groupBy("key").sum("value") assert_gpu_and_cpu_are_equal_collect(do_it, conf={'spark.rapids.sql.agg.forceSinglePassPartialSort': True, - 'spark.rapids.sql.agg.singlePassPartialSortEnabled': True}) + 'spark.rapids.sql.agg.singlePassPartialSortEnabled': True, + kudo_enabled_conf_key: kudo_enabled}) diff --git a/integration_tests/src/main/python/hive_parquet_write_test.py b/integration_tests/src/main/python/hive_parquet_write_test.py index e66b889a986..540db74a1ad 100644 --- a/integration_tests/src/main/python/hive_parquet_write_test.py +++ b/integration_tests/src/main/python/hive_parquet_write_test.py @@ -25,9 +25,10 @@ # "GpuInsertIntoHiveTable" for Parquet write. _write_to_hive_conf = {"spark.sql.hive.convertMetastoreParquet": False} -_hive_bucket_gens = [ - boolean_gen, byte_gen, short_gen, int_gen, long_gen, string_gen, float_gen, double_gen, +_hive_bucket_gens_sans_bools = [ + byte_gen, short_gen, int_gen, long_gen, string_gen, float_gen, double_gen, DateGen(start=date(1590, 1, 1)), _restricted_timestamp()] +_hive_bucket_gens = [boolean_gen] + _hive_bucket_gens_sans_bools _hive_basic_gens = _hive_bucket_gens + [ DecimalGen(precision=19, scale=1, nullable=True), diff --git a/integration_tests/src/main/python/hive_write_test.py b/integration_tests/src/main/python/hive_write_test.py index 945cc4806fb..af825a99810 100644 --- a/integration_tests/src/main/python/hive_write_test.py +++ b/integration_tests/src/main/python/hive_write_test.py @@ -29,8 +29,11 @@ def _restricted_timestamp(nullable=True): end=datetime(2262, 4, 11, tzinfo=timezone.utc), nullable=nullable) +# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and +# https://github.com/rapidsai/cudf/issues/6763 . +# Once the first issue is fixed, add back boolean_gen _basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, - string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)), + string_gen, DateGen(start=date(1590, 1, 1)), _restricted_timestamp() ] + decimal_gens @@ -45,8 +48,11 @@ def _restricted_timestamp(nullable=True): ArrayGen(ArrayGen(string_gen, max_length=10), max_length=10), ArrayGen(StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]]))] +# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and +# https://github.com/rapidsai/cudf/issues/6763 . +# Once the first issue is fixed, add back boolean_gen _map_gens = [simple_string_to_string_map_gen] + [MapGen(f(nullable=False), f()) for f in [ - BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen, + ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen, lambda nullable=True: _restricted_timestamp(nullable=nullable), lambda nullable=True: DateGen(start=date(1590, 1, 1), nullable=nullable), lambda nullable=True: DecimalGen(precision=15, scale=1, nullable=nullable), diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py index 703fbe80230..936310bedeb 100644 --- a/integration_tests/src/main/python/join_test.py +++ b/integration_tests/src/main/python/join_test.py @@ -96,6 +96,8 @@ 'spark.sql.shuffle.partitions': '2', } +kudo_enabled_conf_key = "spark.rapids.shuffle.kudo.serializer.enabled" + def create_df(spark, data_gen, left_length, right_length): left = binary_op_df(spark, data_gen, length=left_length) right = binary_op_df(spark, data_gen, length=right_length).withColumnRenamed("a", "r_a")\ @@ -125,53 +127,77 @@ def join_batch_size_test_params(*args): @ignore_order(local=True) @pytest.mark.parametrize('join_type', ['Left', 'Inner', 'LeftSemi', 'LeftAnti'], ids=idfn) @pytest.mark.parametrize("aqe_enabled", ["true", "false"], ids=idfn) -def test_right_broadcast_nested_loop_join_without_condition_empty(join_type, aqe_enabled): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_right_broadcast_nested_loop_join_without_condition_empty(join_type, aqe_enabled, kudo_enabled): def do_join(spark): left, right = create_df(spark, long_gen, 50, 0) return left.join(broadcast(right), how=join_type) - assert_gpu_and_cpu_are_equal_collect(do_join, conf={ "spark.sql.adaptive.enabled": aqe_enabled }) + assert_gpu_and_cpu_are_equal_collect(do_join, conf={ + "spark.sql.adaptive.enabled": aqe_enabled, + kudo_enabled_conf_key: kudo_enabled + }) @ignore_order(local=True) @pytest.mark.parametrize('join_type', ['Left', 'Inner', 'LeftSemi', 'LeftAnti'], ids=idfn) @pytest.mark.parametrize("aqe_enabled", ["true", "false"], ids=idfn) -def test_left_broadcast_nested_loop_join_without_condition_empty(join_type, aqe_enabled): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_left_broadcast_nested_loop_join_without_condition_empty(join_type, aqe_enabled, kudo_enabled): def do_join(spark): left, right = create_df(spark, long_gen, 0, 50) return left.join(broadcast(right), how=join_type) - assert_gpu_and_cpu_are_equal_collect(do_join, conf={ "spark.sql.adaptive.enabled": aqe_enabled }) + assert_gpu_and_cpu_are_equal_collect(do_join, conf={ + "spark.sql.adaptive.enabled": aqe_enabled, + kudo_enabled_conf_key: kudo_enabled + }) @ignore_order(local=True) @pytest.mark.parametrize('join_type', ['Left', 'Inner', 'LeftSemi', 'LeftAnti'], ids=idfn) @pytest.mark.parametrize("aqe_enabled", ["true", "false"], ids=idfn) -def test_broadcast_nested_loop_join_without_condition_empty(join_type, aqe_enabled): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_broadcast_nested_loop_join_without_condition_empty(join_type, aqe_enabled, kudo_enabled): def do_join(spark): left, right = create_df(spark, long_gen, 0, 0) return left.join(broadcast(right), how=join_type) - assert_gpu_and_cpu_are_equal_collect(do_join, conf={ "spark.sql.adaptive.enabled": aqe_enabled }) + assert_gpu_and_cpu_are_equal_collect(do_join, conf={ + "spark.sql.adaptive.enabled": aqe_enabled, + kudo_enabled_conf_key: kudo_enabled + }) @ignore_order(local=True) @pytest.mark.parametrize('join_type', ['Left', 'Inner', 'LeftSemi', 'LeftAnti'], ids=idfn) -def test_right_broadcast_nested_loop_join_without_condition_empty_small_batch(join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_right_broadcast_nested_loop_join_without_condition_empty_small_batch(join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, long_gen, 50, 0) return left.join(broadcast(right), how=join_type) - assert_gpu_and_cpu_are_equal_collect(do_join, conf={'spark.sql.adaptive.enabled': 'true'}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf={ + 'spark.sql.adaptive.enabled': 'true', + kudo_enabled_conf_key: kudo_enabled + }) @ignore_order(local=True) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'Inner', 'LeftSemi', 'LeftAnti'], ids=idfn) -def test_empty_broadcast_hash_join(join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_empty_broadcast_hash_join(join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, long_gen, 50, 0) return left.join(right.hint("broadcast"), left.a == right.r_a, join_type) - assert_gpu_and_cpu_are_equal_collect(do_join, conf={'spark.sql.adaptive.enabled': 'true'}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf={ + 'spark.sql.adaptive.enabled': 'true', + kudo_enabled_conf_key: kudo_enabled + }) @pytest.mark.parametrize('join_type', ['Left', 'Inner', 'LeftSemi', 'LeftAnti'], ids=idfn) -def test_broadcast_hash_join_constant_keys(join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_broadcast_hash_join_constant_keys(join_type, kudo_enabled): def do_join(spark): left = spark.range(10).withColumn("s", lit(1)) right = spark.range(10000).withColumn("r_s", lit(1)) return left.join(right.hint("broadcast"), left.s == right.r_s, join_type) - assert_gpu_and_cpu_row_counts_equal(do_join, conf={'spark.sql.adaptive.enabled': 'true'}) + assert_gpu_and_cpu_row_counts_equal(do_join, conf={ + 'spark.sql.adaptive.enabled': 'true', + kudo_enabled_conf_key: kudo_enabled + }) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 @@ -181,21 +207,29 @@ def do_join(spark): (all_gen, '1g'), (join_small_batch_gens, '1000')), ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) -def test_sortmerge_join(data_gen, join_type, batch_size): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_sortmerge_join(data_gen, join_type, batch_size, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) return left.join(right, left.a == right.r_a, join_type) - conf = copy_and_update(_sortmerge_join_conf, {'spark.rapids.sql.batchSizeBytes': batch_size}) + conf = copy_and_update(_sortmerge_join_conf, { + 'spark.rapids.sql.batchSizeBytes': batch_size, + kudo_enabled_conf_key: kudo_enabled + }) assert_gpu_and_cpu_are_equal_collect(do_join, conf=conf) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', basic_nested_gens + [decimal_gen_128bit], ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) -def test_sortmerge_join_ridealong(data_gen, join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_sortmerge_join_ridealong(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_ridealong_df(spark, short_gen, data_gen, 500, 500) return left.join(right, left.key == right.r_key, join_type) - assert_gpu_and_cpu_are_equal_collect(do_join, conf=_sortmerge_join_conf) + conf = copy_and_update(_sortmerge_join_conf, { + kudo_enabled_conf_key: kudo_enabled + }) + assert_gpu_and_cpu_are_equal_collect(do_join, conf=conf) # For floating point values the normalization is done using a higher order function. We could probably work around this # for now it falls back to the CPU @@ -205,11 +239,15 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', single_level_array_gens + [binary_gen], ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) -def test_sortmerge_join_wrong_key_fallback(data_gen, join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_sortmerge_join_wrong_key_fallback(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) return left.join(right, left.a == right.r_a, join_type) - assert_gpu_fallback_collect(do_join, 'SortMergeJoinExec', conf=_sortmerge_join_conf) + conf = copy_and_update(_sortmerge_join_conf, { + kudo_enabled_conf_key: kudo_enabled + }) + assert_gpu_fallback_collect(do_join, 'SortMergeJoinExec', conf=conf) # For spark to insert a shuffled hash join it has to be enabled with # "spark.sql.join.preferSortMergeJoin" = "false" and both sides have to @@ -231,10 +269,12 @@ def do_join(spark): @pytest.mark.parametrize('data_gen', basic_nested_gens + [decimal_gen_128bit], ids=idfn) @pytest.mark.parametrize('join_type', all_non_sized_join_types, ids=idfn) @pytest.mark.parametrize('sub_part_enabled', ['false', 'true'], ids=['SubPartition_OFF', 'SubPartition_ON']) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_hash_join_ridealong_non_sized(data_gen, join_type, sub_part_enabled): +def test_hash_join_ridealong_non_sized(data_gen, join_type, sub_part_enabled, kudo_enabled): confs = { - "spark.rapids.sql.test.subPartitioning.enabled": sub_part_enabled + "spark.rapids.sql.test.subPartitioning.enabled": sub_part_enabled, + kudo_enabled_conf_key: kudo_enabled } hash_join_ridealong(data_gen, join_type, confs) @@ -242,10 +282,12 @@ def test_hash_join_ridealong_non_sized(data_gen, join_type, sub_part_enabled): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', basic_nested_gens + [decimal_gen_128bit], ids=idfn) @pytest.mark.parametrize('join_type', all_symmetric_sized_join_types, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_hash_join_ridealong_symmetric(data_gen, join_type): +def test_hash_join_ridealong_symmetric(data_gen, join_type, kudo_enabled): confs = { "spark.rapids.sql.join.useShuffledSymmetricHashJoin": "true", + kudo_enabled_conf_key: kudo_enabled } hash_join_ridealong(data_gen, join_type, confs) @@ -253,10 +295,12 @@ def test_hash_join_ridealong_symmetric(data_gen, join_type): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', basic_nested_gens + [decimal_gen_128bit], ids=idfn) @pytest.mark.parametrize('join_type', all_asymmetric_sized_join_types, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_hash_join_ridealong_asymmetric(data_gen, join_type): +def test_hash_join_ridealong_asymmetric(data_gen, join_type, kudo_enabled): confs = { "spark.rapids.sql.join.useShuffledAsymmetricHashJoin": "true", + kudo_enabled_conf_key: kudo_enabled } hash_join_ridealong(data_gen, join_type, confs) @@ -267,24 +311,29 @@ def test_hash_join_ridealong_asymmetric(data_gen, join_type): # Not all join types can be translated to a broadcast join, but this tests them to be sure we # can handle what spark is doing @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_broadcast_join_right_table(data_gen, join_type): +def test_broadcast_join_right_table(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) return left.join(broadcast(right), left.a == right.r_a, join_type) - assert_gpu_and_cpu_are_equal_collect(do_join) + conf = {kudo_enabled_conf_key: kudo_enabled} + assert_gpu_and_cpu_are_equal_collect(do_join, conf = conf) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', basic_nested_gens + [decimal_gen_128bit], ids=idfn) # Not all join types can be translated to a broadcast join, but this tests them to be sure we # can handle what spark is doing @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_broadcast_join_right_table_ridealong(data_gen, join_type): +def test_broadcast_join_right_table_ridealong(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_ridealong_df(spark, short_gen, data_gen, 500, 500) return left.join(broadcast(right), left.key == right.r_key, join_type) - assert_gpu_and_cpu_are_equal_collect(do_join) + + conf = {kudo_enabled_conf_key: kudo_enabled} + assert_gpu_and_cpu_are_equal_collect(do_join, conf = conf) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @@ -293,13 +342,16 @@ def do_join(spark): # Not all join types can be translated to a broadcast join, but this tests them to be sure we # can handle what spark is doing @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_broadcast_join_right_table_with_job_group(data_gen, join_type): +def test_broadcast_join_right_table_with_job_group(data_gen, join_type, kudo_enabled): with_cpu_session(lambda spark : spark.sparkContext.setJobGroup("testjob1", "test", False)) def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) return left.join(broadcast(right), left.a == right.r_a, join_type) - assert_gpu_and_cpu_are_equal_collect(do_join) + + conf = {kudo_enabled_conf_key: kudo_enabled} + assert_gpu_and_cpu_are_equal_collect(do_join, conf = conf) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @@ -308,12 +360,16 @@ def do_join(spark): @pytest.mark.parametrize('data_gen,batch_size', join_batch_size_test_params( (all_gen + basic_nested_gens, '1g'), (join_small_batch_gens + [basic_struct_gen, ArrayGen(string_gen)], '100')), ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_cartesian_join(data_gen, batch_size): +def test_cartesian_join(data_gen, batch_size, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) return left.crossJoin(right) - assert_gpu_and_cpu_are_equal_collect(do_join, conf={'spark.rapids.sql.batchSizeBytes': batch_size}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf={ + 'spark.rapids.sql.batchSizeBytes': batch_size, + kudo_enabled_conf_key: kudo_enabled + }) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @@ -322,11 +378,15 @@ def do_join(spark): @pytest.mark.xfail(condition=is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/334') @pytest.mark.parametrize('batch_size', ['100', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches -def test_cartesian_join_special_case_count(batch_size): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_cartesian_join_special_case_count(batch_size, kudo_enabled): def do_join(spark): left, right = create_df(spark, int_gen, 50, 25) return left.crossJoin(right).selectExpr('COUNT(*)') - assert_gpu_and_cpu_are_equal_collect(do_join, conf={'spark.rapids.sql.batchSizeBytes': batch_size}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf={ + 'spark.rapids.sql.batchSizeBytes': batch_size, + kudo_enabled_conf_key: kudo_enabled + }) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @@ -335,11 +395,15 @@ def do_join(spark): @pytest.mark.xfail(condition=is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/334') @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches -def test_cartesian_join_special_case_group_by_count(batch_size): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_cartesian_join_special_case_group_by_count(batch_size, kudo_enabled): def do_join(spark): left, right = create_df(spark, int_gen, 50, 25) return left.crossJoin(right).groupBy('a').count() - assert_gpu_and_cpu_are_equal_collect(do_join, conf={'spark.rapids.sql.batchSizeBytes': batch_size}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf={ + 'spark.rapids.sql.batchSizeBytes': batch_size, + kudo_enabled_conf_key: kudo_enabled + }) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @@ -348,8 +412,9 @@ def do_join(spark): @pytest.mark.parametrize('data_gen,batch_size', join_batch_size_test_params( (all_gen, '1g'), (join_small_batch_gens, '100')), ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_cartesian_join_with_condition(data_gen, batch_size): +def test_cartesian_join_with_condition(data_gen, batch_size, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) # This test is impacted by https://github.com/NVIDIA/spark-rapids/issues/294 @@ -357,7 +422,10 @@ def do_join(spark): # but these take a long time to verify so we run with smaller numbers by default # that do not expose the error return left.join(right, left.b >= right.r_b, "cross") - conf = copy_and_update(_sortmerge_join_conf, {'spark.rapids.sql.batchSizeBytes': batch_size}) + conf = copy_and_update(_sortmerge_join_conf, { + 'spark.rapids.sql.batchSizeBytes': batch_size, + kudo_enabled_conf_key: kudo_enabled + }) assert_gpu_and_cpu_are_equal_collect(do_join, conf=conf) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 @@ -366,22 +434,30 @@ def do_join(spark): @pytest.mark.parametrize('data_gen,batch_size', join_batch_size_test_params( (all_gen + basic_nested_gens, '1g'), (join_small_batch_gens, '100')), ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_broadcast_nested_loop_join(data_gen, batch_size): +def test_broadcast_nested_loop_join(data_gen, batch_size, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) return left.crossJoin(broadcast(right)) - assert_gpu_and_cpu_are_equal_collect(do_join, conf={'spark.rapids.sql.batchSizeBytes': batch_size}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf={ + 'spark.rapids.sql.batchSizeBytes': batch_size, + kudo_enabled_conf_key: kudo_enabled + }) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('batch_size', ['100', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches -def test_broadcast_nested_loop_join_special_case_count(batch_size): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_broadcast_nested_loop_join_special_case_count(batch_size, kudo_enabled): def do_join(spark): left, right = create_df(spark, int_gen, 50, 25) return left.crossJoin(broadcast(right)).selectExpr('COUNT(*)') - assert_gpu_and_cpu_are_equal_collect(do_join, conf={'spark.rapids.sql.batchSizeBytes': batch_size}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf={ + 'spark.rapids.sql.batchSizeBytes': batch_size, + kudo_enabled_conf_key: kudo_enabled + }) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @@ -389,11 +465,15 @@ def do_join(spark): @pytest.mark.xfail(condition=is_databricks_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/334') @pytest.mark.parametrize('batch_size', ['1000', '1g'], ids=idfn) # set the batch size so we can test multiple stream batches -def test_broadcast_nested_loop_join_special_case_group_by_count(batch_size): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_broadcast_nested_loop_join_special_case_group_by_count(batch_size, kudo_enabled): def do_join(spark): left, right = create_df(spark, int_gen, 50, 25) return left.crossJoin(broadcast(right)).groupBy('a').count() - assert_gpu_and_cpu_are_equal_collect(do_join, conf={'spark.rapids.sql.batchSizeBytes': batch_size}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf={ + 'spark.rapids.sql.batchSizeBytes': batch_size, + kudo_enabled_conf_key: kudo_enabled + }) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @@ -402,8 +482,9 @@ def do_join(spark): (join_ast_gen, '1g'), ([int_gen], 100)), ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Inner', 'LeftSemi', 'LeftAnti', 'Cross'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_right_broadcast_nested_loop_join_with_ast_condition(data_gen, join_type, batch_size): +def test_right_broadcast_nested_loop_join_with_ast_condition(data_gen, join_type, batch_size, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) # This test is impacted by https://github.com/NVIDIA/spark-rapids/issues/294 @@ -411,14 +492,18 @@ def do_join(spark): # but these take a long time to verify so we run with smaller numbers by default # that do not expose the error return left.join(broadcast(right), (left.b >= right.r_b), join_type) - assert_gpu_and_cpu_are_equal_collect(do_join, conf={'spark.rapids.sql.batchSizeBytes': batch_size}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf={ + 'spark.rapids.sql.batchSizeBytes': batch_size, + kudo_enabled_conf_key: kudo_enabled + }) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_ast_gen, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_left_broadcast_nested_loop_join_with_ast_condition(data_gen): +def test_left_broadcast_nested_loop_join_with_ast_condition(data_gen, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) # This test is impacted by https://github.com/NVIDIA/spark-rapids/issues/294 @@ -426,14 +511,15 @@ def do_join(spark): # but these take a long time to verify so we run with smaller numbers by default # that do not expose the error return broadcast(left).join(right, (left.b >= right.r_b), 'Right') - assert_gpu_and_cpu_are_equal_collect(do_join) + assert_gpu_and_cpu_are_equal_collect(do_join, conf = {kudo_enabled_conf_key: kudo_enabled}) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', [IntegerGen(), LongGen(), pytest.param(FloatGen(), marks=[incompat]), pytest.param(DoubleGen(), marks=[incompat])], ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Cross'], ids=idfn) -def test_broadcast_nested_loop_join_with_condition_post_filter(data_gen, join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_broadcast_nested_loop_join_with_condition_post_filter(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) # This test is impacted by https://github.com/NVIDIA/spark-rapids/issues/294 @@ -442,12 +528,13 @@ def do_join(spark): # that do not expose the error # AST does not support cast or logarithm yet, so this must be implemented as a post-filter return left.join(broadcast(right), left.a > f.log(right.r_a), join_type) - assert_gpu_and_cpu_are_equal_collect(do_join) + assert_gpu_and_cpu_are_equal_collect(do_join, conf = {kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', [IntegerGen(), LongGen(), pytest.param(FloatGen(), marks=[incompat]), pytest.param(DoubleGen(), marks=[incompat])], ids=idfn) @pytest.mark.parametrize('join_type', ['Cross', 'Left', 'LeftSemi', 'LeftAnti'], ids=idfn) -def test_broadcast_nested_loop_join_with_condition(data_gen, join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_broadcast_nested_loop_join_with_condition(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) # AST does not support cast or logarithm yet which is supposed to be extracted into child @@ -458,39 +545,46 @@ def do_join(spark): # (1) adapt double to integer since AST current doesn't support it. # (2) switch to right side build to pass checks of 'Left', 'LeftSemi', 'LeftAnti' join types return left.join(broadcast(right), f.round(left.a).cast('integer') > f.round(f.log(right.r_a).cast('integer')), join_type) - assert_gpu_and_cpu_are_equal_collect(do_join, conf={"spark.rapids.sql.castFloatToIntegralTypes.enabled": True}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf={ + "spark.rapids.sql.castFloatToIntegralTypes.enabled": True, + kudo_enabled_conf_key: kudo_enabled + }) @allow_non_gpu('BroadcastExchangeExec', 'BroadcastNestedLoopJoinExec', 'Cast', 'GreaterThan', 'Log') @ignore_order(local=True) @pytest.mark.parametrize('data_gen', [IntegerGen(), LongGen(), pytest.param(FloatGen(), marks=[incompat]), pytest.param(DoubleGen(), marks=[incompat])], ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'FullOuter', 'LeftSemi', 'LeftAnti'], ids=idfn) -def test_broadcast_nested_loop_join_with_condition_fallback(data_gen, join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_broadcast_nested_loop_join_with_condition_fallback(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) # AST does not support double type which is not split-able into child nodes. return broadcast(left).join(right, left.a > f.log(right.r_a), join_type) - assert_gpu_fallback_collect(do_join, 'BroadcastNestedLoopJoinExec') + assert_gpu_fallback_collect(do_join, 'BroadcastNestedLoopJoinExec', + conf = {kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, boolean_gen, date_gen, timestamp_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'FullOuter', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_broadcast_nested_loop_join_with_array_contains(data_gen, join_type): +def test_broadcast_nested_loop_join_with_array_contains(data_gen, join_type, kudo_enabled): arr_gen = ArrayGen(data_gen) literal = with_cpu_session(lambda spark: gen_scalar(data_gen)) def do_join(spark): left, right = create_df(spark, arr_gen, 50, 25) # Array_contains will be pushed down into project child nodes return broadcast(left).join(right, array_contains(left.a, literal.cast(data_gen.data_type)) < array_contains(right.r_a, literal.cast(data_gen.data_type))) - assert_gpu_and_cpu_are_equal_collect(do_join) + assert_gpu_and_cpu_are_equal_collect(do_join, conf = {kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_right_broadcast_nested_loop_join_condition_missing(data_gen, join_type): +def test_right_broadcast_nested_loop_join_condition_missing(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) # This test is impacted by https://github.com/NVIDIA/spark-rapids/issues/294 @@ -500,13 +594,14 @@ def do_join(spark): # Compute the distinct of the join result to verify the join produces a proper dataframe # for downstream processing. return left.join(broadcast(right), how=join_type).distinct() - assert_gpu_and_cpu_are_equal_collect(do_join) + assert_gpu_and_cpu_are_equal_collect(do_join, conf = {kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Right'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_left_broadcast_nested_loop_join_condition_missing(data_gen, join_type): +def test_left_broadcast_nested_loop_join_condition_missing(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) # This test is impacted by https://github.com/NVIDIA/spark-rapids/issues/294 @@ -516,45 +611,52 @@ def do_join(spark): # Compute the distinct of the join result to verify the join produces a proper dataframe # for downstream processing. return broadcast(left).join(right, how=join_type).distinct() - assert_gpu_and_cpu_are_equal_collect(do_join) + assert_gpu_and_cpu_are_equal_collect(do_join, conf = {kudo_enabled_conf_key: kudo_enabled}) @pytest.mark.parametrize('data_gen', all_gen + single_level_array_gens + [binary_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_right_broadcast_nested_loop_join_condition_missing_count(data_gen, join_type): +def test_right_broadcast_nested_loop_join_condition_missing_count(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) return left.join(broadcast(right), how=join_type).selectExpr('COUNT(*)') - assert_gpu_and_cpu_are_equal_collect(do_join) + assert_gpu_and_cpu_are_equal_collect(do_join, conf = {kudo_enabled_conf_key: kudo_enabled}) @pytest.mark.parametrize('data_gen', all_gen + single_level_array_gens + [binary_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Right'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_left_broadcast_nested_loop_join_condition_missing_count(data_gen, join_type): +def test_left_broadcast_nested_loop_join_condition_missing_count(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) return broadcast(left).join(right, how=join_type).selectExpr('COUNT(*)') - assert_gpu_and_cpu_are_equal_collect(do_join) + assert_gpu_and_cpu_are_equal_collect(do_join, conf = {kudo_enabled_conf_key: kudo_enabled}) @allow_non_gpu('BroadcastExchangeExec', 'BroadcastNestedLoopJoinExec', 'GreaterThanOrEqual', *non_utc_allow) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['LeftOuter', 'LeftSemi', 'LeftAnti', 'FullOuter'], ids=idfn) -def test_broadcast_nested_loop_join_with_conditionals_build_left_fallback(data_gen, join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_broadcast_nested_loop_join_with_conditionals_build_left_fallback(data_gen, join_type, + kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) return broadcast(left).join(right, (left.b >= right.r_b), join_type) - assert_gpu_fallback_collect(do_join, 'BroadcastNestedLoopJoinExec') + assert_gpu_fallback_collect(do_join, 'BroadcastNestedLoopJoinExec', + conf = {kudo_enabled_conf_key: kudo_enabled}) @allow_non_gpu('BroadcastExchangeExec', 'BroadcastNestedLoopJoinExec', 'GreaterThanOrEqual', *non_utc_allow) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', all_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['RightOuter', 'FullOuter'], ids=idfn) -def test_broadcast_nested_loop_with_conditionals_build_right_fallback(data_gen, join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_broadcast_nested_loop_with_conditionals_build_right_fallback(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) return left.join(broadcast(right), (left.b >= right.r_b), join_type) - assert_gpu_fallback_collect(do_join, 'BroadcastNestedLoopJoinExec') + assert_gpu_fallback_collect(do_join, 'BroadcastNestedLoopJoinExec', + conf = {kudo_enabled_conf_key: kudo_enabled}) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @@ -566,25 +668,28 @@ def do_join(spark): # Specify 200 shuffle partitions to test cases where streaming side is empty # as in https://github.com/NVIDIA/spark-rapids/issues/7516 @pytest.mark.parametrize('shuffle_conf', [{}, {'spark.sql.shuffle.partitions': 200}], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_broadcast_join_left_table(data_gen, join_type, shuffle_conf): +def test_broadcast_join_left_table(data_gen, join_type, shuffle_conf, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 250, 500) return broadcast(left).join(right, left.a == right.r_a, join_type) - assert_gpu_and_cpu_are_equal_collect(do_join, conf=shuffle_conf) + conf = copy_and_update(shuffle_conf, {kudo_enabled_conf_key: kudo_enabled}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf=conf) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_ast_gen, ids=idfn) @pytest.mark.parametrize('join_type', all_join_types, ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_broadcast_join_with_conditionals(data_gen, join_type): +def test_broadcast_join_with_conditionals(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) return left.join(broadcast(right), (left.a == right.r_a) & (left.b >= right.r_b), join_type) - assert_gpu_and_cpu_are_equal_collect(do_join) + assert_gpu_and_cpu_are_equal_collect(do_join, conf = {kudo_enabled_conf_key: kudo_enabled}) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @@ -592,14 +697,15 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', [long_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'FullOuter', 'LeftSemi', 'LeftAnti'], ids=idfn) -def test_broadcast_join_with_condition_ast_op_fallback(data_gen, join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_broadcast_join_with_condition_ast_op_fallback(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) # AST does not support cast or logarithm yet return left.join(broadcast(right), (left.a == right.r_a) & (left.b > f.log(right.r_b)), join_type) exec = 'SortMergeJoinExec' if join_type in ['Right', 'FullOuter'] else 'BroadcastHashJoinExec' - assert_gpu_fallback_collect(do_join, exec) + assert_gpu_fallback_collect(do_join, exec, conf = {kudo_enabled_conf_key: kudo_enabled}) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @@ -607,38 +713,42 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_no_ast_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'FullOuter', 'LeftSemi', 'LeftAnti'], ids=idfn) -def test_broadcast_join_with_condition_ast_type_fallback(data_gen, join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_broadcast_join_with_condition_ast_type_fallback(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 50, 25) # AST does not support cast or logarithm yet return left.join(broadcast(right), (left.a == right.r_a) & (left.b > right.r_b), join_type) exec = 'SortMergeJoinExec' if join_type in ['Right', 'FullOuter'] else 'BroadcastHashJoinExec' - assert_gpu_fallback_collect(do_join, exec) + assert_gpu_fallback_collect(do_join, exec, conf = {kudo_enabled_conf_key: kudo_enabled}) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_no_ast_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Cross'], ids=idfn) -def test_broadcast_join_with_condition_post_filter(data_gen, join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_broadcast_join_with_condition_post_filter(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) return left.join(broadcast(right), (left.a == right.r_a) & (left.b > right.r_b), join_type) - assert_gpu_and_cpu_are_equal_collect(do_join) + assert_gpu_and_cpu_are_equal_collect(do_join, conf = {kudo_enabled_conf_key: kudo_enabled}) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_ast_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'Inner', 'FullOuter', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_sortmerge_join_with_condition_ast(data_gen, join_type): +def test_sortmerge_join_with_condition_ast(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) return left.join(right, (left.a == right.r_a) & (left.b >= right.r_b), join_type) - assert_gpu_and_cpu_are_equal_collect(do_join, conf=_sortmerge_join_conf) + conf = copy_and_update(_sortmerge_join_conf, {kudo_enabled_conf_key: kudo_enabled}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf=conf) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @@ -646,12 +756,14 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', [long_gen], ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'FullOuter', 'LeftSemi', 'LeftAnti'], ids=idfn) -def test_sortmerge_join_with_condition_ast_op_fallback(data_gen, join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_sortmerge_join_with_condition_ast_op_fallback(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) # AST does not support cast or logarithm yet return left.join(right, (left.a == right.r_a) & (left.b > f.log(right.r_b)), join_type) - assert_gpu_fallback_collect(do_join, 'SortMergeJoinExec', conf=_sortmerge_join_conf) + conf = copy_and_update(_sortmerge_join_conf, {kudo_enabled_conf_key: kudo_enabled}) + assert_gpu_fallback_collect(do_join, 'SortMergeJoinExec', conf=conf) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @@ -659,11 +771,13 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', join_no_ast_gen, ids=idfn) @pytest.mark.parametrize('join_type', ['Left', 'Right', 'FullOuter', 'LeftSemi', 'LeftAnti'], ids=idfn) -def test_sortmerge_join_with_condition_ast_type_fallback(data_gen, join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_sortmerge_join_with_condition_ast_type_fallback(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) return left.join(right, (left.a == right.r_a) & (left.b > right.r_b), join_type) - assert_gpu_fallback_collect(do_join, 'SortMergeJoinExec', conf=_sortmerge_join_conf) + conf = copy_and_update(_sortmerge_join_conf, {kudo_enabled_conf_key: kudo_enabled}) + assert_gpu_fallback_collect(do_join, 'SortMergeJoinExec', conf=conf) _mixed_df1_with_nulls = [('a', RepeatSeqGen(LongGen(nullable=(True, 20.0)), length= 10)), @@ -674,20 +788,22 @@ def do_join(spark): @ignore_order @pytest.mark.parametrize('join_type', ['Left', 'Right', 'Inner', 'LeftSemi', 'LeftAnti', 'FullOuter', 'Cross'], ids=idfn) -def test_broadcast_join_mixed(join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_broadcast_join_mixed(join_type, kudo_enabled): def do_join(spark): left = gen_df(spark, _mixed_df1_with_nulls, length=500) right = gen_df(spark, _mixed_df2_with_nulls, length=500).withColumnRenamed("a", "r_a")\ .withColumnRenamed("b", "r_b").withColumnRenamed("c", "r_c") return left.join(broadcast(right), left.a.eqNullSafe(right.r_a), join_type) - assert_gpu_and_cpu_are_equal_collect(do_join) + assert_gpu_and_cpu_are_equal_collect(do_join, conf={kudo_enabled_conf_key: kudo_enabled}) @ignore_order @allow_non_gpu('DataWritingCommandExec,ExecutedCommandExec,WriteFilesExec') @pytest.mark.xfail(condition=is_emr_runtime(), reason='https://github.com/NVIDIA/spark-rapids/issues/821') @pytest.mark.parametrize('repartition', ["true", "false"], ids=idfn) -def test_join_bucketed_table(repartition, spark_tmp_table_factory): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_join_bucketed_table(repartition, spark_tmp_table_factory, kudo_enabled): def do_join(spark): table_name = spark_tmp_table_factory.get() data = [("http://fooblog.com/blog-entry-116.html", "https://fooblog.com/blog-entry-116.html"), @@ -702,7 +818,10 @@ def do_join(spark): return testurls.repartition(20).join(resolved, "Url", "inner") else: return testurls.join(resolved, "Url", "inner") - assert_gpu_and_cpu_are_equal_collect(do_join, conf={'spark.sql.autoBroadcastJoinThreshold': '-1'}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf={ + 'spark.sql.autoBroadcastJoinThreshold': '-1', + kudo_enabled_conf_key: kudo_enabled + }) # Because we disable ShuffleExchangeExec in some cases we need to allow it to not be on the GPU # and we do the result sorting in python to avoid that shuffle also being off the GPU @@ -711,7 +830,8 @@ def do_join(spark): @pytest.mark.parametrize('join_type', ['Left', 'Right', 'Inner', 'LeftSemi', 'LeftAnti'], ids=idfn) @pytest.mark.parametrize('cache_side', ['cache_left', 'cache_right'], ids=idfn) @pytest.mark.parametrize('cpu_side', ['cache', 'not_cache'], ids=idfn) -def test_half_cache_join(join_type, cache_side, cpu_side): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_half_cache_join(join_type, cache_side, cpu_side, kudo_enabled): left_gen = [('a', SetValuesGen(LongType(), range(500))), ('b', IntegerGen())] right_gen = [('r_a', SetValuesGen(LongType(), range(500))), ('c', LongGen())] def do_join(spark): @@ -743,46 +863,56 @@ def do_join(spark): # Even though Spark does not know the size of an RDD input so it will not do a broadcast join unless # we tell it to, this is just to be safe - assert_gpu_and_cpu_are_equal_collect(do_join, {'spark.sql.autoBroadcastJoinThreshold': '1'}) + assert_gpu_and_cpu_are_equal_collect(do_join, { + 'spark.sql.autoBroadcastJoinThreshold': '1', + 'spark.rapids.shuffle.kudo.serializer.enabled': kudo_enabled + }) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_sortmerge_join_struct_as_key(data_gen, join_type): +def test_sortmerge_join_struct_as_key(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) return left.join(right, left.a == right.r_a, join_type) - assert_gpu_and_cpu_are_equal_collect(do_join, conf=_sortmerge_join_conf) + conf = copy_and_update(_sortmerge_join_conf, {kudo_enabled_conf_key: kudo_enabled}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf=conf) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_sortmerge_join_struct_mixed_key(data_gen, join_type): +def test_sortmerge_join_struct_mixed_key(data_gen, join_type, kudo_enabled): def do_join(spark): left = two_col_df(spark, data_gen, int_gen, length=500) right = two_col_df(spark, data_gen, int_gen, length=500) return left.join(right, (left.a == right.a) & (left.b == right.b), join_type) - assert_gpu_and_cpu_are_equal_collect(do_join, conf=_sortmerge_join_conf) + conf = copy_and_update(_sortmerge_join_conf, {kudo_enabled_conf_key: kudo_enabled}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf=conf) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_sortmerge_join_struct_mixed_key_with_null_filter(data_gen, join_type): +def test_sortmerge_join_struct_mixed_key_with_null_filter(data_gen, join_type, kudo_enabled): def do_join(spark): left = two_col_df(spark, data_gen, int_gen, length=500) right = two_col_df(spark, data_gen, int_gen, length=500) return left.join(right, (left.a == right.a) & (left.b == right.b), join_type) # Disable constraintPropagation to test null filter on built table with nullable structures. - conf = {'spark.sql.constraintPropagation.enabled': 'false', **_sortmerge_join_conf} + conf = {'spark.sql.constraintPropagation.enabled': 'false', + 'spark.rapids.shuffle.kudo.serializer.enabled': kudo_enabled, + **_sortmerge_join_conf} assert_gpu_and_cpu_are_equal_collect(do_join, conf=conf) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 @@ -790,25 +920,27 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_broadcast_join_right_struct_as_key(data_gen, join_type): +def test_broadcast_join_right_struct_as_key(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) return left.join(broadcast(right), left.a == right.r_a, join_type) - assert_gpu_and_cpu_are_equal_collect(do_join) + assert_gpu_and_cpu_are_equal_collect(do_join, conf = {kudo_enabled_conf_key: kudo_enabled}) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_broadcast_join_right_struct_mixed_key(data_gen, join_type): +def test_broadcast_join_right_struct_mixed_key(data_gen, join_type, kudo_enabled): def do_join(spark): left = two_col_df(spark, data_gen, int_gen, length=500) right = two_col_df(spark, data_gen, int_gen, length=250) return left.join(broadcast(right), (left.a == right.a) & (left.b == right.b), join_type) - assert_gpu_and_cpu_are_equal_collect(do_join) + assert_gpu_and_cpu_are_equal_collect(do_join, conf = {kudo_enabled_conf_key: kudo_enabled}) # local sort because of https://github.com/NVIDIA/spark-rapids/issues/84 # After 3.1.0 is the min spark version we can drop this @@ -816,11 +948,14 @@ def do_join(spark): @pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/2140') @pytest.mark.parametrize('data_gen', [basic_struct_gen_with_floats], ids=idfn) @pytest.mark.parametrize('join_type', ['Inner', 'Left', 'Right', 'Cross', 'LeftSemi', 'LeftAnti'], ids=idfn) -def test_sortmerge_join_struct_with_floats_key(data_gen, join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_sortmerge_join_struct_with_floats_key(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 500, 250) return left.join(right, left.a == right.r_a, join_type) - assert_gpu_and_cpu_are_equal_collect(do_join, conf=_sortmerge_join_conf) + conf = copy_and_update(_sortmerge_join_conf, + {kudo_enabled_conf_key: kudo_enabled}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf=conf) @allow_non_gpu('SortMergeJoinExec', 'SortExec', 'NormalizeNaNAndZero', 'CreateNamedStruct', 'GetStructField', 'Literal', 'If', 'IsNull', 'ShuffleExchangeExec', 'HashPartitioning', @@ -828,15 +963,19 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.parametrize('data_gen', struct_gens, ids=idfn) @pytest.mark.parametrize('join_type', ['FullOuter'], ids=idfn) -def test_sortmerge_join_struct_as_key_fallback(data_gen, join_type): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_sortmerge_join_struct_as_key_fallback(data_gen, join_type, kudo_enabled): def do_join(spark): left, right = create_df(spark, data_gen, 500, 500) return left.join(right, left.a == right.r_a, join_type) - assert_gpu_fallback_collect(do_join, 'SortMergeJoinExec', conf=_sortmerge_join_conf) + conf = copy_and_update(_sortmerge_join_conf, + {kudo_enabled_conf_key: kudo_enabled}) + assert_gpu_fallback_collect(do_join, 'SortMergeJoinExec', conf=conf) # Regression test for https://github.com/NVIDIA/spark-rapids/issues/3775 @ignore_order(local=True) -def test_struct_self_join(spark_tmp_table_factory): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_struct_self_join(spark_tmp_table_factory, kudo_enabled): def do_join(spark): data = [ (("Adam ", "", "Green"), "1", "M", 1000), @@ -863,7 +1002,7 @@ def do_join(spark): resultdf.createOrReplaceTempView(resultdf_name) return spark.sql("select a.* from {} a, {} b where a.name=b.name".format( resultdf_name, resultdf_name)) - assert_gpu_and_cpu_are_equal_collect(do_join) + assert_gpu_and_cpu_are_equal_collect(do_join, conf = {kudo_enabled_conf_key: kudo_enabled}) # ExistenceJoin occurs in the context of existential subqueries (which is rewritten to SemiJoin) if # there is an additional condition that may qualify left records even though they don't have @@ -883,7 +1022,9 @@ def do_join(spark): ]) @pytest.mark.parametrize('conditionalJoin', [False, True], ids=['ast:off', 'ast:on']) @pytest.mark.parametrize('forceBroadcastHashJoin', [False, True], ids=['broadcastHJ:off', 'broadcastHJ:on']) -def test_existence_join(numComplementsToExists, aqeEnabled, conditionalJoin, forceBroadcastHashJoin, spark_tmp_table_factory): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_existence_join(numComplementsToExists, aqeEnabled, conditionalJoin, + forceBroadcastHashJoin, spark_tmp_table_factory, kudo_enabled): leftTable = spark_tmp_table_factory.get() rightTable = spark_tmp_table_factory.get() def do_join(spark): @@ -933,12 +1074,14 @@ def do_join(spark): assert_cpu_and_gpu_are_equal_collect_with_capture(do_join, existenceJoinRegex, conf={ "spark.sql.adaptive.enabled": aqeEnabled, - "spark.sql.autoBroadcastJoinThreshold": bhjThreshold + "spark.sql.autoBroadcastJoinThreshold": bhjThreshold, + kudo_enabled_conf_key: kudo_enabled }) @ignore_order @pytest.mark.parametrize('aqeEnabled', [True, False], ids=['aqe:on', 'aqe:off']) -def test_existence_join_in_broadcast_nested_loop_join(spark_tmp_table_factory, aqeEnabled): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_existence_join_in_broadcast_nested_loop_join(spark_tmp_table_factory, aqeEnabled, kudo_enabled): left_table_name = spark_tmp_table_factory.get() right_table_name = spark_tmp_table_factory.get() @@ -958,11 +1101,13 @@ def do_join(spark): capture_regexp = r"GpuBroadcastNestedLoopJoin ExistenceJoin\(exists#[0-9]+\)," assert_cpu_and_gpu_are_equal_collect_with_capture(do_join, capture_regexp, - conf={"spark.sql.adaptive.enabled": aqeEnabled}) + conf={"spark.sql.adaptive.enabled": aqeEnabled, + kudo_enabled_conf_key: kudo_enabled}) @ignore_order @pytest.mark.parametrize('aqeEnabled', [True, False], ids=['aqe:on', 'aqe:off']) -def test_degenerate_broadcast_nested_loop_existence_join(spark_tmp_table_factory, aqeEnabled): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_degenerate_broadcast_nested_loop_existence_join(spark_tmp_table_factory, aqeEnabled, kudo_enabled): left_table_name = spark_tmp_table_factory.get() right_table_name = spark_tmp_table_factory.get() @@ -982,13 +1127,15 @@ def do_join(spark): capture_regexp = r"GpuBroadcastNestedLoopJoin ExistenceJoin\(exists#[0-9]+\)," assert_cpu_and_gpu_are_equal_collect_with_capture(do_join, capture_regexp, - conf={"spark.sql.adaptive.enabled": aqeEnabled}) + conf={"spark.sql.adaptive.enabled": aqeEnabled, + kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @pytest.mark.parametrize('data_gen', [StringGen(), IntegerGen()], ids=idfn) @pytest.mark.parametrize("aqe_enabled", [True, False], ids=idfn) @pytest.mark.parametrize("join_reorder_enabled", [True, False], ids=idfn) -def test_multi_table_hash_join(data_gen, aqe_enabled, join_reorder_enabled): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_multi_table_hash_join(data_gen, aqe_enabled, join_reorder_enabled, kudo_enabled): def do_join(spark): t1 = binary_op_df(spark, data_gen, length=1000) t2 = binary_op_df(spark, data_gen, length=800) @@ -999,14 +1146,15 @@ def do_join(spark): .join(t4, t3.a == t4.a, 'Inner') conf = copy_and_update(_hash_join_conf, { 'spark.sql.adaptive.enabled': aqe_enabled, - 'spark.rapids.sql.optimizer.joinReorder.enabled': join_reorder_enabled + 'spark.rapids.sql.optimizer.joinReorder.enabled': join_reorder_enabled, + kudo_enabled_conf_key: kudo_enabled }) assert_gpu_and_cpu_are_equal_collect(do_join, conf=conf) limited_integral_gens = [byte_gen, ShortGen(max_val=BYTE_MAX), IntegerGen(max_val=BYTE_MAX), LongGen(max_val=BYTE_MAX)] -def hash_join_different_key_integral_types(left_gen, right_gen, join_type): +def hash_join_different_key_integral_types(left_gen, right_gen, join_type, kudo_enabled): def do_join(spark): left = unary_op_df(spark, left_gen, length=50) right = unary_op_df(spark, right_gen, length=500) @@ -1014,7 +1162,8 @@ def do_join(spark): _all_conf = copy_and_update(_hash_join_conf, { "spark.rapids.sql.join.useShuffledSymmetricHashJoin": "true", "spark.rapids.sql.join.useShuffledAsymmetricHashJoin": "true", - "spark.rapids.sql.test.subPartitioning.enabled": True + "spark.rapids.sql.test.subPartitioning.enabled": True, + kudo_enabled_conf_key: kudo_enabled }) assert_gpu_and_cpu_are_equal_collect(do_join, conf=_all_conf) @@ -1023,24 +1172,27 @@ def do_join(spark): @pytest.mark.parametrize('left_gen', limited_integral_gens, ids=idfn) @pytest.mark.parametrize('right_gen', limited_integral_gens, ids=idfn) @pytest.mark.parametrize('join_type', all_non_sized_join_types, ids=idfn) -def test_hash_join_different_key_integral_types_non_sized(left_gen, right_gen, join_type): - hash_join_different_key_integral_types(left_gen, right_gen, join_type) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_join_different_key_integral_types_non_sized(left_gen, right_gen, join_type, kudo_enabled): + hash_join_different_key_integral_types(left_gen, right_gen, join_type, kudo_enabled) @validate_execs_in_gpu_plan('GpuShuffledSymmetricHashJoinExec') @ignore_order(local=True) @pytest.mark.parametrize('left_gen', limited_integral_gens, ids=idfn) @pytest.mark.parametrize('right_gen', limited_integral_gens, ids=idfn) @pytest.mark.parametrize('join_type', all_symmetric_sized_join_types, ids=idfn) -def test_hash_join_different_key_integral_types_symmetric(left_gen, right_gen, join_type): - hash_join_different_key_integral_types(left_gen, right_gen, join_type) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_join_different_key_integral_types_symmetric(left_gen, right_gen, join_type, kudo_enabled): + hash_join_different_key_integral_types(left_gen, right_gen, join_type, kudo_enabled) @validate_execs_in_gpu_plan('GpuShuffledAsymmetricHashJoinExec') @ignore_order(local=True) @pytest.mark.parametrize('left_gen', limited_integral_gens, ids=idfn) @pytest.mark.parametrize('right_gen', limited_integral_gens, ids=idfn) @pytest.mark.parametrize('join_type', all_asymmetric_sized_join_types, ids=idfn) -def test_hash_join_different_key_integral_types_asymmetric(left_gen, right_gen, join_type): - hash_join_different_key_integral_types(left_gen, right_gen, join_type) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_join_different_key_integral_types_asymmetric(left_gen, right_gen, join_type, kudo_enabled): + hash_join_different_key_integral_types(left_gen, right_gen, join_type, kudo_enabled) bloom_filter_confs = { @@ -1068,8 +1220,10 @@ def do_join(spark): @pytest.mark.parametrize("is_multi_column", [False, True], ids=idfn) @pytest.mark.skipif(is_databricks_runtime(), reason="https://github.com/NVIDIA/spark-rapids/issues/8921") @pytest.mark.skipif(is_before_spark_330(), reason="Bloom filter joins added in Spark 3.3.0") -def test_bloom_filter_join(batch_size, is_multi_column): - conf = {"spark.rapids.sql.batchSizeBytes": batch_size} +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_bloom_filter_join(batch_size, is_multi_column, kudo_enabled): + conf = {"spark.rapids.sql.batchSizeBytes": batch_size, + kudo_enabled_conf_key: kudo_enabled} check_bloom_filter_join(confs=conf, expected_classes="GpuBloomFilterMightContain,GpuBloomFilterAggregate", is_multi_column=is_multi_column) @@ -1079,8 +1233,10 @@ def test_bloom_filter_join(batch_size, is_multi_column): @pytest.mark.parametrize("is_multi_column", [False, True], ids=idfn) @pytest.mark.skipif(is_databricks_runtime(), reason="https://github.com/NVIDIA/spark-rapids/issues/8921") @pytest.mark.skipif(is_before_spark_330(), reason="Bloom filter joins added in Spark 3.3.0") -def test_bloom_filter_join_cpu_probe(is_multi_column): - conf = {"spark.rapids.sql.expression.BloomFilterMightContain": "false"} +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_bloom_filter_join_cpu_probe(is_multi_column, kudo_enabled): + conf = {"spark.rapids.sql.expression.BloomFilterMightContain": "false", + kudo_enabled_conf_key: kudo_enabled} check_bloom_filter_join(confs=conf, expected_classes="BloomFilterMightContain,GpuBloomFilterAggregate", is_multi_column=is_multi_column) @@ -1090,8 +1246,10 @@ def test_bloom_filter_join_cpu_probe(is_multi_column): @pytest.mark.parametrize("is_multi_column", [False, True], ids=idfn) @pytest.mark.skipif(is_databricks_runtime(), reason="https://github.com/NVIDIA/spark-rapids/issues/8921") @pytest.mark.skipif(is_before_spark_330(), reason="Bloom filter joins added in Spark 3.3.0") -def test_bloom_filter_join_cpu_build(is_multi_column): - conf = {"spark.rapids.sql.expression.BloomFilterAggregate": "false"} +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_bloom_filter_join_cpu_build(is_multi_column, kudo_enabled): + conf = {"spark.rapids.sql.expression.BloomFilterAggregate": "false", + kudo_enabled_conf_key: kudo_enabled} check_bloom_filter_join(confs=conf, expected_classes="GpuBloomFilterMightContain,BloomFilterAggregate", is_multi_column=is_multi_column) @@ -1102,8 +1260,10 @@ def test_bloom_filter_join_cpu_build(is_multi_column): @pytest.mark.parametrize("is_multi_column", [False, True], ids=idfn) @pytest.mark.skipif(is_databricks_runtime(), reason="https://github.com/NVIDIA/spark-rapids/issues/8921") @pytest.mark.skipif(is_before_spark_330(), reason="Bloom filter joins added in Spark 3.3.0") -def test_bloom_filter_join_split_cpu_build(agg_replace_mode, is_multi_column): - conf = {"spark.rapids.sql.hashAgg.replaceMode": agg_replace_mode} +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_bloom_filter_join_split_cpu_build(agg_replace_mode, is_multi_column, kudo_enabled): + conf = {"spark.rapids.sql.hashAgg.replaceMode": agg_replace_mode, + kudo_enabled_conf_key: kudo_enabled} check_bloom_filter_join(confs=conf, expected_classes="GpuBloomFilterMightContain,BloomFilterAggregate,GpuBloomFilterAggregate", is_multi_column=is_multi_column) @@ -1111,14 +1271,16 @@ def test_bloom_filter_join_split_cpu_build(agg_replace_mode, is_multi_column): @ignore_order(local=True) @pytest.mark.skipif(is_databricks_runtime(), reason="https://github.com/NVIDIA/spark-rapids/issues/8921") @pytest.mark.skipif(is_before_spark_330(), reason="Bloom filter joins added in Spark 3.3.0") -def test_bloom_filter_join_with_merge_some_null_filters(spark_tmp_path): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_bloom_filter_join_with_merge_some_null_filters(spark_tmp_path, kudo_enabled): data_path1 = spark_tmp_path + "/BLOOM_JOIN_DATA1" data_path2 = spark_tmp_path + "/BLOOM_JOIN_DATA2" with_cpu_session(lambda spark: spark.range(100000).coalesce(1).write.parquet(data_path1)) with_cpu_session(lambda spark: spark.range(100000).withColumn("id2", col("id").cast("string"))\ .coalesce(1).write.parquet(data_path2)) confs = copy_and_update(bloom_filter_confs, - {"spark.sql.files.maxPartitionBytes": "1000"}) + {"spark.sql.files.maxPartitionBytes": "1000", + kudo_enabled_conf_key: kudo_enabled}) def do_join(spark): left = spark.read.parquet(data_path1) right = spark.read.parquet(data_path2) @@ -1128,7 +1290,8 @@ def do_join(spark): @ignore_order(local=True) @pytest.mark.skipif(is_databricks_runtime(), reason="https://github.com/NVIDIA/spark-rapids/issues/8921") @pytest.mark.skipif(is_before_spark_330(), reason="Bloom filter joins added in Spark 3.3.0") -def test_bloom_filter_join_with_merge_all_null_filters(spark_tmp_path): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_bloom_filter_join_with_merge_all_null_filters(spark_tmp_path, kudo_enabled): data_path1 = spark_tmp_path + "/BLOOM_JOIN_DATA1" data_path2 = spark_tmp_path + "/BLOOM_JOIN_DATA2" with_cpu_session(lambda spark: spark.range(100000).write.parquet(data_path1)) @@ -1138,13 +1301,15 @@ def do_join(spark): left = spark.read.parquet(data_path1) right = spark.read.parquet(data_path2) return right.filter("cast(id2 as bigint) % 3 = 4").join(left, left.id == right.id, "inner") - assert_gpu_and_cpu_are_equal_collect(do_join, bloom_filter_confs) + conf = copy_and_update(bloom_filter_confs, {kudo_enabled_conf_key: kudo_enabled}) + assert_gpu_and_cpu_are_equal_collect(do_join, conf) @ignore_order(local=True) @allow_non_gpu("ProjectExec", "FilterExec", "BroadcastHashJoinExec", "ColumnarToRowExec", "BroadcastExchangeExec", "BatchScanExec") @pytest.mark.parametrize("disable_build", [True, False]) -def test_broadcast_hash_join_fix_fallback_by_inputfile(spark_tmp_path, disable_build): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_broadcast_hash_join_fix_fallback_by_inputfile(spark_tmp_path, disable_build, kudo_enabled): data_path_parquet = spark_tmp_path + "/parquet" data_path_orc = spark_tmp_path + "/orc" # The smaller one (orc) will be the build side (a broadcast) @@ -1174,13 +1339,15 @@ def do_join(spark): do_join, conf={"spark.sql.autoBroadcastJoinThreshold": "10M", "spark.sql.sources.useV1SourceList": "", - "spark.rapids.sql.input." + scan_name: False}) + "spark.rapids.sql.input." + scan_name: False, + kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @allow_non_gpu("ProjectExec", "BroadcastNestedLoopJoinExec", "ColumnarToRowExec", "BroadcastExchangeExec", "BatchScanExec") @pytest.mark.parametrize("disable_build", [True, False]) -def test_broadcast_nested_join_fix_fallback_by_inputfile(spark_tmp_path, disable_build): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_broadcast_nested_join_fix_fallback_by_inputfile(spark_tmp_path, disable_build, kudo_enabled): data_path_parquet = spark_tmp_path + "/parquet" data_path_orc = spark_tmp_path + "/orc" # The smaller one (orc) will be the build side (a broadcast) @@ -1209,14 +1376,17 @@ def do_join(spark): do_join, conf={"spark.sql.autoBroadcastJoinThreshold": "-1", "spark.sql.sources.useV1SourceList": "", - "spark.rapids.sql.input." + scan_name: False}) + "spark.rapids.sql.input." + scan_name: False, + kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) @pytest.mark.parametrize("join_type", ["Inner", "LeftOuter", "RightOuter"], ids=idfn) @pytest.mark.parametrize("batch_size", ["500", "1g"], ids=idfn) -def test_distinct_join(join_type, batch_size): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_distinct_join(join_type, batch_size, kudo_enabled): join_conf = { - "spark.rapids.sql.batchSizeBytes": batch_size + "spark.rapids.sql.batchSizeBytes": batch_size, + kudo_enabled_conf_key: kudo_enabled } def do_join(spark): left_df = spark.range(1024).withColumn("x", f.col("id") + 1) @@ -1230,13 +1400,15 @@ def do_join(spark): @pytest.mark.parametrize("is_right_host_shuffle", [False, True], ids=idfn) @pytest.mark.parametrize("is_left_smaller", [False, True], ids=idfn) @pytest.mark.parametrize("batch_size", ["1024", "1g"], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) def test_sized_join(join_type, is_left_host_shuffle, is_right_host_shuffle, - is_left_smaller, batch_size): + is_left_smaller, batch_size, kudo_enabled): join_conf = { "spark.rapids.sql.join.useShuffledSymmetricHashJoin": "true", "spark.rapids.sql.join.useShuffledAsymmetricHashJoin": "true", "spark.sql.autoBroadcastJoinThreshold": "1", - "spark.rapids.sql.batchSizeBytes": batch_size + "spark.rapids.sql.batchSizeBytes": batch_size, + kudo_enabled_conf_key: kudo_enabled } left_size, right_size = (2048, 1024) if is_left_smaller else (1024, 2048) def do_join(spark): @@ -1266,7 +1438,8 @@ def do_join(spark): @pytest.mark.parametrize("is_left_smaller", [False, True], ids=idfn) @pytest.mark.parametrize("is_ast_supported", [False, True], ids=idfn) @pytest.mark.parametrize("batch_size", ["1024", "1g"], ids=idfn) -def test_sized_join_conditional(join_type, is_ast_supported, is_left_smaller, batch_size): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_sized_join_conditional(join_type, is_ast_supported, is_left_smaller, batch_size, kudo_enabled): if join_type != "Inner" and not is_ast_supported: pytest.skip("Only inner joins support a non-AST condition") join_conf = { @@ -1274,7 +1447,8 @@ def test_sized_join_conditional(join_type, is_ast_supported, is_left_smaller, ba "spark.rapids.sql.join.useShuffledAsymmetricHashJoin": "true", "spark.rapids.sql.join.use" "spark.sql.autoBroadcastJoinThreshold": "1", - "spark.rapids.sql.batchSizeBytes": batch_size + "spark.rapids.sql.batchSizeBytes": batch_size, + kudo_enabled_conf_key: kudo_enabled } left_size, right_size = (2048, 1024) if is_left_smaller else (1024, 2048) def do_join(spark): @@ -1300,13 +1474,15 @@ def do_join(spark): @pytest.mark.parametrize("is_left_replicated", [False, True], ids=idfn) @pytest.mark.parametrize("is_conditional", [False, True], ids=idfn) @pytest.mark.parametrize("is_outer_side_small", [False, True], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) def test_sized_join_high_key_replication(join_type, is_left_replicated, is_conditional, - is_outer_side_small): + is_outer_side_small, kudo_enabled): join_conf = { "spark.rapids.sql.join.useShuffledSymmetricHashJoin": "true", "spark.rapids.sql.join.useShuffledAsymmetricHashJoin": "true", "spark.rapids.sql.join.use" - "spark.sql.autoBroadcastJoinThreshold": "1" + "spark.sql.autoBroadcastJoinThreshold": "1", + kudo_enabled_conf_key: kudo_enabled } left_size, right_size = (30000, 40000) left_key_gen, right_key_gen = ( diff --git a/integration_tests/src/main/python/json_matrix_test.py b/integration_tests/src/main/python/json_matrix_test.py index 136a4b041f8..50fbe9745dc 100644 --- a/integration_tests/src/main/python/json_matrix_test.py +++ b/integration_tests/src/main/python/json_matrix_test.py @@ -66,6 +66,7 @@ def read_json_as_text(spark, data_path, column_name): WITH_COMMENTS_FILE = "withComments.json" WITH_COMMENTS_SCHEMA = StructType([StructField("str", StringType())]) +WITH_COMMENTS_MAP_SCHEMA = MapType(StringType(), StringType()) @allow_non_gpu('FileSourceScanExec') @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) @@ -86,6 +87,14 @@ def test_from_json_allow_comments_on(std_input_path): 'JsonToStructs', conf =_enable_json_to_structs_conf) +@allow_non_gpu(TEXT_INPUT_EXEC, 'ProjectExec') +def test_from_json_allow_comments_on_map(std_input_path): + schema = WITH_COMMENTS_MAP_SCHEMA + assert_gpu_fallback_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_COMMENTS_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {'allowComments': "true"})), + 'JsonToStructs', + conf =_enable_json_to_structs_conf) + # Off is the default so it really needs to work @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) def test_scan_json_allow_comments_off(std_input_path, read_func, spark_tmp_table_factory): @@ -104,6 +113,14 @@ def test_from_json_allow_comments_off(std_input_path): lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_COMMENTS_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {'allowComments': "false"})), conf =_enable_json_to_structs_conf) +# Off is the default so it really needs to work +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +def test_from_json_allow_comments_off_map(std_input_path): + schema = WITH_COMMENTS_MAP_SCHEMA + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_COMMENTS_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {'allowComments': "false"})), + conf =_enable_json_to_structs_conf) + # Off is the default so it really needs to work @allow_non_gpu(TEXT_INPUT_EXEC) def test_get_json_object_allow_comments_off(std_input_path): @@ -119,24 +136,30 @@ def test_json_tuple_allow_comments_off(std_input_path): WITH_SQ_FILE = "withSingleQuotes.json" WITH_SQ_SCHEMA = StructType([StructField("str", StringType())]) +WITH_SQ_MAP_SCHEMA = MapType(StringType(), StringType()) @allow_non_gpu('FileSourceScanExec') @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) def test_scan_json_allow_single_quotes_off(std_input_path, read_func, spark_tmp_table_factory): - assert_gpu_fallback_collect( + assert_gpu_and_cpu_are_equal_collect( read_func(std_input_path + '/' + WITH_SQ_FILE, WITH_SQ_SCHEMA, spark_tmp_table_factory, {"allowSingleQuotes": "false"}), - 'FileSourceScanExec', conf=_enable_all_types_json_scan_conf) @allow_non_gpu('ProjectExec', TEXT_INPUT_EXEC) def test_from_json_allow_single_quotes_off(std_input_path): schema = WITH_SQ_SCHEMA - assert_gpu_fallback_collect( + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_SQ_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {'allowSingleQuotes': "false"})), + conf =_enable_json_to_structs_conf) + +@allow_non_gpu('ProjectExec', TEXT_INPUT_EXEC) +def test_from_json_allow_single_quotes_off_map(std_input_path): + schema = WITH_SQ_MAP_SCHEMA + assert_gpu_and_cpu_are_equal_collect( lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_SQ_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {'allowSingleQuotes': "false"})), - 'JsonToStructs', conf =_enable_json_to_structs_conf) # On is the default so it really needs to work @@ -157,6 +180,14 @@ def test_from_json_allow_single_quotes_on(std_input_path): lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_SQ_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {'allowSingleQuotes': "true"})), conf =_enable_json_to_structs_conf) +# On is the default so it really needs to work +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +def test_from_json_allow_single_quotes_on_map(std_input_path): + schema = WITH_SQ_MAP_SCHEMA + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_SQ_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {'allowSingleQuotes': "true"})), + conf =_enable_json_to_structs_conf) + # On is the default so it really needs to work @allow_non_gpu(TEXT_INPUT_EXEC) def test_get_json_object_allow_single_quotes_on(std_input_path): @@ -172,6 +203,7 @@ def test_json_tuple_allow_single_quotes_on(std_input_path): WITH_UNQUOTE_FIELD_NAMES_FILE = "withUnquotedFieldNames.json" WITH_UNQUOTE_FIELD_NAMES_SCHEMA = StructType([StructField("str", StringType())]) +WITH_UNQUOTE_FIELD_NAMES_MAP_SCHEMA = MapType(StringType(), StringType()) @allow_non_gpu('FileSourceScanExec') @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) @@ -192,6 +224,14 @@ def test_from_json_allow_unquoted_field_names_on(std_input_path): 'JsonToStructs', conf =_enable_json_to_structs_conf) +@allow_non_gpu('ProjectExec', TEXT_INPUT_EXEC) +def test_from_json_allow_unquoted_field_names_on_map(std_input_path): + schema = WITH_UNQUOTE_FIELD_NAMES_MAP_SCHEMA + assert_gpu_fallback_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_UNQUOTE_FIELD_NAMES_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {'allowUnquotedFieldNames': "true"})), + 'JsonToStructs', + conf =_enable_json_to_structs_conf) + # Off is the default so it really needs to work @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) def test_scan_json_allow_unquoted_field_names_off(std_input_path, read_func, spark_tmp_table_factory): @@ -204,12 +244,20 @@ def test_scan_json_allow_unquoted_field_names_off(std_input_path, read_func, spa # Off is the default so it really needs to work @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 -def test_from_json_allow_unquoted_field_names_on(std_input_path): +def test_from_json_allow_unquoted_field_names_off(std_input_path): schema = WITH_UNQUOTE_FIELD_NAMES_SCHEMA assert_gpu_and_cpu_are_equal_collect( lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_UNQUOTE_FIELD_NAMES_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {'allowUnquotedFieldNames': "false"})), conf =_enable_json_to_structs_conf) +# Off is the default so it really needs to work +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +def test_from_json_allow_unquoted_field_names_off_map(std_input_path): + schema = WITH_UNQUOTE_FIELD_NAMES_MAP_SCHEMA + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_UNQUOTE_FIELD_NAMES_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {'allowUnquotedFieldNames': "false"})), + conf =_enable_json_to_structs_conf) + # Off is the default so it really needs to work @allow_non_gpu(TEXT_INPUT_EXEC) def test_get_json_object_allow_unquoted_field_names_off(std_input_path): @@ -228,6 +276,7 @@ def test_json_tuple_allow_unquoted_field_names_off(std_input_path): StructField("int", IntegerType()), StructField("float", FloatType()), StructField("decimal", DecimalType(10, 3))]) +WITH_NUMERIC_LEAD_ZEROS_MAP_SCHEMA = MapType(StringType(), StringType()) @approximate_float() @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) @@ -247,6 +296,13 @@ def test_from_json_allow_numeric_leading_zeros_on(std_input_path): lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_NUMERIC_LEAD_ZEROS_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"allowNumericLeadingZeros": "true"})), conf =_enable_json_to_structs_conf) +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +def test_from_json_allow_numeric_leading_zeros_on_map(std_input_path): + schema = WITH_NUMERIC_LEAD_ZEROS_MAP_SCHEMA + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_NUMERIC_LEAD_ZEROS_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"allowNumericLeadingZeros": "true"})), + conf =_enable_json_to_structs_conf) + # Off is the default so it really needs to work @approximate_float() @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) @@ -268,6 +324,14 @@ def test_from_json_allow_numeric_leading_zeros_off(std_input_path): lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_NUMERIC_LEAD_ZEROS_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"allowNumericLeadingZeros": "false"})), conf =_enable_json_to_structs_conf) +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +def test_from_json_allow_numeric_leading_zeros_off_map(std_input_path): + schema = WITH_NUMERIC_LEAD_ZEROS_MAP_SCHEMA + + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_NUMERIC_LEAD_ZEROS_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"allowNumericLeadingZeros": "false"})), + conf =_enable_json_to_structs_conf) + # Off is the default so it really needs to work @allow_non_gpu(TEXT_INPUT_EXEC) def test_get_json_object_allow_numeric_leading_zeros_off(std_input_path): @@ -286,6 +350,7 @@ def test_json_tuple_allow_numeric_leading_zeros_off(std_input_path): WITH_NONNUMERIC_NUMBERS_SCHEMA = StructType([ StructField("float", FloatType()), StructField("double", DoubleType())]) +WITH_NONNUMERIC_NUMBERS_MAP_SCHEMA = MapType(StringType(), StringType()) @approximate_float() @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) @@ -307,6 +372,14 @@ def test_from_json_allow_nonnumeric_numbers_off(std_input_path): lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_NONNUMERIC_NUMBERS_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"allowNonNumericNumbers": "false"})), conf =_enable_json_to_structs_conf) +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +@pytest.mark.xfail(condition = is_before_spark_330(), reason = 'https://github.com/NVIDIA/spark-rapids/issues/10493') +def test_from_json_allow_nonnumeric_numbers_off_map(std_input_path): + schema = WITH_NONNUMERIC_NUMBERS_MAP_SCHEMA + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_NONNUMERIC_NUMBERS_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"allowNonNumericNumbers": "false"})), + conf =_enable_json_to_structs_conf) + # On is the default for scan so it really needs to work @approximate_float() @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) @@ -329,6 +402,14 @@ def test_from_json_allow_nonnumeric_numbers_on(std_input_path): lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_NONNUMERIC_NUMBERS_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"allowNonNumericNumbers": "true"})), conf =_enable_json_to_structs_conf) +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +@pytest.mark.xfail(condition = is_before_spark_330(), reason = 'https://github.com/NVIDIA/spark-rapids/issues/10493') +def test_from_json_allow_nonnumeric_numbers_on_map(std_input_path): + schema = WITH_NONNUMERIC_NUMBERS_MAP_SCHEMA + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_NONNUMERIC_NUMBERS_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"allowNonNumericNumbers": "true"})), + conf =_enable_json_to_structs_conf) + # Off is the default for get_json_object so we want this to work @allow_non_gpu(TEXT_INPUT_EXEC) def test_get_json_object_allow_nonnumeric_numbers_off(std_input_path): @@ -346,6 +427,7 @@ def test_json_tuple_allow_nonnumeric_numbers_off(std_input_path): WITH_BS_ESC_FILE = "withBackslashEscapingAnyCharacter.json" WITH_BS_ESC_SCHEMA = StructType([ StructField("str", StringType())]) +WITH_BS_ESC_MAP_SCHEMA = MapType(StringType(), StringType()) # Off is the default for scan so it really needs to work @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same @@ -365,6 +447,14 @@ def test_from_json_allow_backslash_escape_any_off(std_input_path): lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_BS_ESC_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"allowBackslashEscapingAnyCharacter": "false"})), conf =_enable_json_to_structs_conf) +# Off is the default for from_json so it really needs to work +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +def test_from_json_allow_backslash_escape_any_off_map(std_input_path): + schema = WITH_BS_ESC_MAP_SCHEMA + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_BS_ESC_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"allowBackslashEscapingAnyCharacter": "false"})), + conf =_enable_json_to_structs_conf) + @allow_non_gpu('FileSourceScanExec') @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) def test_scan_json_allow_backslash_escape_any_on(std_input_path, read_func, spark_tmp_table_factory): @@ -384,6 +474,14 @@ def test_from_json_allow_backslash_escape_any_on(std_input_path): 'JsonToStructs', conf =_enable_json_to_structs_conf) +@allow_non_gpu(TEXT_INPUT_EXEC, 'ProjectExec') +def test_from_json_allow_backslash_escape_any_on_map(std_input_path): + schema = WITH_BS_ESC_MAP_SCHEMA + assert_gpu_fallback_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_BS_ESC_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"allowBackslashEscapingAnyCharacter": "true"})), + 'JsonToStructs', + conf =_enable_json_to_structs_conf) + # Off is the default for get_json_object so we want this to work @allow_non_gpu(TEXT_INPUT_EXEC) def test_get_json_object_allow_backslash_escape_any_off(std_input_path): @@ -400,6 +498,7 @@ def test_json_tuple_allow_backslash_escape_any_off(std_input_path): WITH_UNQUOTED_CONTROL_FILE = "withUnquotedControlChars.json" WITH_UNQUOTED_CONTROL_SCHEMA = StructType([ StructField("str", StringType())]) +WITH_UNQUOTED_CONTROL_MAP_SCHEMA = MapType(StringType(), StringType()) # Off is the default for scan so it really needs to work @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) @@ -419,6 +518,14 @@ def test_from_json_allow_unquoted_control_chars_off(std_input_path): lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_UNQUOTED_CONTROL_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"allowUnquotedControlChars": "false"})), conf =_enable_json_to_structs_conf) +# Off is the default for from_json so it really needs to work +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +def test_from_json_allow_unquoted_control_chars_off_map(std_input_path): + schema = WITH_UNQUOTED_CONTROL_MAP_SCHEMA + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_UNQUOTED_CONTROL_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"allowUnquotedControlChars": "false"})), + conf =_enable_json_to_structs_conf) + @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql]) def test_scan_json_allow_unquoted_control_chars_on(std_input_path, read_func, spark_tmp_table_factory): assert_gpu_and_cpu_are_equal_collect( @@ -435,6 +542,13 @@ def test_from_json_allow_unquoted_control_chars_on(std_input_path): lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_UNQUOTED_CONTROL_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"allowUnquotedControlChars": "true"})), conf =_enable_json_to_structs_conf) +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +def test_from_json_allow_unquoted_control_chars_on_map(std_input_path): + schema = WITH_UNQUOTED_CONTROL_MAP_SCHEMA + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_UNQUOTED_CONTROL_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"allowUnquotedControlChars": "true"})), + conf =_enable_json_to_structs_conf) + # On is the default for get_json_object so we want this to work @allow_non_gpu(TEXT_INPUT_EXEC) def test_get_json_object_allow_unquoted_control_chars_on(std_input_path): @@ -453,6 +567,7 @@ def test_json_tuple_allow_unquoted_control_chars_on(std_input_path): WITH_DEC_LOCALE_NON_ARIBIC_FILE = "decimal_locale_formatted_strings_non_aribic.json" WITH_DEC_LOCALE_SCHEMA = StructType([ StructField("data", DecimalType(10, 5))]) +WITH_DEC_LOCALE_MAP_SCHEMA = MapType(StringType(), StringType()) NON_US_DEC_LOCALES=["it-CH","ko-KR","h-TH-x-lvariant-TH","ru-RU","de-DE","iw-IL","hi-IN","ar-QA","zh-CN","ko-KR"] # US is the default locale so we kind of what it to work @@ -493,6 +608,23 @@ def test_from_json_dec_locale(std_input_path, locale): 'JsonToStructs', conf =_enable_json_to_structs_conf) +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +def test_from_json_dec_locale_US_map(std_input_path): + schema = WITH_DEC_LOCALE_MAP_SCHEMA + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_DEC_LOCALE_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema)), + conf =_enable_json_to_structs_conf) + +# This will not fall back because we only support map +# and locals impact decimal parsing, not strings. +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +@pytest.mark.parametrize('locale', NON_US_DEC_LOCALES) +def test_from_json_dec_locale_map(std_input_path, locale): + schema = WITH_DEC_LOCALE_MAP_SCHEMA + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_DEC_LOCALE_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"locale": locale})), + conf =_enable_json_to_structs_conf) + #There is no way to set a locale for these, and it really should not matter @allow_non_gpu(TEXT_INPUT_EXEC) def test_get_json_object_dec_locale(std_input_path): @@ -551,6 +683,25 @@ def test_from_json_dec_locale_non_aribic(std_input_path, locale): 'JsonToStructs', conf =_enable_json_to_structs_conf) +# This will not fail because we only support map +# and decimal is needed to trigger the translation issue +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +def test_from_json_dec_locale_US_non_aribic_map(std_input_path): + schema = WITH_DEC_LOCALE_MAP_SCHEMA + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_DEC_LOCALE_NON_ARIBIC_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema)), + conf =_enable_json_to_structs_conf) + +# This will not fall back because we only support map +# and locals impact decimal parsing, not strings. +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +@pytest.mark.parametrize('locale', NON_US_DEC_LOCALES) +def test_from_json_dec_locale_non_aribic_map(std_input_path, locale): + schema = WITH_DEC_LOCALE_MAP_SCHEMA + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + WITH_DEC_LOCALE_NON_ARIBIC_FILE, "json").select(f.col('json'), f.from_json(f.col('json'), schema, {"locale": locale})), + conf =_enable_json_to_structs_conf) + #There is no way to set a locale for these, and it really should not matter @allow_non_gpu(TEXT_INPUT_EXEC) def test_get_json_object_dec_locale_non_aribic(std_input_path): @@ -577,6 +728,7 @@ def test_json_tuple_dec_locale_non_aribic(std_input_path): "boolean_formatted.json", "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -589,6 +741,34 @@ def test_json_tuple_dec_locale_non_aribic(std_input_path): COMMON_SCAN_TEST_FILES = COMMON_TEST_FILES + [ "scan_emtpy_lines.json"] + +@pytest.mark.parametrize('input_file', [ + "int_formatted.json", + "float_formatted.json", + "sci_formatted.json", + "int_formatted_strings.json", + "float_formatted_strings.json", + "sci_formatted_strings.json", + "decimal_locale_formatted_strings.json", + "single_quoted_strings.json", + "boolean_formatted.json", + "int_array_formatted.json", + "int_struct_formatted.json", + "int_mixed_array_struct_formatted.json", + "bad_whitespace.json", + "escaped_strings.json", + "nested_escaped_strings.json", + "repeated_columns.json", # This works for maps, but not others. + "mixed_objects.json", + "timestamp_formatted_strings.json", + "timestamp_tz_formatted_strings.json"]) +@allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 +def test_from_json_map_string_string(std_input_path, input_file): + schema = MapType(StringType(), StringType()) + assert_gpu_and_cpu_are_equal_collect( + lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").select(f.col('json'), f.from_json(f.col('json'), schema)), + conf =_enable_json_to_structs_conf) + @pytest.mark.parametrize('input_file', COMMON_SCAN_TEST_FILES) @pytest.mark.parametrize('read_func', [read_json_df]) # we have done so many tests already that we don't need both read func. They are the same def test_scan_json_bytes(std_input_path, read_func, spark_tmp_table_factory, input_file): @@ -671,6 +851,7 @@ def test_from_json_longs(std_input_path, input_file): "invalid_ridealong_columns.json", "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -702,6 +883,7 @@ def test_scan_json_decs(std_input_path, read_func, spark_tmp_table_factory, inpu "invalid_ridealong_columns.json", "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -731,10 +913,11 @@ def test_from_json_decs(std_input_path, input_file, dt): "invalid_ridealong_columns.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15318')), "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(condition=is_spark_400_or_later(), reason='https://github.com/NVIDIA/spark-rapids/issues/11154')), "bad_whitespace.json", "escaped_strings.json", - pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10534')), + pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11632')), pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", "timestamp_formatted_strings.json", @@ -761,10 +944,11 @@ def test_scan_json_strings(std_input_path, read_func, spark_tmp_table_factory, i "invalid_ridealong_columns.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/15318')), "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", - pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10534')), + pytest.param("nested_escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11632')), pytest.param("repeated_columns.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11361')), "mixed_objects.json", "timestamp_formatted_strings.json", @@ -789,6 +973,7 @@ def test_from_json_strings(std_input_path, input_file): "invalid_ridealong_columns.json", "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -818,6 +1003,7 @@ def test_get_json_object_formats(std_input_path, input_file): "invalid_ridealong_columns.json", "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -857,6 +1043,7 @@ def test_get_json_object_child_formats(std_input_path, input_file): "invalid_ridealong_columns.json", "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11386')), @@ -905,6 +1092,7 @@ def test_from_json_bools(std_input_path, input_file): "boolean_formatted.json", "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -935,6 +1123,7 @@ def test_scan_json_floats(std_input_path, read_func, spark_tmp_table_factory, in "boolean_formatted.json", "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -963,6 +1152,7 @@ def test_from_json_floats(std_input_path, input_file): "boolean_formatted.json", "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -993,6 +1183,7 @@ def test_scan_json_doubles(std_input_path, read_func, spark_tmp_table_factory, i "boolean_formatted.json", "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -1020,6 +1211,7 @@ def test_from_json_doubles(std_input_path, input_file): "boolean_formatted.json", "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9664')), @@ -1032,7 +1224,9 @@ def test_from_json_doubles(std_input_path, input_file): @pytest.mark.parametrize('read_func', [read_json_df]) @allow_non_gpu(*non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_scan_json_corrected_dates(std_input_path, read_func, spark_tmp_table_factory, input_file): - conf = copy_and_update(_enable_all_types_json_scan_conf, {"spark.sql.legacy.timeParserPolicy": "CORRECTED"}) + conf = copy_and_update(_enable_all_types_json_scan_conf, + {"spark.sql.legacy.timeParserPolicy": "CORRECTED", + "spark.rapids.sql.json.read.datetime.enabled": "true"}) assert_gpu_and_cpu_are_equal_collect( read_func(std_input_path + '/' + input_file, StructType([StructField("data", DateType())]), @@ -1051,6 +1245,7 @@ def test_scan_json_corrected_dates(std_input_path, read_func, spark_tmp_table_fa "boolean_formatted.json", "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", pytest.param("escaped_strings.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9664')), @@ -1062,7 +1257,9 @@ def test_scan_json_corrected_dates(std_input_path, read_func, spark_tmp_table_fa @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_corrected_dates(std_input_path, input_file): schema = StructType([StructField("data", DateType())]) - conf = copy_and_update(_enable_json_to_structs_conf, {"spark.sql.legacy.timeParserPolicy": "CORRECTED"}) + conf = copy_and_update(_enable_json_to_structs_conf, + {"spark.sql.legacy.timeParserPolicy": "CORRECTED", + "spark.rapids.sql.json.read.datetime.enabled": "true"}) assert_gpu_and_cpu_are_equal_collect( lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").select(f.col('json'), f.from_json(f.col('json'), schema)), conf = conf) @@ -1079,6 +1276,7 @@ def test_from_json_corrected_dates(std_input_path, input_file): pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10535')), "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -1091,7 +1289,9 @@ def test_from_json_corrected_dates(std_input_path, input_file): @pytest.mark.parametrize('read_func', [read_json_df]) @allow_non_gpu(*non_utc_allow) def test_scan_json_corrected_timestamps(std_input_path, read_func, spark_tmp_table_factory, input_file): - conf = copy_and_update(_enable_all_types_json_scan_conf, {"spark.sql.legacy.timeParserPolicy": "CORRECTED"}) + conf = copy_and_update(_enable_all_types_json_scan_conf, + {"spark.sql.legacy.timeParserPolicy": "CORRECTED", + "spark.rapids.sql.json.read.datetime.enabled": "true"}) assert_gpu_and_cpu_are_equal_collect( read_func(std_input_path + '/' + input_file, StructType([StructField("data", TimestampType())]), @@ -1110,6 +1310,7 @@ def test_scan_json_corrected_timestamps(std_input_path, read_func, spark_tmp_tab pytest.param("boolean_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10535')), "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -1121,7 +1322,9 @@ def test_scan_json_corrected_timestamps(std_input_path, read_func, spark_tmp_tab @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) def test_from_json_corrected_timestamps(std_input_path, input_file): schema = StructType([StructField("data", TimestampType())]) - conf = copy_and_update(_enable_json_to_structs_conf, {"spark.sql.legacy.timeParserPolicy": "CORRECTED"}) + conf = copy_and_update(_enable_json_to_structs_conf, + {"spark.sql.legacy.timeParserPolicy": "CORRECTED", + "spark.rapids.sql.json.read.datetime.enabled": "true"}) assert_gpu_and_cpu_are_equal_collect( lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").select(f.col('json'), f.from_json(f.col('json'), schema)), conf = conf) @@ -1138,6 +1341,7 @@ def test_from_json_corrected_timestamps(std_input_path, input_file): "boolean_formatted.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10573')), "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11491')), "bad_whitespace.json", "escaped_strings.json", @@ -1167,6 +1371,7 @@ def test_scan_json_long_arrays(std_input_path, read_func, spark_tmp_table_factor "boolean_formatted.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10573')), "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11491')), "bad_whitespace.json", "escaped_strings.json", @@ -1194,6 +1399,7 @@ def test_from_json_long_arrays(std_input_path, input_file): "boolean_formatted.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10574')), "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -1223,6 +1429,7 @@ def test_scan_json_string_arrays(std_input_path, read_func, spark_tmp_table_fact "boolean_formatted.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10574')), "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -1249,7 +1456,11 @@ def test_from_json_string_arrays(std_input_path, input_file): "single_quoted_strings.json", "boolean_formatted.json", "int_array_formatted.json", - pytest.param("int_struct_formatted.json", marks=pytest.mark.xfail(condition=is_before_spark_342(),reason='https://github.com/NVIDIA/spark-rapids/issues/10588')), + "int_struct_formatted.json", + pytest.param("int_struct_formatted_problematic_rows.json", marks=pytest.mark.xfail( + condition=is_before_spark_342() or is_databricks_version_or_later(14, 3), + reason='Before Spark 3.4.2? https://github.com/NVIDIA/spark-rapids/issues/10588. ' + 'Databricks 14.3 or later? https://github.com/NVIDIA/spark-rapids/issues/11711.')), pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(condition=is_before_spark_342(),reason='https://github.com/NVIDIA/spark-rapids/issues/10588')), "bad_whitespace.json", "escaped_strings.json", @@ -1278,7 +1489,11 @@ def test_scan_json_long_structs(std_input_path, read_func, spark_tmp_table_facto "single_quoted_strings.json", "boolean_formatted.json", "int_array_formatted.json", - pytest.param("int_struct_formatted.json", marks=pytest.mark.xfail(condition=is_before_spark_342(),reason='https://github.com/NVIDIA/spark-rapids/issues/10588')), + "int_struct_formatted.json", + pytest.param("int_struct_formatted_problematic_rows.json", marks=pytest.mark.xfail( + condition=is_before_spark_342() or is_databricks_version_or_later(14, 3), + reason='Before Spark 3.4.2? https://github.com/NVIDIA/spark-rapids/issues/10588. ' + 'Databricks 14.3 or later? https://github.com/NVIDIA/spark-rapids/issues/11711.')), pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(condition=is_before_spark_342(),reason='https://github.com/NVIDIA/spark-rapids/issues/10588')), "bad_whitespace.json", "escaped_strings.json", @@ -1306,6 +1521,7 @@ def test_from_json_long_structs(std_input_path, input_file): "boolean_formatted.json", "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -1335,6 +1551,7 @@ def test_scan_json_string_structs(std_input_path, read_func, spark_tmp_table_fac "boolean_formatted.json", "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -1363,6 +1580,7 @@ def test_from_json_string_structs(std_input_path, input_file): "boolean_formatted.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10573')), # This does not fail on 38,0 "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11491')), "bad_whitespace.json", "escaped_strings.json", @@ -1392,6 +1610,7 @@ def test_scan_json_dec_arrays(std_input_path, read_func, spark_tmp_table_factory "boolean_formatted.json", pytest.param("int_array_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10573')), # This does not fail on 38,0 "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", pytest.param("int_mixed_array_struct_formatted.json", marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/11491')), "bad_whitespace.json", "escaped_strings.json", @@ -1419,6 +1638,7 @@ def test_from_json_dec_arrays(std_input_path, input_file, dt): "boolean_formatted.json", "int_array_formatted.json", "int_struct_formatted.json", + "int_struct_formatted_problematic_rows.json", "int_mixed_array_struct_formatted.json", "bad_whitespace.json", "escaped_strings.json", @@ -1445,10 +1665,13 @@ def test_scan_json_mixed_struct(std_input_path, read_func, spark_tmp_table_facto pytest.param("mixed_objects.json", "data STRUCT>>", marks=pytest.mark.xfail(condition=is_before_spark_330(), reason='https://github.com/NVIDIA/spark-rapids/issues/11390')), ("mixed_objects.json", "company STRUCT>>>>"), + ("mixed_objects.json", "MAP") ]) @allow_non_gpu(TEXT_INPUT_EXEC, *non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_from_json_mixed_corrected(std_input_path, input_file, schema): - conf = copy_and_update(_enable_json_to_structs_conf, {"spark.sql.legacy.timeParserPolicy": "CORRECTED"}) + conf = copy_and_update(_enable_json_to_structs_conf, + {"spark.sql.legacy.timeParserPolicy": "CORRECTED", + "spark.rapids.sql.json.read.datetime.enabled": "true"}) assert_gpu_and_cpu_are_equal_collect( lambda spark : read_json_as_text(spark, std_input_path + '/' + input_file, "json").selectExpr('json', "from_json(json, '" + schema + "') as parsed"), diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py index fe1d9064933..6e8165846e7 100644 --- a/integration_tests/src/main/python/json_test.py +++ b/integration_tests/src/main/python/json_test.py @@ -200,7 +200,9 @@ def test_json_date_formats_round_trip(spark_tmp_path, date_format, v1_enabled_li gen = StructGen([('a', DateGen())], nullable=False) data_path = spark_tmp_path + '/JSON_DATA' schema = gen.data_type - updated_conf = copy_and_update(_enable_all_types_conf, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) + updated_conf = copy_and_update(_enable_all_types_conf, + {'spark.sql.sources.useV1SourceList': v1_enabled_list, + 'spark.rapids.sql.json.read.datetime.enabled': 'true'}) def create_test_data(spark): write = gen_df(spark, gen).write @@ -238,7 +240,9 @@ def create_test_data(spark): write.json(data_path) with_cpu_session(lambda spark: create_test_data(spark)) - updated_conf = copy_and_update(_enable_all_types_conf, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) + updated_conf = copy_and_update(_enable_all_types_conf, + {'spark.sql.sources.useV1SourceList': v1_enabled_list, + 'spark.rapids.sql.json.read.datetime.enabled': 'true'}) def do_read(spark): read = spark.read.schema(schema) @@ -280,7 +284,8 @@ def create_test_data(spark): updated_conf = copy_and_update(_enable_all_types_conf, { 'spark.sql.sources.useV1SourceList': v1_enabled_list, - 'spark.sql.timestampType': timestamp_type + 'spark.sql.timestampType': timestamp_type, + 'spark.rapids.sql.json.read.datetime.enabled': 'true' }) def do_read(spark): @@ -343,7 +348,8 @@ def test_basic_json_read(std_input_path, filename, schema, read_func, allow_non_ allow_numeric_leading_zeros, ansi_enabled, spark_tmp_table_factory, date_format): updated_conf = copy_and_update(_enable_all_types_conf, {'spark.sql.ansi.enabled': ansi_enabled, - 'spark.sql.legacy.timeParserPolicy': 'CORRECTED'}) + 'spark.sql.legacy.timeParserPolicy': 'CORRECTED', + 'spark.rapids.sql.json.read.datetime.enabled': 'true'}) options = {"allowNonNumericNumbers": allow_non_numeric_numbers, "allowNumericLeadingZeros": allow_numeric_leading_zeros, } @@ -390,7 +396,8 @@ def test_basic_from_json(std_input_path, filename, schema, allow_non_numeric_num allow_numeric_leading_zeros, ansi_enabled, date_format): updated_conf = copy_and_update(_enable_all_types_conf, {'spark.sql.ansi.enabled': ansi_enabled, - 'spark.sql.legacy.timeParserPolicy': 'CORRECTED'}) + 'spark.sql.legacy.timeParserPolicy': 'CORRECTED', + 'spark.rapids.sql.json.read.datetime.enabled': 'true'}) options = {"allowNonNumericNumbers": allow_non_numeric_numbers, "allowNumericLeadingZeros": allow_numeric_leading_zeros, } @@ -511,7 +518,8 @@ def test_read_optional_fields(spark_tmp_table_factory, std_input_path, read_func def test_json_read_valid_dates(std_input_path, filename, schema, read_func, ansi_enabled, time_parser_policy, spark_tmp_table_factory): updated_conf = copy_and_update(_enable_all_types_conf, {'spark.sql.ansi.enabled': ansi_enabled, - 'spark.sql.legacy.timeParserPolicy': time_parser_policy}) + 'spark.sql.legacy.timeParserPolicy': time_parser_policy, + 'spark.rapids.sql.json.read.datetime.enabled': 'true'}) f = read_func(std_input_path + '/' + filename, schema, spark_tmp_table_factory, {}) if time_parser_policy == 'LEGACY' and ansi_enabled == 'true': assert_gpu_fallback_collect( @@ -546,7 +554,8 @@ def test_json_read_generated_dates(spark_tmp_table_factory, spark_tmp_path, date updated_conf = copy_and_update(_enable_all_types_conf, { 'spark.sql.ansi.enabled': ansi_enabled, - 'spark.sql.legacy.timeParserPolicy': 'CORRECTED'}) + 'spark.sql.legacy.timeParserPolicy': 'CORRECTED', + 'spark.rapids.sql.json.read.datetime.enabled': 'true'}) options = { 'allowNumericLeadingZeros': allow_numeric_leading_zeros } if date_format: @@ -572,7 +581,8 @@ def test_json_read_invalid_dates(std_input_path, filename, schema, read_func, an time_parser_policy, spark_tmp_table_factory): updated_conf = copy_and_update(_enable_all_types_conf, {'spark.sql.ansi.enabled': ansi_enabled, - 'spark.sql.legacy.timeParserPolicy': time_parser_policy }) + 'spark.sql.legacy.timeParserPolicy': time_parser_policy, + 'spark.rapids.sql.json.read.datetime.enabled': 'true'}) options = { 'dateFormat': date_format } if date_format else {} f = read_func(std_input_path + '/' + filename, schema, spark_tmp_table_factory, options) if time_parser_policy == 'EXCEPTION': @@ -605,7 +615,8 @@ def test_json_read_valid_timestamps(std_input_path, filename, schema, read_func, spark_tmp_table_factory): updated_conf = copy_and_update(_enable_all_types_conf, {'spark.sql.ansi.enabled': ansi_enabled, - 'spark.sql.legacy.timeParserPolicy': time_parser_policy}) + 'spark.sql.legacy.timeParserPolicy': time_parser_policy, + 'spark.rapids.sql.json.read.datetime.enabled': 'true'}) f = read_func(std_input_path + '/' + filename, schema, spark_tmp_table_factory, {}) assert_gpu_and_cpu_are_equal_collect(f, conf=updated_conf) @@ -668,6 +679,53 @@ def test_from_json_map(): .select(f.from_json(f.col('a'), 'MAP')), conf=_enable_all_types_conf) +@allow_non_gpu(*non_utc_allow) +def test_from_json_map_with_invalid(): + # The test here is working around some inconsistencies in how the keys are parsed for maps + # on the GPU the keys are dense, but on the CPU they are sparse + json_string_gen = StringGen(r'{"a": "[0-9]{0,5}"(, "b": "[A-Z]{0,5}")?}') \ + .with_special_pattern('', weight=50) \ + .with_special_pattern(' ', weight=50) \ + .with_special_pattern('null', weight=50) \ + .with_special_pattern('invalid', weight=50) \ + .with_special_pattern(r'{"a": "[0-9]{0,5}"', weight=50) \ + .with_special_pattern(r'{"a": "[0-9]{0,5}', weight=50) \ + .with_special_pattern(r'{"a": "[0-9]{0,5}"}abc', weight=50) \ + .with_special_pattern(r'{"a": "[0-9]{0,5}"}{"b": "B"}', weight=50) + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, json_string_gen) \ + .select(f.from_json(f.col('a'), 'MAP')), + conf=_enable_all_types_conf) + +@allow_non_gpu(*non_utc_allow) +@pytest.mark.parametrize('allow_single_quotes', ['true', 'false']) +@pytest.mark.parametrize('allow_non_numeric_numbers', ['true', 'false']) +@pytest.mark.parametrize('allow_unquoted_chars', ['true', 'false']) +def test_from_json_map_with_options(allow_single_quotes, + allow_non_numeric_numbers, + allow_unquoted_chars): + # Test the input with: + # - Double quotes + # - Single quotes + # - Numbers with leading zeros + # - Non-numeric numbers + # - Unquoted control characters in quoted strings + json_string_gen = StringGen(r'{"a": "[0-9]{0,5}"}') \ + .with_special_pattern(r"""{'a': "[0-9]{0,5}"}""", weight=50) \ + .with_special_pattern(r'{"a": 0[0-9]{0,5}}', weight=50) \ + .with_special_pattern(r'{"a": [+-]?(INF|Infinity|NaN)}', weight=50) \ + .with_special_pattern(r'{"(a|a\r\n\tb)": "(xyz|01\r\n\t23)"}', weight=50) + options = {"allowSingleQuotes": allow_single_quotes, + # Cannot test `allowNumericLeadingZeros==true` because the GPU output always has + # leading zeros while the CPU output does not, thus test will always fail. + "allowNumericLeadingZeros": "false", + "allowNonNumericNumbers": allow_non_numeric_numbers, + "allowUnquotedControlChars": allow_unquoted_chars} + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, json_string_gen, length=20) \ + .select(f.from_json(f.col('a'), 'MAP', options)), + conf=_enable_all_types_conf) + @allow_non_gpu('ProjectExec', 'JsonToStructs') def test_from_json_map_fallback(): # The test here is working around some inconsistencies in how the keys are parsed for maps @@ -694,8 +752,7 @@ def test_from_json_map_fallback(): ]) @allow_non_gpu(*non_utc_allow) def test_from_json_struct(schema): - # note that column 'a' does not use leading zeroes due to https://github.com/NVIDIA/spark-rapids/issues/10534 - json_string_gen = StringGen(r'{\'a\': [1-9]{0,5}, "b": \'[A-Z]{0,5}\', "c": 1\d\d\d}') \ + json_string_gen = StringGen(r'{\'a\': [0-9]{0,5}, "b": \'[A-Z]{0,5}\', "c": 1\d\d\d}') \ .with_special_pattern('', weight=50) \ .with_special_pattern('null', weight=50) assert_gpu_and_cpu_are_equal_collect( @@ -708,8 +765,7 @@ def test_from_json_struct(schema): ]) @allow_non_gpu("ProjectExec") def test_from_json_struct_fallback_dupe_keys(schema): - # note that column 'a' does not use leading zeroes due to https://github.com/NVIDIA/spark-rapids/issues/10534 - json_string_gen = StringGen(r'{\'a\': [1-9]{0,5}, "b": \'[A-Z]{0,5}\', "c": 1\d\d\d}') \ + json_string_gen = StringGen(r'{\'a\': [0-9]{0,5}, "b": \'[A-Z]{0,5}\', "c": 1\d\d\d}') \ .with_special_pattern('', weight=50) \ .with_special_pattern('null', weight=50) assert_gpu_fallback_collect( @@ -938,6 +994,36 @@ def test_from_json_struct_of_list(schema): .select(f.from_json('a', schema)), conf=_enable_all_types_conf) +@allow_non_gpu(*non_utc_allow) +def test_from_json_struct_of_list_with_mismatched_schema(): + json_string_gen = StringGen(r'{"teacher": "[A-Z]{1}[a-z]{2,5}",' \ + r'"student": \["[A-Z]{1}[a-z]{2,5}"\]}') \ + .with_special_pattern('', weight=50) \ + .with_special_pattern('null', weight=50) + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, json_string_gen) \ + .select(f.from_json('a', 'struct>>')), + conf=_enable_all_types_conf) + +@pytest.mark.parametrize('schema', ['struct', + 'struct>>', + 'struct>>']) +@allow_non_gpu(*non_utc_allow) +@pytest.mark.xfail(reason='https://github.com/rapidsai/cudf/issues/17349') +def test_from_json_struct_of_list_with_mixed_nested_types_input(schema): + json_string_gen = StringGen(r'{"teacher": "[A-Z]{1}[a-z]{2,5}",' \ + r'"student": \[{"name": "[A-Z]{1}[a-z]{2,5}", "class": "junior"},' \ + r'{"name": "[A-Z]{1}[a-z]{2,5}", "class": "freshman"}\]}') \ + .with_special_pattern('', weight=50) \ + .with_special_pattern('null', weight=50) \ + .with_special_pattern('{"student": \["[A-Z]{1}[a-z]{2,5}"\]}', weight=100) \ + .with_special_pattern('{"student": \[[1-9]{1,5}\]}', weight=100) \ + .with_special_pattern('{"student": {"[A-Z]{1}[a-z]{2,5}": "[A-Z]{1}[a-z]{2,5}"}}', weight=100) + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, json_string_gen) \ + .select(f.from_json('a', schema)), + conf=_enable_all_types_conf) + @pytest.mark.parametrize('schema', [ 'struct' ]) @@ -1004,7 +1090,7 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name): conf=all_confs) -@pytest.mark.parametrize('data_gen', [byte_gen, +_to_json_datagens=[byte_gen, boolean_gen, short_gen, int_gen, @@ -1024,36 +1110,84 @@ def test_read_case_col_name(spark_tmp_path, v1_enabled_list, col_name): .with_special_case('\\\'a\\\''), pytest.param(StringGen('\u001a', nullable=True), marks=pytest.mark.xfail( reason='https://github.com/NVIDIA/spark-rapids/issues/9705')) -], ids=idfn) +] + +@pytest.mark.parametrize('data_gen', _to_json_datagens, ids=idfn) @pytest.mark.parametrize('ignore_null_fields', [True, False]) -@pytest.mark.parametrize('pretty', [ - pytest.param(True, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/9517')), - False -]) @pytest.mark.parametrize('timezone', [ 'UTC', - 'Etc/UTC', - pytest.param('UTC+07:00', marks=pytest.mark.allow_non_gpu('ProjectExec')), + 'Etc/UTC' ]) -@pytest.mark.xfail(condition = is_not_utc(), reason = 'xfail non-UTC time zone tests because of https://github.com/NVIDIA/spark-rapids/issues/9653') -def test_structs_to_json(spark_tmp_path, data_gen, ignore_null_fields, pretty, timezone): +@allow_non_gpu(*non_utc_project_allow) +def test_structs_to_json(spark_tmp_path, data_gen, ignore_null_fields, timezone): struct_gen = StructGen([ ('a', data_gen), ("b", StructGen([('child', data_gen)], nullable=True)), ("c", ArrayGen(StructGen([('child', data_gen)], nullable=True))), - ("d", MapGen(LongGen(nullable=False), data_gen)), ("d", MapGen(StringGen('[A-Za-z0-9]{0,10}', nullable=False), data_gen)), - ("e", ArrayGen(MapGen(LongGen(nullable=False), data_gen), nullable=True)), + ("e", ArrayGen(MapGen(StringGen('[A-Z]{5}', nullable=False), data_gen), nullable=True)), ], nullable=False) gen = StructGen([('my_struct', struct_gen)], nullable=False) options = { 'ignoreNullFields': ignore_null_fields, - 'pretty': pretty, 'timeZone': timezone} def struct_to_json(spark): df = gen_df(spark, gen) - return df.withColumn("my_json", f.to_json("my_struct", options)).drop("my_struct") + return df.select( + f.to_json("my_struct", options).alias("ms")) + + conf = copy_and_update(_enable_all_types_conf, + { 'spark.rapids.sql.expression.StructsToJson': True }) + + assert_gpu_and_cpu_are_equal_collect( + lambda spark : struct_to_json(spark), + conf=conf) + +@pytest.mark.parametrize('data_gen', _to_json_datagens, ids=idfn) +@pytest.mark.parametrize('ignore_null_fields', [True, False]) +@pytest.mark.parametrize('timezone', [ + 'UTC', + 'Etc/UTC' +]) +@allow_non_gpu(*non_utc_project_allow) +def test_arrays_to_json(spark_tmp_path, data_gen, ignore_null_fields, timezone): + array_gen = ArrayGen(data_gen, nullable=True) + gen = StructGen([("my_array", array_gen)], nullable=False) + + options = { 'ignoreNullFields': ignore_null_fields, + 'timeZone': timezone} + + def struct_to_json(spark): + df = gen_df(spark, gen) + return df.select( + f.to_json("my_array", options).alias("ma")) + + conf = copy_and_update(_enable_all_types_conf, + { 'spark.rapids.sql.expression.StructsToJson': True }) + + assert_gpu_and_cpu_are_equal_collect( + lambda spark : struct_to_json(spark), + conf=conf) + +@pytest.mark.parametrize('data_gen', _to_json_datagens, ids=idfn) +@pytest.mark.parametrize('ignore_null_fields', [True, False]) +@pytest.mark.parametrize('timezone', [ + 'UTC', + 'Etc/UTC' +]) +@allow_non_gpu(*non_utc_project_allow) +def test_maps_to_json(spark_tmp_path, data_gen, ignore_null_fields, timezone): + map_gen = MapGen(StringGen('[A-Z]{1,10}', nullable=False), data_gen, nullable=True) + gen = StructGen([("my_map", map_gen)], nullable=False) + + options = { 'ignoreNullFields': ignore_null_fields, + 'timeZone': timezone} + + def struct_to_json(spark): + df = gen_df(spark, gen) + return df.select( + f.to_json("my_map", options).alias("mm")) conf = copy_and_update(_enable_all_types_conf, { 'spark.rapids.sql.expression.StructsToJson': True }) @@ -1064,16 +1198,13 @@ def struct_to_json(spark): @pytest.mark.parametrize('data_gen', [timestamp_gen], ids=idfn) @pytest.mark.parametrize('timestamp_format', [ - 'yyyy-MM-dd\'T\'HH:mm:ss[.SSS][XXX]', - pytest.param('yyyy-MM-dd\'T\'HH:mm:ss.SSSXXX', marks=pytest.mark.allow_non_gpu('ProjectExec')), - pytest.param('dd/MM/yyyy\'T\'HH:mm:ss[.SSS][XXX]', marks=pytest.mark.allow_non_gpu('ProjectExec')), + 'yyyy-MM-dd\'T\'HH:mm:ss[.SSS][XXX]' ]) @pytest.mark.parametrize('timezone', [ 'UTC', - 'Etc/UTC', - pytest.param('UTC+07:00', marks=pytest.mark.allow_non_gpu('ProjectExec')), + 'Etc/UTC' ]) -@pytest.mark.skipif(is_not_utc(), reason='Duplicated as original test case designed which it is parameterized by timezone. https://github.com/NVIDIA/spark-rapids/issues/9653.') +@allow_non_gpu(*non_utc_project_allow) def test_structs_to_json_timestamp(spark_tmp_path, data_gen, timestamp_format, timezone): struct_gen = StructGen([ ("b", StructGen([('child', data_gen)], nullable=True)), @@ -1202,6 +1333,29 @@ def struct_to_json(spark): conf=conf) +@allow_non_gpu('ProjectExec') +def test_structs_to_json_fallback_pretty(spark_tmp_path): + struct_gen = StructGen([ + ('a', long_gen), + ("b", byte_gen), + ("c", ArrayGen(short_gen)) + ], nullable=False) + gen = StructGen([('my_struct', struct_gen)], nullable=False) + + options = { 'pretty': True } + + def struct_to_json(spark): + df = gen_df(spark, gen) + return df.withColumn("my_json", f.to_json("my_struct", options)).drop("my_struct") + + conf = copy_and_update(_enable_all_types_conf, + { 'spark.rapids.sql.expression.StructsToJson': True }) + + assert_gpu_fallback_collect( + lambda spark : struct_to_json(spark), + 'ProjectExec', + conf=conf) + ##################################################### # Some from_json tests ported over from Apache Spark ##################################################### @@ -1230,7 +1384,6 @@ def test_spark_from_json(): # from_json - input=empty array, schema=struct, output=single row with null # from_json - input=empty object, schema=struct, output=single row with null # SPARK-19543: from_json empty input column -@pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10483') @pytest.mark.parametrize('data', [ [[r'''[]''']], [[r'''{ }''']], @@ -1300,7 +1453,6 @@ def test_spark_from_json_single_item_array_to_struct(): lambda spark : spark.createDataFrame(data, 'json STRING').select(f.col('json'), f.from_json(f.col('json'), schema)), conf =_enable_all_types_conf) -@pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10484') #from_json - input=array, schema=struct, output=single row @allow_non_gpu('ProjectExec') def test_spark_from_json_struct_with_corrupted_row(): @@ -1375,9 +1527,10 @@ def test_spark_from_json_timestamp_format_option_zoneid_but_default_format(zone_ schema = StructType([StructField("t", TimestampType())]) data = [[r'''{"t": "2016-01-01 00:00:00"}'''], [r'''{"t": "2023-07-27 12:21:05"}''']] + conf = copy_and_update(_enable_all_types_conf, {"spark.rapids.sql.json.read.datetime.enabled": "true"}) assert_gpu_and_cpu_are_equal_collect( lambda spark : spark.createDataFrame(data, 'json STRING').select(f.col('json'), f.from_json(f.col('json'), schema, {'timeZone': zone_id})), - conf =_enable_all_types_conf) + conf =conf) # from_json with option (timestampFormat) # no timestamp format appears to actually work @@ -1391,7 +1544,6 @@ def test_spark_from_json_timestamp_format(): conf =_enable_all_types_conf) # from_json missing fields -@pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10489') @allow_non_gpu(*non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_spark_from_json_missing_fields_with_cr(): schema = StructType([StructField("a", LongType(), False), StructField("b", StringType(), False), StructField("c", StringType(), False)]) @@ -1432,9 +1584,10 @@ def test_spark_from_json_date_with_format(): data = [["""{"time": "26/08/2015"}"""], ["""{"time": "01/01/2024"}"""]] schema = StructType([StructField("d", DateType())]) + conf = copy_and_update(_enable_all_types_conf, {"spark.rapids.sql.json.read.datetime.enabled": "true"}) assert_gpu_and_cpu_are_equal_collect( lambda spark : spark.createDataFrame(data, 'json STRING').select(f.col('json'), f.from_json(f.col('json'), schema, {'dateFormat': 'dd/MM/yyyy'})), - conf =_enable_all_types_conf) + conf =conf) # TEST from_json missing columns @allow_non_gpu(*non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 @@ -1446,7 +1599,6 @@ def test_spark_from_json_missing_columns(): conf =_enable_all_types_conf) # TEST from_json invalid json -@pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/10483') @allow_non_gpu(*non_utc_allow) # https://github.com/NVIDIA/spark-rapids/issues/10453 def test_spark_from_json_invalid_json(): schema = StructType([StructField("a", IntegerType())]) @@ -1454,3 +1606,11 @@ def test_spark_from_json_invalid_json(): assert_gpu_and_cpu_are_equal_collect( lambda spark : spark.createDataFrame(data, 'json STRING').select(f.col('json'), f.from_json(f.col('json'), schema)), conf =_enable_all_types_conf) + +@allow_non_gpu(*non_utc_allow) +def test_from_json_input_wrapped_in_whitespaces(): + json_string_gen = StringGen(r'[ \r\n\t]{0,5}({"key":( |\r|\n|\t|)"[A-Za-z]{0,5}"}|null|invalid|)[ \r\n\t]{0,5}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, json_string_gen) \ + .select(f.from_json('a', 'struct')), + conf=_enable_all_types_conf) diff --git a/integration_tests/src/main/python/misc_expr_test.py b/integration_tests/src/main/python/misc_expr_test.py index 3251df08d27..0895d451b9d 100644 --- a/integration_tests/src/main/python/misc_expr_test.py +++ b/integration_tests/src/main/python/misc_expr_test.py @@ -19,7 +19,7 @@ from marks import incompat, approximate_float from pyspark.sql.types import * import pyspark.sql.functions as f -from spark_session import is_before_spark_400 +from spark_session import is_databricks_version_or_later, is_spark_400_or_later def test_mono_id(): assert_gpu_and_cpu_are_equal_collect( @@ -34,8 +34,8 @@ def test_part_id(): f.spark_partition_id())) -@pytest.mark.skipif(condition=not is_before_spark_400(), - reason="raise_error() not currently implemented for Spark 4.0. " +@pytest.mark.skipif(condition=is_spark_400_or_later() or is_databricks_version_or_later(14, 3), + reason="raise_error() not currently implemented for Spark 4.0, or Databricks 14.3. " "See https://github.com/NVIDIA/spark-rapids/issues/10107.") def test_raise_error(): data_gen = ShortGen(nullable=False, min_val=0, max_val=20, special_cases=[]) diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py index 789a261b52b..19894d29aa6 100644 --- a/integration_tests/src/main/python/orc_test.py +++ b/integration_tests/src/main/python/orc_test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -112,8 +112,11 @@ def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl, #E at org.apache.orc.TypeDescription.parseInt(TypeDescription.java:244) #E at org.apache.orc.TypeDescription.parseType(TypeDescription.java:362) # ... +# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and +# https://github.com/rapidsai/cudf/issues/6763 . +# Once the first issue is fixed, add back boolean_gen orc_basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, - string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)), + string_gen, DateGen(start=date(1590, 1, 1)), orc_timestamp_gen] + decimal_gens orc_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(orc_basic_gens)]) @@ -201,8 +204,11 @@ def test_read_round_trip(spark_tmp_path, orc_gens, read_func, reader_confs, v1_e read_func(data_path), conf=all_confs) +# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and +# https://github.com/rapidsai/cudf/issues/6763 . +# Once the first issue is fixed, add back boolean_gen orc_pred_push_gens = [ - byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, boolean_gen, + byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, # Once https://github.com/NVIDIA/spark-rapids/issues/139 is fixed replace this with # date_gen @@ -277,8 +283,11 @@ def test_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list, rea def test_simple_partitioned_read(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed # we should go with a more standard set of generators + # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and + # https://github.com/rapidsai/cudf/issues/6763 . + # Once the first issue is fixed, add back boolean_gen orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, - string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)), + string_gen, DateGen(start=date(1590, 1, 1)), orc_timestamp_gen] gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] first_data_path = spark_tmp_path + '/ORC_DATA/key=0/key2=20' @@ -344,8 +353,11 @@ def test_partitioned_read_just_partitions(spark_tmp_path, v1_enabled_list, reade def test_merge_schema_read(spark_tmp_path, v1_enabled_list, reader_confs): # Once https://github.com/NVIDIA/spark-rapids/issues/131 is fixed # we should go with a more standard set of generators + # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and + # https://github.com/rapidsai/cudf/issues/6763 . + # Once the first issue is fixed, add back boolean_gen orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, - string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)), + string_gen, DateGen(start=date(1590, 1, 1)), orc_timestamp_gen] first_gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] first_data_path = spark_tmp_path + '/ORC_DATA/key=0' @@ -825,8 +837,11 @@ def test_read_round_trip_for_multithreaded_combining(spark_tmp_path, gens, keep_ @pytest.mark.parametrize('keep_order', [True, pytest.param(False, marks=pytest.mark.ignore_order(local=True))]) @allow_non_gpu(*non_utc_allow_orc_scan) def test_simple_partitioned_read_for_multithreaded_combining(spark_tmp_path, keep_order): + # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and + # https://github.com/rapidsai/cudf/issues/6763 . + # Once the first issue is fixed, add back boolean_gen orc_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, - string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)), + string_gen, DateGen(start=date(1590, 1, 1)), orc_timestamp_gen] gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] first_data_path = spark_tmp_path + '/ORC_DATA/key=0/key2=20' @@ -845,12 +860,16 @@ def test_simple_partitioned_read_for_multithreaded_combining(spark_tmp_path, kee assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.read.orc(data_path), conf=all_confs) -@pytest.mark.skipif(is_spark_340_or_later() and (not (is_databricks_runtime() and spark_version() == "3.4.1")), reason="https://github.com/NVIDIA/spark-rapids/issues/8324") + +@pytest.mark.skipif(is_spark_340_or_later() and not is_databricks_runtime(), + reason="https://github.com/NVIDIA/spark-rapids/issues/8324") @pytest.mark.parametrize('data_file', ['fixed-length-char-column-from-hive.orc']) @pytest.mark.parametrize('reader', [read_orc_df, read_orc_sql]) def test_read_hive_fixed_length_char(std_input_path, data_file, reader): """ Test that a file containing CHAR data is readable as STRING. + The plugin behaviour matches all Spark versions prior to 3.4.0, + and Databricks version 13.3 (i.e. 3.4.1) and after. """ assert_gpu_and_cpu_are_equal_collect( reader(std_input_path + '/' + data_file), @@ -858,20 +877,30 @@ def test_read_hive_fixed_length_char(std_input_path, data_file, reader): @allow_non_gpu("ProjectExec") -@pytest.mark.skipif(is_before_spark_340() or (is_databricks_runtime() and spark_version() == "3.4.1"), reason="https://github.com/NVIDIA/spark-rapids/issues/8324") +@pytest.mark.skipif(is_before_spark_340(), + reason="https://github.com/NVIDIA/spark-rapids/issues/8324") +@pytest.mark.skipif(is_databricks_version_or_later(13, 3), + reason="The SELECT * query does not involve ProjectExec " + "on Databricks versions >= 13.3. " + "Can't test Project fallback without ProjectExec.") @pytest.mark.parametrize('data_file', ['fixed-length-char-column-from-hive.orc']) @pytest.mark.parametrize('reader', [read_orc_df, read_orc_sql]) def test_project_fallback_when_reading_hive_fixed_length_char(std_input_path, data_file, reader): """ - Test that a file containing CHAR data is readable as STRING. + Test that reading a file containing fixed-width CHAR data (e.g. CHAR(3)) as a STRING column + causes the ProjectExec to fall back to CPU. Note: This test can be removed when https://github.com/NVIDIA/spark-rapids/issues/8324 is resolved. + + This test does not apply to Databricks >= 13.3, because there would be + no ProjectExec to fall back to CPU. """ assert_gpu_fallback_collect( reader(std_input_path + '/' + data_file), cpu_fallback_class_name="ProjectExec", conf={}) + @pytest.mark.parametrize('read_func', [read_orc_df, read_orc_sql]) @pytest.mark.parametrize('v1_enabled_list', ["", "orc"]) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) @@ -913,7 +942,10 @@ def test_orc_column_name_with_dots(spark_tmp_path, reader_confs): ("f.g", int_gen), ("h", string_gen)])), ("i.j", long_gen)])), - ("k", boolean_gen)] + # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and + # https://github.com/rapidsai/cudf/issues/6763 . + # Once the first issue is fixed, add back boolean_gen for column k + ("k", int_gen)] with_cpu_session(lambda spark: gen_df(spark, gens).write.orc(data_path)) assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark), conf=all_confs) assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark).selectExpr("`a.b`"), conf=all_confs) @@ -931,7 +963,10 @@ def test_orc_with_null_column(spark_tmp_path, reader_confs): def gen_null_df(spark): return spark.createDataFrame( [(None, None, None, None, None)], - "c1 int, c2 long, c3 float, c4 double, c5 boolean") + # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and + # https://github.com/rapidsai/cudf/issues/6763 . + # Once the first issue is fixed, add back boolean_gen + "c1 int, c2 long, c3 float, c4 double, c5 int") assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_null_df(spark).write.orc(path), @@ -952,7 +987,10 @@ def test_orc_with_null_column_with_1m_rows(spark_tmp_path, reader_confs): def gen_null_df(spark): return spark.createDataFrame( data, - "c1 int, c2 long, c3 float, c4 double, c5 boolean") + # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and + # https://github.com/rapidsai/cudf/issues/6763 . + # Once the first issue is fixed, add back boolean_gen + "c1 int, c2 long, c3 float, c4 double, c5 int") assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_null_df(spark).write.orc(path), lambda spark, path: spark.read.orc(path), diff --git a/integration_tests/src/main/python/orc_write_test.py b/integration_tests/src/main/python/orc_write_test.py index f4928196c82..7e415c79a46 100644 --- a/integration_tests/src/main/python/orc_write_test.py +++ b/integration_tests/src/main/python/orc_write_test.py @@ -15,7 +15,7 @@ import pytest from asserts import assert_gpu_and_cpu_writes_are_equal_collect, assert_gpu_fallback_write -from spark_session import is_before_spark_320, is_before_spark_400, is_spark_321cdh, is_spark_cdh, with_cpu_session, with_gpu_session +from spark_session import is_before_spark_320, is_databricks_version_or_later, is_spark_321cdh, is_spark_400_or_later, is_spark_cdh, with_cpu_session, with_gpu_session from conftest import is_not_utc from datetime import date, datetime, timezone from data_gen import * @@ -24,9 +24,11 @@ from pyspark.sql.types import * pytestmark = pytest.mark.nightly_resource_consuming_test - +# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and +# https://github.com/rapidsai/cudf/issues/6763 . +# Once the first issue is fixed, add back boolean_gen. orc_write_basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, - string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)), + string_gen, DateGen(start=date(1590, 1, 1)), TimestampGen(start=datetime(1970, 1, 1, tzinfo=timezone.utc)) ] + \ decimal_gens @@ -52,7 +54,8 @@ all_nulls_map_gen, all_empty_map_gen] -orc_write_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(orc_write_basic_gens)]) +orc_write_basic_struct_gen = StructGen( + [['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(orc_write_basic_gens)]) orc_write_struct_gens_sample = [orc_write_basic_struct_gen, StructGen([['child0', byte_gen], ['child1', orc_write_basic_struct_gen]]), @@ -62,15 +65,18 @@ ArrayGen(ArrayGen(short_gen, max_length=10), max_length=10), ArrayGen(ArrayGen(string_gen, max_length=10), max_length=10), ArrayGen(StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]]))] - +# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and +# https://github.com/rapidsai/cudf/issues/6763 . +# Once the first issue is fixed, add back boolean_gen. orc_write_basic_map_gens = [simple_string_to_string_map_gen] + [MapGen(f(nullable=False), f()) for f in [ - BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen, + ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen, # Using timestamps from 1970 to work around a cudf ORC bug # https://github.com/NVIDIA/spark-rapids/issues/140. lambda nullable=True: TimestampGen(start=datetime(1970, 1, 1, tzinfo=timezone.utc), nullable=nullable), lambda nullable=True: DateGen(start=date(1590, 1, 1), nullable=nullable), lambda nullable=True: DecimalGen(precision=15, scale=1, nullable=nullable), - lambda nullable=True: DecimalGen(precision=36, scale=5, nullable=nullable)]] + lambda nullable=True: DecimalGen(precision=36, scale=5, nullable=nullable)]] + [MapGen( + f(nullable=False), f(nullable=False)) for f in [IntegerGen]] orc_write_gens_list = [orc_write_basic_gens, orc_write_struct_gens_sample, @@ -79,6 +85,7 @@ pytest.param([date_gen], marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/139')), pytest.param([timestamp_gen], marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/140'))] +bool_gen = [BooleanGen(nullable=True), BooleanGen(nullable=False)] @pytest.mark.parametrize('orc_gens', orc_write_gens_list, ids=idfn) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) @allow_non_gpu(*non_utc_allow) @@ -91,6 +98,30 @@ def test_write_round_trip(spark_tmp_path, orc_gens, orc_impl): data_path, conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True}) +@pytest.mark.parametrize('orc_gens', [bool_gen], ids=idfn) +@pytest.mark.parametrize('orc_impl', ["native", "hive"]) +@allow_non_gpu('ExecutedCommandExec', 'DataWritingCommandExec', 'WriteFilesExec') +def test_write_round_trip_bools_only_fallback(spark_tmp_path, orc_gens, orc_impl): + gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] + data_path = spark_tmp_path + '/ORC_DATA' + assert_gpu_and_cpu_writes_are_equal_collect( + lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.orc(path), + lambda spark, path: spark.read.orc(path), + data_path, + conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True}) + +@pytest.mark.parametrize('orc_gens', [bool_gen], ids=idfn) +@pytest.mark.parametrize('orc_impl', ["native", "hive"]) +def test_write_round_trip_bools_only_no_fallback(spark_tmp_path, orc_gens, orc_impl): + gen_list = [('_c' + str(i), gen) for i, gen in enumerate(orc_gens)] + data_path = spark_tmp_path + '/ORC_DATA' + assert_gpu_and_cpu_writes_are_equal_collect( + lambda spark, path: gen_df(spark, gen_list).coalesce(1).write.orc(path), + lambda spark, path: spark.read.orc(path), + data_path, + conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True, + 'spark.rapids.sql.format.orc.write.boolType.enabled': True}) + @pytest.mark.parametrize('orc_gen', orc_write_odd_empty_strings_gens_sample, ids=idfn) @pytest.mark.parametrize('orc_impl', ["native", "hive"]) def test_write_round_trip_corner(spark_tmp_path, orc_gen, orc_impl): @@ -103,7 +134,8 @@ def test_write_round_trip_corner(spark_tmp_path, orc_gen, orc_impl): conf={'spark.sql.orc.impl': orc_impl, 'spark.rapids.sql.format.orc.write.enabled': True}) orc_part_write_gens = [ - byte_gen, short_gen, int_gen, long_gen, boolean_gen, + # Add back boolean_gen when https://github.com/rapidsai/cudf/issues/6763 is fixed + byte_gen, short_gen, int_gen, long_gen, # Some file systems have issues with UTF8 strings so to help the test pass even there StringGen('(\\w| ){0,50}'), # Once https://github.com/NVIDIA/spark-rapids/issues/139 is fixed replace this with @@ -345,7 +377,10 @@ def test_orc_write_column_name_with_dots(spark_tmp_path): ("f.g", int_gen), ("h", string_gen)])), ("i.j", long_gen)])), - ("k", boolean_gen)] + # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and + # https://github.com/rapidsai/cudf/issues/6763 . + # Once the first issue is fixed, add back boolean_gen for column k + ("k", int_gen)] assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: gen_df(spark, gens).coalesce(1).write.orc(path), lambda spark, path: spark.read.orc(path), @@ -360,8 +395,8 @@ def test_orc_do_not_lowercase_columns(spark_tmp_path): # The wording of the `is not exists` error message in Spark 4.x is unfortunate, but accurate: # https://github.com/apache/spark/blob/4501285a49e4c0429c9cf2c105f044e1c8a93d21/python/pyspark/errors/error-conditions.json#L487 - expected_error_message = "No StructField named acol" if is_before_spark_400() else \ - "Key `acol` is not exists." + expected_error_message = "Key `acol` is not exists." if is_spark_400_or_later() or is_databricks_version_or_later(14, 3) \ + else "No StructField named acol" assert_gpu_and_cpu_writes_are_equal_collect( # column is uppercase lambda spark, path: spark.range(0, 1000).select(col("id").alias("Acol")).write.orc(path), diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py index f9236f42076..6aa234003ba 100644 --- a/integration_tests/src/main/python/parquet_test.py +++ b/integration_tests/src/main/python/parquet_test.py @@ -299,12 +299,19 @@ def test_parquet_read_round_trip_binary_as_string(std_input_path, read_func, rea @pytest.mark.parametrize('compress', parquet_compress_options) @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -def test_parquet_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list, reader_confs): +@pytest.mark.parametrize('cpu_decompress', [True, False]) +def test_parquet_compress_read_round_trip(spark_tmp_path, compress, v1_enabled_list, reader_confs, cpu_decompress): data_path = spark_tmp_path + '/PARQUET_DATA' with_cpu_session( lambda spark : binary_op_df(spark, long_gen).write.parquet(data_path), conf={'spark.sql.parquet.compression.codec': compress}) all_confs = copy_and_update(reader_confs, {'spark.sql.sources.useV1SourceList': v1_enabled_list}) + if cpu_decompress: + all_confs = copy_and_update(all_confs, { + 'spark.rapids.sql.format.parquet.decompressCpu' : 'true', + 'spark.rapids.sql.format.parquet.decompressCpu.snappy' : 'true', + 'spark.rapids.sql.format.parquet.decompressCpu.zstd' : 'true' + }) assert_gpu_and_cpu_are_equal_collect( lambda spark : spark.read.parquet(data_path), conf=all_confs) @@ -517,6 +524,8 @@ def test_parquet_read_buffer_allocation_empty_blocks(spark_tmp_path, v1_enabled_ lambda spark : spark.read.parquet(data_path).filter("id < 2 or id > 990"), conf=all_confs) + +@disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) @pytest.mark.skipif(is_databricks_runtime(), reason="https://github.com/NVIDIA/spark-rapids/issues/7733") @@ -829,6 +838,8 @@ def test_parquet_read_nano_as_longs_true(std_input_path): 'FileSourceScanExec', conf=conf) + +@disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 def test_many_column_project(): def _create_wide_data_frame(spark, num_cols): schema_dict = {} @@ -1317,27 +1328,64 @@ def test_parquet_read_case_insensitivity(spark_tmp_path): ) -# test read INT32 as INT8/INT16/Date -@pytest.mark.parametrize('reader_confs', reader_opt_confs) -@pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) -def test_parquet_int32_downcast(spark_tmp_path, reader_confs, v1_enabled_list): +def run_test_parquet_int32_downcast(spark_tmp_path, + reader_confs, + v1_enabled_list, + ansi_conf): + """ + This tests whether Parquet files with columns written as INT32 can be + read as having INT8, INT16 and DATE columns, with ANSI mode enabled/disabled. + """ data_path = spark_tmp_path + '/PARQUET_DATA' write_schema = [("d", date_gen), ('s', short_gen), ('b', byte_gen)] + + # For test setup, write with ANSI disabled. + # Otherwise, CAST(d AS INT) will fail on Spark CPU. with_cpu_session( lambda spark: gen_df(spark, write_schema).selectExpr( "cast(d as Int) as d", "cast(s as Int) as s", - "cast(b as Int) as b").write.parquet(data_path)) + "cast(b as Int) as b").write.parquet(data_path), conf=ansi_disabled_conf) read_schema = StructType([StructField("d", DateType()), StructField("s", ShortType()), StructField("b", ByteType())]) conf = copy_and_update(reader_confs, - {'spark.sql.sources.useV1SourceList': v1_enabled_list}) + {'spark.sql.sources.useV1SourceList': v1_enabled_list}, + ansi_conf) assert_gpu_and_cpu_are_equal_collect( lambda spark: spark.read.schema(read_schema).parquet(data_path), conf=conf) + +@pytest.mark.parametrize('reader_confs', reader_opt_confs) +@pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) +def test_parquet_int32_downcast_ansi_disabled(spark_tmp_path, reader_confs, v1_enabled_list): + """ + This tests whether Parquet files with columns written as INT32 can be + read as having INT8, INT16 and DATE columns, with ANSI mode disabled. + """ + run_test_parquet_int32_downcast(spark_tmp_path, + reader_confs, + v1_enabled_list, + ansi_disabled_conf) + + +def test_parquet_int32_downcast_ansi_enabled(spark_tmp_path): + """ + This is the flipside of test_parquet_int32_downcast_ansi_disabled. + This tests whether Parquet files with columns written as INT32 can be + read as having INT8, INT16 and DATE columns, now tested with ANSI + enabled. + A limited combination of test parameters is used to test ANSI enabled, + in the interest of brevity. + """ + run_test_parquet_int32_downcast(spark_tmp_path, + reader_confs=native_parquet_file_reader_conf, + v1_enabled_list="", + ansi_conf=ansi_disabled_conf) + + @pytest.mark.parametrize('reader_confs', reader_opt_confs) @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) @pytest.mark.parametrize("types", [("byte", "short"), ("byte", "int"), ("short", "int")], ids=idfn) @@ -1372,6 +1420,10 @@ def test_parquet_nested_column_missing(spark_tmp_path, reader_confs, v1_enabled_ lambda spark: spark.read.schema(read_schema).parquet(data_path), conf=conf) +@pytest.mark.skipif(condition=is_databricks_runtime() and is_databricks_version_or_later(14,3), + reason="https://github.com/NVIDIA/spark-rapids/issues/11512") +@pytest.mark.skipif(condition=is_spark_400_or_later(), + reason="https://github.com/NVIDIA/spark-rapids/issues/11512") def test_parquet_check_schema_compatibility(spark_tmp_path): data_path = spark_tmp_path + '/PARQUET_DATA' gen_list = [('int', int_gen), ('long', long_gen), ('dec32', decimal_gen_32bit)] @@ -1384,13 +1436,6 @@ def test_parquet_check_schema_compatibility(spark_tmp_path): conf={}, error_message='Parquet column cannot be converted') - read_dec32_as_dec64 = StructType( - [StructField('int', IntegerType()), StructField('dec32', DecimalType(15, 10))]) - assert_gpu_and_cpu_error( - lambda spark: spark.read.schema(read_dec32_as_dec64).parquet(data_path).collect(), - conf={}, - error_message='Parquet column cannot be converted') - # For nested types, GPU throws incompatible exception with a different message from CPU. def test_parquet_check_schema_compatibility_nested_types(spark_tmp_path): @@ -1436,6 +1481,75 @@ def test_parquet_check_schema_compatibility_nested_types(spark_tmp_path): lambda spark: spark.read.schema(read_map_str_str_as_str_int).parquet(data_path).collect()), error_message='Parquet column cannot be converted') + +@pytest.mark.parametrize('from_decimal_gen, to_decimal_gen', [ + # Widening precision and scale by the same amount + (DecimalGen(5, 2), DecimalGen(7, 4)), + (DecimalGen(5, 2), DecimalGen(10, 7)), + (DecimalGen(5, 2), DecimalGen(20, 17)), + (DecimalGen(10, 2), DecimalGen(12, 4)), + (DecimalGen(10, 2), DecimalGen(20, 12)), + (DecimalGen(20, 2), DecimalGen(22, 4)), + # Increasing precision by larger amount than scale + (DecimalGen(5, 2), DecimalGen(6, 3)), + (DecimalGen(5, 2), DecimalGen(12, 5)), + (DecimalGen(5, 2), DecimalGen(22, 10)), + # Narrowing precision and scale + (DecimalGen(7, 4), DecimalGen(5, 2)), + (DecimalGen(10, 7), DecimalGen(5, 2)), + (DecimalGen(20, 17), DecimalGen(5, 2)), + # Increasing precision and decreasing scale + (DecimalGen(5, 4), DecimalGen(7, 2)), + (DecimalGen(10, 6), DecimalGen(12, 4)), + (DecimalGen(20, 7), DecimalGen(22, 5)), + # Increasing precision by a smaller amount than scale + (DecimalGen(5, 2), DecimalGen(6, 4)), + (DecimalGen(10, 4), DecimalGen(12, 7)) +], ids=idfn) +def test_parquet_decimal_precision_scale_change(spark_tmp_path, from_decimal_gen, to_decimal_gen): + """Test decimal precision and scale changes when reading Parquet files with RAPIDS acceleration.""" + data_path = f"{spark_tmp_path}/PARQUET_DECIMAL_DATA" + + # Write test data with CPU + with_cpu_session( + lambda spark: unary_op_df(spark, from_decimal_gen) + .coalesce(1) + .write.parquet(data_path) + ) + + # Create target schema for reading + read_schema = StructType([ + StructField("a", to_decimal_gen.data_type) + ]) + + # Determine if we expect an error based on precision and scale changes + expect_error = ( + to_decimal_gen.scale < from_decimal_gen.scale or + (to_decimal_gen.precision - to_decimal_gen.scale) < + (from_decimal_gen.precision - from_decimal_gen.scale) + ) + + spark_conf = {} + if is_before_spark_400(): + # In Spark versions earlier than 4.0, the vectorized Parquet reader throws an exception + # if the read scale differs from the write scale. We disable the vectorized reader, + # forcing Spark to use the non-vectorized path for CPU case. This configuration + # is ignored by the plugin. + spark_conf['spark.sql.parquet.enableVectorizedReader'] = 'false' + + if expect_error: + assert_gpu_and_cpu_error( + lambda spark: spark.read.schema(read_schema).parquet(data_path).collect(), + conf={}, + error_message="Parquet column cannot be converted" + ) + else: + assert_gpu_and_cpu_are_equal_collect( + lambda spark: spark.read.schema(read_schema).parquet(data_path), + conf=spark_conf + ) + + @pytest.mark.skipif(is_before_spark_320() or is_spark_321cdh(), reason='Encryption is not supported before Spark 3.2.0 or Parquet < 1.12') @pytest.mark.skipif(os.environ.get('INCLUDE_PARQUET_HADOOP_TEST_JAR', 'false') == 'false', reason='INCLUDE_PARQUET_HADOOP_TEST_JAR is disabled') @pytest.mark.parametrize('v1_enabled_list', ["", "parquet"]) @@ -1463,13 +1577,16 @@ def test_parquet_read_encryption(spark_tmp_path, reader_confs, v1_enabled_list): assert_spark_exception( lambda: with_gpu_session( lambda spark: spark.read.parquet(data_path).collect()), - error_message='Could not read footer for file') + error_message='Could not read footer') # Common message fragment between all Spark versions. + # Note that this isn't thrown explicitly by the plugin. assert_spark_exception( lambda: with_gpu_session( lambda spark: spark.read.parquet(data_path).collect(), conf=conf), error_message='The GPU does not support reading encrypted Parquet files') + +@disable_ansi_mode # https://github.com/NVIDIA/spark-rapids/issues/5114 def test_parquet_read_count(spark_tmp_path): parquet_gens = [int_gen, string_gen, double_gen] gen_list = [('_c' + str(i), gen) for i, gen in enumerate(parquet_gens)] diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index 805a0b8137c..775b4a9d1cb 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -29,6 +29,11 @@ pytestmark = pytest.mark.nightly_resource_consuming_test +conf_key_parquet_datetimeRebaseModeInWrite = 'spark.sql.parquet.datetimeRebaseModeInWrite' +conf_key_parquet_int96RebaseModeInWrite = 'spark.sql.parquet.int96RebaseModeInWrite' +conf_key_parquet_datetimeRebaseModeInRead = 'spark.sql.parquet.datetimeRebaseModeInRead' +conf_key_parquet_int96RebaseModeInRead = 'spark.sql.parquet.int96RebaseModeInRead' + # test with original parquet file reader, the multi-file parallel reader for cloud, and coalesce file reader for # non-cloud original_parquet_file_reader_conf={'spark.rapids.sql.format.parquet.reader.type': 'PERFILE'} @@ -37,8 +42,8 @@ reader_opt_confs = [original_parquet_file_reader_conf, multithreaded_parquet_file_reader_conf, coalesce_parquet_file_reader_conf] parquet_decimal_struct_gen= StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(decimal_gens)]) -writer_confs={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': 'CORRECTED', - 'spark.sql.legacy.parquet.int96RebaseModeInWrite': 'CORRECTED'} +writer_confs={conf_key_parquet_datetimeRebaseModeInWrite: 'CORRECTED', + conf_key_parquet_int96RebaseModeInWrite: 'CORRECTED'} parquet_basic_gen =[byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, boolean_gen, date_gen, TimestampGen(), binary_gen] @@ -158,8 +163,8 @@ def test_write_ts_millis(spark_tmp_path, ts_type, ts_rebase): lambda spark, path: unary_op_df(spark, gen).write.parquet(path), lambda spark, path: spark.read.parquet(path), data_path, - conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase, - 'spark.sql.legacy.parquet.int96RebaseModeInWrite': ts_rebase, + conf={conf_key_parquet_datetimeRebaseModeInWrite: ts_rebase, + conf_key_parquet_int96RebaseModeInWrite: ts_rebase, 'spark.sql.parquet.outputTimestampType': ts_type}) @@ -285,8 +290,8 @@ def test_write_sql_save_table(spark_tmp_path, parquet_gens, spark_tmp_table_fact def writeParquetUpgradeCatchException(spark, df, data_path, spark_tmp_table_factory, int96_rebase, datetime_rebase, ts_write): spark.conf.set('spark.sql.parquet.outputTimestampType', ts_write) - spark.conf.set('spark.sql.legacy.parquet.datetimeRebaseModeInWrite', datetime_rebase) - spark.conf.set('spark.sql.legacy.parquet.int96RebaseModeInWrite', int96_rebase) # for spark 310 + spark.conf.set(conf_key_parquet_datetimeRebaseModeInWrite, datetime_rebase) + spark.conf.set(conf_key_parquet_int96RebaseModeInWrite, int96_rebase) # for spark 310 with pytest.raises(Exception) as e_info: df.coalesce(1).write.format("parquet").mode('overwrite').option("path", data_path).saveAsTable(spark_tmp_table_factory.get()) assert e_info.match(r".*SparkUpgradeException.*") @@ -544,8 +549,8 @@ def generate_map_with_empty_validity(spark, path): def test_parquet_write_fails_legacy_datetime(spark_tmp_path, data_gen, ts_write, ts_rebase_write): data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = {'spark.sql.parquet.outputTimestampType': ts_write, - 'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase_write, - 'spark.sql.legacy.parquet.int96RebaseModeInWrite': ts_rebase_write} + conf_key_parquet_datetimeRebaseModeInWrite: ts_rebase_write, + conf_key_parquet_int96RebaseModeInWrite: ts_rebase_write} def writeParquetCatchException(spark, data_gen, data_path): with pytest.raises(Exception) as e_info: unary_op_df(spark, data_gen).coalesce(1).write.parquet(data_path) @@ -563,12 +568,12 @@ def test_parquet_write_roundtrip_datetime_with_legacy_rebase(spark_tmp_path, dat ts_rebase_write, ts_rebase_read): data_path = spark_tmp_path + '/PARQUET_DATA' all_confs = {'spark.sql.parquet.outputTimestampType': ts_write, - 'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase_write[0], - 'spark.sql.legacy.parquet.int96RebaseModeInWrite': ts_rebase_write[1], + conf_key_parquet_datetimeRebaseModeInWrite: ts_rebase_write[0], + conf_key_parquet_int96RebaseModeInWrite: ts_rebase_write[1], # The rebase modes in read configs should be ignored and overridden by the same # modes in write configs, which are retrieved from the written files. - 'spark.sql.legacy.parquet.datetimeRebaseModeInRead': ts_rebase_read[0], - 'spark.sql.legacy.parquet.int96RebaseModeInRead': ts_rebase_read[1]} + conf_key_parquet_datetimeRebaseModeInRead: ts_rebase_read[0], + conf_key_parquet_int96RebaseModeInRead: ts_rebase_read[1]} assert_gpu_and_cpu_writes_are_equal_collect( lambda spark, path: unary_op_df(spark, data_gen).coalesce(1).write.parquet(path), lambda spark, path: spark.read.parquet(path), @@ -597,7 +602,8 @@ def test_it(spark): spark.sql("CREATE TABLE {} LOCATION '{}/ctas' AS SELECT * FROM {}".format( ctas_with_existing_name, data_path, src_name)) except pyspark.sql.utils.AnalysisException as e: - if allow_non_empty or e.desc.find('non-empty directory') == -1: + description = e._desc if (is_spark_400_or_later() or is_databricks_version_or_later(14, 3)) else e.desc + if allow_non_empty or description.find('non-empty directory') == -1: raise e with_gpu_session(test_it, conf) @@ -825,8 +831,8 @@ def write_partitions(spark, table_path): ) def hive_timestamp_value(spark_tmp_table_factory, spark_tmp_path, ts_rebase, func): - conf={'spark.sql.legacy.parquet.datetimeRebaseModeInWrite': ts_rebase, - 'spark.sql.legacy.parquet.int96RebaseModeInWrite': ts_rebase} + conf={conf_key_parquet_datetimeRebaseModeInWrite: ts_rebase, + conf_key_parquet_int96RebaseModeInWrite: ts_rebase} def create_table(spark, path): tmp_table = spark_tmp_table_factory.get() diff --git a/integration_tests/src/main/python/repart_test.py b/integration_tests/src/main/python/repart_test.py index 7f299373ff6..19759b77f5d 100644 --- a/integration_tests/src/main/python/repart_test.py +++ b/integration_tests/src/main/python/repart_test.py @@ -57,6 +57,8 @@ struct_of_maps = StructGen([['child0', BooleanGen()]] + [ ['child%d' % (i + 1), gen] for i, gen in enumerate(map_gens)]) +kudo_enabled_conf_key = "spark.rapids.shuffle.kudo.serializer.enabled" + @pytest.mark.parametrize('data_gen', [pytest.param((StructGen([['child0', DecimalGen(7, 2)]]), StructGen([['child1', IntegerGen()]]))), # left_struct(child0 = 4 level nested struct, child1 = Int) @@ -78,11 +80,13 @@ StructGen([['child1', MapGen(BooleanGen(nullable=False), boolean_gen)]], nullable=False))], ids=idfn) # This tests the union of DF of structs with different types of cols as long as the struct itself # isn't null. This is a limitation in cudf because we don't support nested types as literals -def test_union_struct_missing_children(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_union_struct_missing_children(data_gen, kudo_enabled): left_gen, right_gen = data_gen assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, left_gen).unionByName(binary_op_df( - spark, right_gen), True)) + spark, right_gen), True), + conf = {kudo_enabled_conf_key: kudo_enabled}) @pytest.mark.parametrize('data_gen', all_gen + map_gens + array_gens_sample + [all_basic_struct_gen, @@ -90,9 +94,11 @@ def test_union_struct_missing_children(data_gen): nested_struct, struct_of_maps], ids=idfn) # This tests union of two DFs of two cols each. The types of the left col and right col is the same -def test_union(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_union(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( - lambda spark : binary_op_df(spark, data_gen).union(binary_op_df(spark, data_gen))) + lambda spark : binary_op_df(spark, data_gen).union(binary_op_df(spark, data_gen)), + conf = {kudo_enabled_conf_key: kudo_enabled}) @pytest.mark.parametrize('data_gen', all_gen + map_gens + array_gens_sample + [all_basic_struct_gen, @@ -100,9 +106,11 @@ def test_union(data_gen): nested_struct, struct_of_maps], ids=idfn) # This tests union of two DFs of two cols each. The types of the left col and right col is the same -def test_unionAll(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_unionAll(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( - lambda spark : binary_op_df(spark, data_gen).unionAll(binary_op_df(spark, data_gen))) + lambda spark : binary_op_df(spark, data_gen).unionAll(binary_op_df(spark, data_gen)), + conf = {kudo_enabled_conf_key: kudo_enabled}) @pytest.mark.parametrize('data_gen', all_gen + map_gens + array_gens_sample + [all_basic_struct_gen, @@ -114,10 +122,13 @@ def test_unionAll(data_gen): struct_of_maps], ids=idfn) # This tests the union of two DFs of structs with missing child column names. The missing child # column will be replaced by nulls in the output DF. This is a feature added in 3.1+ -def test_union_by_missing_col_name(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_union_by_missing_col_name(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( lambda spark : binary_op_df(spark, data_gen).withColumnRenamed("a", "x") - .unionByName(binary_op_df(spark, data_gen).withColumnRenamed("a", "y"), True)) + .unionByName(binary_op_df(spark, data_gen).withColumnRenamed("a", + "y"), True), + conf = {kudo_enabled_conf_key: kudo_enabled}) # the first number ('1' and '2') is the nest level @@ -133,7 +144,8 @@ def test_union_by_missing_col_name(data_gen): nest_1_one, nest_1_two, nest_2_one, nest_2_two]) @pytest.mark.skipif(is_before_spark_330(), reason="This is supported only in Spark 3.3.0+") -def test_union_by_missing_field_name_in_arrays_structs(gen_pair): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_union_by_missing_field_name_in_arrays_structs(gen_pair, kudo_enabled): """ This tests the union of two DFs of arrays of structs with missing field names. The missing field will be replaced be nulls in the output DF. This is a feature added in 3.3+ @@ -142,8 +154,8 @@ def test_union_by_missing_field_name_in_arrays_structs(gen_pair): """ def assert_union_equal(gen1, gen2): assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen1).unionByName(unary_op_df(spark, gen2), True) - ) + lambda spark: unary_op_df(spark, gen1).unionByName(unary_op_df(spark, gen2), True), + conf = {kudo_enabled_conf_key: kudo_enabled}) assert_union_equal(gen_pair[0], gen_pair[1]) assert_union_equal(gen_pair[1], gen_pair[0]) @@ -155,9 +167,12 @@ def assert_union_equal(gen1, gen2): StructGen([['child0', DecimalGen(7, 2)]]), nested_struct, struct_of_maps], ids=idfn) -def test_union_by_name(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_union_by_name(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( - lambda spark : binary_op_df(spark, data_gen).unionByName(binary_op_df(spark, data_gen))) + lambda spark : binary_op_df(spark, data_gen).unionByName(binary_op_df(spark, + data_gen)), + conf = {kudo_enabled_conf_key: kudo_enabled}) @pytest.mark.parametrize('data_gen', [ @@ -166,19 +181,23 @@ def test_union_by_name(data_gen): pytest.param([('array' + str(i), gen) for i, gen in enumerate(array_gens_sample + [ArrayGen(BinaryGen(max_length=5), max_length=5)])]), pytest.param([('map' + str(i), gen) for i, gen in enumerate(map_gens_sample)]), ], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_coalesce_types(data_gen): +def test_coalesce_types(data_gen, kudo_enabled): assert_gpu_and_cpu_are_equal_collect( - lambda spark: gen_df(spark, data_gen).coalesce(2)) + lambda spark: gen_df(spark, data_gen).coalesce(2), + conf = {kudo_enabled_conf_key: kudo_enabled}) @pytest.mark.parametrize('num_parts', [1, 10, 100, 1000, 2000], ids=idfn) @pytest.mark.parametrize('length', [0, 2048, 4096], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_coalesce_df(num_parts, length): +def test_coalesce_df(num_parts, length, kudo_enabled): #This should change eventually to be more than just the basic gens gen_list = [('_c' + str(i), gen) for i, gen in enumerate(all_basic_gens + decimal_gens + [binary_gen])] assert_gpu_and_cpu_are_equal_collect( - lambda spark : gen_df(spark, gen_list, length=length).coalesce(num_parts)) + lambda spark : gen_df(spark, gen_list, length=length).coalesce(num_parts), + conf = {kudo_enabled_conf_key: kudo_enabled}) @pytest.mark.parametrize('data_gen', [ pytest.param([('_c' + str(i), gen) for i, gen in enumerate(all_basic_gens + decimal_gens + [binary_gen])]), @@ -188,15 +207,17 @@ def test_coalesce_df(num_parts, length): ], ids=idfn) @pytest.mark.parametrize('num_parts', [1, 10, 2345], ids=idfn) @pytest.mark.parametrize('length', [0, 2048, 4096], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @ignore_order(local=True) # To avoid extra data shuffle by 'sort on Spark' for this repartition test. @allow_non_gpu(*non_utc_allow) -def test_repartition_df(data_gen, num_parts, length): +def test_repartition_df(data_gen, num_parts, length, kudo_enabled): from pyspark.sql.functions import lit assert_gpu_and_cpu_are_equal_collect( # Add a computed column to avoid shuffle being optimized back to a CPU shuffle lambda spark : gen_df(spark, data_gen, length=length).withColumn('x', lit(1)).repartition(num_parts), # disable sort before shuffle so round robin works for maps - conf = {'spark.sql.execution.sortBeforeRepartition': 'false'}) + conf = {'spark.sql.execution.sortBeforeRepartition': 'false', + kudo_enabled_conf_key: kudo_enabled}) @pytest.mark.parametrize('data_gen', [ pytest.param([('_c' + str(i), gen) for i, gen in enumerate(all_basic_gens + decimal_gens)]), @@ -205,45 +226,53 @@ def test_repartition_df(data_gen, num_parts, length): ], ids=idfn) @pytest.mark.parametrize('num_parts', [1, 10, 2345], ids=idfn) @pytest.mark.parametrize('length', [0, 2048, 4096], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @ignore_order(local=True) # To avoid extra data shuffle by 'sort on Spark' for this repartition test. @allow_non_gpu(*non_utc_allow) -def test_repartition_df_for_round_robin(data_gen, num_parts, length): +def test_repartition_df_for_round_robin(data_gen, num_parts, length, kudo_enabled): from pyspark.sql.functions import lit assert_gpu_and_cpu_are_equal_collect( # Add a computed column to avoid shuffle being optimized back to a CPU shuffle lambda spark : gen_df(spark, data_gen, length=length).withColumn('x', lit(1)).repartition(num_parts), # Enable sort for round robin partition - conf = {'spark.sql.execution.sortBeforeRepartition': 'true'}) # default is true + conf = {'spark.sql.execution.sortBeforeRepartition': 'true', + kudo_enabled_conf_key: kudo_enabled}) # default is true @allow_non_gpu('ShuffleExchangeExec', 'RoundRobinPartitioning') @pytest.mark.parametrize('data_gen', [[('a', simple_string_to_string_map_gen)]], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @ignore_order(local=True) # To avoid extra data shuffle by 'sort on Spark' for this repartition test. -def test_round_robin_sort_fallback(data_gen): +def test_round_robin_sort_fallback(data_gen, kudo_enabled): from pyspark.sql.functions import lit assert_gpu_fallback_collect( # Add a computed column to avoid shuffle being optimized back to a CPU shuffle like in test_repartition_df lambda spark : gen_df(spark, data_gen).withColumn('extra', lit(1)).repartition(13), - 'ShuffleExchangeExec') + 'ShuffleExchangeExec', + conf = {kudo_enabled_conf_key: kudo_enabled}) @allow_non_gpu("ProjectExec", "ShuffleExchangeExec") @ignore_order(local=True) # To avoid extra data shuffle by 'sort on Spark' for this repartition test. @pytest.mark.parametrize('num_parts', [2, 10, 17, 19, 32], ids=idfn) @pytest.mark.parametrize('gen', [([('ag', ArrayGen(StructGen([('b1', long_gen)])))], ['ag'])], ids=idfn) -def test_hash_repartition_exact_fallback(gen, num_parts): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_repartition_exact_fallback(gen, num_parts, kudo_enabled): data_gen = gen[0] part_on = gen[1] assert_gpu_fallback_collect( lambda spark : gen_df(spark, data_gen, length=1024) \ .repartition(num_parts, *part_on) \ .withColumn('id', f.spark_partition_id()) \ - .selectExpr('*'), "ShuffleExchangeExec") + .selectExpr('*'), "ShuffleExchangeExec", + conf = {kudo_enabled_conf_key: kudo_enabled}) @allow_non_gpu("ProjectExec") @pytest.mark.parametrize('data_gen', [ArrayGen(StructGen([('b1', long_gen)]))], ids=idfn) -def test_hash_fallback(data_gen): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_hash_fallback(data_gen, kudo_enabled): assert_gpu_fallback_collect( lambda spark : unary_op_df(spark, data_gen, length=1024) \ - .selectExpr('*', 'hash(a) as h'), "ProjectExec") + .selectExpr('*', 'hash(a) as h'), "ProjectExec", + conf = {kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) # To avoid extra data shuffle by 'sort on Spark' for this repartition test. @pytest.mark.parametrize('num_parts', [1, 2, 10, 17, 19, 32], ids=idfn) @@ -279,8 +308,9 @@ def test_hash_fallback(data_gen): ([('a', decimal_gen_64bit), ('b', decimal_gen_64bit), ('c', decimal_gen_64bit)], ['a', 'b', 'c']), ([('a', decimal_gen_128bit), ('b', decimal_gen_128bit), ('c', decimal_gen_128bit)], ['a', 'b', 'c']), ], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_hash_repartition_exact(gen, num_parts): +def test_hash_repartition_exact(gen, num_parts, kudo_enabled): data_gen = gen[0] part_on = gen[1] assert_gpu_and_cpu_are_equal_collect( @@ -288,7 +318,8 @@ def test_hash_repartition_exact(gen, num_parts): .repartition(num_parts, *part_on)\ .withColumn('id', f.spark_partition_id())\ .withColumn('hashed', f.hash(*part_on))\ - .selectExpr('*', 'pmod(hashed, {})'.format(num_parts))) + .selectExpr('*', 'pmod(hashed, {})'.format(num_parts)), + conf = {kudo_enabled_conf_key: kudo_enabled}) @ignore_order(local=True) # To avoid extra data shuffle by 'sort on Spark' for this repartition test. @@ -311,18 +342,19 @@ def test_hash_repartition_exact_longs_no_overflow(num_parts, is_ansi_mode): @ignore_order(local=True) # To avoid extra data shuffle by 'sort on Spark' for this repartition test. @pytest.mark.parametrize('num_parts', [17], ids=idfn) +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) @allow_non_gpu(*non_utc_allow) -def test_hash_repartition_long_overflow_ansi_exception(num_parts): - data_gen = [('a', long_gen)] - part_on = [f.col('a') + 15] - conf = ansi_enabled_conf +def test_hash_repartition_long_overflow_ansi_exception(num_parts, kudo_enabled): + conf = copy_and_update(ansi_enabled_conf, {kudo_enabled_conf_key: kudo_enabled}) def test_function(spark): - return gen_df(spark, data_gen, length=1024) \ - .withColumn('plus15', f.col('a') + 15) \ - .repartition(num_parts, f.col('plus15')) \ + df = gen_df(spark, [('a', long_gen)], length=1024) + maxVal = df.selectExpr("max(a) as m").head()['m'] + overflowVal = (1 << 63) - maxVal + return df.withColumn('plus', f.col('a') + overflowVal) \ + .repartition(num_parts, f.col('plus')) \ .withColumn('id', f.spark_partition_id()) \ - .withColumn('hashed', f.hash(*part_on)) \ + .withColumn('hashed', f.hash(f.col('a') + overflowVal)) \ .selectExpr('*', 'pmod(hashed, {})'.format(num_parts)) assert_gpu_and_cpu_error( @@ -332,11 +364,13 @@ def test_function(spark): # Test a query that should cause Spark to leverage getShuffleRDD @ignore_order(local=True) -def test_union_with_filter(): +@pytest.mark.parametrize("kudo_enabled", ["true", "false"], ids=idfn) +def test_union_with_filter(kudo_enabled): def doit(spark): dfa = spark.range(1, 100).withColumn("id2", f.col("id")) dfb = dfa.groupBy("id").agg(f.size(f.collect_set("id2")).alias("idc")) dfc = dfb.filter(f.col("idc") == 1).select("id") return dfc.union(dfc) - conf = { "spark.sql.adaptive.enabled": "true" } + conf = { "spark.sql.adaptive.enabled": "true", + kudo_enabled_conf_key: kudo_enabled} assert_gpu_and_cpu_are_equal_collect(doit, conf) diff --git a/integration_tests/src/main/python/schema_evolution_test.py b/integration_tests/src/main/python/schema_evolution_test.py index ff501324cc0..57af4a1126e 100644 --- a/integration_tests/src/main/python/schema_evolution_test.py +++ b/integration_tests/src/main/python/schema_evolution_test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -34,7 +34,9 @@ # List of additional column data generators to use when adding columns _additional_gens = [ - boolean_gen, + # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and + # https://github.com/rapidsai/cudf/issues/6763 . + # Once the first issue is fixed, add back boolean_gen byte_gen, short_gen, int_gen, @@ -49,7 +51,10 @@ # simple_string_to_string_map_gen), ArrayGen(_custom_date_gen), struct_gen_decimal128, - StructGen([("c0", ArrayGen(long_gen)), ("c1", boolean_gen)]), + # Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and + # https://github.com/rapidsai/cudf/issues/6763 . + # Once the first issue is fixed, add back boolean_gen from int_gen for c1 + StructGen([("c0", ArrayGen(long_gen)), ("c1", int_gen)]), ] def get_additional_columns(): diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 7099436f608..03c022bb201 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -23,7 +23,7 @@ from pyspark.sql.types import * import pyspark.sql.utils import pyspark.sql.functions as f -from spark_session import with_cpu_session, with_gpu_session, is_databricks104_or_later, is_before_spark_320, is_before_spark_400 +from spark_session import with_cpu_session, with_gpu_session, is_databricks104_or_later, is_databricks_version_or_later, is_before_spark_320, is_spark_400_or_later _regexp_conf = { 'spark.rapids.sql.regexp.enabled': 'true' } @@ -104,10 +104,6 @@ def test_substring_index(data_gen,delim): @allow_non_gpu('ProjectExec') -@pytest.mark.skipif(condition=not is_before_spark_400(), - reason="Bug in Apache Spark 4.0 causes NumberFormatExceptions from substring_index(), " - "if called with index==null. For further information, see: " - "https://issues.apache.org/jira/browse/SPARK-48989.") @pytest.mark.parametrize('data_gen', [mk_str_gen('([ABC]{0,3}_?){0,7}')], ids=idfn) def test_unsupported_fallback_substring_index(data_gen): delim_gen = StringGen(pattern="_") @@ -327,6 +323,10 @@ def test_rtrim(data_gen): 'TRIM(TRAILING NULL FROM a)', 'TRIM(TRAILING "" FROM a)')) +@pytest.mark.skipif(condition=is_spark_400_or_later() or is_databricks_version_or_later(14, 3), + reason="startsWith(None)/endswith(None) seems to cause an NPE in Column.fn() on Apache Spark 4.0, " + "and Databricks 14.3." + "See https://issues.apache.org/jira/browse/SPARK-48995.") def test_startswith(): gen = mk_str_gen('[Ab\ud720]{3}A.{0,3}Z[Ab\ud720]{3}') assert_gpu_and_cpu_are_equal_collect( @@ -351,8 +351,9 @@ def assert_gpu_did_fallback(op): assert_gpu_did_fallback(f.col("a").startswith(f.col("a"))) -@pytest.mark.skipif(condition=not is_before_spark_400(), - reason="endswith(None) seems to cause an NPE in Column.fn() on Apache Spark 4.0. " +@pytest.mark.skipif(condition=is_spark_400_or_later() or is_databricks_version_or_later(14, 3), + reason="startsWith(None)/endswith(None) seems to cause an NPE in Column.fn() on Apache Spark 4.0, " + "and Databricks 14.3." "See https://issues.apache.org/jira/browse/SPARK-48995.") def test_endswith(): gen = mk_str_gen('[Ab\ud720]{3}A.{0,3}Z[Ab\ud720]{3}') diff --git a/integration_tests/src/main/python/url_test.py b/integration_tests/src/main/python/url_test.py index 9d601c72675..e1bf9c821a8 100644 --- a/integration_tests/src/main/python/url_test.py +++ b/integration_tests/src/main/python/url_test.py @@ -148,7 +148,7 @@ url_gen = StringGen(url_pattern) -supported_parts = ['PROTOCOL', 'HOST', 'QUERY', 'PATH'] +supported_parts = ['PROTOCOL', 'HOST', 'QUERY', 'PATH', 'invalid', 'path'] unsupported_parts = ['REF', 'FILE', 'AUTHORITY', 'USERINFO'] @pytest.mark.parametrize('data_gen', [url_gen, edge_cases_gen], ids=idfn) diff --git a/integration_tests/src/test/resources/int_struct_formatted.json b/integration_tests/src/test/resources/int_struct_formatted.json index e3ac75fbf14..34e9557b461 100644 --- a/integration_tests/src/test/resources/int_struct_formatted.json +++ b/integration_tests/src/test/resources/int_struct_formatted.json @@ -2,4 +2,3 @@ {"data": {"A": 1}} {"data": {"B": 50}} {"data": {"B": -128, "A": 127}} -{"data": {"B": 99999999999999999999, "A": -9999999999999999999}} diff --git a/integration_tests/src/test/resources/int_struct_formatted_problematic_rows.json b/integration_tests/src/test/resources/int_struct_formatted_problematic_rows.json new file mode 100644 index 00000000000..c51e69b5297 --- /dev/null +++ b/integration_tests/src/test/resources/int_struct_formatted_problematic_rows.json @@ -0,0 +1,2 @@ +{"data": {"B": 99999999999999999999, "A": -9999999999999999999}} +{"data": {"A": 0, "B": "0"}} diff --git a/integration_tests/src/test/scala/com/nvidia/spark/rapids/functionsSuite.scala b/integration_tests/src/test/scala/com/nvidia/spark/rapids/functionsSuite.scala new file mode 100644 index 00000000000..04e2f8a48f1 --- /dev/null +++ b/integration_tests/src/test/scala/com/nvidia/spark/rapids/functionsSuite.scala @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids + +import com.nvidia.spark.functions._ + +import org.apache.spark.sql.{Column, Row} +import org.apache.spark.sql.api.java._ +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.nvidia.SparkTestBase +import org.apache.spark.sql.types._ + +class functionsSuite extends SparkTestBase { + test("basic 0 arg df_udf") { + val zero = df_udf(() => lit(0)) + withSparkSession{ spark => + spark.udf.register("zero", zero) + assertSame(Array( + Row(0L, 0), + Row(1L, 0)), + spark.range(2).selectExpr("id", "zero()").collect()) + assertSame(Array( + Row(0L, 0), + Row(1L, 0)), + spark.range(2).select(col("id"), zero()).collect()) + } + } + + test("basic 1 arg df_udf") { + val inc = df_udf((input: Column) => input + 1) + withSparkSession { spark => + spark.udf.register("inc", inc) + assertSame(Array( + Row(0L, 1L), + Row(1L, 2L)), + spark.range(2).selectExpr("id", "inc(id)").collect()) + assertSame(Array( + Row(0L, 1L), + Row(1L, 2L)), + spark.range(2).select(col("id"), inc(col("id"))).collect()) + } + } + + + test("basic 2 arg df_udf") { + val add = df_udf((a: Column, b:Column) => a + b) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 0L), + Row(1L, 2L)), + spark.range(2).selectExpr("id", "add(id, id)").collect()) + assertSame(Array( + Row(0L, 0L), + Row(1L, 2L)), + spark.range(2).select(col("id"), add(col("id"), col("id"))).collect()) + } + } + + test("basic 3 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column) => a + b + c) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 0L), + Row(1L, 3L)), + spark.range(2).selectExpr("id", "add(id, id, id)").collect()) + assertSame(Array( + Row(0L, 0L), + Row(1L, 3L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), col("id"))).collect()) + } + } + + test("basic 4 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column, d:Column) => a + b + c + d) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 1L), + Row(1L, 4L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id)").collect()) + assertSame(Array( + Row(0L, 1L), + Row(1L, 4L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), col("id"))).collect()) + } + } + + test("basic 5 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column, d:Column, e:Column) => + a + b + c + d + e) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 2L), + Row(1L, 5L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1)").collect()) + assertSame(Array( + Row(0L, 2L), + Row(1L, 5L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1))).collect()) + } + } + + test("basic 6 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column, d:Column, e:Column, f:Column) => + a + b + c + d + e + f) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 2L), + Row(1L, 6L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id)").collect()) + assertSame(Array( + Row(0L, 2L), + Row(1L, 6L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"))).collect()) + } + } + + test("basic 7 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column, d:Column, e:Column, + f:Column, g:Column) => a + b + c + d + e + f + g) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 2L), + Row(1L, 7L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id)").collect()) + assertSame(Array( + Row(0L, 2L), + Row(1L, 7L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"))).collect()) + } + } + + test("basic 8 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column, d:Column, e:Column, + f:Column, g:Column, h:Column) => a + b + c + d + e + f + g + h) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 4L), + Row(1L, 9L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id, 2)").collect()) + assertSame(Array( + Row(0L, 4L), + Row(1L, 9L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"), lit(2))).collect()) + } + } + + test("basic 9 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column, d:Column, e:Column, + f:Column, g:Column, h:Column, i:Column) => + a + b + c + d + e + f + g + h + i) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 4L), + Row(1L, 10L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id, 2, id)").collect()) + assertSame(Array( + Row(0L, 4L), + Row(1L, 10L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"), lit(2), col("id"))).collect()) + } + } + + test("basic 10 arg df_udf") { + val add = df_udf((a: Column, b:Column, c:Column, d:Column, e:Column, + f:Column, g:Column, h:Column, i:Column, j:Column) => + a + b + c + d + e + f + g + h + i + j) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 4L), + Row(1L, 11L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id, 2, id, id)").collect()) + assertSame(Array( + Row(0L, 4L), + Row(1L, 11L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"), lit(2), col("id"), col("id"))).collect()) + } + } + + test("nested df_udf") { + val add = df_udf((a: Column, b:Column) => a + b) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 22L), + Row(1L, 25L)), + spark.range(2).selectExpr("id", "add(add(id, 12), add(add(id, id), 10))").collect()) + } + } + + test("complex df_udf") { + val extractor = df_udf((json: Column) => { + val schema = StructType(Seq(StructField("values", ArrayType(LongType)))) + val extracted_json = from_json(json, schema, Map.empty[String, String]) + aggregate(extracted_json("values"), + lit(0L), + (a, b) => coalesce(a, lit(0L)) + coalesce(b, lit(0L)), + a => a) + }) + withSparkSession { spark => + import spark.implicits._ + spark.udf.register("extractor", extractor) + assertSame(Array( + Row(6L), + Row(3L)), + Seq("""{"values":[1,2,3]}""", + """{"values":[1, null, null, 2]}""").toDF("json").selectExpr("extractor(json)").collect()) + } + } + + test("j basic 0 arg df_udf") { + val zero = df_udf(new UDF0[Column] { + override def call(): Column = lit(0) + }) + withSparkSession{ spark => + spark.udf.register("zero", zero) + assertSame(Array( + Row(0L, 0), + Row(1L, 0)), + spark.range(2).selectExpr("id", "zero()").collect()) + assertSame(Array( + Row(0L, 0), + Row(1L, 0)), + spark.range(2).select(col("id"), zero()).collect()) + } + } + + test("jbasic 1 arg df_udf") { + val inc = df_udf(new UDF1[Column, Column] { + override def call(a: Column): Column = a + 1 + }) + withSparkSession { spark => + spark.udf.register("inc", inc) + assertSame(Array( + Row(0L, 1L), + Row(1L, 2L)), + spark.range(2).selectExpr("id", "inc(id)").collect()) + assertSame(Array( + Row(0L, 1L), + Row(1L, 2L)), + spark.range(2).select(col("id"), inc(col("id"))).collect()) + } + } + + test("jbasic 2 arg df_udf") { + val add = df_udf(new UDF2[Column, Column, Column] { + override def call(a: Column, b:Column): Column = a + b + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 0L), + Row(1L, 2L)), + spark.range(2).selectExpr("id", "add(id, id)").collect()) + assertSame(Array( + Row(0L, 0L), + Row(1L, 2L)), + spark.range(2).select(col("id"), add(col("id"), col("id"))).collect()) + } + } + + test("jbasic 3 arg df_udf") { + val add = df_udf(new UDF3[Column, Column, Column, Column] { + override def call(a: Column, b: Column, c: Column): Column = a + b + c + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 0L), + Row(1L, 3L)), + spark.range(2).selectExpr("id", "add(id, id, id)").collect()) + assertSame(Array( + Row(0L, 0L), + Row(1L, 3L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), col("id"))).collect()) + } + } + + test("jbasic 4 arg df_udf") { + val add = df_udf(new UDF4[Column, Column, Column, Column, Column] { + override def call(a: Column, b:Column, c:Column, d:Column): Column = a + b + c + d + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 1L), + Row(1L, 4L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id)").collect()) + assertSame(Array( + Row(0L, 1L), + Row(1L, 4L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), col("id"))).collect()) + } + } + + test("jbasic 5 arg df_udf") { + val add = df_udf(new UDF5[Column, Column, Column, Column, Column, Column] { + override def call(a: Column, b: Column, c: Column, d: Column, e: Column): Column = + a + b + c + d + e + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 2L), + Row(1L, 5L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1)").collect()) + assertSame(Array( + Row(0L, 2L), + Row(1L, 5L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1))).collect()) + } + } + + test("jbasic 6 arg df_udf") { + val add = df_udf(new UDF6[Column, Column, Column, Column, Column, Column, Column] { + override def call(a: Column, b:Column, c:Column, d:Column, e:Column, f:Column) = + a + b + c + d + e + f + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 2L), + Row(1L, 6L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id)").collect()) + assertSame(Array( + Row(0L, 2L), + Row(1L, 6L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"))).collect()) + } + } + + test("jbasic 7 arg df_udf") { + val add = df_udf(new UDF7[Column, Column, Column, Column, Column, Column, Column, + Column] { + override def call(a: Column, b:Column, c:Column, d:Column, e:Column, + f:Column, g:Column): Column = a + b + c + d + e + f + g + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 2L), + Row(1L, 7L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id)").collect()) + assertSame(Array( + Row(0L, 2L), + Row(1L, 7L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"))).collect()) + } + } + + test("jbasic 8 arg df_udf") { + val add = df_udf(new UDF8[Column, Column, Column, Column, Column, Column, Column, + Column, Column] { + override def call(a: Column, b: Column, c: Column, d: Column, e: Column, + f: Column, g: Column, h: Column): Column = a + b + c + d + e + f + g + h + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 4L), + Row(1L, 9L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id, 2)").collect()) + assertSame(Array( + Row(0L, 4L), + Row(1L, 9L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"), lit(2))).collect()) + } + } + + test("jbasic 9 arg df_udf") { + val add = df_udf(new UDF9[Column, Column, Column, Column, Column, Column, Column, + Column, Column, Column] { + override def call(a: Column, b:Column, c:Column, d:Column, e:Column, + f:Column, g:Column, h:Column, i:Column): Column = + a + b + c + d + e + f + g + h + i + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 4L), + Row(1L, 10L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id, 2, id)").collect()) + assertSame(Array( + Row(0L, 4L), + Row(1L, 10L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"), lit(2), col("id"))).collect()) + } + } + + test("jbasic 10 arg df_udf") { + val add = df_udf(new UDF10[Column, Column, Column, Column, Column, Column, Column, + Column, Column, Column, Column] { + override def call(a: Column, b:Column, c:Column, d:Column, e:Column, + f:Column, g:Column, h:Column, i:Column, j:Column): Column = + a + b + c + d + e + f + g + h + i + j + }) + withSparkSession { spark => + spark.udf.register("add", add) + assertSame(Array( + Row(0L, 4L), + Row(1L, 11L)), + spark.range(2).selectExpr("id", "add(id, id, 1, id, 1, id, id, 2, id, id)").collect()) + assertSame(Array( + Row(0L, 4L), + Row(1L, 11L)), + spark.range(2).select(col("id"), add(col("id"), col("id"), lit(1), + col("id"), lit(1), col("id"), col("id"), lit(2), col("id"), col("id"))).collect()) + } + } +} \ No newline at end of file diff --git a/integration_tests/src/test/scala/org/apache/spark/sql/nvidia/SparkTestBase.scala b/integration_tests/src/test/scala/org/apache/spark/sql/nvidia/SparkTestBase.scala new file mode 100644 index 00000000000..2bd6697ffad --- /dev/null +++ b/integration_tests/src/test/scala/org/apache/spark/sql/nvidia/SparkTestBase.scala @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.nvidia + +import java.io.File +import java.nio.file.Files +import java.util.{Locale, TimeZone} + +import org.scalatest.BeforeAndAfterAll +import org.scalatest.funsuite.AnyFunSuite + +import org.apache.spark.SparkConf +import org.apache.spark.internal.Logging +import org.apache.spark.sql.{Row, SparkSession} + +object SparkSessionHolder extends Logging { + private var spark = createSparkSession() + private var origConf = spark.conf.getAll + private var origConfKeys = origConf.keys.toSet + + private def setAllConfs(confs: Array[(String, String)]): Unit = confs.foreach { + case (key, value) if spark.conf.get(key, null) != value => + spark.conf.set(key, value) + case _ => // No need to modify it + } + + private def createSparkSession(): SparkSession = { + SparkSession.cleanupAnyExistingSession() + + TimeZone.setDefault(TimeZone.getTimeZone("UTC")) + Locale.setDefault(Locale.US) + + val builder = SparkSession.builder() + .master("local[1]") + .config("spark.sql.extensions", "com.nvidia.spark.DFUDFPlugin") + .config("spark.sql.warehouse.dir", sparkWarehouseDir.getAbsolutePath) + .appName("dataframe udf tests") + + builder.getOrCreate() + } + + private def reinitSession(): Unit = { + spark = createSparkSession() + origConf = spark.conf.getAll + origConfKeys = origConf.keys.toSet + } + + def sparkSession: SparkSession = { + if (SparkSession.getActiveSession.isEmpty) { + reinitSession() + } + spark + } + + def resetSparkSessionConf(): Unit = { + if (SparkSession.getActiveSession.isEmpty) { + reinitSession() + } else { + setAllConfs(origConf.toArray) + val currentKeys = spark.conf.getAll.keys.toSet + val toRemove = currentKeys -- origConfKeys + if (toRemove.contains("spark.shuffle.manager")) { + // cannot unset the config so need to reinitialize + reinitSession() + } else { + toRemove.foreach(spark.conf.unset) + } + } + logDebug(s"RESET CONF TO: ${spark.conf.getAll}") + } + + def withSparkSession[U](conf: SparkConf, f: SparkSession => U): U = { + resetSparkSessionConf() + logDebug(s"SETTING CONF: ${conf.getAll.toMap}") + setAllConfs(conf.getAll) + logDebug(s"RUN WITH CONF: ${spark.conf.getAll}\n") + f(spark) + } + + private lazy val sparkWarehouseDir: File = { + new File(System.getProperty("java.io.tmpdir")).mkdirs() + val path = Files.createTempDirectory("spark-warehouse") + val file = new File(path.toString) + file.deleteOnExit() + file + } +} + +/** + * Base to be able to run tests with a spark context + */ +trait SparkTestBase extends AnyFunSuite with BeforeAndAfterAll { + def withSparkSession[U](f: SparkSession => U): U = { + withSparkSession(new SparkConf, f) + } + + def withSparkSession[U](conf: SparkConf, f: SparkSession => U): U = { + SparkSessionHolder.withSparkSession(conf, f) + } + + override def afterAll(): Unit = { + super.afterAll() + SparkSession.cleanupAnyExistingSession() + } + + def assertSame(expected: Any, actual: Any, epsilon: Double = 0.0, + path: List[String] = List.empty): Unit = { + def assertDoublesAreEqualWithinPercentage(expected: Double, + actual: Double, path: List[String]): Unit = { + if (expected != actual) { + if (expected != 0) { + val v = Math.abs((expected - actual) / expected) + assert(v <= epsilon, + s"$path: ABS($expected - $actual) / ABS($actual) == $v is not <= $epsilon ") + } else { + val v = Math.abs(expected - actual) + assert(v <= epsilon, s"$path: ABS($expected - $actual) == $v is not <= $epsilon ") + } + } + } + (expected, actual) match { + case (a: Float, b: Float) if a.isNaN && b.isNaN => + case (a: Double, b: Double) if a.isNaN && b.isNaN => + case (null, null) => + case (null, other) => fail(s"$path: expected is null, but actual is $other") + case (other, null) => fail(s"$path: expected is $other, but actual is null") + case (a: Array[_], b: Array[_]) => + assert(a.length == b.length, + s"$path: expected (${a.toList}) and actual (${b.toList}) lengths don't match") + a.indices.foreach { i => + assertSame(a(i), b(i), epsilon, path :+ i.toString) + } + case (a: Map[_, _], b: Map[_, _]) => + throw new IllegalStateException(s"Maps are not supported yet for comparison $a vs $b") + case (a: Iterable[_], b: Iterable[_]) => + assert(a.size == b.size, + s"$path: expected (${a.toList}) and actual (${b.toList}) lengths don't match") + var i = 0 + a.zip(b).foreach { + case (l, r) => + assertSame(l, r, epsilon, path :+ i.toString) + i += 1 + } + case (a: Product, b: Product) => + assertSame(a.productIterator.toSeq, b.productIterator.toSeq, epsilon, path) + case (a: Row, b: Row) => + assertSame(a.toSeq, b.toSeq, epsilon, path) + // 0.0 == -0.0, turn float/double to bits before comparison, to distinguish 0.0 and -0.0. + case (a: Double, b: Double) if epsilon <= 0 => + java.lang.Double.doubleToRawLongBits(a) == java.lang.Double.doubleToRawLongBits(b) + case (a: Double, b: Double) if epsilon > 0 => + assertDoublesAreEqualWithinPercentage(a, b, path) + case (a: Float, b: Float) if epsilon <= 0 => + java.lang.Float.floatToRawIntBits(a) == java.lang.Float.floatToRawIntBits(b) + case (a: Float, b: Float) if epsilon > 0 => + assertDoublesAreEqualWithinPercentage(a, b, path) + case (a, b) => + assert(a == b, s"$path: $a != $b") + } + } +} diff --git a/jdk-profiles/pom.xml b/jdk-profiles/pom.xml index ce10ccb0db3..689f3576b89 100644 --- a/jdk-profiles/pom.xml +++ b/jdk-profiles/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-parent_2.12 - 24.10.1 + 24.12.0 com.nvidia rapids-4-spark-jdk-profiles_2.12 pom Shim JDK Profiles - 24.10.1 + 24.12.0 jdk8 diff --git a/jenkins/Jenkinsfile-blossom.premerge-databricks b/jenkins/Jenkinsfile-blossom.premerge-databricks index 5b0a2bf1226..2f8b926898a 100644 --- a/jenkins/Jenkinsfile-blossom.premerge-databricks +++ b/jenkins/Jenkinsfile-blossom.premerge-databricks @@ -91,7 +91,7 @@ pipeline { // 'name' and 'value' only supprt literal string in the declarative Jenkins // Refer to Jenkins issue https://issues.jenkins.io/browse/JENKINS-62127 name 'DB_RUNTIME' - values '11.3', '12.2', '13.3' + values '11.3', '12.2', '13.3', '14.3' } } stages { @@ -175,20 +175,23 @@ void databricksBuild() { } } - stage("Test agaist $SPARK_MAJOR DB") { - script { - container('cpu') { - try { - withCredentials([file(credentialsId: 'SPARK_DATABRICKS_PRIVKEY', variable: 'DATABRICKS_PRIVKEY')]) { - def TEST_PARAMS = " -w $DATABRICKS_HOST -t $DATABRICKS_TOKEN -c $CLUSTER_ID -e TEST_MODE=$TEST_MODE" + - " -p $DATABRICKS_PRIVKEY -l ./jenkins/databricks/test.sh -v $BASE_SPARK_VERSION -d /home/ubuntu/test.sh" - if (params.SPARK_CONF) { - TEST_PARAMS += " -f ${params.SPARK_CONF}" + // TODO: Temporarily skip tests on Databricks 14.3 until the test failures are fixed + if (env.DB_RUNTIME != '14.3') { + stage("Test agaist $SPARK_MAJOR DB") { + script { + container('cpu') { + try { + withCredentials([file(credentialsId: 'SPARK_DATABRICKS_PRIVKEY', variable: 'DATABRICKS_PRIVKEY')]) { + def TEST_PARAMS = " -w $DATABRICKS_HOST -t $DATABRICKS_TOKEN -c $CLUSTER_ID -e TEST_MODE=$TEST_MODE" + + " -p $DATABRICKS_PRIVKEY -l ./jenkins/databricks/test.sh -v $BASE_SPARK_VERSION -d /home/ubuntu/test.sh" + if (params.SPARK_CONF) { + TEST_PARAMS += " -f ${params.SPARK_CONF}" + } + sh "python3 ./jenkins/databricks/run-tests.py $TEST_PARAMS" } - sh "python3 ./jenkins/databricks/run-tests.py $TEST_PARAMS" + } finally { + common.publishPytestResult(this, "${STAGE_NAME}") } - } finally { - common.publishPytestResult(this, "${STAGE_NAME}") } } } diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh index 25bade91968..f6ff6e913b6 100755 --- a/jenkins/databricks/build.sh +++ b/jenkins/databricks/build.sh @@ -73,6 +73,14 @@ initialize() # the version of Spark used when we install the Databricks jars in .m2 BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=${BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS:-$BASE_SPARK_VERSION} SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS=${BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS}-databricks + DBR_VER=$(cat /databricks/DBR_VERSION) + if [ $DBR_VER == '14.3' ]; then + DBR_VER=$(echo $DBR_VER | sed 's/\.//g') + # We are appending 143 in addition to the base spark version because Databricks 14.3 + # and Databricks 15.4 are both based on spark version 3.5.0 + BUILDVER="$BUILDVER$DBR_VER" + SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS="$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS-$DBR_VER" + fi # pull normal Spark artifacts and ignore errors then install databricks jars, then build again. # this should match the databricks init script. diff --git a/jenkins/databricks/clusterutils.py b/jenkins/databricks/clusterutils.py index 2c31fd0e6be..8e1b272ef5a 100644 --- a/jenkins/databricks/clusterutils.py +++ b/jenkins/databricks/clusterutils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -85,6 +85,8 @@ def wait_for_cluster_start(workspace, clusterid, token, retries=20, printLoc=sys if current_state in ['INTERNAL_ERROR', 'SKIPPED', 'TERMINATED'] or p >= 60: if p >= retries: print("Waited %d times already, stopping" % p) + # Output the cluster ID to stdout so a calling script can get it easily + print(clusterid, file=sys.stdout) sys.exit(4) p = p + 1 print("Done starting cluster", file=printLoc) diff --git a/jenkins/databricks/create.py b/jenkins/databricks/create.py index fa2c129da7b..4354886e5b7 100644 --- a/jenkins/databricks/create.py +++ b/jenkins/databricks/create.py @@ -27,16 +27,16 @@ def main(): workspace = 'https://dbc-9ff9942e-a9c4.cloud.databricks.com' token = '' sshkey = '' - cluster_name = 'CI-GPU-databricks-24.10.1' + cluster_name = 'CI-GPU-databricks-24.12.0' idletime = 240 - runtime = '7.0.x-gpu-ml-scala2.12' + runtime = '13.3.x-gpu-ml-scala2.12' num_workers = 1 worker_type = 'g4dn.xlarge' driver_type = 'g4dn.xlarge' cloud_provider = 'aws' # comma separated init scripts in Databricks workspace, e.g. /foo,/bar,... init_scripts = '' - aws_zone='us-west-2c' + aws_zone='auto' try: diff --git a/jenkins/databricks/deploy.sh b/jenkins/databricks/deploy.sh index 6c89af57631..1079ee7dc6a 100755 --- a/jenkins/databricks/deploy.sh +++ b/jenkins/databricks/deploy.sh @@ -29,7 +29,12 @@ SCALA_VERSION=`mvn help:evaluate -q -pl dist -Dexpression=scala.binary.version - VERSION_NUM=${BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS//.} SPARK_VERSION_STR=spark$VERSION_NUM SPARK_PLUGIN_JAR_VERSION=`mvn help:evaluate -q -pl dist -Dexpression=project.version -DforceStdout` -DB_SHIM_NAME=${SPARK_VERSION_STR}db +# Append 143 into the db shim version because Databricks 14.3.x and 15.4.x are both based on spark version 3.5.0 +if [[ "$DB_RUNTIME" == "14.3"* ]]; then + DB_SHIM_NAME="${SPARK_VERSION_STR}db143" +else + DB_SHIM_NAME="${SPARK_VERSION_STR}db" +fi DBJARFPATH=./aggregator/target/${DB_SHIM_NAME}/rapids-4-spark-aggregator_$SCALA_VERSION-$SPARK_PLUGIN_JAR_VERSION-${DB_SHIM_NAME}.jar echo "Databricks jar is: $DBJARFPATH" MVN="mvn -Dmaven.wagon.http.retryHandler.count=3 -DretryFailedDeploymentCount=3" diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh index deaf127cd5a..16b90b95c0e 100755 --- a/jenkins/databricks/init_cudf_udf.sh +++ b/jenkins/databricks/init_cudf_udf.sh @@ -20,7 +20,7 @@ set -ex -CUDF_VER=${CUDF_VER:-24.10} +CUDF_VER=${CUDF_VER:-24.12} CUDA_VER=${CUDA_VER:-11.8} # Need to explicitly add conda into PATH environment, to activate conda environment. diff --git a/jenkins/databricks/install_deps.py b/jenkins/databricks/install_deps.py index 7b77396b3f8..23453912827 100644 --- a/jenkins/databricks/install_deps.py +++ b/jenkins/databricks/install_deps.py @@ -42,6 +42,11 @@ def define_deps(spark_version, scala_version): elif spark_version.startswith('3.4'): spark_prefix = '----ws_3_4' mvn_prefix = '--mvn' + elif spark_version.startswith('3.5'): + spark_prefix = '----ws_3_5' + mvn_prefix = '--mvn' + else: + raise Exception(f"Unsupported Databricks version {spark.version}") spark_suffix = f'hive-{hive_version}__hadoop-{hadoop_version}_{scala_version}' @@ -69,7 +74,7 @@ def define_deps(spark_version, scala_version): Artifact('org.apache.spark', f'spark-core_{scala_version}', f'{spark_prefix}--core--core-{spark_suffix}_deploy.jar'), Artifact('org.apache.spark', f'spark-versions_{scala_version}', - f'spark--versions--*--shim_{scala_version}_deploy.jar'), + f'spark--versions--*--shim*_{scala_version}_deploy.jar'), Artifact('org.apache.spark', f'databricks-versions_{scala_version}', f'common--build-info--build-info-spark_*_{scala_version}_deploy.jar'), # Spark Hive Patches @@ -125,15 +130,17 @@ def define_deps(spark_version, scala_version): Artifact('com.fasterxml.jackson.core', 'jackson-annotations', f'{prefix_ws_sp_mvn_hadoop}--com.fasterxml.jackson.core--jackson-annotations--com.fasterxml.jackson.core__jackson-annotations__*.jar'), Artifact('org.apache.spark', f'spark-avro_{scala_version}', - f'{spark_prefix}--vendor--avro--avro-*.jar'), + f'{prefix_ws_sp_mvn_hadoop}--org.apache.avro--avro--org.apache.avro*.jar' if spark_version.startswith('3.5') else f'{spark_prefix}--vendor--avro--avro-*.jar'), Artifact('org.apache.avro', 'avro-mapred', f'{prefix_ws_sp_mvn_hadoop}--org.apache.avro--avro-mapred--org.apache.avro__avro-mapred__*.jar'), Artifact('org.apache.avro', 'avro', f'{prefix_ws_sp_mvn_hadoop}--org.apache.avro--avro--org.apache.avro__avro__*.jar'), + Artifact('com.github.luben', 'zstd-jni', + f'{prefix_ws_sp_mvn_hadoop}--com.github.luben--zstd-jni--com.github.luben__zstd-jni__*.jar'), ] # Parquet - if spark_version.startswith('3.4'): + if spark_version.startswith('3.4') or spark_version.startswith('3.5'): deps += [ Artifact('org.apache.parquet', 'parquet-hadoop', f'{spark_prefix}--third_party--parquet-mr--parquet-hadoop--parquet-hadoop-shaded--*--libparquet-hadoop-internal.jar'), @@ -162,7 +169,7 @@ def define_deps(spark_version, scala_version): # log4j-core - if spark_version.startswith('3.3') or spark_version.startswith('3.4'): + if spark_version.startswith('3.3') or spark_version.startswith('3.4') or spark_version.startswith('3.5'): deps += Artifact('org.apache.logging.log4j', 'log4j-core', f'{prefix_ws_sp_mvn_hadoop}--org.apache.logging.log4j--log4j-core--org.apache.logging.log4j__log4j-core__*.jar'), @@ -172,7 +179,7 @@ def define_deps(spark_version, scala_version): f'{prefix_ws_sp_mvn_hadoop}--org.scala-lang.modules--scala-parser-combinators_{scala_version}-*.jar') ] - if spark_version.startswith('3.4'): + if spark_version.startswith('3.4') or spark_version.startswith('3.5'): deps += [ # Spark Internal Logging Artifact('org.apache.spark', f'spark-common-utils_{scala_version}', f'{spark_prefix}--common--utils--common-utils-hive-2.3__hadoop-3.2_2.12_deploy.jar'), @@ -180,6 +187,12 @@ def define_deps(spark_version, scala_version): Artifact('org.apache.spark', f'spark-sql-api_{scala_version}', f'{spark_prefix}--sql--api--sql-api-hive-2.3__hadoop-3.2_2.12_deploy.jar') ] + if spark_version.startswith('3.5'): + deps += [ + Artifact('org.scala-lang.modules', f'scala-collection-compat_{scala_version}', + f'{prefix_ws_sp_mvn_hadoop}--org.scala-lang.modules--scala-collection-compat_{scala_version}--org.scala-lang.modules__scala-collection-compat_{scala_version}__2.11.0.jar'), + Artifact('org.apache.avro', f'avro-connector', f'{spark_prefix}--connector--avro--avro-hive-2.3__hadoop-3.2_2.12_shaded---606136534--avro-unshaded-hive-2.3__hadoop-3.2_2.12_deploy.jar') + ] return deps diff --git a/jenkins/spark-nightly-build.sh b/jenkins/spark-nightly-build.sh index 7f7ba8d65a9..00735e02c84 100755 --- a/jenkins/spark-nightly-build.sh +++ b/jenkins/spark-nightly-build.sh @@ -19,6 +19,11 @@ set -ex . jenkins/version-def.sh +## MVN_OPT : maven options environment, e.g. MVN_OPT='-Dspark-rapids-jni.version=xxx' to specify spark-rapids-jni dependency's version. +MVN="mvn -Dmaven.wagon.http.retryHandler.count=3 -DretryFailedDeploymentCount=3 ${MVN_OPT} -Psource-javadoc" + +DIST_PL="dist" +DIST_PATH="$DIST_PL" # The path of the dist module is used only outside of the mvn cmd SCALA_BINARY_VER=${SCALA_BINARY_VER:-"2.12"} if [ $SCALA_BINARY_VER == "2.13" ]; then # Run scala2.13 build and test against JDK17 @@ -26,18 +31,14 @@ if [ $SCALA_BINARY_VER == "2.13" ]; then update-java-alternatives --set $JAVA_HOME java -version - cd scala2.13 - ln -sf ../jenkins jenkins + MVN="$MVN -f scala2.13/" + DIST_PATH="scala2.13/$DIST_PL" fi WORKSPACE=${WORKSPACE:-$(pwd)} ## export 'M2DIR' so that shims can get the correct Spark dependency info export M2DIR=${M2DIR:-"$WORKSPACE/.m2"} -## MVN_OPT : maven options environment, e.g. MVN_OPT='-Dspark-rapids-jni.version=xxx' to specify spark-rapids-jni dependency's version. -MVN="mvn -Dmaven.wagon.http.retryHandler.count=3 -DretryFailedDeploymentCount=3 ${MVN_OPT} -Psource-javadoc" - -DIST_PL="dist" function mvnEval { $MVN help:evaluate -q -pl $DIST_PL $MVN_URM_MIRROR -Prelease320 -Dmaven.repo.local=$M2DIR -DforceStdout -Dexpression=$1 } @@ -80,7 +81,7 @@ function distWithReducedPom { mvnCmd="deploy:deploy-file" if (( ${#CLASSIFIERS_ARR[@]} > 1 )); then # try move tmp artifacts back to target folder for simplifying separate release process - mv ${TMP_PATH}/${ART_ID}-${ART_VER}-*.jar ${DIST_PL}/target/ + mv ${TMP_PATH}/${ART_ID}-${ART_VER}-*.jar ${DIST_PATH}/target/ fi mvnExtraFlags="-Durl=${URM_URL}-local -DrepositoryId=snapshots -Dtypes=${DEPLOY_TYPES} -Dfiles=${DEPLOY_FILES} -Dclassifiers=${DEPLOY_CLASSIFIERS}" ;; @@ -166,7 +167,7 @@ if (( ${#CLASSIFIERS_ARR[@]} > 1 )); then # move artifacts to temp for deployment later artifactFile="${ART_ID}-${ART_VER}-${classifier}.jar" - mv ${DIST_PL}/target/${artifactFile} ${TMP_PATH}/ + mv ${DIST_PATH}/target/${artifactFile} ${TMP_PATH}/ # update deployment properties DEPLOY_TYPES="${DEPLOY_TYPES},jar" DEPLOY_FILES="${DEPLOY_FILES},${DIST_PL}/target/${artifactFile}" diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index e09558425e3..150de339e09 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -191,9 +191,6 @@ ci_scala213() { update-java-alternatives --set $JAVA_HOME java -version - cd scala2.13 - ln -sf ../jenkins jenkins - # Download a Scala 2.13 version of Spark prepare_spark 3.3.0 2.13 @@ -202,15 +199,15 @@ ci_scala213() { do echo "Spark version (Scala 2.13): $version" env -u SPARK_HOME \ - $MVN_CMD -U -B $MVN_URM_MIRROR -Dbuildver=$version clean install $MVN_BUILD_ARGS -Dpytest.TEST_TAGS='' + $MVN_CMD -f scala2.13/ -U -B $MVN_URM_MIRROR -Dbuildver=$version clean install $MVN_BUILD_ARGS -Dpytest.TEST_TAGS='' # Run filecache tests env -u SPARK_HOME SPARK_CONF=spark.rapids.filecache.enabled=true \ - $MVN_CMD -B $MVN_URM_MIRROR -Dbuildver=$version test -rf tests $MVN_BUILD_ARGS -Dpytest.TEST_TAGS='' \ + $MVN_CMD -f scala2.13/ -B $MVN_URM_MIRROR -Dbuildver=$version test -rf tests $MVN_BUILD_ARGS -Dpytest.TEST_TAGS='' \ -DwildcardSuites=org.apache.spark.sql.rapids.filecache.FileCacheIntegrationSuite done - $MVN_CMD -U -B $MVN_URM_MIRROR clean package $MVN_BUILD_ARGS -DskipTests=true - cd .. # Run integration tests in the project root dir to leverage test cases and resource files + $MVN_CMD -f scala2.13/ -U -B $MVN_URM_MIRROR clean package $MVN_BUILD_ARGS -DskipTests=true + export TEST_TAGS="not premerge_ci_1" export TEST_TYPE="pre-commit" # SPARK_HOME (and related) must be set to a Spark built with Scala 2.13 diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh index e09fe78cbf7..65cc2975380 100755 --- a/jenkins/spark-tests.sh +++ b/jenkins/spark-tests.sh @@ -41,7 +41,7 @@ RAPIDS_TEST_JAR="$ARTF_ROOT/rapids-4-spark-integration-tests_${SCALA_BINARY_VER} export INCLUDE_SPARK_AVRO_JAR=${INCLUDE_SPARK_AVRO_JAR:-"true"} if [[ "${INCLUDE_SPARK_AVRO_JAR}" == "true" ]]; then - $WGET_CMD $PROJECT_REPO/org/apache/spark/spark-avro_$SCALA_BINARY_VER/$SPARK_VER/spark-avro_$SCALA_BINARY_VER-${SPARK_VER}.jar + $WGET_CMD $SPARK_REPO/org/apache/spark/spark-avro_$SCALA_BINARY_VER/$SPARK_VER/spark-avro_$SCALA_BINARY_VER-${SPARK_VER}.jar fi $WGET_CMD $PROJECT_TEST_REPO/com/nvidia/rapids-4-spark-integration-tests_$SCALA_BINARY_VER/$PROJECT_TEST_VER/rapids-4-spark-integration-tests_$SCALA_BINARY_VER-$PROJECT_TEST_VER-pytest.tar.gz @@ -94,7 +94,7 @@ $WGET_CMD $SPARK_REPO/org/apache/spark/$SPARK_VER/spark-$SPARK_VER-$BIN_HADOOP_V # Download parquet-hadoop jar for parquet-read encryption tests PARQUET_HADOOP_VER=`mvn help:evaluate -q -N -Dexpression=parquet.hadoop.version -DforceStdout -Dbuildver=${SHUFFLE_SPARK_SHIM/spark/}` if [[ "$(printf '%s\n' "1.12.0" "$PARQUET_HADOOP_VER" | sort -V | head -n1)" = "1.12.0" ]]; then - $WGET_CMD $PROJECT_REPO/org/apache/parquet/parquet-hadoop/$PARQUET_HADOOP_VER/parquet-hadoop-$PARQUET_HADOOP_VER-tests.jar + $WGET_CMD $SPARK_REPO/org/apache/parquet/parquet-hadoop/$PARQUET_HADOOP_VER/parquet-hadoop-$PARQUET_HADOOP_VER-tests.jar fi export SPARK_HOME="$ARTF_ROOT/spark-$SPARK_VER-$BIN_HADOOP_VER" diff --git a/jenkins/version-def.sh b/jenkins/version-def.sh index a56515751aa..7acdd6204a5 100755 --- a/jenkins/version-def.sh +++ b/jenkins/version-def.sh @@ -29,8 +29,8 @@ IFS=$PRE_IFS CUDA_CLASSIFIER=${CUDA_CLASSIFIER:-"cuda11"} CLASSIFIER=${CLASSIFIER:-"$CUDA_CLASSIFIER"} # default as CUDA_CLASSIFIER for compatibility -PROJECT_VER=${PROJECT_VER:-"24.10.1"} -PROJECT_TEST_VER=${PROJECT_TEST_VER:-"24.10.1"} +PROJECT_VER=${PROJECT_VER:-"24.12.0"} +PROJECT_TEST_VER=${PROJECT_TEST_VER:-"24.12.0"} SPARK_VER=${SPARK_VER:-"3.2.0"} SPARK_VER_213=${SPARK_VER_213:-"3.3.0"} # Make a best attempt to set the default value for the shuffle shim. diff --git a/pom.xml b/pom.xml index 4e356af5db5..470f198e5fb 100644 --- a/pom.xml +++ b/pom.xml @@ -23,7 +23,7 @@ rapids-4-spark-parent_2.12 RAPIDS Accelerator for Apache Spark Root Project The root project of the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 pom https://nvidia.github.io/spark-rapids/ @@ -105,6 +105,7 @@ ${spark320.version} ${spark320.version} 1.12.1 + rapids-4-spark-delta-20x delta-lake/delta-20x @@ -125,6 +126,7 @@ ${spark321.version} ${spark321.version} 1.12.2 + rapids-4-spark-delta-20x
delta-lake/delta-20x @@ -145,6 +147,7 @@ ${spark321cdh.version} ${spark321cdh.version} 1.10.1 + rapids-4-spark-delta-20x true + release350db143 + + + buildver + 350db143 + + + + + 3.4.4 + spark350db143 + ${spark350db143.version} + ${spark350db143.version} + 3.3.1 + true + 1.12.0 + rapids-4-spark-delta-${spark.version.classifier} + ${spark330.iceberg.version} + + + shim-deps/databricks + delta-lake/delta-spark350db143 + + release351 @@ -562,6 +644,7 @@ ${spark351.version} ${spark351.version} 1.13.1 + rapids-4-spark-delta-stub ${spark330.iceberg.version} 2.0.7
@@ -582,6 +665,28 @@ ${spark352.version} ${spark352.version} 1.13.1 + rapids-4-spark-delta-stub + ${spark330.iceberg.version} + 2.0.7 +
+ + delta-lake/delta-stub + + + + release353 + + + buildver + 353 + + + + 353 + ${spark353.version} + ${spark353.version} + 1.13.1 + rapids-4-spark-delta-stub ${spark330.iceberg.version} 2.0.7 @@ -603,6 +708,7 @@ ${spark400.version} ${spark400.version} 1.13.1 + rapids-4-spark-delta-stub ${spark330.iceberg.version} 2.0.7
@@ -707,7 +813,8 @@ - + + 350db143 . @@ -722,8 +829,8 @@ spark${buildver} cuda11 ${cuda.version} - 24.10.0 - 24.10.0 + 24.12.0-SNAPSHOT + 24.12.0-SNAPSHOT 2.12 2.8.0 incremental @@ -749,6 +856,15 @@ ${spark.version.classifier}.com.nvidia.shaded.spark none package + + + DEFINE_FOR_EVERY_SPARK_SHIM + + ${rapids.delta.artifactId1} + ${rapids.delta.artifactId1} true UTF-8 @@ -776,14 +892,17 @@ 3.4.1 3.4.2 3.4.3 + 3.4.4 3.3.0.3.3.7180.0-274 3.3.2.3.3.7190.0-91 3.3.0-databricks 3.3.2-databricks 3.4.1-databricks + 3.5.0-databricks-143 3.5.0 3.5.1 3.5.2 + 3.5.3 4.0.0-SNAPSHOT 3.12.4 @@ -1467,6 +1586,11 @@ This will force full Scala code rebuild in downstream modules. Minimum Maven version 3.6.x required [3.6,) + + At least one of rapids.delta.artifactId1, rapids.delta.artifactId2 ... is required in the POM profile "release${buildver}" + rapids.delta.artifactId1 + ^rapids-4-spark-delta-.* + Only Java 8, 11, and 17 are supported! @@ -1568,14 +1692,11 @@ This will force full Scala code rebuild in downstream modules. ${maven.scalastyle.skip} - - - - - - - - + + + + + Checking scalastyle for all modules using following paths: ${scalastyle.dirs} diff --git a/scala2.13/README.md b/scala2.13/README.md index 4096363cf52..f2e5200c9c7 100644 --- a/scala2.13/README.md +++ b/scala2.13/README.md @@ -25,8 +25,7 @@ You can use Maven to build the plugin. Like with Scala 2.12, we recommend buildi phase. ```shell script -cd scala2.13 -mvn verify +mvn verify -f scala2.13/ ``` After a successful build, the RAPIDS Accelerator jar will be in the `scala2.13/dist/target/` directory. @@ -45,7 +44,6 @@ You can also use the `buildall` script in the parent directory to build against of Apache Spark. ```shell script -cd .. ./build/buildall --profile=noSnapshotsScala213 ``` @@ -72,4 +70,4 @@ That way any new dependencies or other changes will be picked up in the Scala 2. You should be able to open the `scala2.13` directory directly in IntelliJ as a separate project. You can build and debug as normal, although there are slight differences in how to navigate the source. In particular, when you select a particular build profile, you will only be able to navigate the source used by modules that are included for that -spark version. \ No newline at end of file +spark version. diff --git a/scala2.13/aggregator/pom.xml b/scala2.13/aggregator/pom.xml index 6322f0f9701..053e9370deb 100644 --- a/scala2.13/aggregator/pom.xml +++ b/scala2.13/aggregator/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.1 + 24.12.0 ../jdk-profiles/pom.xml rapids-4-spark-aggregator_2.13 RAPIDS Accelerator for Apache Spark Aggregator Creates an aggregated shaded package of the RAPIDS plugin for Apache Spark - 24.10.1 + 24.12.0 aggregator @@ -71,6 +71,28 @@ ${spark-rapids-private.version} ${spark.version.classifier}
+ + + com.nvidia + ${rapids.delta.artifactId1}_${scala.binary.version} + ${project.version} + ${spark.version.classifier} + + + com.nvidia + ${rapids.delta.artifactId2}_${scala.binary.version} + ${project.version} + ${spark.version.classifier} + + + com.nvidia + ${rapids.delta.artifactId3}_${scala.binary.version} + ${project.version} + ${spark.version.classifier} +
@@ -262,507 +284,4 @@ - - - - release320 - - - - buildver - 320 - - - - - com.nvidia - rapids-4-spark-delta-20x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release321 - - - buildver - 321 - - - - - com.nvidia - rapids-4-spark-delta-20x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release321cdh - - - buildver - 321cdh - - - - - com.nvidia - rapids-4-spark-delta-20x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release322 - - - buildver - 322 - - - - - com.nvidia - rapids-4-spark-delta-20x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release323 - - - buildver - 323 - - - - - com.nvidia - rapids-4-spark-delta-20x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release324 - - - buildver - 324 - - - - - com.nvidia - rapids-4-spark-delta-20x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release330 - - - true - - - buildver - 330 - - - - - com.nvidia - rapids-4-spark-delta-21x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-22x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-23x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release330cdh - - - buildver - 330cdh - - - - - com.nvidia - rapids-4-spark-delta-21x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-22x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-23x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release332cdh - - - buildver - 332cdh - - - - - com.nvidia - rapids-4-spark-delta-21x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-22x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-23x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release330db - - - buildver - 330db - - - - - com.nvidia - rapids-4-spark-delta-spark330db_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release331 - - - buildver - 331 - - - - - com.nvidia - rapids-4-spark-delta-21x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-22x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-23x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release332 - - - buildver - 332 - - - - - com.nvidia - rapids-4-spark-delta-21x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-22x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-23x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release332db - - - buildver - 332db - - - - - com.nvidia - rapids-4-spark-delta-spark332db_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release341db - - - buildver - 341db - - - - - com.nvidia - rapids-4-spark-delta-spark341db_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release333 - - - buildver - 333 - - - - - com.nvidia - rapids-4-spark-delta-21x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-22x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-23x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release334 - - - buildver - 334 - - - - - com.nvidia - rapids-4-spark-delta-21x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-22x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - com.nvidia - rapids-4-spark-delta-23x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release340 - - - buildver - 340 - - - - - com.nvidia - rapids-4-spark-delta-24x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release341 - - - buildver - 341 - - - - - com.nvidia - rapids-4-spark-delta-24x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release342 - - - buildver - 342 - - - - - com.nvidia - rapids-4-spark-delta-24x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release343 - - - buildver - 343 - - - - - com.nvidia - rapids-4-spark-delta-24x_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release350 - - - buildver - 350 - - - - - com.nvidia - rapids-4-spark-delta-stub_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release351 - - - buildver - 351 - - - - - com.nvidia - rapids-4-spark-delta-stub_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - release352 - - - buildver - 352 - - - - - com.nvidia - rapids-4-spark-delta-stub_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - - release400 - - - buildver - 400 - - - - - com.nvidia - rapids-4-spark-delta-stub_${scala.binary.version} - ${project.version} - ${spark.version.classifier} - - - - - diff --git a/scala2.13/api_validation/pom.xml b/scala2.13/api_validation/pom.xml index 2b42c69e42b..7e73ed7ae72 100644 --- a/scala2.13/api_validation/pom.xml +++ b/scala2.13/api_validation/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.1 + 24.12.0 ../shim-deps/pom.xml rapids-4-spark-api-validation_2.13 - 24.10.1 + 24.12.0 api_validation diff --git a/scala2.13/datagen/pom.xml b/scala2.13/datagen/pom.xml index 8f74c241cab..b5f638aa9cf 100644 --- a/scala2.13/datagen/pom.xml +++ b/scala2.13/datagen/pom.xml @@ -21,18 +21,19 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.1 + 24.12.0 ../shim-deps/pom.xml datagen_2.13 Data Generator Tools for generating large amounts of data - 24.10.1 + 24.12.0 datagen **/* package + ${project.build.outputDirectory}/datagen-version-info.properties diff --git a/scala2.13/delta-lake/delta-20x/pom.xml b/scala2.13/delta-lake/delta-20x/pom.xml index 683c7d93d4c..0eee940615f 100644 --- a/scala2.13/delta-lake/delta-20x/pom.xml +++ b/scala2.13/delta-lake/delta-20x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.1 + 24.12.0 ../../jdk-profiles/pom.xml rapids-4-spark-delta-20x_2.13 RAPIDS Accelerator for Apache Spark Delta Lake 2.0.x Support Delta Lake 2.0.x support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-20x diff --git a/scala2.13/delta-lake/delta-21x/pom.xml b/scala2.13/delta-lake/delta-21x/pom.xml index 48a61e6be3e..2b457c80aea 100644 --- a/scala2.13/delta-lake/delta-21x/pom.xml +++ b/scala2.13/delta-lake/delta-21x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.1 + 24.12.0 ../../jdk-profiles/pom.xml rapids-4-spark-delta-21x_2.13 RAPIDS Accelerator for Apache Spark Delta Lake 2.1.x Support Delta Lake 2.1.x support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-21x diff --git a/scala2.13/delta-lake/delta-22x/pom.xml b/scala2.13/delta-lake/delta-22x/pom.xml index 95ce774c349..42e5685444c 100644 --- a/scala2.13/delta-lake/delta-22x/pom.xml +++ b/scala2.13/delta-lake/delta-22x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.1 + 24.12.0 ../../jdk-profiles/pom.xml rapids-4-spark-delta-22x_2.13 RAPIDS Accelerator for Apache Spark Delta Lake 2.2.x Support Delta Lake 2.2.x support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-22x diff --git a/scala2.13/delta-lake/delta-23x/pom.xml b/scala2.13/delta-lake/delta-23x/pom.xml index e2773a45775..05a38a75ce6 100644 --- a/scala2.13/delta-lake/delta-23x/pom.xml +++ b/scala2.13/delta-lake/delta-23x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-parent_2.13 - 24.10.1 + 24.12.0 ../../pom.xml rapids-4-spark-delta-23x_2.13 RAPIDS Accelerator for Apache Spark Delta Lake 2.3.x Support Delta Lake 2.3.x support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-23x diff --git a/scala2.13/delta-lake/delta-24x/pom.xml b/scala2.13/delta-lake/delta-24x/pom.xml index 71db006ad1a..c1b106d0f55 100644 --- a/scala2.13/delta-lake/delta-24x/pom.xml +++ b/scala2.13/delta-lake/delta-24x/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.1 + 24.12.0 ../../jdk-profiles/pom.xml rapids-4-spark-delta-24x_2.13 RAPIDS Accelerator for Apache Spark Delta Lake 2.4.x Support Delta Lake 2.4.x support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-24x diff --git a/scala2.13/delta-lake/delta-spark330db/pom.xml b/scala2.13/delta-lake/delta-spark330db/pom.xml index fc9fa866490..2fa49fe5847 100644 --- a/scala2.13/delta-lake/delta-spark330db/pom.xml +++ b/scala2.13/delta-lake/delta-spark330db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.1 + 24.12.0 ../../shim-deps/pom.xml rapids-4-spark-delta-spark330db_2.13 RAPIDS Accelerator for Apache Spark Databricks 11.3 Delta Lake Support Databricks 11.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-spark330db diff --git a/scala2.13/delta-lake/delta-spark332db/pom.xml b/scala2.13/delta-lake/delta-spark332db/pom.xml index 1b08c59eabe..a8cae9d2f82 100644 --- a/scala2.13/delta-lake/delta-spark332db/pom.xml +++ b/scala2.13/delta-lake/delta-spark332db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.1 + 24.12.0 ../../shim-deps/pom.xml rapids-4-spark-delta-spark332db_2.13 RAPIDS Accelerator for Apache Spark Databricks 12.2 Delta Lake Support Databricks 12.2 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-spark332db diff --git a/scala2.13/delta-lake/delta-spark341db/pom.xml b/scala2.13/delta-lake/delta-spark341db/pom.xml index cb48f620886..9583419dc7c 100644 --- a/scala2.13/delta-lake/delta-spark341db/pom.xml +++ b/scala2.13/delta-lake/delta-spark341db/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.1 + 24.12.0 ../../shim-deps/pom.xml rapids-4-spark-delta-spark341db_2.13 RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 false diff --git a/scala2.13/delta-lake/delta-spark350db143/pom.xml b/scala2.13/delta-lake/delta-spark350db143/pom.xml new file mode 100644 index 00000000000..da47b99455c --- /dev/null +++ b/scala2.13/delta-lake/delta-spark350db143/pom.xml @@ -0,0 +1,85 @@ + + + + 4.0.0 + + + com.nvidia + rapids-4-spark-shim-deps-parent_2.13 + 24.12.0 + ../../shim-deps/pom.xml + + + rapids-4-spark-delta-spark350db143_2.13 + RAPIDS Accelerator for Apache Spark Databricks 13.3 Delta Lake Support + Databricks 13.3 Delta Lake support for the RAPIDS Accelerator for Apache Spark + 24.12.0 + + + false + **/* + package + + + + + org.roaringbitmap + RoaringBitmap + + + com.nvidia + rapids-4-spark-sql_${scala.binary.version} + ${project.version} + ${spark.version.classifier} + provided + + + + + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-common-sources + generate-sources + + add-source + + + + ${project.basedir}/../common/src/main/scala + ${project.basedir}/../common/src/main/databricks/scala + + + + + + + net.alchim31.maven + scala-maven-plugin + + + org.apache.rat + apache-rat-plugin + + + + diff --git a/scala2.13/delta-lake/delta-stub/pom.xml b/scala2.13/delta-lake/delta-stub/pom.xml index 5698b32e431..989450c3e7e 100644 --- a/scala2.13/delta-lake/delta-stub/pom.xml +++ b/scala2.13/delta-lake/delta-stub/pom.xml @@ -22,14 +22,14 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.1 + 24.12.0 ../../jdk-profiles/pom.xml rapids-4-spark-delta-stub_2.13 RAPIDS Accelerator for Apache Spark Delta Lake Stub Delta Lake stub for the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 ../delta-lake/delta-stub diff --git a/scala2.13/dist/pom.xml b/scala2.13/dist/pom.xml index 3634bf3b78d..d11161e9d7e 100644 --- a/scala2.13/dist/pom.xml +++ b/scala2.13/dist/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-jdk-profiles_2.13 - 24.10.1 + 24.12.0 ../jdk-profiles/pom.xml rapids-4-spark_2.13 RAPIDS Accelerator for Apache Spark Distribution Creates the distribution package of the RAPIDS plugin for Apache Spark - 24.10.1 + 24.12.0 com.nvidia diff --git a/scala2.13/integration_tests/pom.xml b/scala2.13/integration_tests/pom.xml index 1b9d2f2ea64..0f82e0b9186 100644 --- a/scala2.13/integration_tests/pom.xml +++ b/scala2.13/integration_tests/pom.xml @@ -22,11 +22,11 @@ com.nvidia rapids-4-spark-shim-deps-parent_2.13 - 24.10.1 + 24.12.0 ../shim-deps/pom.xml rapids-4-spark-integration-tests_2.13 - 24.10.1 + 24.12.0 integration_tests diff --git a/scala2.13/jdk-profiles/pom.xml b/scala2.13/jdk-profiles/pom.xml index febb2bf230a..808031d488b 100644 --- a/scala2.13/jdk-profiles/pom.xml +++ b/scala2.13/jdk-profiles/pom.xml @@ -22,13 +22,13 @@ com.nvidia rapids-4-spark-parent_2.13 - 24.10.1 + 24.12.0 com.nvidia rapids-4-spark-jdk-profiles_2.13 pom Shim JDK Profiles - 24.10.1 + 24.12.0 jdk8 diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml index d920a059d85..baebe599dc1 100644 --- a/scala2.13/pom.xml +++ b/scala2.13/pom.xml @@ -23,7 +23,7 @@ rapids-4-spark-parent_2.13 RAPIDS Accelerator for Apache Spark Root Project The root project of the RAPIDS Accelerator for Apache Spark - 24.10.1 + 24.12.0 pom https://nvidia.github.io/spark-rapids/ @@ -105,6 +105,7 @@ ${spark320.version} ${spark320.version} 1.12.1 + rapids-4-spark-delta-20x delta-lake/delta-20x @@ -125,6 +126,7 @@ ${spark321.version} ${spark321.version} 1.12.2 + rapids-4-spark-delta-20x delta-lake/delta-20x @@ -145,6 +147,7 @@ ${spark321cdh.version} ${spark321cdh.version} 1.10.1 + rapids-4-spark-delta-20x true --> + release350db143 + + + buildver + 350db143 + + + + + 3.4.4 + spark350db143 + ${spark350db143.version} + ${spark350db143.version} + 3.3.1 + true + 1.12.0 + rapids-4-spark-delta-${spark.version.classifier} + ${spark330.iceberg.version} + + + shim-deps/databricks + delta-lake/delta-spark350db143 + + release351 @@ -562,6 +644,7 @@ ${spark351.version} ${spark351.version} 1.13.1 + rapids-4-spark-delta-stub ${spark330.iceberg.version} 2.0.7 @@ -582,6 +665,28 @@ ${spark352.version} ${spark352.version} 1.13.1 + rapids-4-spark-delta-stub + ${spark330.iceberg.version} + 2.0.7 + + + delta-lake/delta-stub + + + + release353 + + + buildver + 353 + + + + 353 + ${spark353.version} + ${spark353.version} + 1.13.1 + rapids-4-spark-delta-stub ${spark330.iceberg.version} 2.0.7 @@ -603,6 +708,7 @@ ${spark400.version} ${spark400.version} 1.13.1 + rapids-4-spark-delta-stub ${spark330.iceberg.version} 2.0.7 @@ -707,7 +813,8 @@ - + + 350db143 . @@ -722,8 +829,8 @@ spark${buildver} cuda11 ${cuda.version} - 24.10.0 - 24.10.0 + 24.12.0-SNAPSHOT + 24.12.0-SNAPSHOT 2.13 2.8.0 incremental @@ -749,6 +856,15 @@ ${spark.version.classifier}.com.nvidia.shaded.spark none package + + + DEFINE_FOR_EVERY_SPARK_SHIM + + ${rapids.delta.artifactId1} + ${rapids.delta.artifactId1} true UTF-8 @@ -776,14 +892,17 @@ 3.4.1 3.4.2 3.4.3 + 3.4.4 3.3.0.3.3.7180.0-274 3.3.2.3.3.7190.0-91 3.3.0-databricks 3.3.2-databricks 3.4.1-databricks + 3.5.0-databricks-143 3.5.0 3.5.1 3.5.2 + 3.5.3 4.0.0-SNAPSHOT 3.12.4 @@ -1467,6 +1586,11 @@ This will force full Scala code rebuild in downstream modules. Minimum Maven version 3.6.x required [3.6,) + + At least one of rapids.delta.artifactId1, rapids.delta.artifactId2 ... is required in the POM profile "release${buildver}" + rapids.delta.artifactId1 + ^rapids-4-spark-delta-.* +