diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml index b3cbbb6ad14..4b8071303c1 100644 --- a/.github/workflows/blossom-ci.yml +++ b/.github/workflows/blossom-ci.yml @@ -33,45 +33,49 @@ jobs: args: ${{ env.args }} # This job only runs for pull request comments - if: contains( '\ - abellina,\ - anfeng,\ - firestarman,\ - GaryShen2008,\ - jlowe,\ - kuhushukla,\ - mythrocks,\ - nartal1,\ - nvdbaranec,\ - NvTimLiu,\ - razajafri,\ - revans2,\ - rwlee,\ - sameerz,\ - tgravescs,\ - wbo4958,\ - wjxiz1992,\ - sperlingxx,\ - hyperbolic2346,\ - gerashegalov,\ - ttnghia,\ - nvliyuan,\ - res-life,\ - HaoYang670,\ - NVnavkumar,\ - amahussein,\ - mattahrens,\ - YanxuanLiu,\ - cindyyuanjiang,\ - thirtiseven,\ - winningsix,\ - viadea,\ - yinqingh,\ - parthosa,\ - liurenjie1024,\ - binmahone,\ - zpuller,\ - ', format('{0},', github.actor)) && github.event.comment.body == 'build' + if: | + github.event.comment.body == 'build' && + ( + github.actor == 'abellina' || + github.actor == 'anfeng' || + github.actor == 'firestarman' || + github.actor == 'GaryShen2008' || + github.actor == 'jlowe' || + github.actor == 'kuhushukla' || + github.actor == 'mythrocks' || + github.actor == 'nartal1' || + github.actor == 'nvdbaranec' || + github.actor == 'NvTimLiu' || + github.actor == 'razajafri' || + github.actor == 'revans2' || + github.actor == 'rwlee' || + github.actor == 'sameerz' || + github.actor == 'tgravescs' || + github.actor == 'wbo4958' || + github.actor == 'wjxiz1992' || + github.actor == 'sperlingxx' || + github.actor == 'hyperbolic2346' || + github.actor == 'gerashegalov' || + github.actor == 'ttnghia' || + github.actor == 'nvliyuan' || + github.actor == 'res-life' || + github.actor == 'HaoYang670' || + github.actor == 'NVnavkumar' || + github.actor == 'amahussein' || + github.actor == 'mattahrens' || + github.actor == 'YanxuanLiu' || + github.actor == 'cindyyuanjiang' || + github.actor == 'thirtiseven' || + github.actor == 'winningsix' || + github.actor == 'viadea' || + github.actor == 'yinqingh' || + github.actor == 'parthosa' || + github.actor == 'liurenjie1024' || + github.actor == 'binmahone' || + github.actor == 'zpuller' || + github.actor == 'pxLi' || + github.actor == 'Feng-Jiang28' + ) steps: - name: Check if comment is issued by authorized person run: blossom-ci diff --git a/CHANGELOG.md b/CHANGELOG.md index 0340afa5931..788fed0a98f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,121 @@ # Change log -Generated on 2024-05-20 +Generated on 2024-06-13 + +## Release 24.06 + +### Features +||| +|:---|:---| +|[#10850](https://github.com/NVIDIA/spark-rapids/issues/10850)|[FEA] Refine the test framework introduced in #10745| +|[#6969](https://github.com/NVIDIA/spark-rapids/issues/6969)|[FEA] Support parse_url | +|[#10496](https://github.com/NVIDIA/spark-rapids/issues/10496)|[FEA] Drop support for CentOS7| +|[#10760](https://github.com/NVIDIA/spark-rapids/issues/10760)|[FEA]Support ArrayFilter| +|[#10721](https://github.com/NVIDIA/spark-rapids/issues/10721)|[FEA] Dump the complete set of build-info properties to the Spark eventLog| +|[#10666](https://github.com/NVIDIA/spark-rapids/issues/10666)|[FEA] Create Spark 3.4.3 shim| + +### Performance +||| +|:---|:---| +|[#8963](https://github.com/NVIDIA/spark-rapids/issues/8963)|[FEA] Use custom kernel for parse_url| +|[#10817](https://github.com/NVIDIA/spark-rapids/issues/10817)|[FOLLOW ON] Combining regex parsing in transpiling and regex rewrite in `rlike`| +|[#10821](https://github.com/NVIDIA/spark-rapids/issues/10821)|Rewrite `pattern[A-B]{X,Y}` (a pattern string followed by X to Y chars in range A - B) in `RLIKE` to a custom kernel| + +### Bugs Fixed +||| +|:---|:---| +|[#10928](https://github.com/NVIDIA/spark-rapids/issues/10928)|[BUG] 24.06 test_conditional_with_side_effects_case_when test failed on Scala 2.13 with DATAGEN_SEED=1716656294| +|[#10941](https://github.com/NVIDIA/spark-rapids/issues/10941)|[BUG] Failed to build on databricks due to GpuOverrides.scala:4264: not found: type GpuSubqueryBroadcastMeta| +|[#10902](https://github.com/NVIDIA/spark-rapids/issues/10902)|Spark UT failed: SPARK-37360: Timestamp type inference for a mix of TIMESTAMP_NTZ and TIMESTAMP_LTZ| +|[#10899](https://github.com/NVIDIA/spark-rapids/issues/10899)|[BUG] format_number Spark UT failed because Type conversion is not allowed| +|[#10913](https://github.com/NVIDIA/spark-rapids/issues/10913)|[BUG] rlike with empty pattern failed with 'NoSuchElementException' when enabling regex rewrite| +|[#10774](https://github.com/NVIDIA/spark-rapids/issues/10774)|[BUG] Issues found by Spark UT Framework on RapidsRegexpExpressionsSuite| +|[#10606](https://github.com/NVIDIA/spark-rapids/issues/10606)|[BUG] Update Plugin to use the new `getPartitionedFile` method| +|[#10806](https://github.com/NVIDIA/spark-rapids/issues/10806)|[BUG] orc_write_test.py::test_write_round_trip_corner failed with DATAGEN_SEED=1715517863| +|[#10831](https://github.com/NVIDIA/spark-rapids/issues/10831)|[BUG] Failed to read data from iceberg| +|[#10810](https://github.com/NVIDIA/spark-rapids/issues/10810)|[BUG] NPE when running `ParseUrl` tests in `RapidsStringExpressionsSuite`| +|[#10797](https://github.com/NVIDIA/spark-rapids/issues/10797)|[BUG] udf_test test_single_aggregate_udf, test_group_aggregate_udf and test_group_apply_udf_more_types failed on DB 13.3| +|[#10719](https://github.com/NVIDIA/spark-rapids/issues/10719)|[BUG] test_exact_percentile_groupby FAILED: hash_aggregate_test.py::test_exact_percentile_groupby with DATAGEN seed 1713362217| +|[#10738](https://github.com/NVIDIA/spark-rapids/issues/10738)|[BUG] test_exact_percentile_groupby_partial_fallback_to_cpu failed with DATAGEN_SEED=1713928179| +|[#10768](https://github.com/NVIDIA/spark-rapids/issues/10768)|[DOC] Dead links with tools pages| +|[#10751](https://github.com/NVIDIA/spark-rapids/issues/10751)|[BUG] Cascaded Pandas UDFs not working as expected on Databricks when plugin is enabled| +|[#10318](https://github.com/NVIDIA/spark-rapids/issues/10318)|[BUG] `fs.azure.account.keyInvalid` configuration issue while reading from Unity Catalog Tables on Azure DB| +|[#10722](https://github.com/NVIDIA/spark-rapids/issues/10722)|[BUG] "Could not find any rapids-4-spark jars in classpath" error when debugging UT in IDEA| +|[#10724](https://github.com/NVIDIA/spark-rapids/issues/10724)|[BUG] Failed to convert string with invisible characters to float| +|[#10633](https://github.com/NVIDIA/spark-rapids/issues/10633)|[BUG] ScanJson and JsonToStructs can give almost random errors| +|[#10659](https://github.com/NVIDIA/spark-rapids/issues/10659)|[BUG] from_json ArrayIndexOutOfBoundsException in 24.02| +|[#10656](https://github.com/NVIDIA/spark-rapids/issues/10656)|[BUG] Databricks cache tests failing with host memory OOM| + +### PRs +||| +|:---|:---| +|[#11052](https://github.com/NVIDIA/spark-rapids/pull/11052)|Add spark343 shim for scala2.13 dist jar| +|[#10981](https://github.com/NVIDIA/spark-rapids/pull/10981)|Update latest changelog [skip ci]| +|[#10984](https://github.com/NVIDIA/spark-rapids/pull/10984)|[DOC] Update docs for 24.06.0 release [skip ci]| +|[#10974](https://github.com/NVIDIA/spark-rapids/pull/10974)|Update rapids JNI and private dependency to 24.06.0| +|[#10947](https://github.com/NVIDIA/spark-rapids/pull/10947)|Prevent contains-PrefixRange optimization if not preceded by wildcards| +|[#10934](https://github.com/NVIDIA/spark-rapids/pull/10934)|Revert "Add Support for Multiple Filtering Keys for Subquery Broadcast "| +|[#10870](https://github.com/NVIDIA/spark-rapids/pull/10870)|Add support for self-contained profiling| +|[#10903](https://github.com/NVIDIA/spark-rapids/pull/10903)|Use upper case for LEGACY_TIME_PARSER_POLICY to fix a spark UT| +|[#10900](https://github.com/NVIDIA/spark-rapids/pull/10900)|Fix type convert error in format_number scalar input| +|[#10868](https://github.com/NVIDIA/spark-rapids/pull/10868)|Disable default cuDF pinned pool| +|[#10914](https://github.com/NVIDIA/spark-rapids/pull/10914)|Fix NoSuchElementException when rlike with empty pattern| +|[#10858](https://github.com/NVIDIA/spark-rapids/pull/10858)|Add Support for Multiple Filtering Keys for Subquery Broadcast | +|[#10861](https://github.com/NVIDIA/spark-rapids/pull/10861)|refine ut framework including Part 1 and Part 2| +|[#10872](https://github.com/NVIDIA/spark-rapids/pull/10872)|[DOC] ignore released plugin links to reduce the bother info [skip ci]| +|[#10839](https://github.com/NVIDIA/spark-rapids/pull/10839)|Replace anonymous classes for SortOrder and FIlterExec overrides| +|[#10873](https://github.com/NVIDIA/spark-rapids/pull/10873)|Auto merge PRs to branch-24.08 from branch-24.06 [skip ci]| +|[#10860](https://github.com/NVIDIA/spark-rapids/pull/10860)|[Spark 4.0] Account for `PartitionedFileUtil.getPartitionedFile` signature change.| +|[#10822](https://github.com/NVIDIA/spark-rapids/pull/10822)|Rewrite regex pattern `literal[a-b]{x}` to custom kernel in rlike| +|[#10833](https://github.com/NVIDIA/spark-rapids/pull/10833)|Filter out unused json_path tokens| +|[#10855](https://github.com/NVIDIA/spark-rapids/pull/10855)|Fix auto merge conflict 10845 [[skip ci]]| +|[#10826](https://github.com/NVIDIA/spark-rapids/pull/10826)|Add NVTX ranges to identify Spark stages and tasks| +|[#10846](https://github.com/NVIDIA/spark-rapids/pull/10846)|Update latest changelog [skip ci]| +|[#10836](https://github.com/NVIDIA/spark-rapids/pull/10836)|Catch exceptions when trying to examine Iceberg scan for metadata queries| +|[#10824](https://github.com/NVIDIA/spark-rapids/pull/10824)|Support zstd for GPU shuffle compression| +|[#10828](https://github.com/NVIDIA/spark-rapids/pull/10828)|Added DateTimeUtilsShims [Databricks]| +|[#10829](https://github.com/NVIDIA/spark-rapids/pull/10829)|Fix `Inheritance Shadowing` to add support for Spark 4.0.0| +|[#10811](https://github.com/NVIDIA/spark-rapids/pull/10811)|Fix NPE in GpuParseUrl for null keys.| +|[#10723](https://github.com/NVIDIA/spark-rapids/pull/10723)|Implement chunked ORC reader| +|[#10715](https://github.com/NVIDIA/spark-rapids/pull/10715)|Rewrite some rlike expression to StartsWith/Contains| +|[#10820](https://github.com/NVIDIA/spark-rapids/pull/10820)|workaround #10801 temporally| +|[#10812](https://github.com/NVIDIA/spark-rapids/pull/10812)|Replace ThreadPoolExecutor creation with ThreadUtils API| +|[#10816](https://github.com/NVIDIA/spark-rapids/pull/10816)|Fix a test error for DB13.3| +|[#10813](https://github.com/NVIDIA/spark-rapids/pull/10813)|Fix the errors for Pandas UDF tests on DB13.3| +|[#10795](https://github.com/NVIDIA/spark-rapids/pull/10795)|Remove fixed seed for exact `percentile` integration tests| +|[#10805](https://github.com/NVIDIA/spark-rapids/pull/10805)|Drop Support for CentOS 7| +|[#10800](https://github.com/NVIDIA/spark-rapids/pull/10800)|Add number normalization test and address followup for getJsonObject| +|[#10796](https://github.com/NVIDIA/spark-rapids/pull/10796)|fixing build break on DBR| +|[#10791](https://github.com/NVIDIA/spark-rapids/pull/10791)|Fix auto merge conflict 10779 [skip ci]| +|[#10636](https://github.com/NVIDIA/spark-rapids/pull/10636)|Update actions version [skip ci]| +|[#10743](https://github.com/NVIDIA/spark-rapids/pull/10743)|initial PR for the framework reusing Vanilla Spark's unit tests| +|[#10767](https://github.com/NVIDIA/spark-rapids/pull/10767)|Add rows-only batches support to RebatchingRoundoffIterator| +|[#10763](https://github.com/NVIDIA/spark-rapids/pull/10763)|Add in the GpuArrayFilter command| +|[#10766](https://github.com/NVIDIA/spark-rapids/pull/10766)|Fix dead links related to tools documentation [skip ci]| +|[#10644](https://github.com/NVIDIA/spark-rapids/pull/10644)|Add logging to Integration test runs in local and local-cluster mode| +|[#10756](https://github.com/NVIDIA/spark-rapids/pull/10756)|Fix Authorization Failure While Reading Tables From Unity Catalog| +|[#10752](https://github.com/NVIDIA/spark-rapids/pull/10752)|Add SparkRapidsBuildInfoEvent to the event log| +|[#10754](https://github.com/NVIDIA/spark-rapids/pull/10754)|Substitute whoami for $USER| +|[#10755](https://github.com/NVIDIA/spark-rapids/pull/10755)|[DOC] Update README for prioritize-commits script [skip ci]| +|[#10728](https://github.com/NVIDIA/spark-rapids/pull/10728)|Let big data gen set nullability recursively| +|[#10740](https://github.com/NVIDIA/spark-rapids/pull/10740)|Use parse_url kernel for PATH parsing| +|[#10734](https://github.com/NVIDIA/spark-rapids/pull/10734)|Add short circuit path for get-json-object when there is separate wildcard path| +|[#10725](https://github.com/NVIDIA/spark-rapids/pull/10725)|Initial definition for Spark 4.0.0 shim| +|[#10635](https://github.com/NVIDIA/spark-rapids/pull/10635)|Use new getJsonObject kernel for json_tuple| +|[#10739](https://github.com/NVIDIA/spark-rapids/pull/10739)|Use fixed seed for some random failed tests| +|[#10720](https://github.com/NVIDIA/spark-rapids/pull/10720)|Add Shims for Spark 3.4.3| +|[#10716](https://github.com/NVIDIA/spark-rapids/pull/10716)|Remove the mixedType config for JSON as it has no downsides any longer| +|[#10733](https://github.com/NVIDIA/spark-rapids/pull/10733)|Fix "Could not find any rapids-4-spark jars in classpath" error when debugging UT in IDEA| +|[#10718](https://github.com/NVIDIA/spark-rapids/pull/10718)|Change parameters for memory limit in Parquet chunked reader| +|[#10292](https://github.com/NVIDIA/spark-rapids/pull/10292)|Upgrade to UCX 1.16.0| +|[#10709](https://github.com/NVIDIA/spark-rapids/pull/10709)|Removing some authorizations for departed users [skip ci]| +|[#10726](https://github.com/NVIDIA/spark-rapids/pull/10726)|Append new authorized user to blossom-ci whitelist [skip ci]| +|[#10708](https://github.com/NVIDIA/spark-rapids/pull/10708)|Updated dump tool to verify get_json_object| +|[#10706](https://github.com/NVIDIA/spark-rapids/pull/10706)|Fix auto merge conflict 10704 [skip ci]| +|[#10675](https://github.com/NVIDIA/spark-rapids/pull/10675)|Fix merge conflict with branch-24.04 [skip ci]| +|[#10678](https://github.com/NVIDIA/spark-rapids/pull/10678)|Append new authorized user to blossom-ci whitelist [skip ci]| +|[#10662](https://github.com/NVIDIA/spark-rapids/pull/10662)|Audit script - Check commits from shuffle and storage directories [skip ci]| +|[#10655](https://github.com/NVIDIA/spark-rapids/pull/10655)|Update rapids jni/private dependency to 24.06| +|[#10652](https://github.com/NVIDIA/spark-rapids/pull/10652)|Substitute murmurHash32 for spark32BitMurmurHash3| ## Release 24.04 @@ -85,8 +201,12 @@ Generated on 2024-05-20 ### PRs ||| |:---|:---| +|[#10844](https://github.com/NVIDIA/spark-rapids/pull/10844)|Update rapids private dependency to 24.04.3| +|[#10788](https://github.com/NVIDIA/spark-rapids/pull/10788)|[DOC] Update archive page for v24.04.1 [skip ci]| +|[#10784](https://github.com/NVIDIA/spark-rapids/pull/10784)|Update latest changelog [skip ci]| |[#10782](https://github.com/NVIDIA/spark-rapids/pull/10782)|Update latest changelog [skip ci]| |[#10780](https://github.com/NVIDIA/spark-rapids/pull/10780)|[DOC]Update download page for v24.04.1 [skip ci]| +|[#10778](https://github.com/NVIDIA/spark-rapids/pull/10778)|Update version to 24.04.1-SNAPSHOT| |[#10777](https://github.com/NVIDIA/spark-rapids/pull/10777)|Update rapids JNI dependency: private to 24.04.2| |[#10683](https://github.com/NVIDIA/spark-rapids/pull/10683)|Update latest changelog [skip ci]| |[#10681](https://github.com/NVIDIA/spark-rapids/pull/10681)|Update rapids JNI dependency to 24.04.0, private to 24.04.1| @@ -172,307 +292,7 @@ Generated on 2024-05-20 |[#10348](https://github.com/NVIDIA/spark-rapids/pull/10348)|Remove redundant joinOutputRows metric| |[#10321](https://github.com/NVIDIA/spark-rapids/pull/10321)|Bump up dependency version to 24.04.0-SNAPSHOT| |[#10330](https://github.com/NVIDIA/spark-rapids/pull/10330)|Add tryAcquire to GpuSemaphore| -|[#10331](https://github.com/NVIDIA/spark-rapids/pull/10331)|Revert "Update to libcudf unsigned sum aggregation types change (#10267)"| |[#10258](https://github.com/NVIDIA/spark-rapids/pull/10258)|Init project version 24.04.0-SNAPSHOT| -## Release 24.02 - -### Features -||| -|:---|:---| -|[#9926](https://github.com/NVIDIA/spark-rapids/issues/9926)|[FEA] Add config option for the parquet reader input read limit.| -|[#10270](https://github.com/NVIDIA/spark-rapids/issues/10270)|[FEA] Add support for single quotes when reading JSON| -|[#10253](https://github.com/NVIDIA/spark-rapids/issues/10253)|[FEA] Enable mixed types as string in GpuJsonToStruct| -|[#9692](https://github.com/NVIDIA/spark-rapids/issues/9692)|[FEA] Remove Pascal support| -|[#8806](https://github.com/NVIDIA/spark-rapids/issues/8806)|[FEA] Support lazy quantifier and specified group index in regexp_extract function| -|[#10079](https://github.com/NVIDIA/spark-rapids/issues/10079)|[FEA] Add string parameter support for `unix_timestamp` for non-UTC time zones| -|[#9667](https://github.com/NVIDIA/spark-rapids/issues/9667)|[FEA][JSON] Add support for non default `dateFormat` in `from_json`| -|[#9173](https://github.com/NVIDIA/spark-rapids/issues/9173)|[FEA] Support format_number | -|[#10145](https://github.com/NVIDIA/spark-rapids/issues/10145)|[FEA] Support to_utc_timestamp| -|[#9927](https://github.com/NVIDIA/spark-rapids/issues/9927)|[FEA] Support to_date with non-UTC timezones without DST| -|[#10006](https://github.com/NVIDIA/spark-rapids/issues/10006)|[FEA] Support ```ParseToTimestamp``` for non-UTC time zones| -|[#9096](https://github.com/NVIDIA/spark-rapids/issues/9096)|[FEA] Add Spark 3.3.4 support| -|[#9585](https://github.com/NVIDIA/spark-rapids/issues/9585)|[FEA] support ascii function| -|[#9260](https://github.com/NVIDIA/spark-rapids/issues/9260)|[FEA] Create Spark 3.4.2 shim and build env| -|[#10076](https://github.com/NVIDIA/spark-rapids/issues/10076)|[FEA] Add performance test framework for non-UTC time zone features.| -|[#9881](https://github.com/NVIDIA/spark-rapids/issues/9881)|[TASK] Remove `spark.rapids.sql.nonUTC.enabled` configuration option| -|[#9801](https://github.com/NVIDIA/spark-rapids/issues/9801)|[FEA] Support DateFormat on GPU with a non-UTC timezone| -|[#6834](https://github.com/NVIDIA/spark-rapids/issues/6834)|[FEA] Support GpuHour expression for timezones other than UTC| -|[#6842](https://github.com/NVIDIA/spark-rapids/issues/6842)|[FEA] Support TimeZone aware operations for value extraction| -|[#1860](https://github.com/NVIDIA/spark-rapids/issues/1860)|[FEA] Optimize row based window operations for BOUNDED ranges| -|[#9606](https://github.com/NVIDIA/spark-rapids/issues/9606)|[FEA] Support unix_timestamp with CST(China Time Zone) support| -|[#9815](https://github.com/NVIDIA/spark-rapids/issues/9815)|[FEA] Support ```unix_timestamp``` for non-DST timezones| -|[#8807](https://github.com/NVIDIA/spark-rapids/issues/8807)|[FEA] support ‘yyyyMMdd’ format in from_unixtime function| -|[#9605](https://github.com/NVIDIA/spark-rapids/issues/9605)|[FEA] Support from_unixtime with CST(China Time Zone) support| -|[#6836](https://github.com/NVIDIA/spark-rapids/issues/6836)|[FEA] Support FromUnixTime for non UTC timezones| -|[#9175](https://github.com/NVIDIA/spark-rapids/issues/9175)|[FEA] Support Databricks 13.3| -|[#6881](https://github.com/NVIDIA/spark-rapids/issues/6881)|[FEA] Support RAPIDS Spark plugin on ARM| -|[#9274](https://github.com/NVIDIA/spark-rapids/issues/9274)|[FEA] Regular deploy process to include arm artifacts| -|[#9844](https://github.com/NVIDIA/spark-rapids/issues/9844)|[FEA] Let Gpu arrow python runners support writing one batch one time for the single threaded model.| -|[#7309](https://github.com/NVIDIA/spark-rapids/issues/7309)|[FEA] Detect multiple versions of the RAPIDS jar on the classpath at the same time| - -### Performance -||| -|:---|:---| -|[#9442](https://github.com/NVIDIA/spark-rapids/issues/9442)|[FEA] For hash joins where the build side can change use the smaller table for the build side| -|[#10142](https://github.com/NVIDIA/spark-rapids/issues/10142)|[TASK] Benchmark existing timestamp functions that work in non-UTC time zone (non-DST)| - -### Bugs Fixed -||| -|:---|:---| -|[#10548](https://github.com/NVIDIA/spark-rapids/issues/10548)|[BUG] test_dpp_bypass / test_dpp_via_aggregate_subquery failures in CI Databricks 13.3| -|[#10530](https://github.com/NVIDIA/spark-rapids/issues/10530)|test_delta_merge_match_delete_only java.lang.OutOfMemoryError: GC overhead limit exceeded| -|[#10464](https://github.com/NVIDIA/spark-rapids/issues/10464)|[BUG] spark334 and spark342 shims missed in scala2.13 dist jar| -|[#10473](https://github.com/NVIDIA/spark-rapids/issues/10473)|[BUG] Leak when running RANK query| -|[#10432](https://github.com/NVIDIA/spark-rapids/issues/10432)|Plug-in Build Failing for Databricks 11.3 | -|[#9974](https://github.com/NVIDIA/spark-rapids/issues/9974)|[BUG] host memory Leak in MultiFileCoalescingPartitionReaderBase in UTC time zone| -|[#10359](https://github.com/NVIDIA/spark-rapids/issues/10359)|[BUG] Build failure on Databricks nightly run with `GpuMapInPandasExecMeta`| -|[#10327](https://github.com/NVIDIA/spark-rapids/issues/10327)|[BUG] Unit test FAILED against : SPARK-24957: average with decimal followed by aggregation returning wrong result | -|[#10324](https://github.com/NVIDIA/spark-rapids/issues/10324)|[BUG] hash_aggregate_test.py test FAILED: Type conversion is not allowed from Table {...}| -|[#10291](https://github.com/NVIDIA/spark-rapids/issues/10291)|[BUG] SIGSEGV in libucp.so| -|[#9212](https://github.com/NVIDIA/spark-rapids/issues/9212)|[BUG] `from_json` fails with cuDF error `Invalid list size computation error`| -|[#10264](https://github.com/NVIDIA/spark-rapids/issues/10264)|[BUG] hash aggregate test failures due to type conversion errors| -|[#10262](https://github.com/NVIDIA/spark-rapids/issues/10262)|[BUG] Test "SPARK-24957: average with decimal followed by aggregation returning wrong result" failed.| -|[#9353](https://github.com/NVIDIA/spark-rapids/issues/9353)|[BUG] [JSON] A mix of lists and structs within the same column is not supported| -|[#10099](https://github.com/NVIDIA/spark-rapids/issues/10099)|[BUG] orc_test.py::test_orc_scan_with_aggregate_pushdown fails with a standalone cluster on spark 3.3.0| -|[#10047](https://github.com/NVIDIA/spark-rapids/issues/10047)|[BUG] CudfException during conditional hash join while running nds query64| -|[#9779](https://github.com/NVIDIA/spark-rapids/issues/9779)|[BUG] 330cdh failed test_hash_reduction_sum_full_decimal on CI| -|[#10197](https://github.com/NVIDIA/spark-rapids/issues/10197)|[BUG] Disable GetJsonObject by default and update docs| -|[#10165](https://github.com/NVIDIA/spark-rapids/issues/10165)|[BUG] Databricks 13.3 executor side broadcast failure| -|[#10224](https://github.com/NVIDIA/spark-rapids/issues/10224)|[BUG] DBR builds fails when installing Maven| -|[#10222](https://github.com/NVIDIA/spark-rapids/issues/10222)|[BUG] to_utc_timestamp and from_utc_timestamp fallback when TZ is supported time zone| -|[#10195](https://github.com/NVIDIA/spark-rapids/issues/10195)|[BUG] test_window_aggs_for_negative_rows_partitioned failure in CI| -|[#10182](https://github.com/NVIDIA/spark-rapids/issues/10182)|[BUG] test_dpp_bypass / test_dpp_via_aggregate_subquery failures in CI (databricks)| -|[#10169](https://github.com/NVIDIA/spark-rapids/issues/10169)|[BUG] Host column vector leaks when running `test_cast_timestamp_to_date`| -|[#10050](https://github.com/NVIDIA/spark-rapids/issues/10050)|[BUG] test_cast_decimal_to_decimal[to:DecimalType(1,-1)-from:Decimal(5,-3)] fails with DATAGEN_SEED=1702439569| -|[#10088](https://github.com/NVIDIA/spark-rapids/issues/10088)|[BUG] GpuExplode single row split to fit cuDF limits| -|[#10174](https://github.com/NVIDIA/spark-rapids/issues/10174)|[BUG] json_test.py::test_from_json_struct_timestamp failed on: Part of the plan is not columnar | -|[#10186](https://github.com/NVIDIA/spark-rapids/issues/10186)|[BUG] test_to_date_with_window_functions failed in non-UTC nightly CI| -|[#10154](https://github.com/NVIDIA/spark-rapids/issues/10154)|[BUG] 'spark-test.sh' integration tests FAILED on 'ps: command not found" in Rocky Docker environment| -|[#10175](https://github.com/NVIDIA/spark-rapids/issues/10175)|[BUG] string_test.py::test_format_number_float_special FAILED : AssertionError 'NaN' == | -|[#10166](https://github.com/NVIDIA/spark-rapids/issues/10166)|Detect Undeclared Shim in POM.xml| -|[#10170](https://github.com/NVIDIA/spark-rapids/issues/10170)|[BUG] `test_cast_timestamp_to_date` fails with `TZ=Asia/Hebron`| -|[#10149](https://github.com/NVIDIA/spark-rapids/issues/10149)|[BUG] GPU illegal access detected during delta_byte_array.parquet read| -|[#9905](https://github.com/NVIDIA/spark-rapids/issues/9905)|[BUG] GpuJsonScan incorrect behavior when parsing dates| -|[#10163](https://github.com/NVIDIA/spark-rapids/issues/10163)|Spark 3.3.4 Shim Build Failure| -|[#10105](https://github.com/NVIDIA/spark-rapids/issues/10105)|[BUG] scala:compile is not thread safe unless compiler bridge already exists | -|[#10026](https://github.com/NVIDIA/spark-rapids/issues/10026)|[BUG] test_hash_agg_with_nan_keys failed with a DATAGEN_SEED=1702335559| -|[#10075](https://github.com/NVIDIA/spark-rapids/issues/10075)|[BUG] `non-pinned blocking alloc with spill` unit test failed in HostAllocSuite| -|[#10134](https://github.com/NVIDIA/spark-rapids/issues/10134)|[BUG] test_window_aggs_for_batched_finite_row_windows_partitioned failed on Scala 2.13 with DATAGEN_SEED=1704033145| -|[#10118](https://github.com/NVIDIA/spark-rapids/issues/10118)|[BUG] non-UTC Nightly CI failed| -|[#10136](https://github.com/NVIDIA/spark-rapids/issues/10136)|[BUG] The canonicalized version of `GpuFileSourceScanExec`s that suppose to be semantic-equal can be different | -|[#10110](https://github.com/NVIDIA/spark-rapids/issues/10110)|[BUG] disable collect_list and collect_set for window operations by default.| -|[#10129](https://github.com/NVIDIA/spark-rapids/issues/10129)|[BUG] Unit test suite fails with `Null data pointer` in GpuTimeZoneDB| -|[#10089](https://github.com/NVIDIA/spark-rapids/issues/10089)|[BUG] DATAGEN_SEED= environment does not override the marker datagen_overrides| -|[#10108](https://github.com/NVIDIA/spark-rapids/issues/10108)|[BUG] @datagen_overrides seed is sticky when it shouldn't be| -|[#10064](https://github.com/NVIDIA/spark-rapids/issues/10064)|[BUG] test_unsupported_fallback_regexp_replace failed with DATAGEN_SEED=1702662063| -|[#10117](https://github.com/NVIDIA/spark-rapids/issues/10117)|[BUG] test_from_utc_timestamp failed on Cloudera Env when TZ is Iran| -|[#9914](https://github.com/NVIDIA/spark-rapids/issues/9914)|[BUG] Report GPU OOM on recent passed CI premerges.| -|[#10094](https://github.com/NVIDIA/spark-rapids/issues/10094)|[BUG] spark351 PR check failure MockTaskContext method isFailed in class TaskContext of type ()Boolean is not defined| -|[#10017](https://github.com/NVIDIA/spark-rapids/issues/10017)|[BUG] test_casting_from_double_to_timestamp failed for DATAGEN_SEED=1702329497| -|[#9992](https://github.com/NVIDIA/spark-rapids/issues/9992)|[BUG] conditionals_test.py::test_conditional_with_side_effects_cast[String] failed with DATAGEN_SEED=1701976979| -|[#9743](https://github.com/NVIDIA/spark-rapids/issues/9743)|[BUG][AUDIT] SPARK-45652 - SPJ: Handle empty input partitions after dynamic filtering| -|[#9859](https://github.com/NVIDIA/spark-rapids/issues/9859)|[AUDIT] [SPARK-45786] Inaccurate Decimal multiplication and division results| -|[#9555](https://github.com/NVIDIA/spark-rapids/issues/9555)|[BUG] Scala 2.13 build with JDK 11 or 17 fails OpcodeSuite tests| -|[#10073](https://github.com/NVIDIA/spark-rapids/issues/10073)|[BUG] test_csv_prefer_date_with_infer_schema failed with DATAGEN_SEED=1702847907| -|[#10004](https://github.com/NVIDIA/spark-rapids/issues/10004)|[BUG] If a host memory buffer is spilled, it cannot be unspilled| -|[#10063](https://github.com/NVIDIA/spark-rapids/issues/10063)|[BUG] CI build failure with 341db: method getKillReason has weaker access privileges; it should be public| -|[#10055](https://github.com/NVIDIA/spark-rapids/issues/10055)|[BUG] array_test.py::test_array_transform_non_deterministic failed with non-UTC time zone| -|[#10056](https://github.com/NVIDIA/spark-rapids/issues/10056)|[BUG] Unit tests ToPrettyStringSuite FAILED on spark-3.5.0| -|[#10048](https://github.com/NVIDIA/spark-rapids/issues/10048)|[BUG] Fix ```out of range``` error from ```pySpark``` in ```test_timestamp_millis``` and other two integration test cases| -|[#4204](https://github.com/NVIDIA/spark-rapids/issues/4204)|casting double to string does not match Spark| -|[#9938](https://github.com/NVIDIA/spark-rapids/issues/9938)|Better to do some refactor for the Python UDF code| -|[#10018](https://github.com/NVIDIA/spark-rapids/issues/10018)|[BUG] `GpuToUnixTimestampImproved` off by 1 on GPU when handling timestamp before epoch| -|[#10012](https://github.com/NVIDIA/spark-rapids/issues/10012)|[BUG] test_str_to_map_expr_random_delimiters with DATAGEN_SEED=1702166057 hangs| -|[#10029](https://github.com/NVIDIA/spark-rapids/issues/10029)|[BUG] doc links fail with 404 for shims.md| -|[#9472](https://github.com/NVIDIA/spark-rapids/issues/9472)|[BUG] Non-Deterministic expressions in an array_transform can cause errors| -|[#9884](https://github.com/NVIDIA/spark-rapids/issues/9884)|[BUG] delta_lake_delete_test.py failed assertion [DATAGEN_SEED=1701225104, IGNORE_ORDER...| -|[#9977](https://github.com/NVIDIA/spark-rapids/issues/9977)|[BUG] test_cast_date_integral fails on databricks 3.4.1| -|[#9936](https://github.com/NVIDIA/spark-rapids/issues/9936)|[BUG] Nightly CI of non-UTC time zone reports 'year 0 is out of range' error| -|[#9941](https://github.com/NVIDIA/spark-rapids/issues/9941)|[BUG] A potential data corruption in Pandas UDFs| -|[#9897](https://github.com/NVIDIA/spark-rapids/issues/9897)|[BUG] Error message for multiple jars on classpath is wrong| -|[#9916](https://github.com/NVIDIA/spark-rapids/issues/9916)|[BUG] ```test_cast_string_ts_valid_format``` failed at ```seed = 1701362564```| -|[#9559](https://github.com/NVIDIA/spark-rapids/issues/9559)|[BUG] precommit regularly fails with error trying to download a dependency| -|[#9708](https://github.com/NVIDIA/spark-rapids/issues/9708)|[BUG] test_cast_string_ts_valid_format fails with DATAGEN_SEED=1699978422| - -### PRs -||| -|:---|:---| -|[#10555](https://github.com/NVIDIA/spark-rapids/pull/10555)|Update change log [skip ci]| -|[#10551](https://github.com/NVIDIA/spark-rapids/pull/10551)|Try to make degenerative joins here impossible for these tests| -|[#10546](https://github.com/NVIDIA/spark-rapids/pull/10546)|Update changelog [skip ci]| -|[#10541](https://github.com/NVIDIA/spark-rapids/pull/10541)|Fix Delta log cache size settings during integration tests| -|[#10525](https://github.com/NVIDIA/spark-rapids/pull/10525)|Update changelog for v24.02.0 release [skip ci]| -|[#10465](https://github.com/NVIDIA/spark-rapids/pull/10465)|Add missed shims for scala2.13| -|[#10511](https://github.com/NVIDIA/spark-rapids/pull/10511)|Update rapids jni and private dependency version to 24.02.1| -|[#10513](https://github.com/NVIDIA/spark-rapids/pull/10513)|Fix scalar leak in SumBinaryFixer (#10510)| -|[#10475](https://github.com/NVIDIA/spark-rapids/pull/10475)|Fix scalar leak in RankFixer| -|[#10461](https://github.com/NVIDIA/spark-rapids/pull/10461)|Preserve tags on FileSourceScanExec| -|[#10459](https://github.com/NVIDIA/spark-rapids/pull/10459)|[DOC] Fix table rendering issue in github.io download UI page on branch-24.02 [skip ci] | -|[#10443](https://github.com/NVIDIA/spark-rapids/pull/10443)|Update change log for v24.02.0 release [skip ci]| -|[#10439](https://github.com/NVIDIA/spark-rapids/pull/10439)|Reverts NVIDIA/spark-rapids#10232 and fixes the plugin build on Databricks 11.3| -|[#10380](https://github.com/NVIDIA/spark-rapids/pull/10380)|Init changelog 24.02 [skip ci]| -|[#10367](https://github.com/NVIDIA/spark-rapids/pull/10367)|Update rapids JNI and private version to release 24.02.0| -|[#10414](https://github.com/NVIDIA/spark-rapids/pull/10414)|[DOC] Fix 24.02.0 documentation errors [skip ci]| -|[#10403](https://github.com/NVIDIA/spark-rapids/pull/10403)|Cherry-pick: Fix a memory leak in json tuple (#10360)| -|[#10387](https://github.com/NVIDIA/spark-rapids/pull/10387)|[DOC] Update docs for 24.02.0 release [skip ci]| -|[#10399](https://github.com/NVIDIA/spark-rapids/pull/10399)|Update NOTICE-binary| -|[#10389](https://github.com/NVIDIA/spark-rapids/pull/10389)|Change version and branch to 24.02 in docs [skip ci]| -|[#10384](https://github.com/NVIDIA/spark-rapids/pull/10384)|[DOC] Update docs for 23.12.2 release [skip ci] | -|[#10309](https://github.com/NVIDIA/spark-rapids/pull/10309)|[DOC] add custom 404 page and fix some document issue [skip ci]| -|[#10352](https://github.com/NVIDIA/spark-rapids/pull/10352)|xfail mixed type test| -|[#10355](https://github.com/NVIDIA/spark-rapids/pull/10355)|Revert "Support barrier mode for mapInPandas/mapInArrow (#10343)"| -|[#10353](https://github.com/NVIDIA/spark-rapids/pull/10353)|Use fixed seed for test_from_json_struct_decimal| -|[#10343](https://github.com/NVIDIA/spark-rapids/pull/10343)|Support barrier mode for mapInPandas/mapInArrow| -|[#10345](https://github.com/NVIDIA/spark-rapids/pull/10345)|Fix auto merge conflict 10339 [skip ci]| -|[#9991](https://github.com/NVIDIA/spark-rapids/pull/9991)|Start to use explicit memory limits in the parquet chunked reader| -|[#10328](https://github.com/NVIDIA/spark-rapids/pull/10328)|Fix typo in spark-tests.sh [skip ci]| -|[#10279](https://github.com/NVIDIA/spark-rapids/pull/10279)|Run '--packages' only with default cuda11 jar| -|[#10273](https://github.com/NVIDIA/spark-rapids/pull/10273)|Support reading JSON data with single quotes around attribute names and values| -|[#10306](https://github.com/NVIDIA/spark-rapids/pull/10306)|Fix performance regression in from_json| -|[#10272](https://github.com/NVIDIA/spark-rapids/pull/10272)|Add FullOuter support to GpuShuffledSymmetricHashJoinExec| -|[#10260](https://github.com/NVIDIA/spark-rapids/pull/10260)|Add perf test for time zone operators| -|[#10275](https://github.com/NVIDIA/spark-rapids/pull/10275)|Add tests for window Python udf with array input| -|[#10278](https://github.com/NVIDIA/spark-rapids/pull/10278)|Clean up $M2_CACHE to avoid side-effect of previous dependency:get [skip ci]| -|[#10268](https://github.com/NVIDIA/spark-rapids/pull/10268)|Add config to enable mixed types as string in GpuJsonToStruct & GpuJsonScan| -|[#10297](https://github.com/NVIDIA/spark-rapids/pull/10297)|Revert "UCX 1.16.0 upgrade (#10190)"| -|[#10289](https://github.com/NVIDIA/spark-rapids/pull/10289)|Add gerashegalov to CODEOWNERS [skip ci]| -|[#10290](https://github.com/NVIDIA/spark-rapids/pull/10290)|Fix merge conflict with 23.12 [skip ci]| -|[#10190](https://github.com/NVIDIA/spark-rapids/pull/10190)|UCX 1.16.0 upgrade| -|[#10211](https://github.com/NVIDIA/spark-rapids/pull/10211)|Use parse_url kernel for QUERY literal and column key| -|[#10267](https://github.com/NVIDIA/spark-rapids/pull/10267)|Update to libcudf unsigned sum aggregation types change| -|[#10208](https://github.com/NVIDIA/spark-rapids/pull/10208)|Added Support for Lazy Quantifier| -|[#9993](https://github.com/NVIDIA/spark-rapids/pull/9993)|Enable mixed types as string in GpuJsonScan| -|[#10246](https://github.com/NVIDIA/spark-rapids/pull/10246)|Refactor full join iterator to allow access to build tracker| -|[#10257](https://github.com/NVIDIA/spark-rapids/pull/10257)|Enable auto-merge from branch-24.02 to branch-24.04 [skip CI]| -|[#10178](https://github.com/NVIDIA/spark-rapids/pull/10178)|Mark hash reduction decimal overflow test as a permanent seed override| -|[#10244](https://github.com/NVIDIA/spark-rapids/pull/10244)|Use POSIX mode in assembly plugin to avoid issues with large UID/GID| -|[#10238](https://github.com/NVIDIA/spark-rapids/pull/10238)|Smoke test with '--package' to fetch the plugin jar| -|[#10201](https://github.com/NVIDIA/spark-rapids/pull/10201)|Deploy release candidates to local maven repo for dependency check[skip ci]| -|[#10240](https://github.com/NVIDIA/spark-rapids/pull/10240)|Improved inner joins with large build side| -|[#10220](https://github.com/NVIDIA/spark-rapids/pull/10220)|Disable GetJsonObject by default and add tests for as many issues with it as possible| -|[#10230](https://github.com/NVIDIA/spark-rapids/pull/10230)|Fix Databricks 13.3 BroadcastHashJoin using executor side broadcast fed by ColumnarToRow [Databricks]| -|[#10232](https://github.com/NVIDIA/spark-rapids/pull/10232)|Fixed 330db Shims to Adopt the PythonRunner Changes| -|[#10225](https://github.com/NVIDIA/spark-rapids/pull/10225)|Download Maven from apache.org archives [skip ci]| -|[#10210](https://github.com/NVIDIA/spark-rapids/pull/10210)|Add string parameter support for unix_timestamp for non-UTC time zones| -|[#10223](https://github.com/NVIDIA/spark-rapids/pull/10223)|Fix to_utc_timestamp and from_utc_timestamp fallback when TZ is supported time zone| -|[#10205](https://github.com/NVIDIA/spark-rapids/pull/10205)|Deterministic ordering in window tests| -|[#10204](https://github.com/NVIDIA/spark-rapids/pull/10204)|Further prevent degenerative joins in dpp_test| -|[#10156](https://github.com/NVIDIA/spark-rapids/pull/10156)|Update string to float compatibility doc[skip ci]| -|[#10193](https://github.com/NVIDIA/spark-rapids/pull/10193)|Fix explode with carry-along columns on GpuExplode single row retry handling| -|[#10191](https://github.com/NVIDIA/spark-rapids/pull/10191)|Updating the config documentation for filecache configs [skip ci]| -|[#10131](https://github.com/NVIDIA/spark-rapids/pull/10131)|With a single row GpuExplode tries to split the generator array| -|[#10179](https://github.com/NVIDIA/spark-rapids/pull/10179)|Fix build regression against Spark 3.2.x| -|[#10189](https://github.com/NVIDIA/spark-rapids/pull/10189)|test needs marks for non-UTC and for non_supported timezones| -|[#10176](https://github.com/NVIDIA/spark-rapids/pull/10176)|Fix format_number NaN symbol in high jdk version| -|[#10074](https://github.com/NVIDIA/spark-rapids/pull/10074)|Update the legacy mode check: only take effect when reading date/timestamp column| -|[#10167](https://github.com/NVIDIA/spark-rapids/pull/10167)|Defined Shims Should Be Declared In POM | -|[#10168](https://github.com/NVIDIA/spark-rapids/pull/10168)|Prevent a degenerative join in test_dpp_reuse_broadcast_exchange| -|[#10171](https://github.com/NVIDIA/spark-rapids/pull/10171)|Fix `test_cast_timestamp_to_date` when running in a DST time zone| -|[#9975](https://github.com/NVIDIA/spark-rapids/pull/9975)|Improve dateFormat support in GpuJsonScan and make tests consistent with GpuStructsToJson| -|[#9790](https://github.com/NVIDIA/spark-rapids/pull/9790)|Support float case of format_number with format_float kernel| -|[#10144](https://github.com/NVIDIA/spark-rapids/pull/10144)|Support to_utc_timestamp| -|[#10162](https://github.com/NVIDIA/spark-rapids/pull/10162)|Fix Spark 334 Build| -|[#10146](https://github.com/NVIDIA/spark-rapids/pull/10146)|Refactor the window code so it is not mostly kept in a few very large files| -|[#10155](https://github.com/NVIDIA/spark-rapids/pull/10155)|Install procps tools for rocky docker images [skip ci]| -|[#10153](https://github.com/NVIDIA/spark-rapids/pull/10153)|Disable multi-threaded Maven | -|[#10100](https://github.com/NVIDIA/spark-rapids/pull/10100)|Enable to_date (via gettimestamp and casting timestamp to date) for non-UTC time zones| -|[#10140](https://github.com/NVIDIA/spark-rapids/pull/10140)|Removed Unnecessary Whitespaces From Spark 3.3.4 Shim [skip ci]| -|[#10148](https://github.com/NVIDIA/spark-rapids/pull/10148)|fix test_hash_agg_with_nan_keys floating point sum failure| -|[#10150](https://github.com/NVIDIA/spark-rapids/pull/10150)|Increase timeouts in HostAllocSuite to avoid timeout failures on slow machines| -|[#10143](https://github.com/NVIDIA/spark-rapids/pull/10143)|Fix `test_window_aggs_for_batched_finite_row_windows_partitioned` fail| -|[#9887](https://github.com/NVIDIA/spark-rapids/pull/9887)|Reduce time-consuming of pre-merge| -|[#10130](https://github.com/NVIDIA/spark-rapids/pull/10130)|Change unit tests that force ooms to specify the oom type (gpu|cpu)| -|[#10138](https://github.com/NVIDIA/spark-rapids/pull/10138)|Update copyright dates in NOTICE files [skip ci]| -|[#10139](https://github.com/NVIDIA/spark-rapids/pull/10139)|Add Delta Lake 2.3.0 to list of versions to test for Spark 3.3.x| -|[#10135](https://github.com/NVIDIA/spark-rapids/pull/10135)|Fix CI: can't find script when there is pushd in script [skip ci]| -|[#10137](https://github.com/NVIDIA/spark-rapids/pull/10137)|Fix the canonicalizing for GPU file scan| -|[#10132](https://github.com/NVIDIA/spark-rapids/pull/10132)|Disable collect_list and collect_set for window by default| -|[#10084](https://github.com/NVIDIA/spark-rapids/pull/10084)|Refactor GpuJsonToStruct to reduce code duplication and manage resources more efficiently| -|[#10087](https://github.com/NVIDIA/spark-rapids/pull/10087)|Additional unit tests for GeneratedInternalRowToCudfRowIterator| -|[#10082](https://github.com/NVIDIA/spark-rapids/pull/10082)|Add Spark 3.3.4 Shim| -|[#10054](https://github.com/NVIDIA/spark-rapids/pull/10054)|Support Ascii function for ascii and latin-1| -|[#10127](https://github.com/NVIDIA/spark-rapids/pull/10127)|Fix merge conflict with branch-23.12| -|[#10097](https://github.com/NVIDIA/spark-rapids/pull/10097)|[DOC] Update docs for 23.12.1 release [skip ci]| -|[#10109](https://github.com/NVIDIA/spark-rapids/pull/10109)|Fixes a bug where datagen seed overrides were sticky and adds datagen_seed_override_disabled| -|[#10093](https://github.com/NVIDIA/spark-rapids/pull/10093)|Fix test_unsupported_fallback_regexp_replace| -|[#10119](https://github.com/NVIDIA/spark-rapids/pull/10119)|Fix from_utc_timestamp case failure on Cloudera when TZ is Iran| -|[#10106](https://github.com/NVIDIA/spark-rapids/pull/10106)|Add `isFailed()` to MockTaskContext and Remove MockTaskContextBase.scala| -|[#10112](https://github.com/NVIDIA/spark-rapids/pull/10112)|Remove datagen seed override for test_conditional_with_side_effects_cast| -|[#10104](https://github.com/NVIDIA/spark-rapids/pull/10104)|[DOC] Add in docs about memory debugging [skip ci]| -|[#9925](https://github.com/NVIDIA/spark-rapids/pull/9925)|Use threads, cache Scala compiler in GH mvn workflow| -|[#9967](https://github.com/NVIDIA/spark-rapids/pull/9967)|Added Spark-3.4.2 Shims| -|[#10061](https://github.com/NVIDIA/spark-rapids/pull/10061)|Use parse_url kernel for QUERY parsing| -|[#10101](https://github.com/NVIDIA/spark-rapids/pull/10101)|[DOC] Add column order error docs [skip ci]| -|[#10078](https://github.com/NVIDIA/spark-rapids/pull/10078)|Add perf test for non-UTC operators| -|[#10096](https://github.com/NVIDIA/spark-rapids/pull/10096)|Shim MockTaskContext to fix Spark 3.5.1 build| -|[#10092](https://github.com/NVIDIA/spark-rapids/pull/10092)|Implement Math.round using floor on GPU| -|[#10085](https://github.com/NVIDIA/spark-rapids/pull/10085)|Update tests that originally restricted the Spark timestamp range| -|[#10090](https://github.com/NVIDIA/spark-rapids/pull/10090)|Replace GPU-unsupported `\z` with an alternative RLIKE expression| -|[#10095](https://github.com/NVIDIA/spark-rapids/pull/10095)|Temporarily fix date format failed cases for non-UTC time zone.| -|[#9999](https://github.com/NVIDIA/spark-rapids/pull/9999)|Add some odd time zones for timezone transition tests| -|[#9962](https://github.com/NVIDIA/spark-rapids/pull/9962)|Add 3.5.1-SNAPSHOT Shim| -|[#10071](https://github.com/NVIDIA/spark-rapids/pull/10071)|Cleanup usage of non-utc configuration here| -|[#10057](https://github.com/NVIDIA/spark-rapids/pull/10057)|Add support for StringConcatFactory.makeConcatWithConstants (#9555)| -|[#9996](https://github.com/NVIDIA/spark-rapids/pull/9996)|Test full timestamp output range in PySpark| -|[#10081](https://github.com/NVIDIA/spark-rapids/pull/10081)|Add a fallback Cloudera Maven repo URL [skip ci]| -|[#10065](https://github.com/NVIDIA/spark-rapids/pull/10065)|Improve host memory spill interfaces| -|[#10069](https://github.com/NVIDIA/spark-rapids/pull/10069)|Revert "Support split broadcast join condition into ast and non-ast […| -|[#10070](https://github.com/NVIDIA/spark-rapids/pull/10070)|Fix 332db build failure| -|[#10060](https://github.com/NVIDIA/spark-rapids/pull/10060)|Fix failed cases for non-utc time zone| -|[#10038](https://github.com/NVIDIA/spark-rapids/pull/10038)|Remove spark.rapids.sql.nonUTC.enabled configuration option| -|[#10059](https://github.com/NVIDIA/spark-rapids/pull/10059)|Fixed Failing ToPrettyStringSuite Test for 3.5.0| -|[#10013](https://github.com/NVIDIA/spark-rapids/pull/10013)|Extended configuration of OOM injection mode| -|[#10052](https://github.com/NVIDIA/spark-rapids/pull/10052)|Set seed=0 for some integration test cases| -|[#10053](https://github.com/NVIDIA/spark-rapids/pull/10053)|Remove invalid user from CODEOWNER file [skip ci]| -|[#10049](https://github.com/NVIDIA/spark-rapids/pull/10049)|Fix out of range error from pySpark in test_timestamp_millis and other two integration test cases| -|[#9721](https://github.com/NVIDIA/spark-rapids/pull/9721)|Support date_format via Gpu for non-UTC time zone| -|[#9470](https://github.com/NVIDIA/spark-rapids/pull/9470)|Use float to string kernel| -|[#9845](https://github.com/NVIDIA/spark-rapids/pull/9845)|Use parse_url kernel for HOST parsing| -|[#10024](https://github.com/NVIDIA/spark-rapids/pull/10024)|Support hour minute second for non-UTC time zone| -|[#9973](https://github.com/NVIDIA/spark-rapids/pull/9973)|Batching support for row-based bounded window functions | -|[#10042](https://github.com/NVIDIA/spark-rapids/pull/10042)|Update tests to not have hard coded fallback when not needed| -|[#9816](https://github.com/NVIDIA/spark-rapids/pull/9816)|Support unix_timestamp and to_unix_timestamp with non-UTC timezones (non-DST)| -|[#9902](https://github.com/NVIDIA/spark-rapids/pull/9902)|Some refactor for the Python UDF code| -|[#10023](https://github.com/NVIDIA/spark-rapids/pull/10023)|GPU supports `yyyyMMdd` format by post process for the `from_unixtime` function| -|[#10033](https://github.com/NVIDIA/spark-rapids/pull/10033)|Remove GpuToTimestampImproved and spark.rapids.sql.improvedTimeOps.enabled| -|[#10016](https://github.com/NVIDIA/spark-rapids/pull/10016)|Fix infinite loop in test_str_to_map_expr_random_delimiters| -|[#9481](https://github.com/NVIDIA/spark-rapids/pull/9481)|Use parse_url kernel for PROTOCOL parsing| -|[#10030](https://github.com/NVIDIA/spark-rapids/pull/10030)|Update links in shims.md| -|[#10015](https://github.com/NVIDIA/spark-rapids/pull/10015)|Fix array_transform to not recompute the argument| -|[#10011](https://github.com/NVIDIA/spark-rapids/pull/10011)|Add cpu oom retry split handling to InternalRowToColumnarBatchIterator| -|[#10019](https://github.com/NVIDIA/spark-rapids/pull/10019)|Fix auto merge conflict 10010 [skip ci]| -|[#9760](https://github.com/NVIDIA/spark-rapids/pull/9760)|Support split broadcast join condition into ast and non-ast| -|[#9827](https://github.com/NVIDIA/spark-rapids/pull/9827)|Enable ORC timestamp and decimal predicate push down tests| -|[#10002](https://github.com/NVIDIA/spark-rapids/pull/10002)|Use Spark 3.3.3 instead of 3.3.2 for Scala 2.13 premerge builds| -|[#10000](https://github.com/NVIDIA/spark-rapids/pull/10000)|Optimize from_unixtime| -|[#10003](https://github.com/NVIDIA/spark-rapids/pull/10003)|Fix merge conflict with branch-23.12| -|[#9984](https://github.com/NVIDIA/spark-rapids/pull/9984)|Fix 340+(including DB341+) does not support casting date to integral/float| -|[#9972](https://github.com/NVIDIA/spark-rapids/pull/9972)|Fix year 0 is out of range in test_from_json_struct_timestamp | -|[#9814](https://github.com/NVIDIA/spark-rapids/pull/9814)|Support from_unixtime via Gpu for non-UTC time zone| -|[#9929](https://github.com/NVIDIA/spark-rapids/pull/9929)|Add host memory retries for GeneratedInternalRowToCudfRowIterator| -|[#9957](https://github.com/NVIDIA/spark-rapids/pull/9957)|Update cases for cast between integral and (date/time)| -|[#9959](https://github.com/NVIDIA/spark-rapids/pull/9959)|Append new authorized user to blossom-ci whitelist [skip ci]| -|[#9942](https://github.com/NVIDIA/spark-rapids/pull/9942)|Fix a potential data corruption for Pandas UDF| -|[#9922](https://github.com/NVIDIA/spark-rapids/pull/9922)|Fix `allowMultipleJars` recommend setting message| -|[#9947](https://github.com/NVIDIA/spark-rapids/pull/9947)|Fix merge conflict with branch-23.12| -|[#9908](https://github.com/NVIDIA/spark-rapids/pull/9908)|Register default allocator for host memory| -|[#9944](https://github.com/NVIDIA/spark-rapids/pull/9944)|Fix Java OOM caused by incorrect state of shouldCapture when exception occurred| -|[#9937](https://github.com/NVIDIA/spark-rapids/pull/9937)|Refactor to use CLASSIFIER instead of CUDA_CLASSIFIER [skip ci]| -|[#9904](https://github.com/NVIDIA/spark-rapids/pull/9904)|Params for build and test CI scripts on Databricks| -|[#9719](https://github.com/NVIDIA/spark-rapids/pull/9719)|Support fine grained timezone checker instead of type based| -|[#9918](https://github.com/NVIDIA/spark-rapids/pull/9918)|Prevent generation of 'year 0 is out of range' strings in IT| -|[#9852](https://github.com/NVIDIA/spark-rapids/pull/9852)|Avoid generating duplicate nan keys with MapGen(FloatGen)| -|[#9674](https://github.com/NVIDIA/spark-rapids/pull/9674)|Add cache action to speed up mvn workflow [skip ci]| -|[#9900](https://github.com/NVIDIA/spark-rapids/pull/9900)|Revert "Remove Databricks 13.3 from release 23.12 (#9890)"| -|[#9889](https://github.com/NVIDIA/spark-rapids/pull/9889)|Fix test_cast_string_ts_valid_format test| -|[#9888](https://github.com/NVIDIA/spark-rapids/pull/9888)|Update nightly build and deploy script for arm artifacts [skip ci]| -|[#9833](https://github.com/NVIDIA/spark-rapids/pull/9833)|Fix a hang for Pandas UDFs on DB 13.3| -|[#9656](https://github.com/NVIDIA/spark-rapids/pull/9656)|Update for new retry state machine JNI APIs| -|[#9654](https://github.com/NVIDIA/spark-rapids/pull/9654)|Detect multiple jars on the classpath when init plugin| -|[#9857](https://github.com/NVIDIA/spark-rapids/pull/9857)|Skip redundant steps in nightly build [skip ci]| -|[#9812](https://github.com/NVIDIA/spark-rapids/pull/9812)|Update JNI and private dep version to 24.02.0-SNAPSHOT| -|[#9716](https://github.com/NVIDIA/spark-rapids/pull/9716)|Initiate project version 24.02.0-SNAPSHOT| - ## Older Releases Changelog of older releases can be found at [docs/archives](/docs/archives) diff --git a/build/buildall b/build/buildall index e8c0610deb7..b3c473be141 100755 --- a/build/buildall +++ b/build/buildall @@ -265,7 +265,7 @@ function build_single_shim() { -Dmaven.scaladoc.skip \ -Dmaven.scalastyle.skip="$SKIP_CHECKS" \ -pl tools -am > "$LOG_FILE" 2>&1 || { - [[ "$LOG_FILE" != "/dev/tty" ]] && echo "$LOG_FILE:" && tail -20 "$LOG_FILE" || true + [[ "$LOG_FILE" != "/dev/tty" ]] && echo "$LOG_FILE:" && tail -500 "$LOG_FILE" || true exit 255 } } diff --git a/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala b/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala index 91335afe4e6..14e0d4e0970 100644 --- a/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala +++ b/datagen/src/main/scala/org/apache/spark/sql/tests/datagen/bigDataGen.scala @@ -16,21 +16,22 @@ package org.apache.spark.sql.tests.datagen +import com.fasterxml.jackson.core.{JsonFactoryBuilder, JsonParser, JsonToken} +import com.fasterxml.jackson.core.json.JsonReadFeature import java.math.{BigDecimal => JavaBigDecimal} import java.sql.{Date, Timestamp} import java.time.{Duration, Instant, LocalDate, LocalDateTime} import java.util - import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import scala.math.BigDecimal.RoundingMode import scala.util.Random -import org.apache.spark.sql.{Column, DataFrame, SparkSession} +import org.apache.spark.sql.{Column, DataFrame, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Expression, XXH64} import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils} -import org.apache.spark.sql.functions.col +import org.apache.spark.sql.functions.{approx_count_distinct, avg, coalesce, col, count, lit, stddev, struct, transform, udf, when} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.random.XORShiftRandom @@ -79,22 +80,28 @@ class RowLocation(val rowNum: Long, val subRows: Array[Int] = null) { * hash. This makes the generated data correlated for all column/child columns. * @param tableNum a unique ID for the table this is a part of. * @param columnNum the location of the column in the data being generated + * @param substringNum the location of the substring column * @param correlatedKeyGroup the correlated key group this column is a part of, if any. */ -case class ColumnLocation(tableNum: Int, columnNum: Int, correlatedKeyGroup: Option[Long] = None) { - def forNextColumn(): ColumnLocation = ColumnLocation(tableNum, columnNum + 1) +case class ColumnLocation(tableNum: Int, + columnNum: Int, + substringNum: Int, + correlatedKeyGroup: Option[Long] = None) { + def forNextColumn(): ColumnLocation = ColumnLocation(tableNum, columnNum + 1, 0) + def forNextSubstring: ColumnLocation = ColumnLocation(tableNum, columnNum, substringNum + 1) /** * Create a new ColumnLocation that is specifically for a given key group */ def forCorrelatedKeyGroup(keyGroup: Long): ColumnLocation = - ColumnLocation(tableNum, columnNum, Some(keyGroup)) + ColumnLocation(tableNum, columnNum, substringNum, Some(keyGroup)) /** * Hash the location into a single long value. */ - lazy val hashLoc: Long = XXH64.hashLong(tableNum, correlatedKeyGroup.getOrElse(columnNum)) + lazy val hashLoc: Long = XXH64.hashLong(tableNum, + correlatedKeyGroup.getOrElse(XXH64.hashLong(columnNum, substringNum))) } /** @@ -115,6 +122,9 @@ case class ColumnConf(columnLoc: ColumnLocation, def forNextColumn(nullable: Boolean): ColumnConf = ColumnConf(columnLoc.forNextColumn(), nullable, numTableRows) + def forNextSubstring: ColumnConf = + ColumnConf(columnLoc.forNextSubstring, nullable = true, numTableRows) + /** * Create a new configuration based on this, but for a given correlated key group. */ @@ -303,6 +313,23 @@ case class VarLengthGeneratorFunction(minLength: Int, maxLength: Int) extends } } +case class StdDevLengthGen(mean: Double, + stdDev: Double, + mapping: LocationToSeedMapping = null) extends + LengthGeneratorFunction { + override def withLocationToSeedMapping(mapping: LocationToSeedMapping): LengthGeneratorFunction = + StdDevLengthGen(mean, stdDev, mapping) + + override def apply(rowLoc: RowLocation): Int = { + val r = DataGen.getRandomFor(rowLoc, mapping) + val g = r.nextGaussian() // g has a mean of 0 and a stddev of 1.0 + val adjusted = mean + (g * stdDev) + // If the range of seed is too small compared to the stddev and mean we will + // end up with an invalid distribution, but they asked for it. + math.max(0, math.round(adjusted).toInt) + } +} + /** * Generate nulls with a given probability. * @param prob 0.0 to 1.0 for how often nulls should appear in the output. @@ -562,11 +589,8 @@ case class DataGenExpr(child: Expression, } } -/** - * Base class for generating a column/sub-column. This holds configuration for the column, - * and handles what is needed to convert it into GeneratorFunction - */ -abstract class DataGen(var conf: ColumnConf, +abstract class CommonDataGen( + var conf: ColumnConf, defaultValueRange: Option[(Any, Any)], var seedMapping: LocationToSeedMapping = FlatDistribution(), var nullMapping: LocationToSeedMapping = FlatDistribution(), @@ -576,26 +600,25 @@ abstract class DataGen(var conf: ColumnConf, protected var valueRange: Option[(Any, Any)] = defaultValueRange /** - * Set a value range for this data gen. + * Set a value range */ - def setValueRange(min: Any, max: Any): DataGen = { + def setValueRange(min: Any, max: Any): CommonDataGen = { valueRange = Some((min, max)) this } /** - * Set a custom GeneratorFunction to use for this column. + * Set a custom GeneratorFunction */ - def setValueGen(f: GeneratorFunction): DataGen = { + def setValueGen(f: GeneratorFunction): CommonDataGen = { userProvidedValueGen = Some(f) this } /** - * Set a NullGeneratorFunction for this column. This will not be used - * if the column is not nullable. + * Set a NullGeneratorFunction */ - def setNullGen(f: NullGeneratorFunction): DataGen = { + def setNullGen(f: NullGeneratorFunction): CommonDataGen = { this.userProvidedNullGen = Some(f) this } @@ -604,12 +627,12 @@ abstract class DataGen(var conf: ColumnConf, * Set the probability of a null appearing in the output. The probability should be * 0.0 to 1.0. */ - def setNullProbability(probability: Double): DataGen = { + def setNullProbability(probability: Double): CommonDataGen = { this.userProvidedNullGen = Some(NullProbabilityGenerationFunction(probability)) this } - def setNullProbabilityRecursively(probability: Double): DataGen = { + def setNullProbabilityRecursively(probability: Double): CommonDataGen = { this.userProvidedNullGen = Some(NullProbabilityGenerationFunction(probability)) children.foreach { case (_, dataGen) => @@ -621,7 +644,7 @@ abstract class DataGen(var conf: ColumnConf, /** * Set a specific location to seed mapping for the value generation. */ - def setSeedMapping(seedMapping: LocationToSeedMapping): DataGen = { + def setSeedMapping(seedMapping: LocationToSeedMapping): CommonDataGen = { this.seedMapping = seedMapping this } @@ -629,7 +652,7 @@ abstract class DataGen(var conf: ColumnConf, /** * Set a specific location to seed mapping for the null generation. */ - def setNullMapping(nullMapping: LocationToSeedMapping): DataGen = { + def setNullMapping(nullMapping: LocationToSeedMapping): CommonDataGen = { this.nullMapping = nullMapping this } @@ -638,7 +661,7 @@ abstract class DataGen(var conf: ColumnConf, * Set a specific LengthGeneratorFunction to use. This will only be used if * the datatype needs a length. */ - def setLengthGen(lengthGen: LengthGeneratorFunction): DataGen = { + def setLengthGen(lengthGen: LengthGeneratorFunction): CommonDataGen = { this.lengthGen = lengthGen this } @@ -646,25 +669,30 @@ abstract class DataGen(var conf: ColumnConf, /** * Set the length generation to be a fixed length. */ - def setLength(len: Int): DataGen = { + def setLength(len: Int): CommonDataGen = { this.lengthGen = FixedLengthGeneratorFunction(len) this } - def setLength(minLen: Int, maxLen: Int) = { + def setLength(minLen: Int, maxLen: Int): CommonDataGen = { this.lengthGen = VarLengthGeneratorFunction(minLen, maxLen) this } + def setGaussianLength(mean: Double, stdDev: Double): CommonDataGen = { + this.lengthGen = StdDevLengthGen(mean, stdDev) + this + } + /** * Add this column to a specific correlated key group. This should not be * called directly by users. */ def setCorrelatedKeyGroup(keyGroup: Long, - minSeed: Long, maxSeed: Long, - seedMapping: LocationToSeedMapping): DataGen = { + minSeed: Long, maxSeed: Long, + seedMapping: LocationToSeedMapping): CommonDataGen = { conf = conf.forCorrelatedKeyGroup(keyGroup) - .forSeedRange(minSeed, maxSeed) + .forSeedRange(minSeed, maxSeed) this.seedMapping = seedMapping this } @@ -672,7 +700,7 @@ abstract class DataGen(var conf: ColumnConf, /** * Set a range of seed values that should be returned by the LocationToSeedMapping */ - def setSeedRange(min: Long, max: Long): DataGen = { + def setSeedRange(min: Long, max: Long): CommonDataGen = { conf = conf.forSeedRange(min, max) this } @@ -681,7 +709,7 @@ abstract class DataGen(var conf: ColumnConf, * Get the default value generator for this specific data gen. */ protected def getValGen: GeneratorFunction - def children: Seq[(String, DataGen)] + def children: Seq[(String, CommonDataGen)] /** * Get the final ready to use GeneratorFunction for the data generator. @@ -690,8 +718,8 @@ abstract class DataGen(var conf: ColumnConf, val sm = seedMapping.withColumnConf(conf) val lg = lengthGen.withLocationToSeedMapping(sm) var valGen = userProvidedValueGen.getOrElse(getValGen) - .withLocationToSeedMapping(sm) - .withLengthGeneratorFunction(lg) + .withLocationToSeedMapping(sm) + .withLengthGeneratorFunction(lg) valueRange.foreach { case (min, max) => valGen = valGen.withValueRange(min, max) @@ -700,35 +728,75 @@ abstract class DataGen(var conf: ColumnConf, val nullColConf = conf.forNulls val nm = nullMapping.withColumnConf(nullColConf) userProvidedNullGen.get - .withWrapped(valGen) - .withLocationToSeedMapping(nm) + .withWrapped(valGen) + .withLocationToSeedMapping(nm) } else { valGen } } - /** - * Get the data type for this column - */ - def dataType: DataType - /** * Is this column nullable or not. */ def nullable: Boolean = conf.nullable /** - * Get a child column for a given name, if it has one. + * Get a child for a given name, if it has one. */ - final def apply(name: String): DataGen = { + final def apply(name: String): CommonDataGen = { get(name).getOrElse{ throw new IllegalStateException(s"Could not find a child $name for $this") } } - def get(name: String): Option[DataGen] = None + def get(name: String): Option[CommonDataGen] = None +} + + +/** + * Base class for generating a column/sub-column. This holds configuration + * for the column, and handles what is needed to convert it into GeneratorFunction + */ +abstract class DataGen( + conf: ColumnConf, + defaultValueRange: Option[(Any, Any)], + seedMapping: LocationToSeedMapping = FlatDistribution(), + nullMapping: LocationToSeedMapping = FlatDistribution(), + lengthGen: LengthGeneratorFunction = FixedLengthGeneratorFunction(10)) extends + CommonDataGen(conf, defaultValueRange, seedMapping, nullMapping, lengthGen) { + + /** + * Get the data type for this column + */ + def dataType: DataType + + override def get(name: String): Option[DataGen] = None + + def getSubstringGen: Option[SubstringDataGen] = None + + def substringGen: SubstringDataGen = + getSubstringGen.getOrElse( + throw new IllegalArgumentException("substring data gen was not set")) + + def setSubstringGen(f : ColumnConf => SubstringDataGen): Unit = + setSubstringGen(Option(f(conf.forNextSubstring))) + + def setSubstringGen(subgen: Option[SubstringDataGen]): Unit = + throw new IllegalArgumentException("substring data gens can only be set for a STRING") } +/** + * Base class for generating a sub-string. This holds configuration + * for the substring, and handles what is needed to convert it into a GeneratorFunction + */ +abstract class SubstringDataGen( + conf: ColumnConf, + defaultValueRange: Option[(Any, Any)], + seedMapping: LocationToSeedMapping = FlatDistribution(), + nullMapping: LocationToSeedMapping = FlatDistribution(), + lengthGen: LengthGeneratorFunction = FixedLengthGeneratorFunction(10)) extends + CommonDataGen(conf, defaultValueRange, seedMapping, nullMapping, lengthGen) {} + /** * A special GeneratorFunction that just returns the computed seed. This is helpful for * debugging distributions or if you want long values without any abstraction in between. @@ -1494,155 +1562,866 @@ class FloatGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)]) override def children: Seq[(String, DataGen)] = Seq.empty } -trait JSONType { - def appendRandomValue(sb: StringBuilder, - index: Int, - maxStringLength: Int, - maxArrayLength: Int, - maxObjectLength: Int, - depth: Int, - maxDepth: Int, - r: Random): Unit -} +case class JsonPathElement(name: String, is_array: Boolean) +case class JsonLevel(path: Array[JsonPathElement], data_type: String, length: Int, value: String) {} + +object JsonColumnStats { + private def printHelp(): Unit = { + println("JSON Fingerprinting Tool:") + println("PARAMS: ") + println(" is a path to a Spark dataframe to read in") + println(" is a path in a Spark file system to write out fingerprint data to.") + println() + println("OPTIONS:") + println(" --json= where is the name of a top level String column") + println(" --anon= where is a SEED used to anonymize the JSON keys ") + println(" and column names.") + println(" --input_format= where is parquet or ORC. Defaults to parquet.") + println(" --overwrite to enable overwriting the fingerprint output.") + println(" --debug to enable some debug information to be printed out") + println(" --help to print out this help message") + println() + } + + def main(args: Array[String]): Unit = { + var inputPath = Option.empty[String] + var outputPath = Option.empty[String] + val jsonColumns = ArrayBuffer.empty[String] + var anonSeed = Option.empty[Long] + var debug = false + var argsDone = false + var format = "parquet" + var overwrite = false + + args.foreach { + case a if !argsDone && a.startsWith("--json=") => + jsonColumns += a.substring("--json=".length) + case a if !argsDone && a.startsWith("--anon=") => + anonSeed = Some(a.substring("--anon=".length).toLong) + case a if !argsDone && a.startsWith("--input_format=") => + format = a.substring("--input_format=".length).toLowerCase(java.util.Locale.US) + case "--overwrite" if !argsDone => + overwrite = true + case "--debug" if !argsDone => + debug = true + case "--help" if !argsDone => + printHelp() + System.exit(0) + case "--" if !argsDone => + argsDone = true + case a if !argsDone && a.startsWith("--") => // "--" was covered above already + println(s"ERROR $a is not a supported argument") + printHelp() + System.exit(-1) + case a if inputPath.isEmpty => + inputPath = Some(a) + case a if outputPath.isEmpty => + outputPath = Some(a) + case a => + println(s"ERROR only two arguments are supported. Found $a") + printHelp() + System.exit(-1) + } + if (outputPath.isEmpty) { + println("ERROR both an inputPath and an outputPath are required") + printHelp() + System.exit(-1) + } + + val spark = SparkSession.builder.getOrCreate() + spark.sparkContext.setLogLevel("WARN") + + val df = spark.read.format(format).load(inputPath.get) + jsonColumns.foreach { column => + val fp = fingerPrint(df, df(column), anonSeed) + val name = anonSeed.map(s => anonymizeString(column, s)).getOrElse(column) + val fullOutPath = s"${outputPath.get}/$name" + var writer = fp.write + if (overwrite) { + writer = writer.mode("overwrite") + } + if (debug) { + anonSeed.foreach { s => + println(s"Keys and columns will be anonymized with seed $s") + } + println(s"Writing $column fingerprint to $fullOutPath") + spark.time(writer.parquet(fullOutPath)) + println(s"Wrote ${spark.read.parquet(fullOutPath).count} rows") + spark.read.parquet(fullOutPath).show() + } else { + writer.parquet(fullOutPath) + } + } + } -object JSONType { - def selectType(depth: Int, - maxDepth: Int, - r: Random): JSONType = { - val toSelectFrom = if (depth < maxDepth) { - Seq(QuotedJSONString, JSONLong, JSONDouble, JSONArray, JSONObject) - } else { - Seq(QuotedJSONString, JSONLong, JSONDouble) - } - val index = r.nextInt(toSelectFrom.length) - toSelectFrom(index) - } -} - -object QuotedJSONString extends JSONType { - override def appendRandomValue(sb: StringBuilder, - index: Int, - maxStringLength: Int, - maxArrayLength: Int, - maxObjectLength: Int, - depth: Int, - maxDepth: Int, - r: Random): Unit = { - val strValue = r.nextString(r.nextInt(maxStringLength + 1)) - .replace("\\", "\\\\") - .replace("\"", "\\\"") - .replace("\n", "\\n") - .replace("\r", "\\r") - .replace("\b", "\\b") - .replace("\f", "\\f") - sb.append('"') - sb.append(strValue) - sb.append('"') - } -} - -object JSONLong extends JSONType { - override def appendRandomValue(sb: StringBuilder, - index: Int, - maxStringLength: Int, - maxArrayLength: Int, - maxObjectLength: Int, - depth: Int, - maxDepth: Int, - r: Random): Unit = { - sb.append(r.nextLong()) - } -} - -object JSONDouble extends JSONType { - override def appendRandomValue(sb: StringBuilder, - index: Int, - maxStringLength: Int, - maxArrayLength: Int, - maxObjectLength: Int, - depth: Int, - maxDepth: Int, - r: Random): Unit = { - sb.append(r.nextDouble() * 4096.0) - } -} - -object JSONArray extends JSONType { - override def appendRandomValue(sb: StringBuilder, - index: Int, - maxStringLength: Int, - maxArrayLength: Int, - maxObjectLength: Int, - depth: Int, - maxDepth: Int, - r: Random): Unit = { - val childType = JSONType.selectType(depth, maxDepth, r) - val length = r.nextInt(maxArrayLength + 1) - sb.append("[") + case class JsonNodeStats(count: Long, meanLen: Double, stdDevLength: Double, dc: Long) + + class JsonNode() { + private val forDataType = + mutable.HashMap[String, (JsonNodeStats, mutable.HashMap[String, JsonNode])]() + + def getChild(name: String, isArray: Boolean): JsonNode = { + val dt = if (isArray) { "ARRAY" } else { "OBJECT" } + val typed = forDataType.getOrElse(dt, + throw new IllegalArgumentException(s"$dt is not a set data type yet.")) + typed._2.getOrElse(name, + throw new IllegalArgumentException(s"$name is not a child when the type is $dt")) + } + + def contains(name: String, isArray: Boolean): Boolean = { + val dt = if (isArray) { "ARRAY" } else { "OBJECT" } + forDataType.get(dt).exists { children => + children._2.contains(name) + } + } + + def addChild(name: String, isArray: Boolean): JsonNode = { + val dt = if (isArray) { "ARRAY" } else { "OBJECT" } + val found = forDataType.getOrElse(dt, + throw new IllegalArgumentException(s"$dt was not already added as a data type")) + if (found._2.contains(name)) { + throw new IllegalArgumentException(s"$dt already has a child named $name") + } + val node = new JsonNode() + found._2.put(name, node) + node + } + + def addChoice(dt: String, stats: JsonNodeStats): Unit = { + if (forDataType.contains(dt)) { + throw new IllegalArgumentException(s"$dt was already added as a data type") + } + forDataType.put(dt, (stats, new mutable.HashMap[String, JsonNode]())) + } + + override def toString: String = { + forDataType.toString() + } + + def totalCount: Long = { + forDataType.values.map{ case (stats, _) => stats.count}.sum + } + + private def makeNoChoiceGenRecursive(dt: String, + children: mutable.HashMap[String, JsonNode], + cc: ColumnConf): (SubstringDataGen, ColumnConf) = { + var c = cc + val ret = dt match { + case "LONG" => new JSONLongGen(c) + case "DOUBLE" => new JSONDoubleGen(c) + case "BOOLEAN" => new JSONBoolGen(c) + case "NULL" => new JSONNullGen(false, c) + case "VALUE_NULL" => new JSONNullGen(true, c) + case "ERROR" => new JSONErrorGen(c) + case "STRING" => new JSONStringGen(c) + case "ARRAY" => + val child = if (children.isEmpty) { + // A corner case, we will just make it a BOOL column and it will be ignored + val tmp = new JSONBoolGen(c) + c = c.forNextSubstring + tmp + } else { + val tmp = children.values.head.makeGenRecursive(c) + c = tmp._2 + tmp._1 + } + new JSONArrayGen(child, c) + case "OBJECT" => + val childGens = if (children.isEmpty) { + Seq.empty + } else { + children.toSeq.map { + case (k, node) => + val tmp = node.makeGenRecursive(c) + c = tmp._2 + (k, tmp._1) + } + } + new JSONObjectGen(childGens, c) + case other => + throw new IllegalArgumentException(s"$other is not a leaf node type") + } + (ret, c.forNextSubstring) + } + + private def makeGenRecursive(cc: ColumnConf): (SubstringDataGen, ColumnConf) = { + var c = cc + // We are going to recursively walk the tree for all of the values. + if (forDataType.size == 1) { + // We don't need a choice at all. This makes it simpler.. + val (dt, (_, children)) = forDataType.head + makeNoChoiceGenRecursive(dt, children, c) + } else { + val totalSum = forDataType.map(f => f._2._1.count).sum.toDouble + var runningSum = 0L + val allChoices = ArrayBuffer[(Double, String, SubstringDataGen)]() + forDataType.foreach { + case (dt, (stats, children)) => + val tmp = makeNoChoiceGenRecursive(dt, children, c) + c = tmp._2 + runningSum += stats.count + allChoices.append((runningSum/totalSum, dt, tmp._1)) + } + + val ret = new JSONChoiceGen(allChoices.toSeq, c) + (ret, c.forNextSubstring) + } + } + + def makeGen(cc: ColumnConf): SubstringDataGen = { + val (ret, _) = makeGenRecursive(cc) + ret + } + + def setStatsSingle(dg: CommonDataGen, + dt: String, + stats: JsonNodeStats, + nullPct: Double): Unit = { + + val includeLength = dt != "OBJECT" && dt != "BOOLEAN" && dt != "NULL" && dt != "VALUE_NULL" + val includeNullPct = nullPct > 0.0 + if (includeLength) { + dg.setGaussianLength(stats.meanLen, stats.stdDevLength) + } + if (includeNullPct) { + dg.setNullProbability(nullPct) + } + dg.setSeedRange(1, stats.dc) + } + + def setStats(dg: CommonDataGen, + parentCount: Option[Long]): Unit = { + // We are going to recursively walk the tree... + if (forDataType.size == 1) { + // We don't need a choice at all. This makes it simpler.. + val (dt, (stats, children)) = forDataType.head + val nullPct = parentCount.map { pc => + (pc - stats.count).toDouble/pc + }.getOrElse(0.0) + setStatsSingle(dg, dt, stats, nullPct) + val myCount = if (dt == "OBJECT") { + Some(totalCount) + } else { + None + } + children.foreach { + case (name, node) => + node.setStats(dg(name), myCount) + } + } else { + // We have choices to make between different types. + // The null percent cannot be calculated for each individual choice + // but is calculated on the group as a whole instead + parentCount.foreach { pc => + val tc = totalCount + val choiceNullPct = (pc - tc).toDouble / pc + if (choiceNullPct > 0.0) { + dg.setNullProbability(choiceNullPct) + } + } + forDataType.foreach { + case (dt, (stats, children)) => + // When there is a choice the name to access it is the data type + val choiceDg = dg(dt) + setStatsSingle(choiceDg, dt, stats, 0.0) + children.foreach { + case (name, node) => + val myCount = if (dt == "OBJECT") { + // Here we only want the count for the OBJECTs + Some(stats.count) + } else { + None + } + node.setStats(choiceDg(name), myCount) + } + } + } + } + } + + private lazy val jsonFactory = new JsonFactoryBuilder() + // The two options below enabled for Hive compatibility + .enable(JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS) + .enable(JsonReadFeature.ALLOW_SINGLE_QUOTES) + .build() + + private def processNext(parser: JsonParser, + currentPath: ArrayBuffer[JsonPathElement], + output: ArrayBuffer[JsonLevel]): Unit = { + parser.currentToken() match { + case JsonToken.START_OBJECT => + parser.nextToken() + while (parser.currentToken() != JsonToken.END_OBJECT) { + processNext(parser, currentPath, output) + } + output.append(JsonLevel(currentPath.toArray, "OBJECT", 0, "")) + parser.nextToken() + case JsonToken.START_ARRAY => + currentPath.append(JsonPathElement("data", is_array = true)) + parser.nextToken() + var length = 0 + while (parser.currentToken() != JsonToken.END_ARRAY) { + length += 1 + processNext(parser, currentPath, output) + } + currentPath.remove(currentPath.length - 1) + output.append(JsonLevel(currentPath.toArray, "ARRAY", length, "")) + parser.nextToken() + case JsonToken.FIELD_NAME => + currentPath.append(JsonPathElement(parser.getCurrentName, is_array = false)) + parser.nextToken() + processNext(parser, currentPath, output) + currentPath.remove(currentPath.length - 1) + case JsonToken.VALUE_NUMBER_INT => + val length = parser.getValueAsString.getBytes("UTF-8").length + output.append(JsonLevel(currentPath.toArray, "LONG", length, parser.getValueAsString)) + parser.nextToken() + case JsonToken.VALUE_NUMBER_FLOAT => + val length = parser.getValueAsString.getBytes("UTF-8").length + output.append(JsonLevel(currentPath.toArray, "DOUBLE", length, parser.getValueAsString)) + parser.nextToken() + case JsonToken.VALUE_TRUE | JsonToken.VALUE_FALSE => + val length = parser.getValueAsString.getBytes("UTF-8").length + output.append(JsonLevel(currentPath.toArray, "BOOLEAN", length, parser.getValueAsString)) + parser.nextToken() + case JsonToken.VALUE_NULL | null => + output.append(JsonLevel(currentPath.toArray, "VALUE_NULL", 4, "NULL")) + parser.nextToken() + case JsonToken.VALUE_STRING => + val length = parser.getValueAsString.getBytes("UTF-8").length + output.append(JsonLevel(currentPath.toArray, "STRING", length, parser.getValueAsString)) + parser.nextToken() + case other => + throw new IllegalStateException(s"DON'T KNOW HOW TO DEAL WITH $other") + } + } + + def jsonStatsUdf(json: String): Array[JsonLevel] = { + val output = new ArrayBuffer[JsonLevel]() + try { + val currentPath = new ArrayBuffer[JsonPathElement]() + if (json == null) { + output.append(JsonLevel(Array.empty, "NULL", 0, "")) + } else { + val parser = jsonFactory.createParser(json) + try { + parser.nextToken() + processNext(parser, currentPath, output) + } finally { + parser.close() + } + } + } catch { + case _: com.fasterxml.jackson.core.JsonParseException => + output.clear() + output.append(JsonLevel(Array.empty, "ERROR", json.getBytes("UTF-8").length, json)) + } + output.toArray + } + + private lazy val extractPaths = udf(json => jsonStatsUdf(json)) + + def anonymizeString(str: String, seed: Long): String = { + val length = str.length + val data = new Array[Byte](length) + val hash = XXH64.hashLong(str.hashCode, seed) + val r = new Random() + r.setSeed(hash) (0 until length).foreach { i => - if (i > 0) { - sb.append(",") + val tmp = r.nextInt(16) + data(i) = (tmp + 'A').toByte + } + new String(data) + } + + private lazy val anonPath = udf((str, seed) => anonymizeString(str, seed)) + + def anonymizeFingerPrint(df: DataFrame, anonSeed: Long): DataFrame = { + df.withColumn("tmp", transform(col("path"), + o => { + val name = o("name") + val isArray = o("is_array") + val anon = anonPath(name, lit(anonSeed)) + val newName = when(isArray, name).otherwise(anon).alias("name") + struct(newName, isArray) + })) + .drop("path").withColumnRenamed("tmp", "path") + .orderBy("path", "dt") + .selectExpr("path", "dt","c","mean_len","stddev_len","distinct","version") + } + + def fingerPrint(df: DataFrame, column: Column, anonymize: Option[Long] = None): DataFrame = { + val ret = df.select(extractPaths(column).alias("paths")) + .selectExpr("explode_outer(paths) as p") + .selectExpr("p.path as path", "p.data_type as dt", "p.length as len", "p.value as value") + .groupBy(col("path"), col("dt")).agg( + count(lit(1)).alias("c"), + avg(col("len")).alias("mean_len"), + coalesce(stddev(col("len")), lit(0.0)).alias("stddev_len"), + approx_count_distinct(col("value")).alias("distinct")) + .orderBy("path", "dt").withColumn("version", lit("0.1")) + .selectExpr("path", "dt","c","mean_len","stddev_len","distinct","version") + + anonymize.map { anonSeed => + anonymizeFingerPrint(ret, anonSeed) + }.getOrElse(ret) + } + + def apply(aggForColumn: DataFrame, genColumn: ColumnGen): Unit = + apply(aggForColumn, genColumn.dataGen) + + private val expectedSchema = StructType.fromDDL( + "path ARRAY>," + + "dt STRING," + + "c BIGINT," + + "mean_len DOUBLE," + + "stddev_len DOUBLE," + + "distinct BIGINT," + + "version STRING") + + def apply(aggForColumn: DataFrame, gen: DataGen): Unit = { + val aggData = aggForColumn.orderBy("path", "dt").collect() + val rootNode: JsonNode = new JsonNode() + assert(aggData.length > 0) + val schema = aggData.head.schema + assert(schema.length == expectedSchema.length) + schema.fields.zip(expectedSchema.fields).foreach { + case(found, expected) => + assert(found.name == expected.name) + // TODO we can worry about the exact types later if we need to + } + assert(aggData.head.getString(6) == "0.1") + aggData.foreach { row => + val fullPath = row.getAs[mutable.WrappedArray[Row]](0) + val parsedPath = fullPath.map(r => (r.getString(0), r.getBoolean(1))).toList + val dt = row.getString(1) + val count = row.getLong(2) + val meanLen = row.getDouble(3) + val stdLen = row.getDouble(4) + val dc = row.getLong(5) + + val stats = JsonNodeStats(count, meanLen, stdLen, dc) + var currentNode = rootNode + // Find everything up to the last path element + if (parsedPath.length > 1) { + parsedPath.slice(0, parsedPath.length - 1).foreach { + case (name, isArray) => + currentNode = currentNode.getChild(name, isArray) + } + } + + if (parsedPath.nonEmpty) { + // For the last path element (that is not the root element) we might need to add it + // as a child + val (name, isArray) = parsedPath.last + if (!currentNode.contains(name, isArray)) { + currentNode.addChild(name, isArray) + } + currentNode = currentNode.getChild(name, isArray) } - childType.appendRandomValue(sb, i, maxStringLength, maxArrayLength, maxObjectLength, - depth + 1, maxDepth, r) + currentNode.addChoice(dt, stats) } - sb.append("]") + + gen.setSubstringGen(cc => rootNode.makeGen(cc)) + rootNode.setStats(gen.substringGen, None) } } -object JSONObject extends JSONType { - override def appendRandomValue(sb: StringBuilder, - index: Int, - maxStringLength: Int, - maxArrayLength: Int, - maxObjectLength: Int, - depth: Int, - maxDepth: Int, - r: Random): Unit = { - val length = r.nextInt(maxObjectLength) + 1 - sb.append("{") - (0 until length).foreach { i => - if (i > 0) { - sb.append(",") + +case class JSONStringGenFunc(lengthGen: LengthGeneratorFunction = null, + mapping: LocationToSeedMapping = null) extends GeneratorFunction { + + override def apply(rowLoc: RowLocation): Any = { + val len = lengthGen(rowLoc) + val r = DataGen.getRandomFor(rowLoc, mapping) + val buffer = new Array[Byte](len) + var at = 0 + while (at < len) { + // Value range is 32 (Space) to 126 (~) + buffer(at) = (r.nextInt(126 - 31) + 32).toByte + at += 1 + } + val strVal = new String(buffer, 0, len) + .replace("\\", "\\\\") + .replace("\"", "\\\"") + .replace("\n", "\\n") + .replace("\r", "\\r") + .replace("\b", "\\b") + .replace("\f", "\\f") + '"' + strVal + '"' + } + + override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONStringGenFunc = + JSONStringGenFunc(lengthGen, mapping) + + override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONStringGenFunc = + JSONStringGenFunc(lengthGen, mapping) + + override def withValueRange(min: Any, max: Any): GeneratorFunction = + throw new IllegalArgumentException("value ranges are not supported for JSON") +} + +class JSONStringGen(conf: ColumnConf, + defaultValueRange: Option[(Any, Any)] = None) + extends SubstringDataGen(conf, defaultValueRange) { + + override protected def getValGen: GeneratorFunction = JSONStringGenFunc() + + override def children: Seq[(String, SubstringDataGen)] = Seq.empty +} + +case class JSONLongGenFunc(lengthGen: LengthGeneratorFunction = null, + mapping: LocationToSeedMapping = null) extends GeneratorFunction { + + override def apply(rowLoc: RowLocation): Any = { + val len = math.max(lengthGen(rowLoc), 1) // We need at least 1 long for a valid value + val r = DataGen.getRandomFor(rowLoc, mapping) + val buffer = new Array[Byte](len) + var at = 0 + while (at < len) { + if (at == 0) { + // No leading 0's + buffer(at) = (r.nextInt(9) + '1').toByte + } else { + buffer(at) = (r.nextInt(10) + '0').toByte } - sb.append("\"key_") - sb.append(i) - sb.append("_") - sb.append(depth ) - sb.append("\":") - val childType = JSONType.selectType(depth, maxDepth, r) - childType.appendRandomValue(sb, i, maxStringLength, maxArrayLength, maxObjectLength, - depth + 1, maxDepth, r) + at += 1 } - sb.append("}") + new String(buffer, 0, len) } + + override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONLongGenFunc = + JSONLongGenFunc(lengthGen, mapping) + + override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONLongGenFunc = + JSONLongGenFunc(lengthGen, mapping) + + override def withValueRange(min: Any, max: Any): GeneratorFunction = + throw new IllegalArgumentException("value ranges are not supported for JSON") } -case class JSONGenFunc( - maxStringLength: Int, - maxArrayLength: Int, - maxObjectLength: Int, - maxDepth: Int, - lengthGen: LengthGeneratorFunction = null, - mapping: LocationToSeedMapping = null) extends GeneratorFunction { +class JSONLongGen(conf: ColumnConf, + defaultValueRange: Option[(Any, Any)] = None) + extends SubstringDataGen(conf, defaultValueRange) { + + override protected def getValGen: GeneratorFunction = JSONLongGenFunc() + + override def children: Seq[(String, SubstringDataGen)] = Seq.empty +} + +case class JSONDoubleGenFunc(lengthGen: LengthGeneratorFunction = null, + mapping: LocationToSeedMapping = null) extends GeneratorFunction { override def apply(rowLoc: RowLocation): Any = { + val len = math.max(lengthGen(rowLoc), 3) // We have to have at least 3 chars NUM.NUM val r = DataGen.getRandomFor(rowLoc, mapping) - val sb = new StringBuilder() - JSONObject.appendRandomValue(sb, 0, maxStringLength, maxArrayLength, maxObjectLength, - 0, maxDepth, r) - // For now I am going to have some hard coded keys - UTF8String.fromString(sb.toString()) + val beforeLen = if (len == 3) { 1 } else { r.nextInt(len - 3) + 1 } + val buffer = new Array[Byte](len) + var at = 0 + while (at < len) { + if (at == 0) { + // No leading 0's + buffer(at) = (r.nextInt(9) + '1').toByte + } else if (at == beforeLen) { + buffer(at) = '.' + } else { + buffer(at) = (r.nextInt(10) + '0').toByte + } + at += 1 + } + UTF8String.fromBytes(buffer, 0, len) } - override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): GeneratorFunction = - JSONGenFunc(maxStringLength, maxArrayLength, maxObjectLength, maxDepth, lengthGen, mapping) + override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONDoubleGenFunc = + JSONDoubleGenFunc(lengthGen, mapping) - override def withLocationToSeedMapping(mapping: LocationToSeedMapping): GeneratorFunction = - JSONGenFunc(maxStringLength, maxArrayLength, maxObjectLength, maxDepth, lengthGen, mapping) + override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONDoubleGenFunc = + JSONDoubleGenFunc(lengthGen, mapping) override def withValueRange(min: Any, max: Any): GeneratorFunction = - throw new IllegalArgumentException("value ranges are not supported for strings") + throw new IllegalArgumentException("value ranges are not supported for JSON") +} + +class JSONDoubleGen(conf: ColumnConf, + defaultValueRange: Option[(Any, Any)] = None) + extends SubstringDataGen(conf, defaultValueRange) { + + override protected def getValGen: GeneratorFunction = JSONDoubleGenFunc() + + override def children: Seq[(String, SubstringDataGen)] = Seq.empty +} + +case class JSONBoolGenFunc(lengthGen: LengthGeneratorFunction = null, + mapping: LocationToSeedMapping = null) extends GeneratorFunction { + + override def apply(rowLoc: RowLocation): Any = { + val r = DataGen.getRandomFor(rowLoc, mapping) + val ret = if (r.nextBoolean()) "true" else "false" + UTF8String.fromString(ret) + } + + override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONBoolGenFunc = + JSONBoolGenFunc(lengthGen, mapping) + + override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONBoolGenFunc = + JSONBoolGenFunc(lengthGen, mapping) + + override def withValueRange(min: Any, max: Any): GeneratorFunction = + throw new IllegalArgumentException("value ranges are not supported for JSON") +} + +class JSONBoolGen(conf: ColumnConf, + defaultValueRange: Option[(Any, Any)] = None) + extends SubstringDataGen(conf, defaultValueRange) { + + override protected def getValGen: GeneratorFunction = JSONBoolGenFunc() + + override def children: Seq[(String, SubstringDataGen)] = Seq.empty +} + +case class JSONNullGenFunc(nullAsString: Boolean, + lengthGen: LengthGeneratorFunction = null, + mapping: LocationToSeedMapping = null) extends GeneratorFunction { + + override def apply(rowLoc: RowLocation): Any = + if (nullAsString) { + UTF8String.fromString("null") + } else { + null + } + + + override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONNullGenFunc = + JSONNullGenFunc(nullAsString, lengthGen, mapping) + + override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONNullGenFunc = + JSONNullGenFunc(nullAsString, lengthGen, mapping) + + override def withValueRange(min: Any, max: Any): GeneratorFunction = + throw new IllegalArgumentException("value ranges are not supported for JSON") +} + +class JSONNullGen(nullAsString: Boolean, + conf: ColumnConf, + defaultValueRange: Option[(Any, Any)] = None) + extends SubstringDataGen(conf, defaultValueRange) { + + override protected def getValGen: GeneratorFunction = JSONNullGenFunc(nullAsString) + + override def children: Seq[(String, SubstringDataGen)] = Seq.empty +} + +case class JSONErrorGenFunc(lengthGen: LengthGeneratorFunction = null, + mapping: LocationToSeedMapping = null) extends GeneratorFunction { + + override def apply(rowLoc: RowLocation): Any = { + val len = lengthGen(rowLoc) + val r = DataGen.getRandomFor(rowLoc, mapping) + val buffer = new Array[Byte](len) + var at = 0 + while (at < len) { + // Value range is 32 (Space) to 126 (~) + // But it is almost impossible to show up as valid JSON + buffer(at) = (r.nextInt(126 - 31) + 32).toByte + at += 1 + } + UTF8String.fromBytes(buffer, 0, len) + } + + override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONErrorGenFunc = + JSONErrorGenFunc(lengthGen, mapping) + + override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONErrorGenFunc = + JSONErrorGenFunc(lengthGen, mapping) + + override def withValueRange(min: Any, max: Any): GeneratorFunction = + throw new IllegalArgumentException("value ranges are not supported for JSON") +} + +class JSONErrorGen(conf: ColumnConf, + defaultValueRange: Option[(Any, Any)] = None) + extends SubstringDataGen(conf, defaultValueRange) { + + override protected def getValGen: GeneratorFunction = JSONErrorGenFunc() + + override def children: Seq[(String, SubstringDataGen)] = Seq.empty +} + +case class JSONArrayGenFunc(child: GeneratorFunction, + lengthGen: LengthGeneratorFunction = null, + mapping: LocationToSeedMapping = null) extends GeneratorFunction { + + override def apply(rowLoc: RowLocation): Any = { + val len = lengthGen(rowLoc) + val data = new Array[String](len) + val childRowLoc = rowLoc.withNewChild() + var i = 0 + while (i < len) { + childRowLoc.setLastChildIndex(i) + val v = child(childRowLoc) + if (v == null) { + // A null in an array must look like "null" + data(i) = "null" + } else { + data(i) = v.toString + } + i += 1 + } + val ret = data.mkString("[", ",", "]") + UTF8String.fromString(ret) + } + + override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONArrayGenFunc = + JSONArrayGenFunc(child, lengthGen, mapping) + + override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONArrayGenFunc = + JSONArrayGenFunc(child, lengthGen, mapping) + + override def withValueRange(min: Any, max: Any): GeneratorFunction = + throw new IllegalArgumentException("value ranges are not supported for JSON") +} + +class JSONArrayGen(child: SubstringDataGen, + conf: ColumnConf, + defaultValueRange: Option[(Any, Any)] = None) + extends SubstringDataGen(conf, defaultValueRange) { + + override def setCorrelatedKeyGroup(keyGroup: Long, + minSeed: Long, maxSeed: Long, + seedMapping: LocationToSeedMapping): SubstringDataGen = { + super.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping) + child.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping) + this + } + + override protected def getValGen: GeneratorFunction = JSONArrayGenFunc(child.getGen) + + override def get(name: String): Option[SubstringDataGen] = { + if ("data".equalsIgnoreCase(name) || "child".equalsIgnoreCase(name)) { + Some(child) + } else { + None + } + } + + override def children: Seq[(String, SubstringDataGen)] = Seq(("data", child)) +} + +case class JSONObjectGenFunc(childGens: Array[(String, GeneratorFunction)], + lengthGen: LengthGeneratorFunction = null, + mapping: LocationToSeedMapping = null) extends GeneratorFunction { + override def apply(rowLoc: RowLocation): Any = { + // TODO randomize the order of the children??? + // TODO duplicate child values??? + // The row location does not change for a struct/object + val data = childGens.map { + case (k, gen) => + val key = k.replace("\\", "\\\\") + .replace("\"", "\\\"") + .replace("\n", "\\n") + .replace("\r", "\\r") + .replace("\b", "\\b") + .replace("\f", "\\f") + val v = gen.apply(rowLoc) + if (v == null) { + "" + } else { + '"' + key + "\":" + v + } + } + val ret = data.filterNot(_.isEmpty).mkString("{",",","}") + UTF8String.fromString(ret) + } + + override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONObjectGenFunc = + JSONObjectGenFunc(childGens, lengthGen, mapping) + + override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONObjectGenFunc = + JSONObjectGenFunc(childGens, lengthGen, mapping) + + override def withValueRange(min: Any, max: Any): GeneratorFunction = + throw new IllegalArgumentException("value ranges are not supported for JSON") +} + +class JSONObjectGen(val children: Seq[(String, SubstringDataGen)], + conf: ColumnConf, + defaultValueRange: Option[(Any, Any)] = None) + extends SubstringDataGen(conf, defaultValueRange) { + + override def setCorrelatedKeyGroup(keyGroup: Long, + minSeed: Long, maxSeed: Long, + seedMapping: LocationToSeedMapping): SubstringDataGen = { + super.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping) + children.foreach { + case (_, gen) => + gen.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping) + } + this + } + + override def get(name: String): Option[SubstringDataGen] = + children.collectFirst { + case (childName, dataGen) if childName.equalsIgnoreCase(name) => dataGen + } + + override protected def getValGen: GeneratorFunction = { + val childGens = children.map(c => (c._1, c._2.getGen)).toArray + JSONObjectGenFunc(childGens) + } +} + +case class JSONChoiceGenFunc(choices: List[(Double, GeneratorFunction)], + lengthGen: LengthGeneratorFunction = null, + mapping: LocationToSeedMapping = null) extends GeneratorFunction { + override def apply(rowLoc: RowLocation): Any = { + val r = DataGen.getRandomFor(rowLoc, mapping) + val l = r.nextDouble() + var index = 0 + while (choices(index)._1 < l) { + index += 1 + } + val childRowLoc = rowLoc.withNewChild() + choices(index)._2(childRowLoc) + } + + override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): JSONChoiceGenFunc = + JSONChoiceGenFunc(choices, lengthGen, mapping) + + override def withLocationToSeedMapping(mapping: LocationToSeedMapping): JSONChoiceGenFunc = + JSONChoiceGenFunc(choices, lengthGen, mapping) + + override def withValueRange(min: Any, max: Any): GeneratorFunction = + throw new IllegalArgumentException("value ranges are not supported for JSON") +} + +class JSONChoiceGen(val choices: Seq[(Double, String, SubstringDataGen)], + conf: ColumnConf, + defaultValueRange: Option[(Any, Any)] = None) + extends SubstringDataGen(conf, defaultValueRange) { + + override val children: Seq[(String, SubstringDataGen)] = + choices.map { case (_, name, gen) => (name, gen) } + + override def setCorrelatedKeyGroup(keyGroup: Long, + minSeed: Long, maxSeed: Long, + seedMapping: LocationToSeedMapping): SubstringDataGen = { + super.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping) + children.foreach { + case (_, gen) => + gen.setCorrelatedKeyGroup(keyGroup, minSeed, maxSeed, seedMapping) + } + this + } + + override def get(name: String): Option[SubstringDataGen] = + children.collectFirst { + case (childName, dataGen) if childName.equalsIgnoreCase(name) => dataGen + } + + override protected def getValGen: GeneratorFunction = { + val childGens = choices.map(c => (c._1, c._3.getGen)).toList + JSONChoiceGenFunc(childGens) + } } case class ASCIIGenFunc( @@ -1672,14 +2451,46 @@ case class ASCIIGenFunc( throw new IllegalArgumentException("value ranges are not supported for strings") } -class StringGen(conf: ColumnConf, defaultValueRange: Option[(Any, Any)]) - extends DataGen(conf, defaultValueRange) { +/** + * This is here to wrap the substring gen function so that its length/settings + * are the ones used when generating a string, and not what was set for the string. + */ +case class SubstringGenFunc( + substringGen: GeneratorFunction, + lengthGen: LengthGeneratorFunction = null, + mapping: LocationToSeedMapping = null) extends GeneratorFunction { + + override def apply(rowLoc: RowLocation): Any = { + substringGen(rowLoc) + } + + // The length and location seed mapping are just ignored for this... + override def withLengthGeneratorFunction(lengthGen: LengthGeneratorFunction): GeneratorFunction = + this + + override def withLocationToSeedMapping(mapping: LocationToSeedMapping): GeneratorFunction = + this + + override def withValueRange(min: Any, max: Any): GeneratorFunction = + throw new IllegalArgumentException("value ranges are not supported for strings") +} + +class StringGen(conf: ColumnConf, + defaultValueRange: Option[(Any, Any)], + var substringDataGen: Option[SubstringDataGen] = None) + extends DataGen(conf, defaultValueRange) { override def dataType: DataType = StringType - override protected def getValGen: GeneratorFunction = ASCIIGenFunc() + override protected def getValGen: GeneratorFunction = + substringDataGen.map(s => SubstringGenFunc(s.getGen)).getOrElse(ASCIIGenFunc()) override def children: Seq[(String, DataGen)] = Seq.empty + + override def setSubstringGen(subgen: Option[SubstringDataGen]): Unit = + substringDataGen = subgen + + override def getSubstringGen: Option[SubstringDataGen] = substringDataGen } case class StructGenFunc(childGens: Array[GeneratorFunction]) extends GeneratorFunction { @@ -1854,7 +2665,6 @@ class MapGen(key: DataGen, override def children: Seq[(String, DataGen)] = Seq(("key", key), ("value", value)) } - object ColumnGen { private def genInternal(rowNumber: Column, dataType: DataType, @@ -1869,8 +2679,8 @@ object ColumnGen { */ class ColumnGen(val dataGen: DataGen) { def setCorrelatedKeyGroup(kg: Long, - minSeed: Long, maxSeed: Long, - seedMapping: LocationToSeedMapping): ColumnGen = { + minSeed: Long, maxSeed: Long, + seedMapping: LocationToSeedMapping): ColumnGen = { dataGen.setCorrelatedKeyGroup(kg, minSeed, maxSeed, seedMapping) this } @@ -1930,6 +2740,11 @@ class ColumnGen(val dataGen: DataGen) { this } + def setGaussianLength(mean: Double, stdDev: Double): ColumnGen = { + dataGen.setGaussianLength(mean, stdDev) + this + } + final def apply(name: String): DataGen = { get(name).getOrElse { throw new IllegalArgumentException(s"$name not a child of $this") @@ -1941,8 +2756,16 @@ class ColumnGen(val dataGen: DataGen) { def gen(rowNumber: Column): Column = { ColumnGen.genInternal(rowNumber, dataGen.dataType, dataGen.nullable, dataGen.getGen) } + + def getSubstring: Option[SubstringDataGen] = dataGen.getSubstringGen + + def substringGen: SubstringDataGen = dataGen.substringGen + + def setSubstringGen(f : ColumnConf => SubstringDataGen): Unit = + dataGen.setSubstringGen(f) } + sealed trait KeyGroupType /** @@ -2192,7 +3015,7 @@ object DBGen { numRows: Long, mapping: OrderedTypeMapping): Seq[(String, ColumnGen)] = { // a bit of a hack with the column num so that we update it before each time... - var conf = ColumnConf(ColumnLocation(tableId, -1), true, numRows) + var conf = ColumnConf(ColumnLocation(tableId, -1, 0), true, numRows) st.toArray.map { sf => if (!mapping.canMap(sf.dataType, mapping)) { throw new IllegalArgumentException(s"$sf is not supported at this time") diff --git a/dist/maven-antrun/build-parallel-worlds.xml b/dist/maven-antrun/build-parallel-worlds.xml index 524b15addf9..07838616340 100644 --- a/dist/maven-antrun/build-parallel-worlds.xml +++ b/dist/maven-antrun/build-parallel-worlds.xml @@ -1,6 +1,6 @@ - diff --git a/dist/scripts/binary-dedupe.sh b/dist/scripts/binary-dedupe.sh index 183e86b1524..356b0b4dbae 100755 --- a/dist/scripts/binary-dedupe.sh +++ b/dist/scripts/binary-dedupe.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -34,10 +34,10 @@ case "$OSTYPE" in esac STEP=0 -export SPARK3XX_COMMON_TXT="$PWD/spark3xx-common.txt" -export SPARK3XX_COMMON_COPY_LIST="$PWD/spark-common-copy-list.txt" +export SPARK_SHARED_TXT="$PWD/spark-shared.txt" +export SPARK_SHARED_COPY_LIST="$PWD/spark-shared-copy-list.txt" export DELETE_DUPLICATES_TXT="$PWD/delete-duplicates.txt" -export SPARK3XX_COMMON_DIR="$PWD/spark3xx-common" +export SPARK_SHARED_DIR="$PWD/spark-shared" # This script de-duplicates .class files at the binary level. # We could also diff classes using scalap / javap outputs. @@ -47,17 +47,17 @@ export SPARK3XX_COMMON_DIR="$PWD/spark3xx-common" # The following pipeline determines identical classes across shims in this build. # - checksum all class files -# - move the varying-prefix spark3xy to the left so it can be easily skipped for uniq and sort +# - move the varying-prefix sparkxyz to the left so it can be easily skipped for uniq and sort # - sort by path, secondary sort by checksum, print one line per group # - produce uniq count for paths # - filter the paths with count=1, the class files without diverging checksums -# - put the path starting with /spark3xy back together for the final list +# - put the path starting with /sparkxyz back together for the final list echo "Retrieving class files hashing to a single value ..." echo "$((++STEP))/ SHA1 of all non-META files > tmp-sha1-files.txt" -find ./parallel-world/spark3* -name META-INF -prune -o \( -type f -print \) | \ - xargs $SHASUM > tmp-sha1-files.txt +find ./parallel-world/spark[34]* -name META-INF -prune -o -name webapps -prune -o \( -type f -print0 \) | \ + xargs --null $SHASUM > tmp-sha1-files.txt echo "$((++STEP))/ make shim column 1 > tmp-shim-sha-package-files.txt" < tmp-sha1-files.txt awk -F/ '$1=$1' | \ @@ -68,10 +68,10 @@ echo "$((++STEP))/ sort by path, sha1; output first from each group > tmp-count- sort -k3 -k2,2 -u tmp-shim-sha-package-files.txt | \ uniq -f 2 -c > tmp-count-shim-sha-package-files.txt -echo "$((++STEP))/ files with unique sha1 > $SPARK3XX_COMMON_TXT" +echo "$((++STEP))/ files with unique sha1 > $SPARK_SHARED_TXT" grep '^\s\+1 .*' tmp-count-shim-sha-package-files.txt | \ awk '{$1=""; $3=""; print $0 }' | \ - tr -s ' ' | sed 's/\ /\//g' > "$SPARK3XX_COMMON_TXT" + tr -s ' ' | sed 's/\ /\//g' > "$SPARK_SHARED_TXT" function retain_single_copy() { set -e @@ -93,10 +93,10 @@ function retain_single_copy() { package_class="${package_class_with_spaces// //}" # get the reference copy out of the way - echo "$package_class" >> "from-$shim-to-spark3xx-common.txt" + echo "$package_class" >> "from-$shim-to-spark-shared.txt" # expanding directories separately because full path # glob is broken for class file name including the "$" character - for pw in ./parallel-world/spark3* ; do + for pw in ./parallel-world/spark[34]* ; do delete_path="$pw/$package_class" [[ -f "$delete_path" ]] && echo "$delete_path" || true done >> "$DELETE_DUPLICATES_TXT" || exit 255 @@ -106,26 +106,26 @@ function retain_single_copy() { # standalone debugging # truncate incremental files : > "$DELETE_DUPLICATES_TXT" -rm -f from-spark3*-to-spark3xx-common.txt -rm -rf "$SPARK3XX_COMMON_DIR" -mkdir -p "$SPARK3XX_COMMON_DIR" +rm -f from-spark[34]*-to-spark-shared.txt +rm -rf "$SPARK_SHARED_DIR" +mkdir -p "$SPARK_SHARED_DIR" -echo "$((++STEP))/ retaining a single copy of spark3xx-common classes" +echo "$((++STEP))/ retaining a single copy of spark-shared classes" while read spark_common_class; do retain_single_copy "$spark_common_class" -done < "$SPARK3XX_COMMON_TXT" +done < "$SPARK_SHARED_TXT" -echo "$((++STEP))/ rsyncing common classes to $SPARK3XX_COMMON_DIR" -for copy_list in from-spark3*-to-spark3xx-common.txt; do +echo "$((++STEP))/ rsyncing common classes to $SPARK_SHARED_DIR" +for copy_list in from-spark[34]*-to-spark-shared.txt; do echo Initializing rsync of "$copy_list" IFS='-' <<< "$copy_list" read -ra copy_list_parts # declare -p copy_list_parts shim="${copy_list_parts[1]}" # use rsync to reduce process forking - rsync --files-from="$copy_list" ./parallel-world/"$shim" "$SPARK3XX_COMMON_DIR" + rsync --files-from="$copy_list" ./parallel-world/"$shim" "$SPARK_SHARED_DIR" done -mv "$SPARK3XX_COMMON_DIR" parallel-world/ +mv "$SPARK_SHARED_DIR" parallel-world/ # TODO further dedupe by FEATURE version lines: # spark30x-common @@ -137,9 +137,9 @@ mv "$SPARK3XX_COMMON_DIR" parallel-world/ # # At this point the duplicate classes have not been removed from version-specific jar # locations such as parallel-world/spark312. -# For each unshimmed class file look for all of its copies inside /spark3* and +# For each unshimmed class file look for all of its copies inside /spark[34]* and # and count the number of distinct checksums. There are two representative cases -# 1) The class is contributed to the unshimmed location via the unshimmed-from-each-spark3xx list. These are classes +# 1) The class is contributed to the unshimmed location via the unshimmed-from-each-spark34 list. These are classes # carrying the shim classifier in their package name such as # com.nvidia.spark.rapids.spark312.RapidsShuffleManager. They are unique by construction, # and will have zero copies in any non-spark312 shims. Although such classes are currently excluded from @@ -157,25 +157,25 @@ mv "$SPARK3XX_COMMON_DIR" parallel-world/ # Determine the list of unshimmed class files UNSHIMMED_LIST_TXT=unshimmed-result.txt echo "$((++STEP))/ creating sorted list of unshimmed classes > $UNSHIMMED_LIST_TXT" -find ./parallel-world -name '*.class' -not -path './parallel-world/spark3*' | \ +find ./parallel-world -name '*.class' -not -path './parallel-world/spark[34-]*' | \ cut -d/ -f 3- | sort > "$UNSHIMMED_LIST_TXT" function verify_same_sha_for_unshimmed() { set -e class_file="$1" - # the raw spark3xx-common.txt file list contains all single-sha1 classes + # the raw spark-shared.txt file list contains all single-sha1 classes # including the ones that are unshimmed. Instead of expensively recomputing # sha1 look up if there is an entry with the unshimmed class as a suffix class_file_quoted=$(printf '%q' "$class_file") - # TODO currently RapidsShuffleManager is "removed" from /spark3* by construction in + # TODO currently RapidsShuffleManager is "removed" from /spark* by construction in # dist pom.xml via ant. We could delegate this logic to this script # and make both simmpler - if [[ ! "$class_file_quoted" =~ (com/nvidia/spark/rapids/spark3.*/.*ShuffleManager.class|org/apache/spark/sql/rapids/shims/spark3.*/ProxyRapidsShuffleInternalManager.class) ]]; then + if [[ ! "$class_file_quoted" =~ (com/nvidia/spark/rapids/spark[34].*/.*ShuffleManager.class|org/apache/spark/sql/rapids/shims/spark[34].*/ProxyRapidsShuffleInternalManager.class) ]]; then - if ! grep -q "/spark.\+/$class_file_quoted" "$SPARK3XX_COMMON_TXT"; then + if ! grep -q "/spark.\+/$class_file_quoted" "$SPARK_SHARED_TXT"; then echo >&2 "$class_file is not bitwise-identical across shims" exit 255 fi @@ -192,7 +192,7 @@ done < "$UNSHIMMED_LIST_TXT" echo "$((++STEP))/ removing duplicates of unshimmed classes" while read unshimmed_class; do - for pw in ./parallel-world/spark3* ; do + for pw in ./parallel-world/spark[34]* ; do unshimmed_path="$pw/$unshimmed_class" [[ -f "$unshimmed_path" ]] && echo "$unshimmed_path" || true done >> "$DELETE_DUPLICATES_TXT" diff --git a/docs/archive.md b/docs/archive.md index 6cce30557f4..f4eeab11a40 100644 --- a/docs/archive.md +++ b/docs/archive.md @@ -5,11 +5,143 @@ nav_order: 15 --- Below are archived releases for RAPIDS Accelerator for Apache Spark. +## Release v24.04.1 +### Hardware Requirements: + +The plugin is tested on the following architectures: + + GPU Models: NVIDIA V100, T4, A10/A100, L4 and H100 GPUs + +### Software Requirements: + + OS: Ubuntu 20.04, Ubuntu 22.04, CentOS 7, or Rocky Linux 8 + + NVIDIA Driver*: R470+ + + Runtime: + Scala 2.12, 2.13 + Python, Java Virtual Machine (JVM) compatible with your spark-version. + + * Check the Spark documentation for Python and Java version compatibility with your specific + Spark version. For instance, visit `https://spark.apache.org/docs/3.4.1` for Spark 3.4.1. + + Supported Spark versions: + Apache Spark 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.2.4 + Apache Spark 3.3.0, 3.3.1, 3.3.2, 3.3.3, 3.3.4 + Apache Spark 3.4.0, 3.4.1, 3.4.2 + Apache Spark 3.5.0, 3.5.1 + + Supported Databricks runtime versions for Azure and AWS: + Databricks 11.3 ML LTS (GPU, Scala 2.12, Spark 3.3.0) + Databricks 12.2 ML LTS (GPU, Scala 2.12, Spark 3.3.2) + Databricks 13.3 ML LTS (GPU, Scala 2.12, Spark 3.4.1) + + Supported Dataproc versions (Debian/Ubuntu): + GCP Dataproc 2.0 + GCP Dataproc 2.1 + + Supported Dataproc Serverless versions: + Spark runtime 1.1 LTS + Spark runtime 2.0 + Spark runtime 2.1 + +*Some hardware may have a minimum driver version greater than R470. Check the GPU spec sheet +for your hardware's minimum driver version. + +*For Cloudera and EMR support, please refer to the +[Distributions](https://docs.nvidia.com/spark-rapids/user-guide/latest/faq.html#which-distributions-are-supported) section of the FAQ. + +### RAPIDS Accelerator's Support Policy for Apache Spark +The RAPIDS Accelerator maintains support for Apache Spark versions available for download from [Apache Spark](https://spark.apache.org/downloads.html) + +### Download RAPIDS Accelerator for Apache Spark v24.04.1 + +| Processor | Scala Version | Download Jar | Download Signature | +|-----------|---------------|--------------|--------------------| +| x86_64 | Scala 2.12 | [RAPIDS Accelerator v24.04.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.04.1/rapids-4-spark_2.12-24.04.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.04.1/rapids-4-spark_2.12-24.04.1.jar.asc) | +| x86_64 | Scala 2.13 | [RAPIDS Accelerator v24.04.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.04.1/rapids-4-spark_2.13-24.04.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.04.1/rapids-4-spark_2.13-24.04.1.jar.asc) | +| arm64 | Scala 2.12 | [RAPIDS Accelerator v24.04.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.04.1/rapids-4-spark_2.12-24.04.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.04.1/rapids-4-spark_2.12-24.04.1-cuda11-arm64.jar.asc) | +| arm64 | Scala 2.13 | [RAPIDS Accelerator v24.04.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.04.1/rapids-4-spark_2.13-24.04.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.04.1/rapids-4-spark_2.13-24.04.1-cuda11-arm64.jar.asc) | + +This package is built against CUDA 11.8. It is tested on V100, T4, A10, A100, L4 and H100 GPUs with +CUDA 11.8 through CUDA 12.0. + +### Verify signature +* Download the [PUB_KEY](https://keys.openpgp.org/search?q=sw-spark@nvidia.com). +* Import the public key: `gpg --import PUB_KEY` +* Verify the signature for Scala 2.12 jar: + `gpg --verify rapids-4-spark_2.12-24.04.1.jar.asc rapids-4-spark_2.12-24.04.1.jar` +* Verify the signature for Scala 2.13 jar: + `gpg --verify rapids-4-spark_2.13-24.04.1.jar.asc rapids-4-spark_2.13-24.04.1.jar` + +The output of signature verify: + + gpg: Good signature from "NVIDIA Spark (For the signature of spark-rapids release jars) " + +### Release Notes +* New functionality and performance improvements for this release include: +* Performance improvements for S3 reading. +Refer to perfio.s3.enabled in [advanced_configs](./additional-functionality/advanced_configs.md) for more details. +* Performance improvements when doing a joins on unique keys. +* Enhanced decompression kernels for zstd and snappy. +* Enhanced Parquet reading performance with modular kernels. +* Added compatibility with Spark version 3.5.1. +* Deprecated support for Databricks 10.4 ML LTS. +* For updates on RAPIDS Accelerator Tools, please visit [this link](https://github.com/NVIDIA/spark-rapids-tools/releases). + +For a detailed list of changes, please refer to the +[CHANGELOG](https://github.com/NVIDIA/spark-rapids/blob/main/CHANGELOG.md). + +## Archived releases + +As new releases come out, previous ones will still be available in [archived releases](./archive.md). + ## Release v24.04.0 ### Hardware Requirements: The plugin is tested on the following architectures: - @@ -67,14 +67,14 @@ for your hardware's minimum driver version. + + GPU Models: NVIDIA V100, T4, A10/A100, L4 and H100 GPUs + +### Software Requirements: + + OS: Ubuntu 20.04, Ubuntu 22.04, CentOS 7, or Rocky Linux 8 + + NVIDIA Driver*: R470+ + + Runtime: + Scala 2.12, 2.13 + Python, Java Virtual Machine (JVM) compatible with your spark-version. + + * Check the Spark documentation for Python and Java version compatibility with your specific + Spark version. For instance, visit `https://spark.apache.org/docs/3.4.1` for Spark 3.4.1. + + Supported Spark versions: + Apache Spark 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.2.4 + Apache Spark 3.3.0, 3.3.1, 3.3.2, 3.3.3, 3.3.4 + Apache Spark 3.4.0, 3.4.1, 3.4.2 + Apache Spark 3.5.0, 3.5.1 + + Supported Databricks runtime versions for Azure and AWS: + Databricks 11.3 ML LTS (GPU, Scala 2.12, Spark 3.3.0) + Databricks 12.2 ML LTS (GPU, Scala 2.12, Spark 3.3.2) + Databricks 13.3 ML LTS (GPU, Scala 2.12, Spark 3.4.1) + + Supported Dataproc versions (Debian/Ubuntu): + GCP Dataproc 2.0 + GCP Dataproc 2.1 + + Supported Dataproc Serverless versions: + Spark runtime 1.1 LTS + Spark runtime 2.0 + Spark runtime 2.1 + +*Some hardware may have a minimum driver version greater than R470. Check the GPU spec sheet +for your hardware's minimum driver version. + +*For Cloudera and EMR support, please refer to the +[Distributions](https://docs.nvidia.com/spark-rapids/user-guide/latest/faq.html#which-distributions-are-supported) section of the FAQ. + ### RAPIDS Accelerator's Support Policy for Apache Spark The RAPIDS Accelerator maintains support for Apache Spark versions available for download from [Apache Spark](https://spark.apache.org/downloads.html) @@ -74,7 +206,7 @@ The plugin is tested on the following architectures: Supported Spark versions: Apache Spark 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.2.4 Apache Spark 3.3.0, 3.3.1, 3.3.2, 3.3.3 - Apache Spark 3.4.0, 3.4.1 + Apache Spark 3.4.0, 3.4.1, 3.4.2 Apache Spark 3.5.0 Supported Databricks runtime versions for Azure and AWS: diff --git a/docs/archives/CHANGELOG_24.02.md b/docs/archives/CHANGELOG_24.02.md new file mode 100644 index 00000000000..732035502f0 --- /dev/null +++ b/docs/archives/CHANGELOG_24.02.md @@ -0,0 +1,300 @@ +# Change log\nGenerated on 2024-06-05 +## Release 24.02 + +### Features +||| +|:---|:---| +|[#9926](https://github.com/NVIDIA/spark-rapids/issues/9926)|[FEA] Add config option for the parquet reader input read limit.| +|[#10270](https://github.com/NVIDIA/spark-rapids/issues/10270)|[FEA] Add support for single quotes when reading JSON| +|[#10253](https://github.com/NVIDIA/spark-rapids/issues/10253)|[FEA] Enable mixed types as string in GpuJsonToStruct| +|[#9692](https://github.com/NVIDIA/spark-rapids/issues/9692)|[FEA] Remove Pascal support| +|[#8806](https://github.com/NVIDIA/spark-rapids/issues/8806)|[FEA] Support lazy quantifier and specified group index in regexp_extract function| +|[#10079](https://github.com/NVIDIA/spark-rapids/issues/10079)|[FEA] Add string parameter support for `unix_timestamp` for non-UTC time zones| +|[#9667](https://github.com/NVIDIA/spark-rapids/issues/9667)|[FEA][JSON] Add support for non default `dateFormat` in `from_json`| +|[#9173](https://github.com/NVIDIA/spark-rapids/issues/9173)|[FEA] Support format_number | +|[#10145](https://github.com/NVIDIA/spark-rapids/issues/10145)|[FEA] Support to_utc_timestamp| +|[#9927](https://github.com/NVIDIA/spark-rapids/issues/9927)|[FEA] Support to_date with non-UTC timezones without DST| +|[#10006](https://github.com/NVIDIA/spark-rapids/issues/10006)|[FEA] Support ```ParseToTimestamp``` for non-UTC time zones| +|[#9096](https://github.com/NVIDIA/spark-rapids/issues/9096)|[FEA] Add Spark 3.3.4 support| +|[#9585](https://github.com/NVIDIA/spark-rapids/issues/9585)|[FEA] support ascii function| +|[#9260](https://github.com/NVIDIA/spark-rapids/issues/9260)|[FEA] Create Spark 3.4.2 shim and build env| +|[#10076](https://github.com/NVIDIA/spark-rapids/issues/10076)|[FEA] Add performance test framework for non-UTC time zone features.| +|[#9881](https://github.com/NVIDIA/spark-rapids/issues/9881)|[TASK] Remove `spark.rapids.sql.nonUTC.enabled` configuration option| +|[#9801](https://github.com/NVIDIA/spark-rapids/issues/9801)|[FEA] Support DateFormat on GPU with a non-UTC timezone| +|[#6834](https://github.com/NVIDIA/spark-rapids/issues/6834)|[FEA] Support GpuHour expression for timezones other than UTC| +|[#6842](https://github.com/NVIDIA/spark-rapids/issues/6842)|[FEA] Support TimeZone aware operations for value extraction| +|[#1860](https://github.com/NVIDIA/spark-rapids/issues/1860)|[FEA] Optimize row based window operations for BOUNDED ranges| +|[#9606](https://github.com/NVIDIA/spark-rapids/issues/9606)|[FEA] Support unix_timestamp with CST(China Time Zone) support| +|[#9815](https://github.com/NVIDIA/spark-rapids/issues/9815)|[FEA] Support ```unix_timestamp``` for non-DST timezones| +|[#8807](https://github.com/NVIDIA/spark-rapids/issues/8807)|[FEA] support ‘yyyyMMdd’ format in from_unixtime function| +|[#9605](https://github.com/NVIDIA/spark-rapids/issues/9605)|[FEA] Support from_unixtime with CST(China Time Zone) support| +|[#6836](https://github.com/NVIDIA/spark-rapids/issues/6836)|[FEA] Support FromUnixTime for non UTC timezones| +|[#9175](https://github.com/NVIDIA/spark-rapids/issues/9175)|[FEA] Support Databricks 13.3| +|[#6881](https://github.com/NVIDIA/spark-rapids/issues/6881)|[FEA] Support RAPIDS Spark plugin on ARM| +|[#9274](https://github.com/NVIDIA/spark-rapids/issues/9274)|[FEA] Regular deploy process to include arm artifacts| +|[#9844](https://github.com/NVIDIA/spark-rapids/issues/9844)|[FEA] Let Gpu arrow python runners support writing one batch one time for the single threaded model.| +|[#7309](https://github.com/NVIDIA/spark-rapids/issues/7309)|[FEA] Detect multiple versions of the RAPIDS jar on the classpath at the same time| + +### Performance +||| +|:---|:---| +|[#9442](https://github.com/NVIDIA/spark-rapids/issues/9442)|[FEA] For hash joins where the build side can change use the smaller table for the build side| +|[#10142](https://github.com/NVIDIA/spark-rapids/issues/10142)|[TASK] Benchmark existing timestamp functions that work in non-UTC time zone (non-DST)| + +### Bugs Fixed +||| +|:---|:---| +|[#10548](https://github.com/NVIDIA/spark-rapids/issues/10548)|[BUG] test_dpp_bypass / test_dpp_via_aggregate_subquery failures in CI Databricks 13.3| +|[#10530](https://github.com/NVIDIA/spark-rapids/issues/10530)|test_delta_merge_match_delete_only java.lang.OutOfMemoryError: GC overhead limit exceeded| +|[#10464](https://github.com/NVIDIA/spark-rapids/issues/10464)|[BUG] spark334 and spark342 shims missed in scala2.13 dist jar| +|[#10473](https://github.com/NVIDIA/spark-rapids/issues/10473)|[BUG] Leak when running RANK query| +|[#10432](https://github.com/NVIDIA/spark-rapids/issues/10432)|Plug-in Build Failing for Databricks 11.3 | +|[#9974](https://github.com/NVIDIA/spark-rapids/issues/9974)|[BUG] host memory Leak in MultiFileCoalescingPartitionReaderBase in UTC time zone| +|[#10359](https://github.com/NVIDIA/spark-rapids/issues/10359)|[BUG] Build failure on Databricks nightly run with `GpuMapInPandasExecMeta`| +|[#10327](https://github.com/NVIDIA/spark-rapids/issues/10327)|[BUG] Unit test FAILED against : SPARK-24957: average with decimal followed by aggregation returning wrong result | +|[#10324](https://github.com/NVIDIA/spark-rapids/issues/10324)|[BUG] hash_aggregate_test.py test FAILED: Type conversion is not allowed from Table {...}| +|[#10291](https://github.com/NVIDIA/spark-rapids/issues/10291)|[BUG] SIGSEGV in libucp.so| +|[#9212](https://github.com/NVIDIA/spark-rapids/issues/9212)|[BUG] `from_json` fails with cuDF error `Invalid list size computation error`| +|[#10264](https://github.com/NVIDIA/spark-rapids/issues/10264)|[BUG] hash aggregate test failures due to type conversion errors| +|[#10262](https://github.com/NVIDIA/spark-rapids/issues/10262)|[BUG] Test "SPARK-24957: average with decimal followed by aggregation returning wrong result" failed.| +|[#9353](https://github.com/NVIDIA/spark-rapids/issues/9353)|[BUG] [JSON] A mix of lists and structs within the same column is not supported| +|[#10099](https://github.com/NVIDIA/spark-rapids/issues/10099)|[BUG] orc_test.py::test_orc_scan_with_aggregate_pushdown fails with a standalone cluster on spark 3.3.0| +|[#10047](https://github.com/NVIDIA/spark-rapids/issues/10047)|[BUG] CudfException during conditional hash join while running nds query64| +|[#9779](https://github.com/NVIDIA/spark-rapids/issues/9779)|[BUG] 330cdh failed test_hash_reduction_sum_full_decimal on CI| +|[#10197](https://github.com/NVIDIA/spark-rapids/issues/10197)|[BUG] Disable GetJsonObject by default and update docs| +|[#10165](https://github.com/NVIDIA/spark-rapids/issues/10165)|[BUG] Databricks 13.3 executor side broadcast failure| +|[#10224](https://github.com/NVIDIA/spark-rapids/issues/10224)|[BUG] DBR builds fails when installing Maven| +|[#10222](https://github.com/NVIDIA/spark-rapids/issues/10222)|[BUG] to_utc_timestamp and from_utc_timestamp fallback when TZ is supported time zone| +|[#10195](https://github.com/NVIDIA/spark-rapids/issues/10195)|[BUG] test_window_aggs_for_negative_rows_partitioned failure in CI| +|[#10182](https://github.com/NVIDIA/spark-rapids/issues/10182)|[BUG] test_dpp_bypass / test_dpp_via_aggregate_subquery failures in CI (databricks)| +|[#10169](https://github.com/NVIDIA/spark-rapids/issues/10169)|[BUG] Host column vector leaks when running `test_cast_timestamp_to_date`| +|[#10050](https://github.com/NVIDIA/spark-rapids/issues/10050)|[BUG] test_cast_decimal_to_decimal[to:DecimalType(1,-1)-from:Decimal(5,-3)] fails with DATAGEN_SEED=1702439569| +|[#10088](https://github.com/NVIDIA/spark-rapids/issues/10088)|[BUG] GpuExplode single row split to fit cuDF limits| +|[#10174](https://github.com/NVIDIA/spark-rapids/issues/10174)|[BUG] json_test.py::test_from_json_struct_timestamp failed on: Part of the plan is not columnar | +|[#10186](https://github.com/NVIDIA/spark-rapids/issues/10186)|[BUG] test_to_date_with_window_functions failed in non-UTC nightly CI| +|[#10154](https://github.com/NVIDIA/spark-rapids/issues/10154)|[BUG] 'spark-test.sh' integration tests FAILED on 'ps: command not found" in Rocky Docker environment| +|[#10175](https://github.com/NVIDIA/spark-rapids/issues/10175)|[BUG] string_test.py::test_format_number_float_special FAILED : AssertionError 'NaN' == | +|[#10166](https://github.com/NVIDIA/spark-rapids/issues/10166)|Detect Undeclared Shim in POM.xml| +|[#10170](https://github.com/NVIDIA/spark-rapids/issues/10170)|[BUG] `test_cast_timestamp_to_date` fails with `TZ=Asia/Hebron`| +|[#10149](https://github.com/NVIDIA/spark-rapids/issues/10149)|[BUG] GPU illegal access detected during delta_byte_array.parquet read| +|[#9905](https://github.com/NVIDIA/spark-rapids/issues/9905)|[BUG] GpuJsonScan incorrect behavior when parsing dates| +|[#10163](https://github.com/NVIDIA/spark-rapids/issues/10163)|Spark 3.3.4 Shim Build Failure| +|[#10105](https://github.com/NVIDIA/spark-rapids/issues/10105)|[BUG] scala:compile is not thread safe unless compiler bridge already exists | +|[#10026](https://github.com/NVIDIA/spark-rapids/issues/10026)|[BUG] test_hash_agg_with_nan_keys failed with a DATAGEN_SEED=1702335559| +|[#10075](https://github.com/NVIDIA/spark-rapids/issues/10075)|[BUG] `non-pinned blocking alloc with spill` unit test failed in HostAllocSuite| +|[#10134](https://github.com/NVIDIA/spark-rapids/issues/10134)|[BUG] test_window_aggs_for_batched_finite_row_windows_partitioned failed on Scala 2.13 with DATAGEN_SEED=1704033145| +|[#10118](https://github.com/NVIDIA/spark-rapids/issues/10118)|[BUG] non-UTC Nightly CI failed| +|[#10136](https://github.com/NVIDIA/spark-rapids/issues/10136)|[BUG] The canonicalized version of `GpuFileSourceScanExec`s that suppose to be semantic-equal can be different | +|[#10110](https://github.com/NVIDIA/spark-rapids/issues/10110)|[BUG] disable collect_list and collect_set for window operations by default.| +|[#10129](https://github.com/NVIDIA/spark-rapids/issues/10129)|[BUG] Unit test suite fails with `Null data pointer` in GpuTimeZoneDB| +|[#10089](https://github.com/NVIDIA/spark-rapids/issues/10089)|[BUG] DATAGEN_SEED= environment does not override the marker datagen_overrides| +|[#10108](https://github.com/NVIDIA/spark-rapids/issues/10108)|[BUG] @datagen_overrides seed is sticky when it shouldn't be| +|[#10064](https://github.com/NVIDIA/spark-rapids/issues/10064)|[BUG] test_unsupported_fallback_regexp_replace failed with DATAGEN_SEED=1702662063| +|[#10117](https://github.com/NVIDIA/spark-rapids/issues/10117)|[BUG] test_from_utc_timestamp failed on Cloudera Env when TZ is Iran| +|[#9914](https://github.com/NVIDIA/spark-rapids/issues/9914)|[BUG] Report GPU OOM on recent passed CI premerges.| +|[#10094](https://github.com/NVIDIA/spark-rapids/issues/10094)|[BUG] spark351 PR check failure MockTaskContext method isFailed in class TaskContext of type ()Boolean is not defined| +|[#10017](https://github.com/NVIDIA/spark-rapids/issues/10017)|[BUG] test_casting_from_double_to_timestamp failed for DATAGEN_SEED=1702329497| +|[#9992](https://github.com/NVIDIA/spark-rapids/issues/9992)|[BUG] conditionals_test.py::test_conditional_with_side_effects_cast[String] failed with DATAGEN_SEED=1701976979| +|[#9743](https://github.com/NVIDIA/spark-rapids/issues/9743)|[BUG][AUDIT] SPARK-45652 - SPJ: Handle empty input partitions after dynamic filtering| +|[#9859](https://github.com/NVIDIA/spark-rapids/issues/9859)|[AUDIT] [SPARK-45786] Inaccurate Decimal multiplication and division results| +|[#9555](https://github.com/NVIDIA/spark-rapids/issues/9555)|[BUG] Scala 2.13 build with JDK 11 or 17 fails OpcodeSuite tests| +|[#10073](https://github.com/NVIDIA/spark-rapids/issues/10073)|[BUG] test_csv_prefer_date_with_infer_schema failed with DATAGEN_SEED=1702847907| +|[#10004](https://github.com/NVIDIA/spark-rapids/issues/10004)|[BUG] If a host memory buffer is spilled, it cannot be unspilled| +|[#10063](https://github.com/NVIDIA/spark-rapids/issues/10063)|[BUG] CI build failure with 341db: method getKillReason has weaker access privileges; it should be public| +|[#10055](https://github.com/NVIDIA/spark-rapids/issues/10055)|[BUG] array_test.py::test_array_transform_non_deterministic failed with non-UTC time zone| +|[#10056](https://github.com/NVIDIA/spark-rapids/issues/10056)|[BUG] Unit tests ToPrettyStringSuite FAILED on spark-3.5.0| +|[#10048](https://github.com/NVIDIA/spark-rapids/issues/10048)|[BUG] Fix ```out of range``` error from ```pySpark``` in ```test_timestamp_millis``` and other two integration test cases| +|[#4204](https://github.com/NVIDIA/spark-rapids/issues/4204)|casting double to string does not match Spark| +|[#9938](https://github.com/NVIDIA/spark-rapids/issues/9938)|Better to do some refactor for the Python UDF code| +|[#10018](https://github.com/NVIDIA/spark-rapids/issues/10018)|[BUG] `GpuToUnixTimestampImproved` off by 1 on GPU when handling timestamp before epoch| +|[#10012](https://github.com/NVIDIA/spark-rapids/issues/10012)|[BUG] test_str_to_map_expr_random_delimiters with DATAGEN_SEED=1702166057 hangs| +|[#10029](https://github.com/NVIDIA/spark-rapids/issues/10029)|[BUG] doc links fail with 404 for shims.md| +|[#9472](https://github.com/NVIDIA/spark-rapids/issues/9472)|[BUG] Non-Deterministic expressions in an array_transform can cause errors| +|[#9884](https://github.com/NVIDIA/spark-rapids/issues/9884)|[BUG] delta_lake_delete_test.py failed assertion [DATAGEN_SEED=1701225104, IGNORE_ORDER...| +|[#9977](https://github.com/NVIDIA/spark-rapids/issues/9977)|[BUG] test_cast_date_integral fails on databricks 3.4.1| +|[#9936](https://github.com/NVIDIA/spark-rapids/issues/9936)|[BUG] Nightly CI of non-UTC time zone reports 'year 0 is out of range' error| +|[#9941](https://github.com/NVIDIA/spark-rapids/issues/9941)|[BUG] A potential data corruption in Pandas UDFs| +|[#9897](https://github.com/NVIDIA/spark-rapids/issues/9897)|[BUG] Error message for multiple jars on classpath is wrong| +|[#9916](https://github.com/NVIDIA/spark-rapids/issues/9916)|[BUG] ```test_cast_string_ts_valid_format``` failed at ```seed = 1701362564```| +|[#9559](https://github.com/NVIDIA/spark-rapids/issues/9559)|[BUG] precommit regularly fails with error trying to download a dependency| +|[#9708](https://github.com/NVIDIA/spark-rapids/issues/9708)|[BUG] test_cast_string_ts_valid_format fails with DATAGEN_SEED=1699978422| + +### PRs +||| +|:---|:---| +|[#10555](https://github.com/NVIDIA/spark-rapids/pull/10555)|Update change log [skip ci]| +|[#10551](https://github.com/NVIDIA/spark-rapids/pull/10551)|Try to make degenerative joins here impossible for these tests| +|[#10546](https://github.com/NVIDIA/spark-rapids/pull/10546)|Update changelog [skip ci]| +|[#10541](https://github.com/NVIDIA/spark-rapids/pull/10541)|Fix Delta log cache size settings during integration tests| +|[#10525](https://github.com/NVIDIA/spark-rapids/pull/10525)|Update changelog for v24.02.0 release [skip ci]| +|[#10465](https://github.com/NVIDIA/spark-rapids/pull/10465)|Add missed shims for scala2.13| +|[#10511](https://github.com/NVIDIA/spark-rapids/pull/10511)|Update rapids jni and private dependency version to 24.02.1| +|[#10513](https://github.com/NVIDIA/spark-rapids/pull/10513)|Fix scalar leak in SumBinaryFixer (#10510)| +|[#10475](https://github.com/NVIDIA/spark-rapids/pull/10475)|Fix scalar leak in RankFixer| +|[#10461](https://github.com/NVIDIA/spark-rapids/pull/10461)|Preserve tags on FileSourceScanExec| +|[#10459](https://github.com/NVIDIA/spark-rapids/pull/10459)|[DOC] Fix table rendering issue in github.io download UI page on branch-24.02 [skip ci] | +|[#10443](https://github.com/NVIDIA/spark-rapids/pull/10443)|Update change log for v24.02.0 release [skip ci]| +|[#10439](https://github.com/NVIDIA/spark-rapids/pull/10439)|Reverts NVIDIA/spark-rapids#10232 and fixes the plugin build on Databricks 11.3| +|[#10380](https://github.com/NVIDIA/spark-rapids/pull/10380)|Init changelog 24.02 [skip ci]| +|[#10367](https://github.com/NVIDIA/spark-rapids/pull/10367)|Update rapids JNI and private version to release 24.02.0| +|[#10414](https://github.com/NVIDIA/spark-rapids/pull/10414)|[DOC] Fix 24.02.0 documentation errors [skip ci]| +|[#10403](https://github.com/NVIDIA/spark-rapids/pull/10403)|Cherry-pick: Fix a memory leak in json tuple (#10360)| +|[#10387](https://github.com/NVIDIA/spark-rapids/pull/10387)|[DOC] Update docs for 24.02.0 release [skip ci]| +|[#10399](https://github.com/NVIDIA/spark-rapids/pull/10399)|Update NOTICE-binary| +|[#10389](https://github.com/NVIDIA/spark-rapids/pull/10389)|Change version and branch to 24.02 in docs [skip ci]| +|[#10384](https://github.com/NVIDIA/spark-rapids/pull/10384)|[DOC] Update docs for 23.12.2 release [skip ci] | +|[#10309](https://github.com/NVIDIA/spark-rapids/pull/10309)|[DOC] add custom 404 page and fix some document issue [skip ci]| +|[#10352](https://github.com/NVIDIA/spark-rapids/pull/10352)|xfail mixed type test| +|[#10355](https://github.com/NVIDIA/spark-rapids/pull/10355)|Revert "Support barrier mode for mapInPandas/mapInArrow (#10343)"| +|[#10353](https://github.com/NVIDIA/spark-rapids/pull/10353)|Use fixed seed for test_from_json_struct_decimal| +|[#10343](https://github.com/NVIDIA/spark-rapids/pull/10343)|Support barrier mode for mapInPandas/mapInArrow| +|[#10345](https://github.com/NVIDIA/spark-rapids/pull/10345)|Fix auto merge conflict 10339 [skip ci]| +|[#9991](https://github.com/NVIDIA/spark-rapids/pull/9991)|Start to use explicit memory limits in the parquet chunked reader| +|[#10328](https://github.com/NVIDIA/spark-rapids/pull/10328)|Fix typo in spark-tests.sh [skip ci]| +|[#10279](https://github.com/NVIDIA/spark-rapids/pull/10279)|Run '--packages' only with default cuda11 jar| +|[#10273](https://github.com/NVIDIA/spark-rapids/pull/10273)|Support reading JSON data with single quotes around attribute names and values| +|[#10306](https://github.com/NVIDIA/spark-rapids/pull/10306)|Fix performance regression in from_json| +|[#10272](https://github.com/NVIDIA/spark-rapids/pull/10272)|Add FullOuter support to GpuShuffledSymmetricHashJoinExec| +|[#10260](https://github.com/NVIDIA/spark-rapids/pull/10260)|Add perf test for time zone operators| +|[#10275](https://github.com/NVIDIA/spark-rapids/pull/10275)|Add tests for window Python udf with array input| +|[#10278](https://github.com/NVIDIA/spark-rapids/pull/10278)|Clean up $M2_CACHE to avoid side-effect of previous dependency:get [skip ci]| +|[#10268](https://github.com/NVIDIA/spark-rapids/pull/10268)|Add config to enable mixed types as string in GpuJsonToStruct & GpuJsonScan| +|[#10297](https://github.com/NVIDIA/spark-rapids/pull/10297)|Revert "UCX 1.16.0 upgrade (#10190)"| +|[#10289](https://github.com/NVIDIA/spark-rapids/pull/10289)|Add gerashegalov to CODEOWNERS [skip ci]| +|[#10290](https://github.com/NVIDIA/spark-rapids/pull/10290)|Fix merge conflict with 23.12 [skip ci]| +|[#10190](https://github.com/NVIDIA/spark-rapids/pull/10190)|UCX 1.16.0 upgrade| +|[#10211](https://github.com/NVIDIA/spark-rapids/pull/10211)|Use parse_url kernel for QUERY literal and column key| +|[#10267](https://github.com/NVIDIA/spark-rapids/pull/10267)|Update to libcudf unsigned sum aggregation types change| +|[#10208](https://github.com/NVIDIA/spark-rapids/pull/10208)|Added Support for Lazy Quantifier| +|[#9993](https://github.com/NVIDIA/spark-rapids/pull/9993)|Enable mixed types as string in GpuJsonScan| +|[#10246](https://github.com/NVIDIA/spark-rapids/pull/10246)|Refactor full join iterator to allow access to build tracker| +|[#10257](https://github.com/NVIDIA/spark-rapids/pull/10257)|Enable auto-merge from branch-24.02 to branch-24.04 [skip CI]| +|[#10178](https://github.com/NVIDIA/spark-rapids/pull/10178)|Mark hash reduction decimal overflow test as a permanent seed override| +|[#10244](https://github.com/NVIDIA/spark-rapids/pull/10244)|Use POSIX mode in assembly plugin to avoid issues with large UID/GID| +|[#10238](https://github.com/NVIDIA/spark-rapids/pull/10238)|Smoke test with '--package' to fetch the plugin jar| +|[#10201](https://github.com/NVIDIA/spark-rapids/pull/10201)|Deploy release candidates to local maven repo for dependency check[skip ci]| +|[#10240](https://github.com/NVIDIA/spark-rapids/pull/10240)|Improved inner joins with large build side| +|[#10220](https://github.com/NVIDIA/spark-rapids/pull/10220)|Disable GetJsonObject by default and add tests for as many issues with it as possible| +|[#10230](https://github.com/NVIDIA/spark-rapids/pull/10230)|Fix Databricks 13.3 BroadcastHashJoin using executor side broadcast fed by ColumnarToRow [Databricks]| +|[#10232](https://github.com/NVIDIA/spark-rapids/pull/10232)|Fixed 330db Shims to Adopt the PythonRunner Changes| +|[#10225](https://github.com/NVIDIA/spark-rapids/pull/10225)|Download Maven from apache.org archives [skip ci]| +|[#10210](https://github.com/NVIDIA/spark-rapids/pull/10210)|Add string parameter support for unix_timestamp for non-UTC time zones| +|[#10223](https://github.com/NVIDIA/spark-rapids/pull/10223)|Fix to_utc_timestamp and from_utc_timestamp fallback when TZ is supported time zone| +|[#10205](https://github.com/NVIDIA/spark-rapids/pull/10205)|Deterministic ordering in window tests| +|[#10204](https://github.com/NVIDIA/spark-rapids/pull/10204)|Further prevent degenerative joins in dpp_test| +|[#10156](https://github.com/NVIDIA/spark-rapids/pull/10156)|Update string to float compatibility doc[skip ci]| +|[#10193](https://github.com/NVIDIA/spark-rapids/pull/10193)|Fix explode with carry-along columns on GpuExplode single row retry handling| +|[#10191](https://github.com/NVIDIA/spark-rapids/pull/10191)|Updating the config documentation for filecache configs [skip ci]| +|[#10131](https://github.com/NVIDIA/spark-rapids/pull/10131)|With a single row GpuExplode tries to split the generator array| +|[#10179](https://github.com/NVIDIA/spark-rapids/pull/10179)|Fix build regression against Spark 3.2.x| +|[#10189](https://github.com/NVIDIA/spark-rapids/pull/10189)|test needs marks for non-UTC and for non_supported timezones| +|[#10176](https://github.com/NVIDIA/spark-rapids/pull/10176)|Fix format_number NaN symbol in high jdk version| +|[#10074](https://github.com/NVIDIA/spark-rapids/pull/10074)|Update the legacy mode check: only take effect when reading date/timestamp column| +|[#10167](https://github.com/NVIDIA/spark-rapids/pull/10167)|Defined Shims Should Be Declared In POM | +|[#10168](https://github.com/NVIDIA/spark-rapids/pull/10168)|Prevent a degenerative join in test_dpp_reuse_broadcast_exchange| +|[#10171](https://github.com/NVIDIA/spark-rapids/pull/10171)|Fix `test_cast_timestamp_to_date` when running in a DST time zone| +|[#9975](https://github.com/NVIDIA/spark-rapids/pull/9975)|Improve dateFormat support in GpuJsonScan and make tests consistent with GpuStructsToJson| +|[#9790](https://github.com/NVIDIA/spark-rapids/pull/9790)|Support float case of format_number with format_float kernel| +|[#10144](https://github.com/NVIDIA/spark-rapids/pull/10144)|Support to_utc_timestamp| +|[#10162](https://github.com/NVIDIA/spark-rapids/pull/10162)|Fix Spark 334 Build| +|[#10146](https://github.com/NVIDIA/spark-rapids/pull/10146)|Refactor the window code so it is not mostly kept in a few very large files| +|[#10155](https://github.com/NVIDIA/spark-rapids/pull/10155)|Install procps tools for rocky docker images [skip ci]| +|[#10153](https://github.com/NVIDIA/spark-rapids/pull/10153)|Disable multi-threaded Maven | +|[#10100](https://github.com/NVIDIA/spark-rapids/pull/10100)|Enable to_date (via gettimestamp and casting timestamp to date) for non-UTC time zones| +|[#10140](https://github.com/NVIDIA/spark-rapids/pull/10140)|Removed Unnecessary Whitespaces From Spark 3.3.4 Shim [skip ci]| +|[#10148](https://github.com/NVIDIA/spark-rapids/pull/10148)|fix test_hash_agg_with_nan_keys floating point sum failure| +|[#10150](https://github.com/NVIDIA/spark-rapids/pull/10150)|Increase timeouts in HostAllocSuite to avoid timeout failures on slow machines| +|[#10143](https://github.com/NVIDIA/spark-rapids/pull/10143)|Fix `test_window_aggs_for_batched_finite_row_windows_partitioned` fail| +|[#9887](https://github.com/NVIDIA/spark-rapids/pull/9887)|Reduce time-consuming of pre-merge| +|[#10130](https://github.com/NVIDIA/spark-rapids/pull/10130)|Change unit tests that force ooms to specify the oom type (gpu|cpu)| +|[#10138](https://github.com/NVIDIA/spark-rapids/pull/10138)|Update copyright dates in NOTICE files [skip ci]| +|[#10139](https://github.com/NVIDIA/spark-rapids/pull/10139)|Add Delta Lake 2.3.0 to list of versions to test for Spark 3.3.x| +|[#10135](https://github.com/NVIDIA/spark-rapids/pull/10135)|Fix CI: can't find script when there is pushd in script [skip ci]| +|[#10137](https://github.com/NVIDIA/spark-rapids/pull/10137)|Fix the canonicalizing for GPU file scan| +|[#10132](https://github.com/NVIDIA/spark-rapids/pull/10132)|Disable collect_list and collect_set for window by default| +|[#10084](https://github.com/NVIDIA/spark-rapids/pull/10084)|Refactor GpuJsonToStruct to reduce code duplication and manage resources more efficiently| +|[#10087](https://github.com/NVIDIA/spark-rapids/pull/10087)|Additional unit tests for GeneratedInternalRowToCudfRowIterator| +|[#10082](https://github.com/NVIDIA/spark-rapids/pull/10082)|Add Spark 3.3.4 Shim| +|[#10054](https://github.com/NVIDIA/spark-rapids/pull/10054)|Support Ascii function for ascii and latin-1| +|[#10127](https://github.com/NVIDIA/spark-rapids/pull/10127)|Fix merge conflict with branch-23.12| +|[#10097](https://github.com/NVIDIA/spark-rapids/pull/10097)|[DOC] Update docs for 23.12.1 release [skip ci]| +|[#10109](https://github.com/NVIDIA/spark-rapids/pull/10109)|Fixes a bug where datagen seed overrides were sticky and adds datagen_seed_override_disabled| +|[#10093](https://github.com/NVIDIA/spark-rapids/pull/10093)|Fix test_unsupported_fallback_regexp_replace| +|[#10119](https://github.com/NVIDIA/spark-rapids/pull/10119)|Fix from_utc_timestamp case failure on Cloudera when TZ is Iran| +|[#10106](https://github.com/NVIDIA/spark-rapids/pull/10106)|Add `isFailed()` to MockTaskContext and Remove MockTaskContextBase.scala| +|[#10112](https://github.com/NVIDIA/spark-rapids/pull/10112)|Remove datagen seed override for test_conditional_with_side_effects_cast| +|[#10104](https://github.com/NVIDIA/spark-rapids/pull/10104)|[DOC] Add in docs about memory debugging [skip ci]| +|[#9925](https://github.com/NVIDIA/spark-rapids/pull/9925)|Use threads, cache Scala compiler in GH mvn workflow| +|[#9967](https://github.com/NVIDIA/spark-rapids/pull/9967)|Added Spark-3.4.2 Shims| +|[#10061](https://github.com/NVIDIA/spark-rapids/pull/10061)|Use parse_url kernel for QUERY parsing| +|[#10101](https://github.com/NVIDIA/spark-rapids/pull/10101)|[DOC] Add column order error docs [skip ci]| +|[#10078](https://github.com/NVIDIA/spark-rapids/pull/10078)|Add perf test for non-UTC operators| +|[#10096](https://github.com/NVIDIA/spark-rapids/pull/10096)|Shim MockTaskContext to fix Spark 3.5.1 build| +|[#10092](https://github.com/NVIDIA/spark-rapids/pull/10092)|Implement Math.round using floor on GPU| +|[#10085](https://github.com/NVIDIA/spark-rapids/pull/10085)|Update tests that originally restricted the Spark timestamp range| +|[#10090](https://github.com/NVIDIA/spark-rapids/pull/10090)|Replace GPU-unsupported `\z` with an alternative RLIKE expression| +|[#10095](https://github.com/NVIDIA/spark-rapids/pull/10095)|Temporarily fix date format failed cases for non-UTC time zone.| +|[#9999](https://github.com/NVIDIA/spark-rapids/pull/9999)|Add some odd time zones for timezone transition tests| +|[#9962](https://github.com/NVIDIA/spark-rapids/pull/9962)|Add 3.5.1-SNAPSHOT Shim| +|[#10071](https://github.com/NVIDIA/spark-rapids/pull/10071)|Cleanup usage of non-utc configuration here| +|[#10057](https://github.com/NVIDIA/spark-rapids/pull/10057)|Add support for StringConcatFactory.makeConcatWithConstants (#9555)| +|[#9996](https://github.com/NVIDIA/spark-rapids/pull/9996)|Test full timestamp output range in PySpark| +|[#10081](https://github.com/NVIDIA/spark-rapids/pull/10081)|Add a fallback Cloudera Maven repo URL [skip ci]| +|[#10065](https://github.com/NVIDIA/spark-rapids/pull/10065)|Improve host memory spill interfaces| +|[#10069](https://github.com/NVIDIA/spark-rapids/pull/10069)|Revert "Support split broadcast join condition into ast and non-ast […| +|[#10070](https://github.com/NVIDIA/spark-rapids/pull/10070)|Fix 332db build failure| +|[#10060](https://github.com/NVIDIA/spark-rapids/pull/10060)|Fix failed cases for non-utc time zone| +|[#10038](https://github.com/NVIDIA/spark-rapids/pull/10038)|Remove spark.rapids.sql.nonUTC.enabled configuration option| +|[#10059](https://github.com/NVIDIA/spark-rapids/pull/10059)|Fixed Failing ToPrettyStringSuite Test for 3.5.0| +|[#10013](https://github.com/NVIDIA/spark-rapids/pull/10013)|Extended configuration of OOM injection mode| +|[#10052](https://github.com/NVIDIA/spark-rapids/pull/10052)|Set seed=0 for some integration test cases| +|[#10053](https://github.com/NVIDIA/spark-rapids/pull/10053)|Remove invalid user from CODEOWNER file [skip ci]| +|[#10049](https://github.com/NVIDIA/spark-rapids/pull/10049)|Fix out of range error from pySpark in test_timestamp_millis and other two integration test cases| +|[#9721](https://github.com/NVIDIA/spark-rapids/pull/9721)|Support date_format via Gpu for non-UTC time zone| +|[#9470](https://github.com/NVIDIA/spark-rapids/pull/9470)|Use float to string kernel| +|[#9845](https://github.com/NVIDIA/spark-rapids/pull/9845)|Use parse_url kernel for HOST parsing| +|[#10024](https://github.com/NVIDIA/spark-rapids/pull/10024)|Support hour minute second for non-UTC time zone| +|[#9973](https://github.com/NVIDIA/spark-rapids/pull/9973)|Batching support for row-based bounded window functions | +|[#10042](https://github.com/NVIDIA/spark-rapids/pull/10042)|Update tests to not have hard coded fallback when not needed| +|[#9816](https://github.com/NVIDIA/spark-rapids/pull/9816)|Support unix_timestamp and to_unix_timestamp with non-UTC timezones (non-DST)| +|[#9902](https://github.com/NVIDIA/spark-rapids/pull/9902)|Some refactor for the Python UDF code| +|[#10023](https://github.com/NVIDIA/spark-rapids/pull/10023)|GPU supports `yyyyMMdd` format by post process for the `from_unixtime` function| +|[#10033](https://github.com/NVIDIA/spark-rapids/pull/10033)|Remove GpuToTimestampImproved and spark.rapids.sql.improvedTimeOps.enabled| +|[#10016](https://github.com/NVIDIA/spark-rapids/pull/10016)|Fix infinite loop in test_str_to_map_expr_random_delimiters| +|[#9481](https://github.com/NVIDIA/spark-rapids/pull/9481)|Use parse_url kernel for PROTOCOL parsing| +|[#10030](https://github.com/NVIDIA/spark-rapids/pull/10030)|Update links in shims.md| +|[#10015](https://github.com/NVIDIA/spark-rapids/pull/10015)|Fix array_transform to not recompute the argument| +|[#10011](https://github.com/NVIDIA/spark-rapids/pull/10011)|Add cpu oom retry split handling to InternalRowToColumnarBatchIterator| +|[#10019](https://github.com/NVIDIA/spark-rapids/pull/10019)|Fix auto merge conflict 10010 [skip ci]| +|[#9760](https://github.com/NVIDIA/spark-rapids/pull/9760)|Support split broadcast join condition into ast and non-ast| +|[#9827](https://github.com/NVIDIA/spark-rapids/pull/9827)|Enable ORC timestamp and decimal predicate push down tests| +|[#10002](https://github.com/NVIDIA/spark-rapids/pull/10002)|Use Spark 3.3.3 instead of 3.3.2 for Scala 2.13 premerge builds| +|[#10000](https://github.com/NVIDIA/spark-rapids/pull/10000)|Optimize from_unixtime| +|[#10003](https://github.com/NVIDIA/spark-rapids/pull/10003)|Fix merge conflict with branch-23.12| +|[#9984](https://github.com/NVIDIA/spark-rapids/pull/9984)|Fix 340+(including DB341+) does not support casting date to integral/float| +|[#9972](https://github.com/NVIDIA/spark-rapids/pull/9972)|Fix year 0 is out of range in test_from_json_struct_timestamp | +|[#9814](https://github.com/NVIDIA/spark-rapids/pull/9814)|Support from_unixtime via Gpu for non-UTC time zone| +|[#9929](https://github.com/NVIDIA/spark-rapids/pull/9929)|Add host memory retries for GeneratedInternalRowToCudfRowIterator| +|[#9957](https://github.com/NVIDIA/spark-rapids/pull/9957)|Update cases for cast between integral and (date/time)| +|[#9959](https://github.com/NVIDIA/spark-rapids/pull/9959)|Append new authorized user to blossom-ci whitelist [skip ci]| +|[#9942](https://github.com/NVIDIA/spark-rapids/pull/9942)|Fix a potential data corruption for Pandas UDF| +|[#9922](https://github.com/NVIDIA/spark-rapids/pull/9922)|Fix `allowMultipleJars` recommend setting message| +|[#9947](https://github.com/NVIDIA/spark-rapids/pull/9947)|Fix merge conflict with branch-23.12| +|[#9908](https://github.com/NVIDIA/spark-rapids/pull/9908)|Register default allocator for host memory| +|[#9944](https://github.com/NVIDIA/spark-rapids/pull/9944)|Fix Java OOM caused by incorrect state of shouldCapture when exception occurred| +|[#9937](https://github.com/NVIDIA/spark-rapids/pull/9937)|Refactor to use CLASSIFIER instead of CUDA_CLASSIFIER [skip ci]| +|[#9904](https://github.com/NVIDIA/spark-rapids/pull/9904)|Params for build and test CI scripts on Databricks| +|[#9719](https://github.com/NVIDIA/spark-rapids/pull/9719)|Support fine grained timezone checker instead of type based| +|[#9918](https://github.com/NVIDIA/spark-rapids/pull/9918)|Prevent generation of 'year 0 is out of range' strings in IT| +|[#9852](https://github.com/NVIDIA/spark-rapids/pull/9852)|Avoid generating duplicate nan keys with MapGen(FloatGen)| +|[#9674](https://github.com/NVIDIA/spark-rapids/pull/9674)|Add cache action to speed up mvn workflow [skip ci]| +|[#9900](https://github.com/NVIDIA/spark-rapids/pull/9900)|Revert "Remove Databricks 13.3 from release 23.12 (#9890)"| +|[#9889](https://github.com/NVIDIA/spark-rapids/pull/9889)|Fix test_cast_string_ts_valid_format test| +|[#9888](https://github.com/NVIDIA/spark-rapids/pull/9888)|Update nightly build and deploy script for arm artifacts [skip ci]| +|[#9833](https://github.com/NVIDIA/spark-rapids/pull/9833)|Fix a hang for Pandas UDFs on DB 13.3| +|[#9656](https://github.com/NVIDIA/spark-rapids/pull/9656)|Update for new retry state machine JNI APIs| +|[#9654](https://github.com/NVIDIA/spark-rapids/pull/9654)|Detect multiple jars on the classpath when init plugin| +|[#9857](https://github.com/NVIDIA/spark-rapids/pull/9857)|Skip redundant steps in nightly build [skip ci]| +|[#9812](https://github.com/NVIDIA/spark-rapids/pull/9812)|Update JNI and private dep version to 24.02.0-SNAPSHOT| +|[#9716](https://github.com/NVIDIA/spark-rapids/pull/9716)|Initiate project version 24.02.0-SNAPSHOT| + diff --git a/docs/download.md b/docs/download.md index a7b6bd23a4a..f786f5a217d 100644 --- a/docs/download.md +++ b/docs/download.md @@ -18,7 +18,7 @@ cuDF jar, that is either preinstalled in the Spark classpath on all nodes or sub that uses the RAPIDS Accelerator For Apache Spark. See the [getting-started guide](https://docs.nvidia.com/spark-rapids/user-guide/latest/getting-started/overview.html) for more details. -## Release v24.04.1 +## Release v24.06.0 ### Hardware Requirements: The plugin is tested on the following architectures: @@ -41,7 +41,7 @@ The plugin is tested on the following architectures: Supported Spark versions: Apache Spark 3.2.0, 3.2.1, 3.2.2, 3.2.3, 3.2.4 Apache Spark 3.3.0, 3.3.1, 3.3.2, 3.3.3, 3.3.4 - Apache Spark 3.4.0, 3.4.1, 3.4.2 + Apache Spark 3.4.0, 3.4.1, 3.4.2, 3.4.3 Apache Spark 3.5.0, 3.5.1 Supported Databricks runtime versions for Azure and AWS: @@ -57,6 +57,7 @@ The plugin is tested on the following architectures: Spark runtime 1.1 LTS Spark runtime 2.0 Spark runtime 2.1 + Spark runtime 2.2 *Some hardware may have a minimum driver version greater than R470. Check the GPU spec sheet for your hardware's minimum driver version. @@ -67,14 +68,14 @@ for your hardware's minimum driver version. ### RAPIDS Accelerator's Support Policy for Apache Spark The RAPIDS Accelerator maintains support for Apache Spark versions available for download from [Apache Spark](https://spark.apache.org/downloads.html) -### Download RAPIDS Accelerator for Apache Spark v24.04.1 +### Download RAPIDS Accelerator for Apache Spark v24.06.0 | Processor | Scala Version | Download Jar | Download Signature | |-----------|---------------|--------------|--------------------| -| x86_64 | Scala 2.12 | [RAPIDS Accelerator v24.04.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.04.1/rapids-4-spark_2.12-24.04.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.04.1/rapids-4-spark_2.12-24.04.1.jar.asc) | -| x86_64 | Scala 2.13 | [RAPIDS Accelerator v24.04.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.04.1/rapids-4-spark_2.13-24.04.1.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.04.1/rapids-4-spark_2.13-24.04.1.jar.asc) | -| arm64 | Scala 2.12 | [RAPIDS Accelerator v24.04.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.04.1/rapids-4-spark_2.12-24.04.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.04.1/rapids-4-spark_2.12-24.04.1-cuda11-arm64.jar.asc) | -| arm64 | Scala 2.13 | [RAPIDS Accelerator v24.04.1](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.04.1/rapids-4-spark_2.13-24.04.1-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.04.1/rapids-4-spark_2.13-24.04.1-cuda11-arm64.jar.asc) | +| x86_64 | Scala 2.12 | [RAPIDS Accelerator v24.06.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.06.0/rapids-4-spark_2.12-24.06.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.06.0/rapids-4-spark_2.12-24.06.0.jar.asc) | +| x86_64 | Scala 2.13 | [RAPIDS Accelerator v24.06.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.06.0/rapids-4-spark_2.13-24.06.0.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.06.0/rapids-4-spark_2.13-24.06.0.jar.asc) | +| arm64 | Scala 2.12 | [RAPIDS Accelerator v24.06.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.06.0/rapids-4-spark_2.12-24.06.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/24.06.0/rapids-4-spark_2.12-24.06.0-cuda11-arm64.jar.asc) | +| arm64 | Scala 2.13 | [RAPIDS Accelerator v24.06.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.06.0/rapids-4-spark_2.13-24.06.0-cuda11-arm64.jar) | [Signature](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.13/24.06.0/rapids-4-spark_2.13-24.06.0-cuda11-arm64.jar.asc) | This package is built against CUDA 11.8. It is tested on V100, T4, A10, A100, L4 and H100 GPUs with CUDA 11.8 through CUDA 12.0. @@ -83,24 +84,20 @@ CUDA 11.8 through CUDA 12.0. * Download the [PUB_KEY](https://keys.openpgp.org/search?q=sw-spark@nvidia.com). * Import the public key: `gpg --import PUB_KEY` * Verify the signature for Scala 2.12 jar: - `gpg --verify rapids-4-spark_2.12-24.04.1.jar.asc rapids-4-spark_2.12-24.04.1.jar` + `gpg --verify rapids-4-spark_2.12-24.06.0.jar.asc rapids-4-spark_2.12-24.06.0.jar` * Verify the signature for Scala 2.13 jar: - `gpg --verify rapids-4-spark_2.13-24.04.1.jar.asc rapids-4-spark_2.13-24.04.1.jar` + `gpg --verify rapids-4-spark_2.13-24.06.0.jar.asc rapids-4-spark_2.13-24.06.0.jar` The output of signature verify: gpg: Good signature from "NVIDIA Spark (For the signature of spark-rapids release jars) " ### Release Notes -* New functionality and performance improvements for this release include: -* Performance improvements for S3 reading. -Refer to perfio.s3.enabled in [advanced_configs](./additional-functionality/advanced_configs.md) for more details. -* Performance improvements when doing a joins on unique keys. -* Enhanced decompression kernels for zstd and snappy. -* Enhanced Parquet reading performance with modular kernels. -* Added compatibility with Spark version 3.5.1. -* Deprecated support for Databricks 10.4 ML LTS. -* For updates on RAPIDS Accelerator Tools, please visit [this link](https://github.com/NVIDIA/spark-rapids-tools/releases). +* Improve support for Unity Catalog on Databricks +* Added support for parse_url PATH +* Added support for array_filter +* Added support for Spark 3.4.3 +* For updates on RAPIDS Accelerator Tools, please visit [this link](https://github.com/NVIDIA/spark-rapids-tools/releases) For a detailed list of changes, please refer to the [CHANGELOG](https://github.com/NVIDIA/spark-rapids/blob/main/CHANGELOG.md). diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index dec93e6f22a..18c26aa26e7 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -171,11 +171,16 @@ else TEST_TYPE_PARAM="--test_type $TEST_TYPE" fi + # We found that when parallelism > 8, as it increases, the test speed will become slower and slower. So we set the default maximum parallelism to 8. + # Note that MAX_PARALLEL varies with the hardware, OS, and test case. Please overwrite it with an appropriate value if needed. + MAX_PARALLEL=${MAX_PARALLEL:-8} if [[ ${TEST_PARALLEL} -lt 2 ]]; then # With xdist 0 and 1 are the same parallelism but # 0 is more efficient TEST_PARALLEL_OPTS=() + elif [[ ${TEST_PARALLEL} -gt ${MAX_PARALLEL} ]]; then + TEST_PARALLEL_OPTS=("-n" "$MAX_PARALLEL") else TEST_PARALLEL_OPTS=("-n" "$TEST_PARALLEL") fi @@ -245,6 +250,12 @@ else DRIVER_EXTRA_JAVA_OPTIONS="-ea -Duser.timezone=$TZ -Ddelta.log.cacheSize=$deltaCacheSize" export PYSP_TEST_spark_driver_extraJavaOptions="$DRIVER_EXTRA_JAVA_OPTIONS $COVERAGE_SUBMIT_FLAGS" export PYSP_TEST_spark_executor_extraJavaOptions="-ea -Duser.timezone=$TZ" + + # Set driver memory to speed up tests such as deltalake + if [[ -n "${DRIVER_MEMORY}" ]]; then + export PYSP_TEST_spark_driver_memory="${DRIVER_MEMORY}" + fi + export PYSP_TEST_spark_ui_showConsoleProgress='false' export PYSP_TEST_spark_sql_session_timeZone=$TZ export PYSP_TEST_spark_sql_shuffle_partitions='4' diff --git a/integration_tests/src/main/python/data_gen.py b/integration_tests/src/main/python/data_gen.py index 2e6c36b77d9..fb1627af75b 100644 --- a/integration_tests/src/main/python/data_gen.py +++ b/integration_tests/src/main/python/data_gen.py @@ -159,7 +159,8 @@ def __repr__(self): return super().__repr__() + '(' + str(self._child_gen) + ')' def _cache_repr(self): - return super()._cache_repr() + '(' + self._child_gen._cache_repr() + ')' + return (super()._cache_repr() + '(' + self._child_gen._cache_repr() + + ',' + str(self._func.__code__) + ')' ) def start(self, rand): self._child_gen.start(rand) @@ -667,7 +668,10 @@ def __repr__(self): return super().__repr__() + '(' + str(self._child_gen) + ')' def _cache_repr(self): - return super()._cache_repr() + '(' + self._child_gen._cache_repr() + ')' + return (super()._cache_repr() + '(' + self._child_gen._cache_repr() + + ',' + str(self._min_length) + ',' + str(self._max_length) + ',' + + str(self.all_null) + ',' + str(self.convert_to_tuple) + ')') + def start(self, rand): self._child_gen.start(rand) @@ -701,7 +705,8 @@ def __repr__(self): return super().__repr__() + '(' + str(self._key_gen) + ',' + str(self._value_gen) + ')' def _cache_repr(self): - return super()._cache_repr() + '(' + self._key_gen._cache_repr() + ',' + self._value_gen._cache_repr() + ')' + return (super()._cache_repr() + '(' + self._key_gen._cache_repr() + ',' + self._value_gen._cache_repr() + + ',' + str(self._min_length) + ',' + str(self._max_length) + ')') def start(self, rand): self._key_gen.start(rand) @@ -769,12 +774,13 @@ def __init__(self, min_value=MIN_DAY_TIME_INTERVAL, max_value=MAX_DAY_TIME_INTER self._min_micros = (math.floor(min_value.total_seconds()) * 1000000) + min_value.microseconds self._max_micros = (math.floor(max_value.total_seconds()) * 1000000) + max_value.microseconds fields = ["day", "hour", "minute", "second"] - start_index = fields.index(start_field) - end_index = fields.index(end_field) - if start_index > end_index: + self._start_index = fields.index(start_field) + self._end_index = fields.index(end_field) + if self._start_index > self._end_index: raise RuntimeError('Start field {}, end field {}, valid fields is {}, start field index should <= end ' 'field index'.format(start_field, end_field, fields)) - super().__init__(DayTimeIntervalType(start_index, end_index), nullable=nullable, special_cases=special_cases) + super().__init__(DayTimeIntervalType(self._start_index, self._end_index), nullable=nullable, + special_cases=special_cases) def _gen_random(self, rand): micros = rand.randint(self._min_micros, self._max_micros) @@ -784,7 +790,8 @@ def _gen_random(self, rand): return timedelta(microseconds=micros) def _cache_repr(self): - return super()._cache_repr() + '(' + str(self._min_micros) + ',' + str(self._max_micros) + ')' + return (super()._cache_repr() + '(' + str(self._min_micros) + ',' + str(self._max_micros) + + ',' + str(self._start_index) + ',' + str(self._end_index) + ')') def start(self, rand): self._start(rand, lambda: self._gen_random(rand)) diff --git a/integration_tests/src/main/python/fastparquet_compatibility_test.py b/integration_tests/src/main/python/fastparquet_compatibility_test.py index 53a99d32bd2..4b0fc2827f4 100644 --- a/integration_tests/src/main/python/fastparquet_compatibility_test.py +++ b/integration_tests/src/main/python/fastparquet_compatibility_test.py @@ -30,6 +30,8 @@ def fastparquet_unavailable(): return False except ImportError: return True + except ValueError: # TODO: remove when https://github.com/NVIDIA/spark-rapids/issues/11070 is fixed + return True rebase_write_corrected_conf = { diff --git a/integration_tests/src/main/python/hive_parquet_write_test.py b/integration_tests/src/main/python/hive_parquet_write_test.py new file mode 100644 index 00000000000..f62439a39af --- /dev/null +++ b/integration_tests/src/main/python/hive_parquet_write_test.py @@ -0,0 +1,176 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from asserts import assert_gpu_and_cpu_sql_writes_are_equal_collect +from conftest import is_databricks_runtime +from data_gen import * +from hive_write_test import _restricted_timestamp +from marks import allow_non_gpu, ignore_order +from spark_session import with_cpu_session, is_before_spark_320, is_spark_350_or_later + +# Disable the meta conversion from Hive write to FrameData write in Spark, to test +# "GpuInsertIntoHiveTable" for Parquet write. +_write_to_hive_conf = {"spark.sql.hive.convertMetastoreParquet": False} + +_hive_basic_gens = [ + byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen, string_gen, boolean_gen, + DateGen(start=date(1590, 1, 1)), _restricted_timestamp(), + DecimalGen(precision=19, scale=1, nullable=True), + DecimalGen(precision=23, scale=5, nullable=True), + DecimalGen(precision=36, scale=3, nullable=True)] + +_hive_basic_struct_gen = StructGen( + [['c'+str(ind), c_gen] for ind, c_gen in enumerate(_hive_basic_gens)]) + +_hive_struct_gens = [ + _hive_basic_struct_gen, + StructGen([['child0', byte_gen], ['child1', _hive_basic_struct_gen]]), + StructGen([['child0', ArrayGen(short_gen)], ['child1', double_gen]])] + +_hive_array_gens = [ArrayGen(sub_gen) for sub_gen in _hive_basic_gens] + [ + ArrayGen(ArrayGen(short_gen, max_length=10), max_length=10), + ArrayGen(ArrayGen(string_gen, max_length=10), max_length=10), + ArrayGen(StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]]))] + +_hive_map_gens = [simple_string_to_string_map_gen] + [MapGen(f(nullable=False), f()) for f in [ + BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen, + lambda nullable=True: _restricted_timestamp(nullable=nullable), + lambda nullable=True: DateGen(start=date(1590, 1, 1), nullable=nullable), + lambda nullable=True: DecimalGen(precision=19, scale=1, nullable=nullable), + lambda nullable=True: DecimalGen(precision=36, scale=5, nullable=nullable)]] + +_hive_write_gens = [_hive_basic_gens, _hive_struct_gens, _hive_array_gens, _hive_map_gens] + +# ProjectExec falls back on databricks due to no GPU version of "MapFromArrays". +fallback_nodes = ['ProjectExec'] if is_databricks_runtime() or is_spark_350_or_later() else [] + + +@allow_non_gpu(*(non_utc_allow + fallback_nodes)) +@ignore_order(local=True) +@pytest.mark.parametrize("is_ctas", [True, False], ids=['CTAS', 'CTTW']) +@pytest.mark.parametrize("gens", _hive_write_gens, ids=idfn) +def test_write_parquet_into_hive_table(spark_tmp_table_factory, is_ctas, gens): + + def gen_table(spark): + gen_list = [('_c' + str(i), gen) for i, gen in enumerate(gens)] + types_sql_str = ','.join('{} {}'.format( + name, gen.data_type.simpleString()) for name, gen in gen_list) + data_table = spark_tmp_table_factory.get() + gen_df(spark, gen_list).createOrReplaceTempView(data_table) + return data_table, types_sql_str + + (input_table, input_schema) = with_cpu_session(gen_table) + + def write_to_hive_sql(spark, output_table): + if is_ctas: + # Create Table As Select + return [ + "CREATE TABLE {} STORED AS PARQUET AS SELECT * FROM {}".format( + output_table, input_table) + ] + else: + # Create Table Then Write + return [ + "CREATE TABLE {} ({}) STORED AS PARQUET".format(output_table, input_schema), + "INSERT OVERWRITE TABLE {} SELECT * FROM {}".format(output_table, input_table) + ] + + assert_gpu_and_cpu_sql_writes_are_equal_collect( + spark_tmp_table_factory, + write_to_hive_sql, + _write_to_hive_conf) + + +@allow_non_gpu(*non_utc_allow) +@ignore_order(local=True) +@pytest.mark.parametrize("is_static", [True, False], ids=['Static_Partition', 'Dynamic_Partition']) +def test_write_parquet_into_partitioned_hive_table(spark_tmp_table_factory, is_static): + # Generate hive table in Parquet format + def gen_table(spark): + # gen_list = [('_c' + str(i), gen) for i, gen in enumerate(gens)] + dates = [date(2024, 2, 28), date(2024, 2, 27), date(2024, 2, 26)] + gen_list = [('a', int_gen), + ('b', long_gen), + ('c', short_gen), + ('d', string_gen), + ('part', SetValuesGen(DateType(), dates))] + data_table = spark_tmp_table_factory.get() + gen_df(spark, gen_list).createOrReplaceTempView(data_table) + return data_table + + input_table = with_cpu_session(gen_table) + + def partitioned_write_to_hive_sql(spark, output_table): + sql_create_part_table = ( + "CREATE TABLE {} (a INT, b LONG, c SHORT, d STRING) " + "PARTITIONED BY (part DATE) STORED AS PARQUET" + ).format(output_table) + if is_static: + return [ + # sql_1: Create partitioned hive table + sql_create_part_table, + # sql_2: Static partition write only to partition 'par2' + "INSERT OVERWRITE TABLE {} PARTITION (part='2024-02-25') " + "SELECT a, b, c, d FROM {}".format(output_table, input_table) + ] + else: + return [ + # sql_1: Create partitioned hive table + sql_create_part_table, + # sql_2: Dynamic partition write + "INSERT OVERWRITE TABLE {} SELECT * FROM {}".format(output_table, input_table) + ] + all_confs = copy_and_update(_write_to_hive_conf, { + "hive.exec.dynamic.partition.mode": "nonstrict"}) + assert_gpu_and_cpu_sql_writes_are_equal_collect( + spark_tmp_table_factory, + partitioned_write_to_hive_sql, + all_confs) + + +zstd_param = pytest.param('ZSTD', + marks=pytest.mark.skipif(is_before_spark_320(), reason="zstd is not supported before 320")) + +@allow_non_gpu(*(non_utc_allow + fallback_nodes)) +@ignore_order(local=True) +@pytest.mark.parametrize("comp_type", ['UNCOMPRESSED', 'SNAPPY', zstd_param]) +def test_write_compressed_parquet_into_hive_table(spark_tmp_table_factory, comp_type): + # Generate hive table in Parquet format + def gen_table(spark): + gens = _hive_basic_gens + _hive_struct_gens + _hive_array_gens + _hive_map_gens + gen_list = [('_c' + str(i), gen) for i, gen in enumerate(gens)] + types_sql_str = ','.join('{} {}'.format( + name, gen.data_type.simpleString()) for name, gen in gen_list) + data_table = spark_tmp_table_factory.get() + gen_df(spark, gen_list).createOrReplaceTempView(data_table) + return data_table, types_sql_str + + input_table, schema_str = with_cpu_session(gen_table) + + def write_to_hive_sql(spark, output_table): + return [ + # Create table with compression type + "CREATE TABLE {} ({}) STORED AS PARQUET " + "TBLPROPERTIES ('parquet.compression'='{}')".format( + output_table, schema_str, comp_type), + # Insert into table + "INSERT OVERWRITE TABLE {} SELECT * FROM {}".format(output_table, input_table) + ] + + assert_gpu_and_cpu_sql_writes_are_equal_collect( + spark_tmp_table_factory, + write_to_hive_sql, + _write_to_hive_conf) diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index 99a2d4241e8..38dab9e84a4 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -224,6 +224,10 @@ def test_all_null_int96(spark_tmp_path): class AllNullTimestampGen(TimestampGen): def start(self, rand): self._start(rand, lambda : None) + + def _cache_repr(self): + return super()._cache_repr() + '(all_nulls)' + data_path = spark_tmp_path + '/PARQUET_DATA' confs = copy_and_update(writer_confs, {'spark.sql.parquet.outputTimestampType': 'INT96'}) assert_gpu_and_cpu_writes_are_equal_collect( diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py index 89929eb6762..18a83870d83 100644 --- a/integration_tests/src/main/python/regexp_test.py +++ b/integration_tests/src/main/python/regexp_test.py @@ -454,6 +454,7 @@ def test_rlike_rewrite_optimization(): 'rlike(a, "(.*)(abb)(.*)")', 'rlike(a, "^(abb)(.*)")', 'rlike(a, "^abb")', + 'rlike(a, "^.*(aaa)")', 'rlike(a, "\\\\A(abb)(.*)")', 'rlike(a, "\\\\Aabb")', 'rlike(a, "^(abb)\\\\Z")', @@ -466,7 +467,12 @@ def test_rlike_rewrite_optimization(): 'rlike(a, "ab[a-c]{3}")', 'rlike(a, "a[a-c]{1,3}")', 'rlike(a, "a[a-c]{1,}")', - 'rlike(a, "a[a-c]+")'), + 'rlike(a, "a[a-c]+")', + 'rlike(a, "(aaa|bbb|ccc)")', + 'rlike(a, ".*.*(aaa|bbb).*.*")', + 'rlike(a, "^.*(aaa|bbb|ccc)")', + 'rlike(a, "aaa|bbb")', + 'rlike(a, "aaa|(bbb|ccc)")'), conf=_regexp_conf) def test_regexp_replace_character_set_negated(): diff --git a/integration_tests/src/main/python/spark_session.py b/integration_tests/src/main/python/spark_session.py index 78e0b08a651..c55f1976497 100644 --- a/integration_tests/src/main/python/spark_session.py +++ b/integration_tests/src/main/python/spark_session.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -220,6 +220,9 @@ def is_spark_341(): def is_spark_350_or_later(): return spark_version() >= "3.5.0" +def is_spark_351_or_later(): + return spark_version() >= "3.5.1" + def is_spark_330(): return spark_version() == "3.3.0" diff --git a/jenkins/Jenkinsfile-blossom.premerge b/jenkins/Jenkinsfile-blossom.premerge index e7bb8af2cdd..d61638d901a 100755 --- a/jenkins/Jenkinsfile-blossom.premerge +++ b/jenkins/Jenkinsfile-blossom.premerge @@ -57,7 +57,8 @@ pipeline { } parameters { - string(name: 'REF', defaultValue: '', + // Put a default value for REF to avoid error when running the pipeline manually + string(name: 'REF', defaultValue: 'main', description: 'Merged commit of specific PR') string(name: 'GITHUB_DATA', defaultValue: '', description: 'Json-formatted github data from upstream blossom-ci') @@ -273,7 +274,7 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true""" } } // end of Unit Test stage - stage('Databricks') { + stage('Databricks IT part1') { when { expression { db_build } } @@ -284,17 +285,42 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true""" propagate: false, wait: true, parameters: [ string(name: 'REF', value: params.REF), - string(name: 'GITHUB_DATA', value: params.GITHUB_DATA) + string(name: 'GITHUB_DATA', value: params.GITHUB_DATA), + string(name: 'TEST_MODE', value: 'CI_PART1') ]) if ( DBJob.result != 'SUCCESS' ) { // Output Databricks failure logs to uploaded onto the pre-merge PR print(DBJob.getRawBuild().getLog()) // Fail the pipeline - error "Databricks build result : " + DBJob.result + error "Databricks part1 result : " + DBJob.result } } } - } // end of Databricks + } // end of Databricks IT part1 + + stage('Databricks IT part2') { + when { + expression { db_build } + } + steps { + script { + githubHelper.updateCommitStatus("", "Running - includes databricks", GitHubCommitState.PENDING) + def DBJob = build(job: 'rapids-databricks_premerge-github', + propagate: false, wait: true, + parameters: [ + string(name: 'REF', value: params.REF), + string(name: 'GITHUB_DATA', value: params.GITHUB_DATA), + string(name: 'TEST_MODE', value: 'CI_PART2') + ]) + if ( DBJob.result != 'SUCCESS' ) { + // Output Databricks failure logs to uploaded onto the pre-merge PR + print(DBJob.getRawBuild().getLog()) + // Fail the pipeline + error "Databricks part2 result : " + DBJob.result + } + } + } + } // end of Databricks IT part2 stage('Dummy stage: blue ocean log view') { steps { diff --git a/jenkins/Jenkinsfile-blossom.premerge-databricks b/jenkins/Jenkinsfile-blossom.premerge-databricks index a13170f7162..5b0a2bf1226 100644 --- a/jenkins/Jenkinsfile-blossom.premerge-databricks +++ b/jenkins/Jenkinsfile-blossom.premerge-databricks @@ -46,10 +46,13 @@ pipeline { } parameters { - string(name: 'REF', defaultValue: '', + // Put a default value for REF to avoid error when running the pipeline manually + string(name: 'REF', defaultValue: 'main', description: 'Merged commit of specific PR') string(name: 'GITHUB_DATA', defaultValue: '', description: 'Json-formatted github data from upstream blossom-ci') + choice(name: 'TEST_MODE', choices: ['CI_PART1', 'CI_PART2'], + description: 'Separate integration tests into 2 parts, and run each part in parallell') } environment { @@ -177,7 +180,7 @@ void databricksBuild() { container('cpu') { try { withCredentials([file(credentialsId: 'SPARK_DATABRICKS_PRIVKEY', variable: 'DATABRICKS_PRIVKEY')]) { - def TEST_PARAMS = " -w $DATABRICKS_HOST -t $DATABRICKS_TOKEN -c $CLUSTER_ID" + + def TEST_PARAMS = " -w $DATABRICKS_HOST -t $DATABRICKS_TOKEN -c $CLUSTER_ID -e TEST_MODE=$TEST_MODE" + " -p $DATABRICKS_PRIVKEY -l ./jenkins/databricks/test.sh -v $BASE_SPARK_VERSION -d /home/ubuntu/test.sh" if (params.SPARK_CONF) { TEST_PARAMS += " -f ${params.SPARK_CONF}" diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh index d5c440bfbb2..3c3e73ab582 100755 --- a/jenkins/databricks/init_cudf_udf.sh +++ b/jenkins/databricks/init_cudf_udf.sh @@ -20,7 +20,7 @@ set -ex -CUDF_VER=${CUDF_VER:-24.06} # TODO: https://github.com/NVIDIA/spark-rapids/issues/ +CUDF_VER=${CUDF_VER:-24.08} CUDA_VER=${CUDA_VER:-11.8} # Need to explicitly add conda into PATH environment, to activate conda environment. diff --git a/jenkins/databricks/install_deps.py b/jenkins/databricks/install_deps.py index be5cb9bc040..8d21a4f9556 100644 --- a/jenkins/databricks/install_deps.py +++ b/jenkins/databricks/install_deps.py @@ -115,8 +115,10 @@ def define_deps(spark_version, scala_version): f'{prefix_ws_sp_mvn_hadoop}--org.json4s--json4s-jackson_{scala_version}--org.json4s__json4s-jackson_{scala_version}__*.jar'), Artifact('org.javaassist', 'javaassist', f'{prefix_ws_sp_mvn_hadoop}--org.javassist--javassist--org.javassist__javassist__*.jar'), - Artifact('com.fasterxml.jackson.core', 'jackson-core', + Artifact('com.fasterxml.jackson.core', 'jackson-databind', f'{prefix_ws_sp_mvn_hadoop}--com.fasterxml.jackson.core--jackson-databind--com.fasterxml.jackson.core__jackson-databind__*.jar'), + Artifact('com.fasterxml.jackson.core', 'jackson-core', + f'{prefix_ws_sp_mvn_hadoop}--com.fasterxml.jackson.core--jackson-core--com.fasterxml.jackson.core__jackson-core__*.jar'), Artifact('com.fasterxml.jackson.core', 'jackson-annotations', f'{prefix_ws_sp_mvn_hadoop}--com.fasterxml.jackson.core--jackson-annotations--com.fasterxml.jackson.core__jackson-annotations__*.jar'), Artifact('org.apache.spark', f'spark-avro_{scala_version}', diff --git a/jenkins/databricks/test.sh b/jenkins/databricks/test.sh index 404dcd97578..c966d5a92f7 100755 --- a/jenkins/databricks/test.sh +++ b/jenkins/databricks/test.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -59,15 +59,13 @@ IS_SPARK_321_OR_LATER=0 # - DELTA_LAKE_ONLY: delta_lake tests only # - MULTITHREADED_SHUFFLE: shuffle tests only # - PYARROW_ONLY: pyarrow tests only +# - CI_PART1 or CI_PART2 : part1 or part2 of the tests run in parallel from CI TEST_MODE=${TEST_MODE:-'DEFAULT'} # Classloader config is here to work around classloader issues with # --packages in distributed setups, should be fixed by # https://github.com/NVIDIA/spark-rapids/pull/5646 -# Increase driver memory as Delta Lake tests can slowdown with default 1G (possibly due to caching?) -DELTA_LAKE_CONFS="--driver-memory 2g" - # Enable event log for qualification & profiling tools testing export PYSP_TEST_spark_eventLog_enabled=true mkdir -p /tmp/spark-events @@ -89,32 +87,30 @@ run_pyarrow_tests() { bash integration_tests/run_pyspark_from_build.sh -m pyarrow_test --pyarrow_test --runtime_env="databricks" --test_type=$TEST_TYPE } -## limit parallelism to avoid OOM kill -export TEST_PARALLEL=${TEST_PARALLEL:-4} - -if [[ $TEST_MODE == "DEFAULT" ]]; then +## Separate the integration tests into "CI_PART1" and "CI_PART2", run each part in parallel on separate Databricks clusters to speed up the testing process. +if [[ $TEST_MODE == "DEFAULT" || $TEST_MODE == "CI_PART1" ]]; then bash integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE +fi +## Run tests with jars building from the spark-rapids source code +if [[ "$(pwd)" == "$SOURCE_PATH" ]]; then ## Run cache tests - if [[ "$IS_SPARK_321_OR_LATER" -eq "1" ]]; then + if [[ "$IS_SPARK_321_OR_LATER" -eq "1" && ("$TEST_MODE" == "DEFAULT" || $TEST_MODE == "CI_PART2") ]]; then PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \ bash integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test fi -fi -## Run tests with jars building from the spark-rapids source code -if [ "$(pwd)" == "$SOURCE_PATH" ]; then - if [[ "$TEST_MODE" == "DEFAULT" || "$TEST_MODE" == "DELTA_LAKE_ONLY" ]]; then + if [[ "$TEST_MODE" == "DEFAULT" || $TEST_MODE == "CI_PART2" || "$TEST_MODE" == "DELTA_LAKE_ONLY" ]]; then ## Run Delta Lake tests - SPARK_SUBMIT_FLAGS="$SPARK_CONF $DELTA_LAKE_CONFS" TEST_PARALLEL=1 \ + DRIVER_MEMORY="4g" \ bash integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "delta_lake" --delta_lake --test_type=$TEST_TYPE fi - if [[ "$TEST_MODE" == "DEFAULT" || "$TEST_MODE" == "MULTITHREADED_SHUFFLE" ]]; then + if [[ "$TEST_MODE" == "DEFAULT" || $TEST_MODE == "CI_PART2" || "$TEST_MODE" == "MULTITHREADED_SHUFFLE" ]]; then ## Mutithreaded Shuffle test rapids_shuffle_smoke_test fi - if [[ "$TEST_MODE" == "DEFAULT" || "$TEST_MODE" == "PYARROW_ONLY" ]]; then + if [[ "$TEST_MODE" == "DEFAULT" || $TEST_MODE == "CI_PART2" || "$TEST_MODE" == "PYARROW_ONLY" ]]; then # Pyarrow tests run_pyarrow_tests fi diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index 883b3f3acfc..697722c0138 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -78,7 +78,7 @@ mvn_verify() { # Here run Python integration tests tagged with 'premerge_ci_1' only, that would help balance test duration and memory # consumption from two k8s pods running in parallel, which executes 'mvn_verify()' and 'ci_2()' respectively. $MVN_CMD -B $MVN_URM_MIRROR $PREMERGE_PROFILES clean verify -Dpytest.TEST_TAGS="premerge_ci_1" \ - -Dpytest.TEST_TYPE="pre-commit" -Dpytest.TEST_PARALLEL=4 -Dcuda.version=$CLASSIFIER + -Dpytest.TEST_TYPE="pre-commit" -Dcuda.version=$CLASSIFIER # The jacoco coverage should have been collected, but because of how the shade plugin # works and jacoco we need to clean some things up so jacoco will only report for the @@ -162,7 +162,6 @@ ci_2() { $MVN_CMD -U -B $MVN_URM_MIRROR clean package $MVN_BUILD_ARGS -DskipTests=true export TEST_TAGS="not premerge_ci_1" export TEST_TYPE="pre-commit" - export TEST_PARALLEL=5 # Download a Scala 2.12 build of spark prepare_spark $SPARK_VER 2.12 @@ -206,7 +205,6 @@ ci_scala213() { cd .. # Run integration tests in the project root dir to leverage test cases and resource files export TEST_TAGS="not premerge_ci_1" export TEST_TYPE="pre-commit" - export TEST_PARALLEL=5 # SPARK_HOME (and related) must be set to a Spark built with Scala 2.13 SPARK_HOME=$SPARK_HOME PYTHONPATH=$PYTHONPATH \ ./integration_tests/run_pyspark_from_build.sh diff --git a/jenkins/version-def.sh b/jenkins/version-def.sh index d3c01e1eba4..dbad6d6fd94 100755 --- a/jenkins/version-def.sh +++ b/jenkins/version-def.sh @@ -27,7 +27,7 @@ done IFS=$PRE_IFS -CUDF_VER=${CUDF_VER:-"24.06.0-SNAPSHOT"} # TODO: https://github.com/NVIDIA/spark-rapids/issues/ +CUDF_VER=${CUDF_VER:-"24.08.0-SNAPSHOT"} CUDA_CLASSIFIER=${CUDA_CLASSIFIER:-"cuda11"} CLASSIFIER=${CLASSIFIER:-"$CUDA_CLASSIFIER"} # default as CUDA_CLASSIFIER for compatibility PROJECT_VER=${PROJECT_VER:-"24.08.0-SNAPSHOT"} diff --git a/pom.xml b/pom.xml index df010a7589e..06947857521 100644 --- a/pom.xml +++ b/pom.xml @@ -719,9 +719,8 @@ spark${buildver} cuda11 ${cuda.version} - - 24.06.0-SNAPSHOT - 24.06.0-SNAPSHOT + 24.08.0-SNAPSHOT + 24.08.0-SNAPSHOT 2.12 2.8.0 incremental @@ -887,6 +886,7 @@ 340, 341, 342, + 343, 350, 351 diff --git a/scala2.13/pom.xml b/scala2.13/pom.xml index 711872e8d54..cbc4aecbd26 100644 --- a/scala2.13/pom.xml +++ b/scala2.13/pom.xml @@ -719,9 +719,8 @@ spark${buildver} cuda11 ${cuda.version} - - 24.06.0-SNAPSHOT - 24.06.0-SNAPSHOT + 24.08.0-SNAPSHOT + 24.08.0-SNAPSHOT 2.13 2.8.0 incremental @@ -887,6 +886,7 @@ 340, 341, 342, + 343, 350, 351 diff --git a/scala2.13/shim-deps/databricks/pom.xml b/scala2.13/shim-deps/databricks/pom.xml index b342f381c71..a0459901079 100644 --- a/scala2.13/shim-deps/databricks/pom.xml +++ b/scala2.13/shim-deps/databricks/pom.xml @@ -105,6 +105,12 @@ ${spark.version} compile + + com.fasterxml.jackson.core + jackson-databind + ${spark.version} + compile + com.fasterxml.jackson.core jackson-annotations @@ -286,4 +292,4 @@ compile - \ No newline at end of file + diff --git a/shim-deps/databricks/pom.xml b/shim-deps/databricks/pom.xml index bef8a90d227..22842b0f7c0 100644 --- a/shim-deps/databricks/pom.xml +++ b/shim-deps/databricks/pom.xml @@ -105,6 +105,12 @@ ${spark.version} compile + + com.fasterxml.jackson.core + jackson-databind + ${spark.version} + compile + com.fasterxml.jackson.core jackson-annotations @@ -286,4 +292,4 @@ compile - \ No newline at end of file + diff --git a/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala index 36abc75ba87..2d7a51c4e43 100644 --- a/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala +++ b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimLoader.scala @@ -40,19 +40,19 @@ import org.apache.spark.util.MutableURLClassLoader "parallel worlds" in the JDK's com.sun.istack.internal.tools.ParallelWorldClassLoader parlance 1. a few publicly documented classes in the conventional layout at the top 2. a large fraction of classes whose bytecode is identical under all supported Spark versions - in spark3xx-common + in spark-shared 3. a smaller fraction of classes that differ under one of the supported Spark versions com/nvidia/spark/SQLPlugin.class - spark3xx-common/com/nvidia/spark/rapids/CastExprMeta.class + spark-shared/com/nvidia/spark/rapids/CastExprMeta.class spark311/org/apache/spark/sql/rapids/GpuUnaryMinus.class spark320/org/apache/spark/sql/rapids/GpuUnaryMinus.class Each shim can see a consistent parallel world without conflicts by referencing only one conflicting directory. E.g., Spark 3.2.0 Shim will use - jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark3xx-common/ + jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark-shared/ jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark320/ Spark 3.1.1 will use - jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark3xx-common/ + jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark-shared/ jar:file:/home/spark/rapids-4-spark_2.12-24.08.0.jar!/spark311/ Using these Jar URL's allows referencing different bytecode produced from identical sources by incompatible Scala / Spark dependencies. @@ -67,7 +67,7 @@ object ShimLoader extends Logging { new URL(rootUrlStr) } - private val shimCommonURL = new URL(s"${shimRootURL.toString}spark3xx-common/") + private val shimCommonURL = new URL(s"${shimRootURL.toString}spark-shared/") @volatile private var shimProviderClass: String = _ @volatile private var shimProvider: SparkShimServiceProvider = _ @volatile private var shimURL: URL = _ diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/CudfUnsafeRow.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/CudfUnsafeRow.java deleted file mode 100644 index d25500a77b2..00000000000 --- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/CudfUnsafeRow.java +++ /dev/null @@ -1,400 +0,0 @@ -/* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.nvidia.spark.rapids; - -import org.apache.spark.sql.catalyst.InternalRow; -import org.apache.spark.sql.catalyst.expressions.Attribute; -import org.apache.spark.sql.catalyst.expressions.SpecializedGettersReader; -import org.apache.spark.sql.catalyst.util.ArrayData; -import org.apache.spark.sql.catalyst.util.MapData; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.Decimal; -import org.apache.spark.unsafe.Platform; -import org.apache.spark.unsafe.array.ByteArrayMethods; -import org.apache.spark.unsafe.hash.Murmur3_x86_32; -import org.apache.spark.unsafe.types.CalendarInterval; -import org.apache.spark.unsafe.types.UTF8String; - -import java.util.Arrays; - -/** - * This is an InternalRow implementation based off of UnsafeRow, but follows a format for use with - * the row format supported by cudf. In this format each column is padded to match the alignment - * needed by it, and validity is placed at the end one byte at a time. - * - * It also supports remapping the columns so that if the columns were re-ordered to reduce packing - * in the format, then they can be mapped back to their original positions. - * - * This class is likely to go away once we move to code generation when going directly to an - * UnsafeRow through code generation. This is rather difficult because of some details in how - * UnsafeRow works. - */ -public final class CudfUnsafeRow extends InternalRow { - public static int alignOffset(int offset, int alignment) { - return (offset + alignment - 1) & -alignment; - } - - public static int calculateBitSetWidthInBytes(int numFields) { - return (numFields + 7)/ 8; - } - - public static int getRowSizeEstimate(Attribute[] attributes) { - // This needs to match what is in cudf and what is in the constructor. - int offset = 0; - for (Attribute attr : attributes) { - int length = GpuColumnVector.getNonNestedRapidsType(attr.dataType()).getSizeInBytes(); - offset = alignOffset(offset, length); - offset += length; - } - int bitSetWidthInBytes = calculateBitSetWidthInBytes(attributes.length); - // Each row is 64-bit aligned - return alignOffset(offset + bitSetWidthInBytes, 8); - } - - ////////////////////////////////////////////////////////////////////////////// - // Private fields and methods - ////////////////////////////////////////////////////////////////////////////// - - /** - * Address of where the row is stored in off heap memory. - */ - private long address; - - /** - * For each column the starting location to read from. The index to the is the position in - * the row bytes, not the user faceing ordinal. - */ - private int[] startOffsets; - - /** - * At what point validity data starts. - */ - private int fixedWidthSizeInBytes; - - /** - * The size of this row's backing data, in bytes. - */ - private int sizeInBytes; - - /** - * A mapping from the user facing ordinal to the index in the underlying row. - */ - private int[] remapping; - - /** - * Get the address where a field is stored. - * @param ordinal the user facing ordinal. - * @return the address of the field. - */ - private long getFieldAddressFromOrdinal(int ordinal) { - assertIndexIsValid(ordinal); - int i = remapping[ordinal]; - return address + startOffsets[i]; - } - - /** - * Verify that index is valid for this row. - * @param index in this case the index can be either the user facing ordinal or the index into the - * row. - */ - private void assertIndexIsValid(int index) { - assert index >= 0 : "index (" + index + ") should >= 0"; - assert index < startOffsets.length : "index (" + index + ") should < " + startOffsets.length; - } - - ////////////////////////////////////////////////////////////////////////////// - // Public methods - ////////////////////////////////////////////////////////////////////////////// - - /** - * Construct a new Row. The resulting row won't be usable until `pointTo()` has been called, - * since the value returned by this constructor is equivalent to a null pointer. - * - * @param attributes the schema of what this will hold. This is the schema of the underlying - * row, so if columns were re-ordered it is the attributes of the reordered - * data. - * @param remapping a mapping from the user requested column to the underlying column in the - * backing row. - */ - public CudfUnsafeRow(Attribute[] attributes, int[] remapping) { - int offset = 0; - startOffsets = new int[attributes.length]; - for (int i = 0; i < attributes.length; i++) { - Attribute attr = attributes[i]; - int length = GpuColumnVector.getNonNestedRapidsType(attr.dataType()).getSizeInBytes(); - assert length > 0 : "Only fixed width types are currently supported."; - offset = alignOffset(offset, length); - startOffsets[i] = offset; - offset += length; - } - fixedWidthSizeInBytes = offset; - this.remapping = remapping; - assert startOffsets.length == remapping.length; - } - - // for serializer - public CudfUnsafeRow() {} - - @Override - public int numFields() { return startOffsets.length; } - - /** - * Update this CudfUnsafeRow to point to different backing data. - * - * @param address the address in host memory for this. We should change this to be a - * MemoryBuffer class or something like that. - * @param sizeInBytes the size of this row's backing data, in bytes - */ - public void pointTo(long address, int sizeInBytes) { - assert startOffsets != null && startOffsets.length > 0 : "startOffsets not properly initialized"; - assert sizeInBytes % 8 == 0 : "sizeInBytes (" + sizeInBytes + ") should be a multiple of 8"; - this.address = address; - this.sizeInBytes = sizeInBytes; - } - - @Override - public void update(int ordinal, Object value) { - throw new UnsupportedOperationException(); - } - - @Override - public Object get(int ordinal, DataType dataType) { - // Don't remap the ordinal because it will be remapped in each of the other backing APIs - return SpecializedGettersReader.read(this, ordinal, dataType, true, true); - } - - @Override - public boolean isNullAt(int ordinal) { - int i = remapping[ordinal]; - assertIndexIsValid(i); - int validByteIndex = i / 8; - int validBitIndex = i % 8; - byte b = Platform.getByte(null, address + fixedWidthSizeInBytes + validByteIndex); - return ((1 << validBitIndex) & b) == 0; - } - - @Override - public void setNullAt(int ordinal) { - int i = remapping[ordinal]; - assertIndexIsValid(i); - int validByteIndex = i / 8; - int validBitIndex = i % 8; - byte b = Platform.getByte(null, address + fixedWidthSizeInBytes + validByteIndex); - b = (byte)((b & ~(1 << validBitIndex)) & 0xFF); - Platform.putByte(null, address + fixedWidthSizeInBytes + validByteIndex, b); - } - - @Override - public boolean getBoolean(int ordinal) { - return Platform.getBoolean(null, getFieldAddressFromOrdinal(ordinal)); - } - - @Override - public byte getByte(int ordinal) { - return Platform.getByte(null, getFieldAddressFromOrdinal(ordinal)); - } - - @Override - public short getShort(int ordinal) { - return Platform.getShort(null, getFieldAddressFromOrdinal(ordinal)); - } - - @Override - public int getInt(int ordinal) { - return Platform.getInt(null, getFieldAddressFromOrdinal(ordinal)); - } - - @Override - public long getLong(int ordinal) { - return Platform.getLong(null, getFieldAddressFromOrdinal(ordinal)); - } - - @Override - public float getFloat(int ordinal) { - return Platform.getFloat(null, getFieldAddressFromOrdinal(ordinal)); - } - - @Override - public double getDouble(int ordinal) { - return Platform.getDouble(null, getFieldAddressFromOrdinal(ordinal)); - } - - @Override - public Decimal getDecimal(int ordinal, int precision, int scale) { - if (isNullAt(ordinal)) { - return null; - } - if (precision <= Decimal.MAX_INT_DIGITS()) { - return Decimal.createUnsafe(getInt(ordinal), precision, scale); - } else if (precision <= Decimal.MAX_LONG_DIGITS()) { - return Decimal.createUnsafe(getLong(ordinal), precision, scale); - } else { - throw new IllegalArgumentException("NOT IMPLEMENTED YET"); -// byte[] bytes = getBinary(ordinal); -// BigInteger bigInteger = new BigInteger(bytes); -// BigDecimal javaDecimal = new BigDecimal(bigInteger, scale); -// return Decimal.apply(javaDecimal, precision, scale); - } - } - - @Override - public UTF8String getUTF8String(int ordinal) { -// if (isNullAt(ordinal)) return null; -// final long offsetAndSize = getLong(ordinal); -// final int offset = (int) (offsetAndSize >> 32); -// final int size = (int) offsetAndSize; -// return UTF8String.fromAddress(null, address + offset, size); - throw new IllegalArgumentException("NOT IMPLEMENTED YET"); - } - - @Override - public byte[] getBinary(int ordinal) { -// if (isNullAt(ordinal)) { -// return null; -// } else { -// final long offsetAndSize = getLong(ordinal); -// final int offset = (int) (offsetAndSize >> 32); -// final int size = (int) offsetAndSize; -// final byte[] bytes = new byte[size]; -// Platform.copyMemory( -// null, -// address + offset, -// bytes, -// Platform.BYTE_ARRAY_OFFSET, -// size -// ); -// return bytes; -// } - throw new IllegalArgumentException("NOT IMPLEMENTED YET"); - } - - @Override - public CalendarInterval getInterval(int ordinal) { -// if (isNullAt(ordinal)) { -// return null; -// } else { -// final long offsetAndSize = getLong(ordinal); -// final int offset = (int) (offsetAndSize >> 32); -// final int months = Platform.getInt(baseObject, address + offset); -// final int days = Platform.getInt(baseObject, address + offset + 4); -// final long microseconds = Platform.getLong(baseObject, address + offset + 8); -// return new CalendarInterval(months, days, microseconds); -// } - throw new IllegalArgumentException("NOT IMPLEMENTED YET"); - } - - @Override - public CudfUnsafeRow getStruct(int ordinal, int numFields) { -// if (isNullAt(ordinal)) { -// return null; -// } else { -// final long offsetAndSize = getLong(ordinal); -// final int offset = (int) (offsetAndSize >> 32); -// final int size = (int) offsetAndSize; -// final UnsafeRow row = new UnsafeRow(numFields); -// row.pointTo(baseObject, address + offset, size); -// return row; -// } - throw new IllegalArgumentException("NOT IMPLEMENTED YET"); - } - - @Override - public ArrayData getArray(int ordinal) { -// if (isNullAt(ordinal)) { -// return null; -// } else { -// final long offsetAndSize = getLong(ordinal); -// final int offset = (int) (offsetAndSize >> 32); -// final int size = (int) offsetAndSize; -// final UnsafeArrayData array = new UnsafeArrayData(); -// array.pointTo(baseObject, address + offset, size); -// return array; -// } - throw new IllegalArgumentException("NOT IMPLEMENTED YET"); - } - - @Override - public MapData getMap(int ordinal) { -// if (isNullAt(ordinal)) { -// return null; -// } else { -// final long offsetAndSize = getLong(ordinal); -// final int offset = (int) (offsetAndSize >> 32); -// final int size = (int) offsetAndSize; -// final UnsafeMapData map = new UnsafeMapData(); -// map.pointTo(baseObject, address + offset, size); -// return map; -// } - throw new IllegalArgumentException("NOT IMPLEMENTED YET"); - } - - /** - * Copies this row, returning a self-contained UnsafeRow that stores its data in an internal - * byte array rather than referencing data stored in a data page. - */ - @Override - public CudfUnsafeRow copy() { -// UnsafeRow rowCopy = new UnsafeRow(numFields); -// final byte[] rowDataCopy = new byte[sizeInBytes]; -// Platform.copyMemory( -// baseObject, -// address, -// rowDataCopy, -// Platform.BYTE_ARRAY_OFFSET, -// sizeInBytes -// ); -// rowCopy.pointTo(rowDataCopy, Platform.BYTE_ARRAY_OFFSET, sizeInBytes); -// return rowCopy; - throw new IllegalArgumentException("NOT IMPLEMENTED YET"); - } - - @Override - public int hashCode() { - return Murmur3_x86_32.hashUnsafeWords(null, address, sizeInBytes, 42); - } - - @Override - public boolean equals(Object other) { - if (other instanceof CudfUnsafeRow) { - CudfUnsafeRow o = (CudfUnsafeRow) other; - return (sizeInBytes == o.sizeInBytes) && - ByteArrayMethods.arrayEquals(null, address, null, o.address, sizeInBytes) && - Arrays.equals(remapping, o.remapping); - } - return false; - } - - // This is for debugging - @Override - public String toString() { - StringBuilder build = new StringBuilder("["); - for (int i = 0; i < sizeInBytes; i += 8) { - if (i != 0) build.append(','); - build.append(java.lang.Long.toHexString(Platform.getLong(null, address + i))); - } - build.append(']'); - build.append(" remapped with "); - build.append(Arrays.toString(remapping)); - return build.toString(); - } - - @Override - public boolean anyNull() { - throw new IllegalArgumentException("NOT IMPLEMENTED YET"); -// return BitSetMethods.anySet(baseObject, address, bitSetWidthInBytes / 8); - } -} \ No newline at end of file diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/InternalRowToColumnarBatchIterator.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/InternalRowToColumnarBatchIterator.java index 9e532ba394a..0aa3f0978e9 100644 --- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/InternalRowToColumnarBatchIterator.java +++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/InternalRowToColumnarBatchIterator.java @@ -35,6 +35,7 @@ import ai.rapids.cudf.NvtxRange; import ai.rapids.cudf.Table; import com.nvidia.spark.rapids.jni.RowConversion; +import com.nvidia.spark.rapids.shims.CudfUnsafeRow; import org.apache.spark.TaskContext; import org.apache.spark.sql.catalyst.InternalRow; diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarToRowExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarToRowExec.scala index 38b235a36f6..694d6dabbd6 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarToRowExec.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuColumnarToRowExec.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,7 +25,7 @@ import com.nvidia.spark.rapids.RapidsPluginImplicits._ import com.nvidia.spark.rapids.RmmRapidsRetryIterator.{splitSpillableInHalfByRows, withRetryNoSplit} import com.nvidia.spark.rapids.ScalableTaskCompletion.onTaskCompletion import com.nvidia.spark.rapids.jni.RowConversion -import com.nvidia.spark.rapids.shims.ShimUnaryExecNode +import com.nvidia.spark.rapids.shims.{CudfUnsafeRow, ShimUnaryExecNode} import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDataWritingCommandExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDataWritingCommandExec.scala index 5a54d0b2f66..019f9b2e6b0 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDataWritingCommandExec.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuDataWritingCommandExec.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,7 +31,7 @@ import org.apache.spark.sql.execution.command.DataWritingCommand import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.rapids.GpuWriteJobStatsTracker -import org.apache.spark.sql.rapids.execution.TrampolineUtil +import org.apache.spark.sql.rapids.shims.RapidsErrorUtils import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.SerializableConfiguration @@ -84,10 +84,9 @@ object GpuDataWritingCommand { if (fs.exists(filePath) && fs.getFileStatus(filePath).isDirectory && fs.listStatus(filePath).length != 0) { - TrampolineUtil.throwAnalysisException( - s"CREATE-TABLE-AS-SELECT cannot create table with location to a non-empty directory " + - s"${tablePath} . To allow overwriting the existing non-empty directory, " + - s"set '$allowNonEmptyLocationInCTASKey' to true.") + throw RapidsErrorUtils. + createTableAsSelectWithNonEmptyDirectoryError(tablePath.toString, + allowNonEmptyLocationInCTASKey) } } } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 1a799b43d1b..ee66b4d19df 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -3797,14 +3797,6 @@ object GpuOverrides extends Logging { TypeSig.ARRAY.nested(TypeSig.all)), (e, conf, p, r) => new GpuGetArrayStructFieldsMeta(e, conf, p, r) ), - expr[RaiseError]( - "Throw an exception", - ExprChecks.unaryProject( - TypeSig.NULL, TypeSig.NULL, - TypeSig.STRING, TypeSig.STRING), - (a, conf, p, r) => new UnaryExprMeta[RaiseError](a, conf, p, r) { - override def convertToGpu(child: Expression): GpuExpression = GpuRaiseError(child) - }), expr[DynamicPruningExpression]( "Dynamic pruning expression marker", ExprChecks.unaryProject(TypeSig.all, TypeSig.all, TypeSig.BOOLEAN, TypeSig.BOOLEAN), @@ -3820,7 +3812,8 @@ object GpuOverrides extends Logging { val expressions: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = commonExpressions ++ TimeStamp.getExprs ++ GpuHiveOverrides.exprs ++ ZOrderRules.exprs ++ DecimalArithmeticOverrides.exprs ++ - BloomFilterShims.exprs ++ InSubqueryShims.exprs ++ SparkShimImpl.getExprs + BloomFilterShims.exprs ++ InSubqueryShims.exprs ++ RaiseErrorShim.exprs ++ + SparkShimImpl.getExprs def wrapScan[INPUT <: Scan]( scan: INPUT, diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala index e8ae977b1f6..25105386b3d 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -321,7 +321,7 @@ class GpuParquetWriter( new GpuColumnVector(cv.dataType, deepTransformColumn(cv.getBase, cv.dataType)) .asInstanceOf[org.apache.spark.sql.vectorized.ColumnVector] } - new ColumnarBatch(transformedCols) + new ColumnarBatch(transformedCols, batch.numRows()) } } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRowToColumnarExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRowToColumnarExec.scala index 99f17cf341a..51b6645d7b7 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRowToColumnarExec.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRowToColumnarExec.scala @@ -19,7 +19,7 @@ package com.nvidia.spark.rapids import ai.rapids.cudf.{NvtxColor, NvtxRange} import com.nvidia.spark.rapids.Arm.withResource import com.nvidia.spark.rapids.GpuColumnVector.GpuColumnarBatchBuilder -import com.nvidia.spark.rapids.shims.{GpuTypeShims, ShimUnaryExecNode} +import com.nvidia.spark.rapids.shims.{CudfUnsafeRow, GpuTypeShims, ShimUnaryExecNode} import org.apache.spark.TaskContext import org.apache.spark.broadcast.Broadcast diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRunnableCommandExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRunnableCommandExec.scala index e3869960fc4..43bd593c0b5 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRunnableCommandExec.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuRunnableCommandExec.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,7 +31,7 @@ import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.rapids.GpuWriteJobStatsTracker -import org.apache.spark.sql.rapids.execution.TrampolineUtil +import org.apache.spark.sql.rapids.shims.RapidsErrorUtils import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.SerializableConfiguration @@ -82,10 +82,9 @@ object GpuRunnableCommand { if (fs.exists(filePath) && fs.getFileStatus(filePath).isDirectory && fs.listStatus(filePath).length != 0) { - TrampolineUtil.throwAnalysisException( - s"CREATE-TABLE-AS-SELECT cannot create table with location to a non-empty directory " + - s"${tablePath} . To allow overwriting the existing non-empty directory, " + - s"set '$allowNonEmptyLocationInCTASKey' to true.") + throw RapidsErrorUtils. + createTableAsSelectWithNonEmptyDirectoryError(tablePath.toString, + allowNonEmptyLocationInCTASKey) } } } diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index 45d5e07dd73..1ca155f8a52 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -17,7 +17,6 @@ package com.nvidia.spark.rapids import java.sql.SQLException -import scala.collection import scala.collection.mutable.ListBuffer import com.nvidia.spark.rapids.GpuOverrides.regexMetaChars @@ -73,7 +72,7 @@ class RegexParser(pattern: String) { sequence } - def parseReplacementBase(): RegexAST = { + private def parseReplacementBase(): RegexAST = { consume() match { case '\\' => parseBackrefOrEscaped() @@ -782,6 +781,7 @@ class CudfRegexTranspiler(mode: RegexMode) { } } + @scala.annotation.tailrec private def isRepetition(e: RegexAST, checkZeroLength: Boolean): Boolean = { e match { case RegexRepetition(_, _) if !checkZeroLength => true @@ -1648,6 +1648,7 @@ class CudfRegexTranspiler(mode: RegexMode) { } } + @scala.annotation.tailrec private def isEntirely(regex: RegexAST, f: RegexAST => Boolean): Boolean = { regex match { case RegexSequence(parts) if parts.nonEmpty => @@ -1672,6 +1673,7 @@ class CudfRegexTranspiler(mode: RegexMode) { }) } + @scala.annotation.tailrec private def beginsWith(regex: RegexAST, f: RegexAST => Boolean): Boolean = { regex match { case RegexSequence(parts) if parts.nonEmpty => @@ -1687,6 +1689,7 @@ class CudfRegexTranspiler(mode: RegexMode) { } + @scala.annotation.tailrec private def endsWith(regex: RegexAST, f: RegexAST => Boolean): Boolean = { regex match { case RegexSequence(parts) if parts.nonEmpty => @@ -1760,7 +1763,7 @@ sealed case class RegexSequence(parts: ListBuffer[RegexAST]) extends RegexAST { } sealed case class RegexGroup(capture: Boolean, term: RegexAST, - val lookahead: Option[RegexLookahead]) + lookahead: Option[RegexLookahead]) extends RegexAST { def this(capture: Boolean, term: RegexAST) = { this(capture, term, None) @@ -2023,11 +2026,13 @@ object RegexOptimizationType { case class Contains(literal: String) extends RegexOptimizationType case class PrefixRange(literal: String, length: Int, rangeStart: Int, rangeEnd: Int) extends RegexOptimizationType + case class MultipleContains(literals: Seq[String]) extends RegexOptimizationType case object NoOptimization extends RegexOptimizationType } object RegexRewrite { + @scala.annotation.tailrec private def removeBrackets(astLs: collection.Seq[RegexAST]): collection.Seq[RegexAST] = { astLs match { case collection.Seq(RegexGroup(_, term, None)) => removeBrackets(term.children()) @@ -2044,7 +2049,7 @@ object RegexRewrite { */ private def getPrefixRangePattern(astLs: collection.Seq[RegexAST]): Option[(String, Int, Int, Int)] = { - val haveLiteralPrefix = isliteralString(astLs.dropRight(1)) + val haveLiteralPrefix = isLiteralString(astLs.dropRight(1)) val endsWithRange = astLs.lastOption match { case Some(RegexRepetition( RegexCharacterClass(false, ListBuffer(RegexCharacterRange(a,b))), @@ -2080,13 +2085,27 @@ object RegexRewrite { } } - private def isliteralString(astLs: collection.Seq[RegexAST]): Boolean = { + private def isLiteralString(astLs: collection.Seq[RegexAST]): Boolean = { removeBrackets(astLs).forall { - case RegexChar(ch) if !regexMetaChars.contains(ch) => true + case RegexChar(ch) => !regexMetaChars.contains(ch) case _ => false } } + private def getMultipleContainsLiterals(ast: RegexAST): Seq[String] = { + ast match { + case RegexGroup(_, term, _) => getMultipleContainsLiterals(term) + case RegexChoice(RegexSequence(parts), ls) if isLiteralString(parts) => { + getMultipleContainsLiterals(ls) match { + case Seq() => Seq.empty + case literals => RegexCharsToString(parts) +: literals + } + } + case RegexSequence(parts) if (isLiteralString(parts)) => Seq(RegexCharsToString(parts)) + case _ => Seq.empty + } + } + private def isWildcard(ast: RegexAST): Boolean = { ast match { case RegexRepetition(RegexChar('.'), SimpleQuantifier('*')) => true @@ -2097,11 +2116,8 @@ object RegexRewrite { } private def stripLeadingWildcards(astLs: collection.Seq[RegexAST]): - collection.Seq[RegexAST] = astLs match { - case (RegexChar('^') | RegexEscaped('A')) :: tail => - // if the pattern starts with ^ or \A, strip it too - tail.dropWhile(isWildcard) - case _ => astLs.dropWhile(isWildcard) + collection.Seq[RegexAST] = { + astLs.dropWhile(isWildcard) } private def stripTailingWildcards(astLs: collection.Seq[RegexAST]): @@ -2120,30 +2136,48 @@ object RegexRewrite { * Matches the given regex ast to a regex optimization type for regex rewrite * optimization. * - * @param ast The Abstract Syntax Tree parsed from a regex pattern. + * @param ast Abstract Syntax Tree parsed from a regex pattern. * @return The `RegexOptimizationType` for the given pattern. */ def matchSimplePattern(ast: RegexAST): RegexOptimizationType = { - ast.children() match { - case (RegexChar('^') | RegexEscaped('A')) :: ast - if isliteralString(stripTailingWildcards(ast)) => { - // ^literal.* => startsWith literal - RegexOptimizationType.StartsWith(RegexCharsToString(stripTailingWildcards(ast))) + val astLs = ast match { + case RegexSequence(_) => ast.children() + case _ => Seq(ast) + } + val noTailingWildcards = stripTailingWildcards(astLs) + if (noTailingWildcards.headOption.exists( + ast => ast == RegexChar('^') || ast == RegexEscaped('A'))) { + val possibleLiteral = noTailingWildcards.drop(1) + if (isLiteralString(possibleLiteral)) { + return RegexOptimizationType.StartsWith(RegexCharsToString(possibleLiteral)) } - case astLs => { - val noStartsWithAst = stripTailingWildcards(stripLeadingWildcards(astLs)) - val prefixRangeInfo = getPrefixRangePattern(noStartsWithAst) - if (prefixRangeInfo.isDefined) { - val (prefix, length, start, end) = prefixRangeInfo.get - // (literal[a-b]{x,y}) => prefix range pattern - RegexOptimizationType.PrefixRange(prefix, length, start, end) - } else if (isliteralString(noStartsWithAst)) { - // literal.* or (literal).* => contains literal - RegexOptimizationType.Contains(RegexCharsToString(noStartsWithAst)) - } else { - RegexOptimizationType.NoOptimization - } + } + + val noStartsWithAst = stripLeadingWildcards(noTailingWildcards) + + // Check if the pattern is a contains literal pattern + if (isLiteralString(noStartsWithAst)) { + // literal or .*(literal).* => contains literal + return RegexOptimizationType.Contains(RegexCharsToString(noStartsWithAst)) + } + + // Check if the pattern is a multiple contains literal pattern (e.g. "abc|def|ghi") + if (noStartsWithAst.length == 1) { + val containsLiterals = getMultipleContainsLiterals(noStartsWithAst.head) + if (!containsLiterals.isEmpty) { + return RegexOptimizationType.MultipleContains(containsLiterals) } } + + // Check if the pattern is a prefix range pattern (e.g. "abc[a-z]{3}") + val prefixRangeInfo = getPrefixRangePattern(noStartsWithAst) + if (prefixRangeInfo.isDefined) { + val (prefix, length, start, end) = prefixRangeInfo.get + // (literal[a-b]{x,y}) => prefix range pattern + return RegexOptimizationType.PrefixRange(prefix, length, start, end) + } + + // return NoOptimization if the pattern is not a simple pattern and use cuDF + RegexOptimizationType.NoOptimization } -} \ No newline at end of file +} diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveTextFileFormat.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveFileFormat.scala similarity index 54% rename from sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveTextFileFormat.scala rename to sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveFileFormat.scala index 4595ea87ed3..21437a64481 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveTextFileFormat.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/GpuHiveFileFormat.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,8 +17,9 @@ package org.apache.spark.sql.hive.rapids import java.nio.charset.Charset +import java.util.Locale -import ai.rapids.cudf.{CSVWriterOptions, DType, QuoteStyle, Scalar, Table, TableWriter => CudfTableWriter} +import ai.rapids.cudf.{CompressionType, CSVWriterOptions, DType, ParquetWriterOptions, QuoteStyle, Scalar, Table, TableWriter => CudfTableWriter} import com.google.common.base.Charsets import com.nvidia.spark.rapids._ import com.nvidia.spark.rapids.Arm.withResource @@ -27,14 +28,85 @@ import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.hive.rapids.GpuHiveTextFileUtils._ +import org.apache.spark.sql.execution.datasources.parquet.ParquetOptions import org.apache.spark.sql.hive.rapids.shims.GpuInsertIntoHiveTableMeta -import org.apache.spark.sql.types.{DataType, StringType, StructType} +import org.apache.spark.sql.rapids.execution.TrampolineUtil +import org.apache.spark.sql.types.{DataType, Decimal, DecimalType, StringType, StructType} import org.apache.spark.sql.vectorized.ColumnarBatch -object GpuHiveTextFileFormat extends Logging { +object GpuHiveFileFormat extends Logging { + private val parquetOutputFormatClass = + "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat" + private val parquetSerdeClass = + "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe" - private def checkIfEnabled(meta: GpuInsertIntoHiveTableMeta): Unit = { + def tagGpuSupport(meta: GpuInsertIntoHiveTableMeta): Option[ColumnarFileFormat] = { + val insertCmd = meta.wrapped + // Bucketing write + if (insertCmd.table.bucketSpec.isDefined) { + meta.willNotWorkOnGpu("bucketed tables are not supported yet") + } + + // Infer the file format from the serde string, similar as what Spark does in + // RelationConversions for Hive. + val serde = insertCmd.table.storage.serde.getOrElse("").toLowerCase(Locale.ROOT) + val tempFileFormat = if (serde.contains("parquet")) { + // Parquet specific tagging + tagGpuSupportForParquet(meta) + } else { + // Default to text file format + tagGpuSupportForText(meta) + } + + if (meta.canThisBeReplaced) { + Some(tempFileFormat) + } else { + None + } + } + + private def tagGpuSupportForParquet(meta: GpuInsertIntoHiveTableMeta): ColumnarFileFormat = { + val insertCmd = meta.wrapped + val storage = insertCmd.table.storage + + if (storage.outputFormat.getOrElse("") != parquetOutputFormatClass) { + meta.willNotWorkOnGpu(s"unsupported output format found: ${storage.outputFormat}, " + + s"only $parquetOutputFormatClass is currently supported for Parquet") + } + if (storage.serde.getOrElse("") != parquetSerdeClass) { + meta.willNotWorkOnGpu(s"unsupported serde found: ${storage.serde}, " + + s"only $parquetSerdeClass is currently supported for Parquet") + } + + // Decimal type check + val hasIntOrLongBackedDec = insertCmd.query.schema.exists { field => + TrampolineUtil.dataTypeExistsRecursively(field.dataType, { + case dec: DecimalType if dec.precision <= Decimal.MAX_LONG_DIGITS => true + case _ => false + }) + } + if (hasIntOrLongBackedDec) { + meta.willNotWorkOnGpu("decimals that fit in a long are not supported " + + s"for Parquet. Hive always writes decimals as binary arrays but the GPU writes them " + + s"as integral types") + } + + FileFormatChecks.tag(meta, insertCmd.table.schema, ParquetFormatType, WriteFileOp) + + // Compression type + val parquetOptions = new ParquetOptions(insertCmd.table.properties, insertCmd.conf) + val compressionType = + GpuParquetFileFormat.parseCompressionType(parquetOptions.compressionCodecClassName) + .getOrElse { + meta.willNotWorkOnGpu("compression codec " + + s"${parquetOptions.compressionCodecClassName} is not supported for Parquet") + CompressionType.NONE + } + new GpuHiveParquetFileFormat(compressionType) + } + + private def tagGpuSupportForText(meta: GpuInsertIntoHiveTableMeta): ColumnarFileFormat = { + import org.apache.spark.sql.hive.rapids.GpuHiveTextFileUtils._ if (!meta.conf.isHiveDelimitedTextEnabled) { meta.willNotWorkOnGpu("Hive text I/O has been disabled. To enable this, " + s"set ${RapidsConf.ENABLE_HIVE_TEXT} to true") @@ -43,21 +115,16 @@ object GpuHiveTextFileFormat extends Logging { meta.willNotWorkOnGpu("writing Hive delimited text tables has been disabled, " + s"to enable this, set ${RapidsConf.ENABLE_HIVE_TEXT_WRITE} to true") } - } - - def tagGpuSupport(meta: GpuInsertIntoHiveTableMeta) - : Option[ColumnarFileFormat] = { - checkIfEnabled(meta) val insertCommand = meta.wrapped val storage = insertCommand.table.storage if (storage.outputFormat.getOrElse("") != textOutputFormat) { meta.willNotWorkOnGpu(s"unsupported output-format found: ${storage.outputFormat}, " + - s"only $textOutputFormat is currently supported") + s"only $textOutputFormat is currently supported for text") } if (storage.serde.getOrElse("") != lazySimpleSerDe) { meta.willNotWorkOnGpu(s"unsupported serde found: ${storage.serde}, " + - s"only $lazySimpleSerDe is currently supported") + s"only $lazySimpleSerDe is currently supported for text") } val serializationFormat = storage.properties.getOrElse(serializationKey, "1") @@ -86,28 +153,60 @@ object GpuHiveTextFileFormat extends Logging { meta.willNotWorkOnGpu("only UTF-8 is supported as the charset") } - if (insertCommand.table.bucketSpec.isDefined) { - meta.willNotWorkOnGpu("bucketed tables are not supported") - } - - if (insertCommand.conf.getConfString("hive.exec.compress.output", "false").toLowerCase - != "false") { + if (insertCommand.conf.getConfString("hive.exec.compress.output", "false").toBoolean) { meta.willNotWorkOnGpu("compressed output is not supported, " + "set hive.exec.compress.output to false to enable writing Hive text via GPU") } - FileFormatChecks.tag(meta, - insertCommand.table.schema, - HiveDelimitedTextFormatType, - WriteFileOp) + FileFormatChecks.tag(meta, insertCommand.table.schema, HiveDelimitedTextFormatType, + WriteFileOp) - Some(new GpuHiveTextFileFormat()) + new GpuHiveTextFileFormat() } } +class GpuHiveParquetFileFormat(compType: CompressionType) extends ColumnarFileFormat { + + override def prepareWrite(sparkSession: SparkSession, job: Job, + options: Map[String, String], dataSchema: StructType): ColumnarOutputWriterFactory = { + + // Avoid referencing the outer object. + val compressionType = compType + new ColumnarOutputWriterFactory { + override def getFileExtension(context: TaskAttemptContext): String = + compressionType match { + case CompressionType.NONE => ".parquet" + case ct => s".${ct.name().toLowerCase(Locale.ROOT)}.parquet" + } + + override def newInstance(path: String, + dataSchema: StructType, + context: TaskAttemptContext): ColumnarOutputWriter = { + new GpuHiveParquetWriter(path, dataSchema, context, compressionType) + } + } + } +} + +class GpuHiveParquetWriter(override val path: String, dataSchema: StructType, + context: TaskAttemptContext, compType: CompressionType) + extends ColumnarOutputWriter(context, dataSchema, "HiveParquet", true) { + + override protected val tableWriter: CudfTableWriter = { + val optionsBuilder = SchemaUtils + .writerOptionsFromSchema(ParquetWriterOptions.builder(), dataSchema, + writeInt96 = true, // Hive 1.2 write timestamp as INT96 + parquetFieldIdEnabled = false) + .withCompressionType(compType) + Table.writeParquetChunked(optionsBuilder.build(), this) + } + +} + class GpuHiveTextFileFormat extends ColumnarFileFormat with Logging { - override def supportDataType(dataType: DataType): Boolean = isSupportedType(dataType) + override def supportDataType(dataType: DataType): Boolean = + GpuHiveTextFileUtils.isSupportedType(dataType) override def prepareWrite(sparkSession: SparkSession, job: Job, diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/RapidsHiveErrors.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/RapidsHiveErrors.scala index 259a04ec318..40cac90680f 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/RapidsHiveErrors.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/hive/rapids/RapidsHiveErrors.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,9 +19,9 @@ package org.apache.spark.sql.hive.rapids import org.apache.hadoop.fs.Path import org.apache.spark.SparkException -import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.Literal +import org.apache.spark.sql.rapids.shims.RapidsErrorUtils import org.apache.spark.sql.types.{DataType, DoubleType, FloatType, StringType} object RapidsHiveErrors { @@ -53,8 +53,7 @@ object RapidsHiveErrors { } def cannotResolveAttributeError(name: String, outputStr: String): Throwable = { - new AnalysisException( - s"Unable to resolve $name given [$outputStr]") + throw RapidsErrorUtils.cannotResolveAttributeError(name, outputStr) } def writePartitionExceedConfigSizeWhenDynamicPartitionError( diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuDataSourceBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuDataSourceBase.scala index 0ec720733e8..5589bca0435 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuDataSourceBase.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuDataSourceBase.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,7 +44,7 @@ import org.apache.spark.sql.execution.datasources.v2.orc.OrcDataSourceV2 import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.sources.{RateStreamProvider, TextSocketSourceProvider} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.rapids.shims.SchemaUtilsShims +import org.apache.spark.sql.rapids.shims.{RapidsErrorUtils, SchemaUtilsShims} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.util.{HadoopFSUtils, ThreadUtils, Utils} @@ -144,8 +144,8 @@ abstract class GpuDataSourceBase( } inferredOpt }.getOrElse { - throw new AnalysisException(s"Failed to resolve the schema for $format for " + - s"the partition column: $partitionColumn. It must be specified manually.") + throw RapidsErrorUtils. + partitionColumnNotSpecifiedError(format.toString, partitionColumn) } } StructType(partitionFields) @@ -162,8 +162,7 @@ abstract class GpuDataSourceBase( caseInsensitiveOptions - "path", SparkShimImpl.filesFromFileIndex(tempFileIndex)) }.getOrElse { - throw new AnalysisException( - s"Unable to infer schema for $format. It must be specified manually.") + throw RapidsErrorUtils.dataSchemaNotSpecifiedError(format.toString) } // We just print a waring message if the data schema and partition schema have the duplicate @@ -201,17 +200,13 @@ abstract class GpuDataSourceBase( case (dataSource: RelationProvider, None) => dataSource.createRelation(sparkSession.sqlContext, caseInsensitiveOptions) case (_: SchemaRelationProvider, None) => - throw new AnalysisException(s"A schema needs to be specified when using $className.") + throw RapidsErrorUtils.schemaNotSpecifiedForSchemaRelationProviderError(className) case (dataSource: RelationProvider, Some(schema)) => val baseRelation = dataSource.createRelation(sparkSession.sqlContext, caseInsensitiveOptions) if (!DataType.equalsIgnoreCompatibleNullability(baseRelation.schema, schema)) { - throw new AnalysisException( - "The user-specified schema doesn't match the actual schema: " + - s"user-specified: ${schema.toDDL}, actual: ${baseRelation.schema.toDDL}. If " + - "you're using DataFrameReader.schema API or creating a table, please do not " + - "specify the schema. Or if you're scanning an existed table, please drop " + - "it and re-create it.") + throw RapidsErrorUtils.userSpecifiedSchemaMismatchActualSchemaError(schema, + baseRelation.schema) } baseRelation @@ -233,9 +228,8 @@ abstract class GpuDataSourceBase( caseInsensitiveOptions - "path", SparkShimImpl.filesFromFileIndex(fileCatalog)) }.getOrElse { - throw new AnalysisException( - s"Unable to infer schema for $format at ${fileCatalog.allFiles().mkString(",")}. " + - "It must be specified manually") + throw RapidsErrorUtils. + dataSchemaNotSpecifiedError(format.toString, fileCatalog.allFiles().mkString(",")) } HadoopFsRelation( @@ -276,8 +270,7 @@ abstract class GpuDataSourceBase( caseInsensitiveOptions)(sparkSession) case _ => - throw new AnalysisException( - s"$className is not a valid Spark SQL Data Source.") + throw RapidsErrorUtils.invalidDataSourceError(className) } relation match { @@ -411,22 +404,13 @@ object GpuDataSourceBase extends Logging { dataSource case Failure(error) => if (provider1.startsWith("org.apache.spark.sql.hive.orc")) { - throw new AnalysisException( - "Hive built-in ORC data source must be used with Hive support enabled. " + - "Please use the native ORC data source by setting 'spark.sql.orc.impl' to " + - "'native'") + throw RapidsErrorUtils.orcNotUsedWithHiveEnabledError() } else if (provider1.toLowerCase(Locale.ROOT) == "avro" || provider1 == "com.databricks.spark.avro" || provider1 == "org.apache.spark.sql.avro") { - throw new AnalysisException( - s"Failed to find data source: $provider1. Avro is built-in but external data " + - "source module since Spark 2.4. Please deploy the application as per " + - "the deployment section of \"Apache Avro Data Source Guide\".") + throw RapidsErrorUtils.failedToFindAvroDataSourceError(provider1) } else if (provider1.toLowerCase(Locale.ROOT) == "kafka") { - throw new AnalysisException( - s"Failed to find data source: $provider1. Please deploy the application as " + - "per the deployment section of " + - "\"Structured Streaming + Kafka Integration Guide\".") + throw RapidsErrorUtils.failedToFindKafkaDataSourceError(provider1) } else { throw new ClassNotFoundException( s"Failed to find data source: $provider1. Please find packages at " + @@ -459,8 +443,7 @@ object GpuDataSourceBase extends Logging { s"defaulting to the internal datasource (${internalSources.head.getClass.getName}).") internalSources.head.getClass } else { - throw new AnalysisException(s"Multiple sources found for $provider1 " + - s"(${sourceNames.mkString(", ")}), please specify the fully qualified class name.") + throw RapidsErrorUtils.findMultipleDataSourceError(provider1, sourceNames) } } } catch { @@ -513,7 +496,7 @@ object GpuDataSourceBase extends Logging { } if (checkEmptyGlobPath && globResult.isEmpty) { - throw new AnalysisException(s"Path does not exist: $globPath") + throw RapidsErrorUtils.dataPathNotExistError(globPath.toString) } globResult @@ -527,7 +510,7 @@ object GpuDataSourceBase extends Logging { ThreadUtils.parmap(nonGlobPaths, "checkPathsExist", numThreads) { path => val fs = path.getFileSystem(hadoopConf) if (!fs.exists(path)) { - throw new AnalysisException(s"Path does not exist: $path") + throw RapidsErrorUtils.dataPathNotExistError(path.toString) } } } catch { diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuInsertIntoHadoopFsRelationCommand.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuInsertIntoHadoopFsRelationCommand.scala index 2b7974fd1a6..ece5ef5acf5 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuInsertIntoHadoopFsRelationCommand.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/GpuInsertIntoHadoopFsRelationCommand.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,7 +22,7 @@ import com.nvidia.spark.rapids.{ColumnarFileFormat, GpuDataWritingCommand} import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.internal.io.FileCommitProtocol -import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession} +import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable, CatalogTablePartition} import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.getPartitionPathString @@ -33,7 +33,7 @@ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command.{AlterTableAddPartitionCommand, AlterTableDropPartitionCommand, CommandUtils} import org.apache.spark.sql.execution.datasources.{FileFormatWriter, FileIndex, PartitioningUtils} import org.apache.spark.sql.internal.SQLConf.PartitionOverwriteMode -import org.apache.spark.sql.rapids.shims.SchemaUtilsShims +import org.apache.spark.sql.rapids.shims.{RapidsErrorUtils, SchemaUtilsShims} import org.apache.spark.sql.vectorized.ColumnarBatch case class GpuInsertIntoHadoopFsRelationCommand( @@ -121,7 +121,7 @@ case class GpuInsertIntoHadoopFsRelationCommand( val pathExists = fs.exists(qualifiedOutputPath) (mode, pathExists) match { case (SaveMode.ErrorIfExists, true) => - throw new AnalysisException(s"path $qualifiedOutputPath already exists.") + throw RapidsErrorUtils.outputPathAlreadyExistsError(qualifiedOutputPath) case (SaveMode.Overwrite, true) => if (ifPartitionNotExists && matchingPartitions.nonEmpty) { false diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/catalyst/expressions/GpuRandomExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/catalyst/expressions/GpuRandomExpressions.scala index 6675f678f6d..f9d0be81505 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/catalyst/expressions/GpuRandomExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/catalyst/expressions/GpuRandomExpressions.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,8 +23,8 @@ import com.nvidia.spark.rapids.Arm.withResource import com.nvidia.spark.rapids.shims.ShimUnaryExpression import org.apache.spark.TaskContext -import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, ExpressionWithRandomSeed} +import org.apache.spark.sql.rapids.execution.RapidsAnalysisException import org.apache.spark.sql.types._ import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.Utils @@ -52,7 +52,7 @@ case class GpuRand(child: Expression) extends ShimUnaryExpression with GpuExpres @transient protected lazy val seed: Long = child match { case GpuLiteral(s, IntegerType) => s.asInstanceOf[Int] case GpuLiteral(s, LongType) => s.asInstanceOf[Long] - case _ => throw new AnalysisException( + case _ => throw new RapidsAnalysisException( s"Input argument to $prettyName must be an integer, long or null literal.") } diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala index 7f0a82517c3..41c2e5e3776 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala @@ -49,8 +49,8 @@ case class GpuConcat(children: Seq[Expression]) extends GpuComplexTypeMergingExp override def columnarEval(batch: ColumnarBatch): GpuColumnVector = { val res = dataType match { - // Explicitly return null for empty concat as Spark, since cuDF doesn't support empty concat. - case dt if children.isEmpty => GpuScalar.from(null, dt) + // in Spark concat() will be considered as an empty string here + case dt if children.isEmpty => GpuScalar("", dt) // For single column concat, we pass the result of child node to avoid extra cuDF call. case _ if children.length == 1 => children.head.columnarEval(batch) case StringType => stringConcat(batch) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala index 5ffe08348f1..8a88cc4024d 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/TrampolineUtil.scala @@ -157,9 +157,6 @@ object TrampolineUtil { TaskContext.get.taskMemoryManager() } - /** Throw a Spark analysis exception */ - def throwAnalysisException(msg: String) = throw new AnalysisException(msg) - /** Set the task context for the current thread */ def setTaskContext(tc: TaskContext): Unit = TaskContext.setTaskContext(tc) @@ -241,4 +238,13 @@ object TrampolineUtil { } def getSparkHadoopUtilConf: Configuration = SparkHadoopUtil.get.conf + } + +/** + * This class is to only be used to throw errors specific to the + * RAPIDS Accelerator or errors mirroring Spark where a raw + * AnalysisException is thrown directly rather than via an error + * utility class (this should be rare). + */ +class RapidsAnalysisException(msg: String) extends AnalysisException(msg) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala index bc2f30dff2f..639a39bcd38 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuAggregateInPandasExec.scala @@ -75,15 +75,15 @@ case class GpuAggregateInPandasExec( } private def collectFunctions(udf: GpuPythonFunction): - (ChainedPythonFunctions, Seq[Expression]) = { + ((ChainedPythonFunctions, Long), Seq[Expression]) = { udf.children match { case Seq(u: GpuPythonFunction) => - val (chained, children) = collectFunctions(u) - (ChainedPythonFunctions(chained.funcs ++ Seq(udf.func)), children) + val ((chained, _), children) = collectFunctions(u) + ((ChainedPythonFunctions(chained.funcs ++ Seq(udf.func)), udf.resultId.id), children) case children => // There should not be any other UDFs, or the children can't be evaluated directly. assert(children.forall(_.find(_.isInstanceOf[GpuPythonFunction]).isEmpty)) - (ChainedPythonFunctions(Seq(udf.func)), udf.children) + ((ChainedPythonFunctions(Seq(udf.func)), udf.resultId.id), udf.children) } } diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala index 182d7d1b6c6..c99d0403ed0 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuArrowEvalPythonExec.scala @@ -362,15 +362,16 @@ case class GpuArrowEvalPythonExec( override def producedAttributes: AttributeSet = AttributeSet(resultAttrs) - private def collectFunctions(udf: GpuPythonUDF): (ChainedPythonFunctions, Seq[Expression]) = { + private def collectFunctions( + udf: GpuPythonUDF): ((ChainedPythonFunctions, Long), Seq[Expression]) = { udf.children match { case Seq(u: GpuPythonUDF) => - val (chained, children) = collectFunctions(u) - (ChainedPythonFunctions(chained.funcs ++ Seq(udf.func)), children) + val ((chained, _), children) = collectFunctions(u) + ((ChainedPythonFunctions(chained.funcs ++ Seq(udf.func)), udf.resultId.id), children) case children => // There should not be any other UDFs, or the children can't be evaluated directly. assert(children.forall(_.find(_.isInstanceOf[GpuPythonUDF]).isEmpty)) - (ChainedPythonFunctions(Seq(udf.func)), udf.children) + ((ChainedPythonFunctions(Seq(udf.func)), udf.resultId.id), udf.children) } } diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuFlatMapCoGroupsInPandasExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuFlatMapCoGroupsInPandasExec.scala index b8fa3c1ab69..2e90765e40e 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuFlatMapCoGroupsInPandasExec.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuFlatMapCoGroupsInPandasExec.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -107,8 +107,8 @@ case class GpuFlatMapCoGroupsInPandasExec( private val sessionLocalTimeZone = conf.sessionLocalTimeZone private val pythonRunnerConf = ArrowUtilsShim.getPythonRunnerConfMap(conf) - private val pandasFunction = udf.asInstanceOf[GpuPythonUDF].func - private val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction))) + private val pyUDF = udf.asInstanceOf[GpuPythonUDF] + private val chainedFunc = Seq((ChainedPythonFunctions(Seq(pyUDF.func)), pyUDF.resultId.id)) override def producedAttributes: AttributeSet = AttributeSet(output) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuFlatMapGroupsInPandasExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuFlatMapGroupsInPandasExec.scala index 4a24a449b24..f1596ae7a74 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuFlatMapGroupsInPandasExec.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuFlatMapGroupsInPandasExec.scala @@ -98,7 +98,7 @@ case class GpuFlatMapGroupsInPandasExec( override def requiredChildOrdering: Seq[Seq[SortOrder]] = Seq(groupingAttributes.map(SortOrder(_, Ascending))) - private val pandasFunction = func.asInstanceOf[GpuPythonUDF].func + private val udf = func.asInstanceOf[GpuPythonUDF] // One batch as input to keep the integrity for each group override def childrenCoalesceGoal: Seq[CoalesceGoal] = Seq(RequireSingleBatch) @@ -111,7 +111,7 @@ case class GpuFlatMapGroupsInPandasExec( val (mNumInputRows, mNumInputBatches, mNumOutputRows, mNumOutputBatches) = commonGpuMetrics() lazy val isPythonOnGpuEnabled = GpuPythonHelper.isPythonOnGpuEnabled(conf) - val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction))) + val chainedFunc = Seq((ChainedPythonFunctions(Seq(udf.func)), udf.resultId.id)) val localOutput = output val localChildOutput = child.output // Python wraps the resulting columns in a single struct column. diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuMapInBatchExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuMapInBatchExec.scala index 4d41cd32e4f..57c1c7f7114 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuMapInBatchExec.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuMapInBatchExec.scala @@ -46,7 +46,7 @@ trait GpuMapInBatchExec extends ShimUnaryExecNode with GpuPythonExecBase { protected val isBarrier: Boolean - private val pandasFunction = func.asInstanceOf[GpuPythonUDF].func + private val udf = func.asInstanceOf[GpuPythonUDF] override def producedAttributes: AttributeSet = AttributeSet(output) @@ -58,7 +58,7 @@ trait GpuMapInBatchExec extends ShimUnaryExecNode with GpuPythonExecBase { val (numInputRows, numInputBatches, numOutputRows, numOutputBatches) = commonGpuMetrics() val pyInputTypes = child.schema - val chainedFunc = Seq(ChainedPythonFunctions(Seq(pandasFunction))) + val chainedFunc = Seq((ChainedPythonFunctions(Seq(udf.func)), udf.resultId.id)) val sessionLocalTimeZone = conf.sessionLocalTimeZone val pythonRunnerConf = ArrowUtilsShim.getPythonRunnerConfMap(conf) val isPythonOnGpuEnabled = GpuPythonHelper.isPythonOnGpuEnabled(conf) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuPythonHelper.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuPythonHelper.scala index 451ae401891..8564018ad3b 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuPythonHelper.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuPythonHelper.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -86,11 +86,12 @@ object GpuPythonHelper extends Logging { } // Called in each task at the executor side - def injectGpuInfo(funcs: Seq[ChainedPythonFunctions], isPythonOnGpuEnabled: Boolean): Unit = { + def injectGpuInfo(funcs: Seq[(ChainedPythonFunctions, Long)], + isPythonOnGpuEnabled: Boolean): Unit = { // Insert GPU related env(s) into `envVars` for all the PythonFunction(s). // Yes `PythonRunner` will only use the first one, but just make sure it will // take effect no matter the order changes or not. - funcs.foreach(_.funcs.foreach { pyF => + funcs.foreach(_._1.funcs.foreach { pyF => pyF.envVars.put("CUDA_VISIBLE_DEVICES", gpuId) pyF.envVars.put("RAPIDS_PYTHON_ENABLED", isPythonOnGpuEnabled.toString) pyF.envVars.put("RAPIDS_UVM_ENABLED", isPythonUvmEnabled) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuPythonUDF.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuPythonUDF.scala index 6cb955a6db8..04367d9f29f 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuPythonUDF.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuPythonUDF.scala @@ -64,7 +64,7 @@ abstract class GpuPythonFunction( children: Seq[Expression], evalType: Int, udfDeterministic: Boolean, - resultId: ExprId = NamedExpression.newExprId) + val resultId: ExprId = NamedExpression.newExprId) extends Expression with GpuUnevaluable with NonSQLExpression with UserDefinedExpression with GpuAggregateWindowFunction with Serializable { @@ -94,7 +94,7 @@ case class GpuPythonUDF( children: Seq[Expression], evalType: Int, udfDeterministic: Boolean, - resultId: ExprId = NamedExpression.newExprId) + override val resultId: ExprId = NamedExpression.newExprId) extends GpuPythonFunction(name, func, dataType, children, evalType, udfDeterministic, resultId) { override lazy val canonicalized: Expression = { val canonicalizedChildren = children.map(_.canonicalized) @@ -110,7 +110,7 @@ case class GpuPythonUDAF( children: Seq[Expression], evalType: Int, udfDeterministic: Boolean, - resultId: ExprId = NamedExpression.newExprId) + override val resultId: ExprId = NamedExpression.newExprId) extends GpuPythonFunction(name, func, dataType, children, evalType, udfDeterministic, resultId) with GpuAggregateFunction { override lazy val canonicalized: Expression = { diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala index 3bc91cd6338..fcf9570a9f7 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/python/GpuWindowInPandasExecBase.scala @@ -235,16 +235,16 @@ trait GpuWindowInPandasExecBase extends ShimUnaryExecNode with GpuPythonExecBase protected val windowBoundTypeConf = "pandas_window_bound_types" - protected def collectFunctions(udf: GpuPythonFunction): - (ChainedPythonFunctions, Seq[Expression]) = { + protected def collectFunctions( + udf: GpuPythonFunction): ((ChainedPythonFunctions, Long), Seq[Expression]) = { udf.children match { case Seq(u: GpuPythonFunction) => - val (chained, children) = collectFunctions(u) - (ChainedPythonFunctions(chained.funcs ++ Seq(udf.func)), children) + val ((chained, _), children) = collectFunctions(u) + ((ChainedPythonFunctions(chained.funcs ++ Seq(udf.func)), udf.resultId.id), children) case children => // There should not be any other UDFs, or the children can't be evaluated directly. assert(children.forall(_.find(_.isInstanceOf[GpuPythonFunction]).isEmpty)) - (ChainedPythonFunctions(Seq(udf.func)), udf.children) + ((ChainedPythonFunctions(Seq(udf.func)), udf.resultId.id), udf.children) } } @@ -396,7 +396,7 @@ trait GpuWindowInPandasExecBase extends ShimUnaryExecNode with GpuPythonExecBase } }.toArray val dataCVs = GpuColumnVector.extractColumns(batch) - new ColumnarBatch(boundsCVs ++ dataCVs.map(_.incRefCount()), numRows) + new ColumnarBatch((boundsCVs ++ dataCVs.map(_.incRefCount())).toArray, numRows) } override protected def internalDoExecuteColumnar(): RDD[ColumnarBatch] = { diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index b875c84edbf..dc2845e4461 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -1097,6 +1097,7 @@ class GpuRLikeMeta( } case StartsWith(s) => GpuStartsWith(lhs, GpuLiteral(s, StringType)) case Contains(s) => GpuContains(lhs, GpuLiteral(s, StringType)) + case MultipleContains(ls) => GpuMultipleContains(lhs, ls) case PrefixRange(s, length, start, end) => GpuLiteralRangePattern(lhs, GpuLiteral(s, StringType), length, start, end) case _ => throw new IllegalStateException("Unexpected optimization type") @@ -1126,6 +1127,33 @@ case class GpuRLike(left: Expression, right: Expression, pattern: String) override def dataType: DataType = BooleanType } +case class GpuMultipleContains(input: Expression, searchList: Seq[String]) + extends GpuUnaryExpression with ImplicitCastInputTypes with NullIntolerant { + + override def dataType: DataType = BooleanType + + override def child: Expression = input + + override def inputTypes: Seq[AbstractDataType] = Seq(StringType) + + override def doColumnar(input: GpuColumnVector): ColumnVector = { + assert(searchList.length > 1) + val accInit = withResource(Scalar.fromString(searchList.head)) { searchScalar => + input.getBase.stringContains(searchScalar) + } + searchList.tail.foldLeft(accInit) { (acc, search) => + val containsSearch = withResource(Scalar.fromString(search)) { searchScalar => + input.getBase.stringContains(searchScalar) + } + withResource(acc) { _ => + withResource(containsSearch) { _ => + acc.or(containsSearch) + } + } + } + } +} + case class GpuLiteralRangePattern(left: Expression, right: Expression, length: Int, start: Int, end: Int) extends GpuBinaryExpressionArgsAnyScalar with ImplicitCastInputTypes with NullIntolerant { diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CudfUnsafeRow.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CudfUnsafeRow.scala new file mode 100644 index 00000000000..c04d3b2db29 --- /dev/null +++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CudfUnsafeRow.scala @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/*** spark-rapids-shim-json-lines +{"spark": "311"} +{"spark": "312"} +{"spark": "313"} +{"spark": "320"} +{"spark": "321"} +{"spark": "321cdh"} +{"spark": "322"} +{"spark": "323"} +{"spark": "324"} +{"spark": "330"} +{"spark": "330cdh"} +{"spark": "330db"} +{"spark": "331"} +{"spark": "332"} +{"spark": "332cdh"} +{"spark": "332db"} +{"spark": "333"} +{"spark": "334"} +{"spark": "340"} +{"spark": "341"} +{"spark": "341db"} +{"spark": "342"} +{"spark": "343"} +{"spark": "350"} +{"spark": "351"} +spark-rapids-shim-json-lines ***/ +package com.nvidia.spark.rapids.shims + +import org.apache.spark.sql.catalyst.expressions.Attribute + +final class CudfUnsafeRow( + attributes: Array[Attribute], + remapping: Array[Int]) extends CudfUnsafeRowBase(attributes, remapping) + +object CudfUnsafeRow extends CudfUnsafeRowTrait \ No newline at end of file diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CudfUnsafeRowBase.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CudfUnsafeRowBase.scala new file mode 100644 index 00000000000..e5e0bbd3dc6 --- /dev/null +++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/CudfUnsafeRowBase.scala @@ -0,0 +1,245 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/*** spark-rapids-shim-json-lines +{"spark": "311"} +{"spark": "312"} +{"spark": "313"} +{"spark": "320"} +{"spark": "321"} +{"spark": "321cdh"} +{"spark": "322"} +{"spark": "323"} +{"spark": "324"} +{"spark": "330"} +{"spark": "330cdh"} +{"spark": "330db"} +{"spark": "331"} +{"spark": "332"} +{"spark": "332cdh"} +{"spark": "332db"} +{"spark": "333"} +{"spark": "334"} +{"spark": "340"} +{"spark": "341"} +{"spark": "341db"} +{"spark": "342"} +{"spark": "343"} +{"spark": "350"} +{"spark": "351"} +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package com.nvidia.spark.rapids.shims + +import java.util.Arrays + +import com.nvidia.spark.rapids.GpuColumnVector + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.expressions.SpecializedGettersReader +import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.catalyst.util.MapData +import org.apache.spark.sql.types.DataType +import org.apache.spark.sql.types.Decimal +import org.apache.spark.unsafe.Platform +import org.apache.spark.unsafe.array.ByteArrayMethods +import org.apache.spark.unsafe.hash.Murmur3_x86_32 +import org.apache.spark.unsafe.types.CalendarInterval +import org.apache.spark.unsafe.types.UTF8String + +abstract class CudfUnsafeRowBase( + protected val attributes: Array[Attribute], + protected val remapping: Array[Int]) extends InternalRow { + protected var address: Long = _ + private var startOffsets: Array[Int] = _ + private var fixedWidthSizeInBytes: Int = _ + protected var sizeInBytes: Int = _ + + def this() = this(null, null) + + init(attributes, remapping) + + private def init(attributes: Array[Attribute], remapping: Array[Int]): Unit = { + var offset = 0 + startOffsets = new Array[Int](attributes.length) + for (i <- attributes.indices) { + val attr = attributes(i) + val length = GpuColumnVector.getNonNestedRapidsType(attr.dataType).getSizeInBytes + assert(length > 0, "Only fixed width types are currently supported.") + offset = CudfUnsafeRow.alignOffset(offset, length) + startOffsets(i) = offset + offset += length + } + fixedWidthSizeInBytes = offset + assert(startOffsets.length == remapping.length) + } + + override def numFields: Int = startOffsets.length + + def pointTo(address: Long, sizeInBytes: Int): Unit = { + assert(startOffsets != null && startOffsets.length > 0, "startOffsets not properly initialized") + assert(sizeInBytes % 8 == 0, s"sizeInBytes ($sizeInBytes) should be a multiple of 8") + this.address = address + this.sizeInBytes = sizeInBytes + } + + override def update(ordinal: Int, value: Any): Unit = throw new UnsupportedOperationException() + + override def get(ordinal: Int, dataType: DataType): Object = { + SpecializedGettersReader.read(this, ordinal, dataType, true, true) + } + + override def isNullAt(ordinal: Int): Boolean = { + val i = remapping(ordinal) + assertIndexIsValid(i) + val validByteIndex = i / 8 + val validBitIndex = i % 8 + val b = Platform.getByte(null, address + fixedWidthSizeInBytes + validByteIndex) + ((1 << validBitIndex) & b) == 0 + } + + override def setNullAt(ordinal: Int): Unit = { + val i = remapping(ordinal) + assertIndexIsValid(i) + val validByteIndex = i / 8 + val validBitIndex = i % 8 + var b = Platform.getByte(null, address + fixedWidthSizeInBytes + validByteIndex) + b = (b & ~(1 << validBitIndex)).toByte + Platform.putByte(null, address + fixedWidthSizeInBytes + validByteIndex, b) + } + + override def getBoolean(ordinal: Int): Boolean = { + Platform.getBoolean(null, getFieldAddressFromOrdinal(ordinal)) + } + + override def getByte(ordinal: Int): Byte = { + Platform.getByte(null, getFieldAddressFromOrdinal(ordinal)) + } + + override def getShort(ordinal: Int): Short = { + Platform.getShort(null, getFieldAddressFromOrdinal(ordinal)) + } + + override def getInt(ordinal: Int): Int = { + Platform.getInt(null, getFieldAddressFromOrdinal(ordinal)) + } + + override def getLong(ordinal: Int): Long = { + Platform.getLong(null, getFieldAddressFromOrdinal(ordinal)) + } + + override def getFloat(ordinal: Int): Float = { + Platform.getFloat(null, getFieldAddressFromOrdinal(ordinal)) + } + + override def getDouble(ordinal: Int): Double = { + Platform.getDouble(null, getFieldAddressFromOrdinal(ordinal)) + } + + override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal = { + if (isNullAt(ordinal)) { + null + } else if (precision <= Decimal.MAX_INT_DIGITS) { + Decimal.createUnsafe(getInt(ordinal), precision, scale) + } else if (precision <= Decimal.MAX_LONG_DIGITS) { + Decimal.createUnsafe(getLong(ordinal), precision, scale) + } else { + throw new IllegalArgumentException("NOT IMPLEMENTED YET") + } + } + + override def getUTF8String(ordinal: Int): UTF8String = { + throw new IllegalArgumentException("NOT IMPLEMENTED YET") + } + + override def getBinary(ordinal: Int): Array[Byte] = { + throw new IllegalArgumentException("NOT IMPLEMENTED YET") + } + + override def getInterval(ordinal: Int): CalendarInterval = { + throw new IllegalArgumentException("NOT IMPLEMENTED YET") + } + + override def getStruct(ordinal: Int, numFields: Int): CudfUnsafeRow = { + throw new IllegalArgumentException("NOT IMPLEMENTED YET") + } + + override def getArray(ordinal: Int): ArrayData = { + throw new IllegalArgumentException("NOT IMPLEMENTED YET") + } + + override def getMap(ordinal: Int): MapData = { + throw new IllegalArgumentException("NOT IMPLEMENTED YET") + } + + override def copy(): CudfUnsafeRow = { + throw new IllegalArgumentException("NOT IMPLEMENTED YET") + } + + override def hashCode(): Int = { + Murmur3_x86_32.hashUnsafeWords(null, address, sizeInBytes, 42) + } + + override def equals(other: Any): Boolean = other match { + case o: CudfUnsafeRow => + sizeInBytes == o.sizeInBytes && + ByteArrayMethods.arrayEquals(null, address, null, o.address, sizeInBytes) && + Arrays.equals(this.remapping, o.remapping) + case _ => false + } + + override def toString: String = { + val build = new StringBuilder("[") + for (i <- 0 until sizeInBytes by 8) { + if (i != 0) build.append(',') + build.append(java.lang.Long.toHexString(Platform.getLong(null, address + i))) + } + build.append(']') + build.append(" remapped with ") + build.append(Arrays.toString(remapping)) + build.toString() + } + + override def anyNull(): Boolean = throw new IllegalArgumentException("NOT IMPLEMENTED YET") + + private def getFieldAddressFromOrdinal(ordinal: Int): Long = { + assertIndexIsValid(ordinal) + val i = remapping(ordinal) + address + startOffsets(i) + } + + private def assertIndexIsValid(index: Int): Unit = { + assert(index >= 0, s"index ($index) should >= 0") + assert(index < startOffsets.length, s"index ($index) should < ${startOffsets.length}") + } +} + +trait CudfUnsafeRowTrait { + def alignOffset(offset: Int, alignment: Int): Int = (offset + alignment - 1) & -alignment + + def calculateBitSetWidthInBytes(numFields: Int): Int = (numFields + 7) / 8 + + def getRowSizeEstimate(attributes: Array[Attribute]): Int = { + var offset = 0 + for (attr <- attributes) { + val length = GpuColumnVector.getNonNestedRapidsType(attr.dataType).getSizeInBytes + offset = alignOffset(offset, length) + offset += length + } + val bitSetWidthInBytes = calculateBitSetWidthInBytes(attributes.length) + alignOffset(offset + bitSetWidthInBytes, 8) + } +} \ No newline at end of file diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala index fd48b8b6375..4d6d4967a80 100644 --- a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala +++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,7 +26,8 @@ import org.apache.parquet.schema.OriginalType._ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._ import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.rapids.execution.TrampolineUtil +import org.apache.spark.sql.rapids.execution.RapidsAnalysisException +import org.apache.spark.sql.rapids.shims.RapidsErrorUtils import org.apache.spark.sql.types._ object ParquetSchemaClipShims { @@ -64,13 +65,13 @@ object ParquetSchemaClipShims { if (originalType == null) s"$typeName" else s"$typeName ($originalType)" def typeNotSupported() = - TrampolineUtil.throwAnalysisException(s"Parquet type not supported: $typeString") + throw new RapidsAnalysisException(s"Parquet type not supported: $typeString") def typeNotImplemented() = - TrampolineUtil.throwAnalysisException(s"Parquet type not yet supported: $typeString") + throw RapidsErrorUtils.parquetTypeUnsupportedYetError(typeString) def illegalType() = - TrampolineUtil.throwAnalysisException(s"Illegal Parquet type: $typeString") + throw RapidsErrorUtils.illegalParquetTypeError(typeString) // When maxPrecision = -1, we skip precision range check, and always respect the precision // specified in field.getDecimalMetadata. This is useful when interpreting decimal types stored @@ -80,8 +81,7 @@ object ParquetSchemaClipShims { val scale = field.getDecimalMetadata.getScale if (!(maxPrecision == -1 || 1 <= precision && precision <= maxPrecision)) { - TrampolineUtil.throwAnalysisException( - s"Invalid decimal precision: $typeName " + + throw new RapidsAnalysisException(s"Invalid decimal precision: $typeName " + s"cannot store $precision digits (max $maxPrecision)") } @@ -121,7 +121,7 @@ object ParquetSchemaClipShims { case INT96 => if (!SQLConf.get.isParquetINT96AsTimestamp) { - TrampolineUtil.throwAnalysisException( + throw new RapidsAnalysisException( "INT96 is not supported unless it's interpreted as timestamp. " + s"Please try to set ${SQLConf.PARQUET_INT96_AS_TIMESTAMP.key} to true.") } diff --git a/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/RaiseErrorShim.scala b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/RaiseErrorShim.scala new file mode 100644 index 00000000000..de433d5f270 --- /dev/null +++ b/sql-plugin/src/main/spark311/scala/com/nvidia/spark/rapids/shims/RaiseErrorShim.scala @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/*** spark-rapids-shim-json-lines +{"spark": "311"} +{"spark": "312"} +{"spark": "313"} +{"spark": "320"} +{"spark": "321"} +{"spark": "321cdh"} +{"spark": "322"} +{"spark": "323"} +{"spark": "324"} +{"spark": "330"} +{"spark": "330cdh"} +{"spark": "330db"} +{"spark": "331"} +{"spark": "332"} +{"spark": "332cdh"} +{"spark": "332db"} +{"spark": "333"} +{"spark": "334"} +{"spark": "340"} +{"spark": "341"} +{"spark": "341db"} +{"spark": "342"} +{"spark": "343"} +{"spark": "350"} +{"spark": "351"} +spark-rapids-shim-json-lines ***/ +package com.nvidia.spark.rapids.shims + +import com.nvidia.spark.rapids.{ExprRule, GpuOverrides} +import com.nvidia.spark.rapids.{ExprChecks, GpuExpression, TypeSig, UnaryExprMeta} + +import org.apache.spark.sql.catalyst.expressions.{Expression, RaiseError} +import org.apache.spark.sql.rapids.shims.GpuRaiseError + +object RaiseErrorShim { + val exprs: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = { + Seq(GpuOverrides.expr[RaiseError]( + "Throw an exception", + ExprChecks.unaryProject( + TypeSig.NULL, TypeSig.NULL, + TypeSig.STRING, TypeSig.STRING), + (a, conf, p, r) => new UnaryExprMeta[RaiseError](a, conf, p, r) { + override def convertToGpu(child: Expression): GpuExpression = GpuRaiseError(child) + })).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap + } +} diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala index d94c8e54683..2dcad0d4226 100644 --- a/sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala +++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/rapids/shims/GpuShuffleExchangeExec.scala @@ -58,6 +58,7 @@ case class GpuShuffleExchangeExec( cpuOutputPartitioning: Partitioning) extends GpuShuffleExchangeExecBaseWithMetrics(gpuOutputPartitioning, child) with ShuffleExchangeLike { + def shuffleId: Int = shuffleDependencyColumnar.shuffleId override def otherCopyArgs: Seq[AnyRef] = cpuOutputPartitioning :: Nil diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/CommandUtilsShim.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/CommandUtilsShim.scala new file mode 100644 index 00000000000..1e1ac57aa60 --- /dev/null +++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/CommandUtilsShim.scala @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "311"} +{"spark": "312"} +{"spark": "313"} +{"spark": "320"} +{"spark": "321"} +{"spark": "321cdh"} +{"spark": "322"} +{"spark": "323"} +{"spark": "324"} +{"spark": "330"} +{"spark": "330cdh"} +{"spark": "330db"} +{"spark": "331"} +{"spark": "332"} +{"spark": "332cdh"} +{"spark": "332db"} +{"spark": "333"} +{"spark": "334"} +{"spark": "340"} +{"spark": "341"} +{"spark": "341db"} +{"spark": "342"} +{"spark": "343"} +{"spark": "350"} +{"spark": "351"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.hive.rapids.shims + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.execution.command.CommandUtils + +object CommandUtilsShim { + + // Shim for CommandUtils.uncacheTableOrView, whose signature changed in Apache Spark 4.0. + def uncacheTableOrView(sparkSession: SparkSession, tableId: TableIdentifier): Unit = { + CommandUtils.uncacheTableOrView(sparkSession, tableId.quotedString) + } + +} \ No newline at end of file diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuInsertIntoHiveTable.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuInsertIntoHiveTable.scala index 92fb72801c8..2ea0301fa2c 100644 --- a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuInsertIntoHiveTable.scala +++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/hive/rapids/shims/GpuInsertIntoHiveTable.scala @@ -45,7 +45,7 @@ import org.apache.hadoop.hive.ql.ErrorMsg import org.apache.hadoop.hive.ql.plan.TableDesc import org.apache.spark.SparkException -import org.apache.spark.sql.{AnalysisException, SparkSession} +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, ExternalCatalog, ExternalCatalogUtils, ExternalCatalogWithListener} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan @@ -57,7 +57,8 @@ import org.apache.spark.sql.hive.HiveShim.{ShimFileSinkDesc => FileSinkDesc} import org.apache.spark.sql.hive.client.HiveClientImpl import org.apache.spark.sql.hive.client.hive._ import org.apache.spark.sql.hive.execution.InsertIntoHiveTable -import org.apache.spark.sql.hive.rapids.{GpuHiveTextFileFormat, GpuSaveAsHiveFile, RapidsHiveErrors} +import org.apache.spark.sql.hive.rapids.{GpuHiveFileFormat, GpuSaveAsHiveFile, RapidsHiveErrors} +import org.apache.spark.sql.rapids.shims.RapidsErrorUtils import org.apache.spark.sql.vectorized.ColumnarBatch final class GpuInsertIntoHiveTableMeta(cmd: InsertIntoHiveTable, @@ -69,16 +70,17 @@ final class GpuInsertIntoHiveTableMeta(cmd: InsertIntoHiveTable, private var fileFormat: Option[ColumnarFileFormat] = None override def tagSelfForGpuInternal(): Unit = { - // Only Hive delimited text writes are currently supported. - // Check whether that is the format currently in play. - fileFormat = GpuHiveTextFileFormat.tagGpuSupport(this) + fileFormat = GpuHiveFileFormat.tagGpuSupport(this) } override def convertToGpu(): GpuDataWritingCommand = { + val format = fileFormat.getOrElse( + throw new IllegalStateException("fileFormat missing, tagSelfForGpu not called?")) + GpuInsertIntoHiveTable( table = wrapped.table, partition = wrapped.partition, - fileFormat = this.fileFormat.get, + fileFormat = format, query = wrapped.query, overwrite = wrapped.overwrite, ifPartitionNotExists = wrapped.ifPartitionNotExists, @@ -137,7 +139,7 @@ case class GpuInsertIntoHiveTable( } // un-cache this table. - CommandUtils.uncacheTableOrView(sparkSession, table.identifier.quotedString) + CommandUtilsShim.uncacheTableOrView(sparkSession, table.identifier) sparkSession.sessionState.catalog.refreshTable(table.identifier) CommandUtils.updateTableStats(sparkSession, table) @@ -192,7 +194,7 @@ case class GpuInsertIntoHiveTable( // Report error if any static partition appears after a dynamic partition val isDynamic = partitionColumnNames.map(partitionSpec(_).isEmpty) if (isDynamic.init.zip(isDynamic.tail).contains((true, false))) { - throw new AnalysisException(ErrorMsg.PARTITION_DYN_STA_ORDER.getMsg) + throw RapidsErrorUtils.dynamicPartitionParentError } } @@ -326,8 +328,10 @@ case class GpuInsertIntoHiveTable( if (!fs.delete(path, true)) { throw RapidsHiveErrors.cannotRemovePartitionDirError(path) } - // Don't let Hive do overwrite operation since it is slower. - doHiveOverwrite = false + // Don't let Hive do overwrite operation since it is slower. But still give a + // chance to forcely override this for some customized cases when this + // operation is optimized. + doHiveOverwrite = hadoopConf.getBoolean("hive.movetask.enable.dir.move", false) } } } diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuArrowPythonRunner.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuArrowPythonRunner.scala index 761d84b4667..977c755712a 100644 --- a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuArrowPythonRunner.scala +++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuArrowPythonRunner.scala @@ -49,7 +49,6 @@ import com.nvidia.spark.rapids.GpuSemaphore import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python.ChainedPythonFunctions -import org.apache.spark.sql.execution.python.PythonUDFRunner import org.apache.spark.sql.rapids.execution.python.{GpuArrowPythonWriter, GpuPythonRunnerCommon} import org.apache.spark.sql.rapids.shims.ArrowUtilsShim import org.apache.spark.sql.types.StructType @@ -60,7 +59,7 @@ import org.apache.spark.util.Utils * Similar to `PythonUDFRunner`, but exchange data with Python worker via Arrow stream. */ class GpuArrowPythonRunner( - funcs: Seq[ChainedPythonFunctions], + funcs: Seq[(ChainedPythonFunctions, Long)], evalType: Int, argOffsets: Array[Array[Int]], pythonInSchema: StructType, @@ -69,8 +68,8 @@ class GpuArrowPythonRunner( maxBatchSize: Long, override val pythonOutSchema: StructType, jobArtifactUUID: Option[String] = None) - extends GpuBasePythonRunner[ColumnarBatch](funcs, evalType, argOffsets, jobArtifactUUID) - with GpuArrowPythonOutput with GpuPythonRunnerCommon { + extends GpuBasePythonRunner[ColumnarBatch](funcs.map(_._1), evalType, argOffsets, + jobArtifactUUID) with GpuArrowPythonOutput with GpuPythonRunnerCommon { protected override def newWriterThread( env: SparkEnv, @@ -82,7 +81,7 @@ class GpuArrowPythonRunner( val arrowWriter = new GpuArrowPythonWriter(pythonInSchema, maxBatchSize) { override protected def writeUDFs(dataOut: DataOutputStream): Unit = { - PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets) + WritePythonUDFUtils.writeUDFs(dataOut, funcs, argOffsets) } } val isInputNonEmpty = inputIterator.nonEmpty diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala index adb28725ba1..68112676a2b 100644 --- a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala +++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala @@ -50,7 +50,6 @@ import com.nvidia.spark.rapids.GpuSemaphore import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python.{ChainedPythonFunctions, PythonRDD} -import org.apache.spark.sql.execution.python.PythonUDFRunner import org.apache.spark.sql.rapids.execution.python.{GpuArrowWriter, GpuPythonRunnerCommon} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch @@ -63,7 +62,7 @@ import org.apache.spark.util.Utils * and receive it back in JVM as batches of single DataFrame. */ class GpuCoGroupedArrowPythonRunner( - funcs: Seq[ChainedPythonFunctions], + funcs: Seq[(ChainedPythonFunctions, Long)], evalType: Int, argOffsets: Array[Array[Int]], leftSchema: StructType, @@ -73,7 +72,7 @@ class GpuCoGroupedArrowPythonRunner( batchSize: Int, override val pythonOutSchema: StructType, jobArtifactUUID: Option[String] = None) - extends GpuBasePythonRunner[(ColumnarBatch, ColumnarBatch)](funcs, evalType, + extends GpuBasePythonRunner[(ColumnarBatch, ColumnarBatch)](funcs.map(_._1), evalType, argOffsets, jobArtifactUUID) with GpuArrowPythonOutput with GpuPythonRunnerCommon { protected override def newWriterThread( @@ -90,7 +89,7 @@ class GpuCoGroupedArrowPythonRunner( PythonRDD.writeUTF(k, dataOut) PythonRDD.writeUTF(v, dataOut) } - PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets) + WritePythonUDFUtils.writeUDFs(dataOut, funcs, argOffsets) } protected override def writeIteratorToStream(dataOut: DataOutputStream): Unit = { diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupedPythonRunnerFactory.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupedPythonRunnerFactory.scala index eba0286e181..9df93a9d11b 100644 --- a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupedPythonRunnerFactory.scala +++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupedPythonRunnerFactory.scala @@ -48,7 +48,7 @@ import org.apache.spark.sql.vectorized.ColumnarBatch case class GpuGroupedPythonRunnerFactory( conf: org.apache.spark.sql.internal.SQLConf, - chainedFunc: Seq[ChainedPythonFunctions], + chainedFunc: Seq[(ChainedPythonFunctions, Long)], argOffsets: Array[Array[Int]], dedupAttrs: StructType, pythonOutputSchema: StructType, diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/WritePythonUDFUtils.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/WritePythonUDFUtils.scala new file mode 100644 index 00000000000..aacf972e7e0 --- /dev/null +++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/execution/python/shims/WritePythonUDFUtils.scala @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "311"} +{"spark": "312"} +{"spark": "313"} +{"spark": "320"} +{"spark": "321"} +{"spark": "321cdh"} +{"spark": "322"} +{"spark": "323"} +{"spark": "324"} +{"spark": "330"} +{"spark": "330cdh"} +{"spark": "330db"} +{"spark": "331"} +{"spark": "332"} +{"spark": "332cdh"} +{"spark": "332db"} +{"spark": "333"} +{"spark": "334"} +{"spark": "340"} +{"spark": "341"} +{"spark": "341db"} +{"spark": "342"} +{"spark": "343"} +{"spark": "350"} +{"spark": "351"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.execution.python.shims + +import java.io.DataOutputStream + +import org.apache.spark.api.python.ChainedPythonFunctions +import org.apache.spark.sql.execution.python.PythonUDFRunner + +object WritePythonUDFUtils { + def writeUDFs( + dataOut: DataOutputStream, + funcs: Seq[(ChainedPythonFunctions, Long)], + argOffsets: Array[Array[Int]], + profiler: Option[String] = None): Unit = { + PythonUDFRunner.writeUDFs(dataOut, funcs.map(_._1), argOffsets) + } +} diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala index f23229e0956..7fa269db71a 100644 --- a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala +++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.trees.Origin import org.apache.spark.sql.types.{DataType, Decimal, DecimalType} -object RapidsErrorUtils { +object RapidsErrorUtils extends RapidsQueryErrorUtils { def invalidArrayIndexError(index: Int, numElements: Int, isElementAtF: Boolean = false): ArrayIndexOutOfBoundsException = { // Follow the Spark string format before 3.3.0 diff --git a/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsQueryErrorUtils.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsQueryErrorUtils.scala new file mode 100644 index 00000000000..266cb4ef54f --- /dev/null +++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/RapidsQueryErrorUtils.scala @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "311"} +{"spark": "312"} +{"spark": "313"} +spark-rapids-shim-json-lines ***/ + +package org.apache.spark.sql.rapids.shims + +import org.apache.hadoop.fs.Path +import org.apache.hadoop.hive.ql.ErrorMsg + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.rapids.execution.RapidsAnalysisException +import org.apache.spark.sql.types.StructType + +trait RapidsQueryErrorUtils { + + def outputPathAlreadyExistsError(qualifiedOutputPath: Path): Throwable = { + new AnalysisException(s"path $qualifiedOutputPath already exists.") + } + + def createTableAsSelectWithNonEmptyDirectoryError(tablePath: String, conf: String): Throwable = { + new AnalysisException(s"CREATE-TABLE-AS-SELECT cannot create table with location to a " + + s"non-empty directory $tablePath. To allow overwriting the existing non-empty directory, " + + s"set '$conf' to true.") + } + + def cannotResolveAttributeError(name: String, outputStr: String): Throwable = { + new AnalysisException(s"Unable to resolve $name given [$outputStr]") + } + + def partitionColumnNotSpecifiedError(format: String, partitionColumn: String): Throwable = { + new AnalysisException(s"Failed to resolve the schema for $format for the partition column: " + + s"$partitionColumn. It must be specified manually.") + } + + def dataSchemaNotSpecifiedError(format: String): Throwable = { + new AnalysisException(s"Unable to infer schema for $format. It must be specified manually.") + } + + def schemaNotSpecifiedForSchemaRelationProviderError(className: String): Throwable = { + new AnalysisException(s"A schema needs to be specified when using $className.") + } + + def userSpecifiedSchemaMismatchActualSchemaError( + schema: StructType, + actualSchema: StructType): Throwable = { + new AnalysisException("The user-specified schema doesn't match the actual schema: " + + s"user-specified: ${schema.toDDL}, actual: ${actualSchema.toDDL}. If " + + "you're using DataFrameReader.schema API or creating a table, please do not " + + "specify the schema. Or if you're scanning an existed table, please drop " + + "it and re-create it.") + } + + def dataSchemaNotSpecifiedError(format: String, fileCatalog: String): Throwable = { + new AnalysisException(s"Unable to infer schema for $format at $fileCatalog. " + + "It must be specified manually") + } + + def invalidDataSourceError(className: String): Throwable = { + new AnalysisException(s"$className is not a valid Spark SQL Data Source.") + } + + def orcNotUsedWithHiveEnabledError(): Throwable = { + new AnalysisException( + s"Hive built-in ORC data source must be used with Hive support enabled. " + + s"Please use the native ORC data source by setting 'spark.sql.orc.impl' to 'native'.") + } + + def failedToFindAvroDataSourceError(provider: String): Throwable = { + new AnalysisException( + s"Failed to find data source: $provider. Avro is built-in but external data " + + "source module since Spark 2.4. Please deploy the application as per " + + "the deployment section of \"Apache Avro Data Source Guide\".") + } + + def failedToFindKafkaDataSourceError(provider: String): Throwable = { + new AnalysisException( + s"Failed to find data source: $provider. Please deploy the application as " + + "per the deployment section of " + + "\"Structured Streaming + Kafka Integration Guide\".") + } + + def findMultipleDataSourceError(provider: String, sourceNames: Seq[String]): Throwable = { + new AnalysisException( + s"Multiple sources found for $provider " + + s"(${sourceNames.mkString(", ")}), please specify the fully qualified class name.") + } + + def dataPathNotExistError(path: String): Throwable = { + new AnalysisException(s"Path does not exist: $path") + } + + def dynamicPartitionParentError: Throwable = { + throw new RapidsAnalysisException(ErrorMsg.PARTITION_DYN_STA_ORDER.getMsg) + } + + def tableOrViewAlreadyExistsError(tableName: String): Throwable = { + new AnalysisException(s"Table $tableName already exists. You need to drop it first.") + } + + def parquetTypeUnsupportedYetError(parquetType: String): Throwable = { + new AnalysisException(s"Parquet type not yet supported: $parquetType.") + } + + def illegalParquetTypeError(parquetType: String): Throwable = { + new AnalysisException(s"Illegal Parquet type: $parquetType.") + } +} \ No newline at end of file diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/misc.scala b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/misc.scala similarity index 75% rename from sql-plugin/src/main/scala/org/apache/spark/sql/rapids/misc.scala rename to sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/misc.scala index b32bdfa207c..1ab58ddcbb6 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/misc.scala +++ b/sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/misc.scala @@ -13,10 +13,36 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +/*** spark-rapids-shim-json-lines +{"spark": "311"} +{"spark": "312"} +{"spark": "313"} +{"spark": "320"} +{"spark": "321"} +{"spark": "321cdh"} +{"spark": "322"} +{"spark": "323"} +{"spark": "324"} +{"spark": "330"} +{"spark": "330cdh"} +{"spark": "330db"} +{"spark": "331"} +{"spark": "332"} +{"spark": "332cdh"} +{"spark": "332db"} +{"spark": "333"} +{"spark": "334"} +{"spark": "340"} +{"spark": "341"} +{"spark": "341db"} +{"spark": "342"} +{"spark": "343"} +{"spark": "350"} +{"spark": "351"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.shims -package org.apache.spark.sql.rapids - -import ai.rapids.cudf.{ColumnVector} +import ai.rapids.cudf.ColumnVector import com.nvidia.spark.rapids.{GpuColumnVector, GpuUnaryExpression} import com.nvidia.spark.rapids.Arm.withResource diff --git a/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuOrcDataReader.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuOrcDataReader.scala index 8c82074b8f5..aec35945b4e 100644 --- a/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuOrcDataReader.scala +++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuOrcDataReader.scala @@ -33,101 +33,17 @@ {"spark": "343"} {"spark": "350"} {"spark": "351"} -{"spark": "400"} spark-rapids-shim-json-lines ***/ package com.nvidia.spark.rapids.shims -import java.io.EOFException -import java.nio.ByteBuffer -import java.nio.channels.SeekableByteChannel - -import ai.rapids.cudf.HostMemoryBuffer -import com.nvidia.spark.rapids.Arm.closeOnExcept import com.nvidia.spark.rapids.GpuMetric -import com.nvidia.spark.rapids.filecache.FileCache import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.hive.common.io.DiskRangeList -import org.apache.orc.OrcProto -import org.apache.orc.impl.{BufferChunk, BufferChunkList, DataReaderProperties, InStream, OrcCodecPool} +import org.apache.orc.impl.DataReaderProperties class GpuOrcDataReader( props: DataReaderProperties, conf: Configuration, - metrics: Map[String, GpuMetric]) extends GpuOrcDataReaderBase(props, conf, metrics) { - - private class BufferChunkLoader(useDirect: Boolean) extends BlockLoader { - override def loadRemoteBlocks( - baseOffset: Long, - first: DiskRangeList, - last: DiskRangeList, - data: ByteBuffer): DiskRangeList = { - var current = first - val offset = current.getOffset - while (current ne last.next) { - val buffer = if (current eq last) data else data.duplicate() - buffer.position((current.getOffset - offset).toInt) - buffer.limit((current.getEnd - offset).toInt) - current.asInstanceOf[BufferChunk].setChunk(buffer) - // see if the filecache wants any of this data - val cacheToken = FileCache.get.startDataRangeCache(filePathString, - baseOffset + current.getOffset, current.getLength, conf) - cacheToken.foreach { token => - val hmb = closeOnExcept(HostMemoryBuffer.allocate(current.getLength, false)) { hmb => - hmb.setBytes(0, buffer.array(), - buffer.arrayOffset() + buffer.position(), current.getLength) - hmb - } - token.complete(hmb) - } - current = current.next - } - current - } - - override def loadCachedBlock( - chunk: DiskRangeList, - channel: SeekableByteChannel): DiskRangeList = { - val buffer = if (useDirect) { - ByteBuffer.allocateDirect(chunk.getLength) - } else { - ByteBuffer.allocate(chunk.getLength) - } - while (buffer.remaining() > 0) { - if (channel.read(buffer) < 0) { - throw new EOFException(s"Unexpected EOF while reading cached block for $filePathString") - } - } - buffer.flip() - chunk.asInstanceOf[BufferChunk].setChunk(buffer) - chunk - } - } - - override protected def parseStripeFooter(buf: ByteBuffer, size: Int): OrcProto.StripeFooter = { - OrcProto.StripeFooter.parseFrom( - InStream.createCodedInputStream(InStream.create("footer", - new BufferChunk(buf, 0), 0, size, compression))) - } - - override def getCompressionOptions: InStream.StreamOptions = compression - - override def readFileData(chunks: BufferChunkList, forceDirect: Boolean): BufferChunkList = { - if (chunks != null) { - readDiskRanges(chunks.get, 0, new BufferChunkLoader(forceDirect)) - } - chunks - } - - override def close(): Unit = { - if (compression.getCodec != null) { - if (compression.getCodec != null) { - OrcCodecPool.returnCodec(compression.getCodec.getKind, compression.getCodec) - compression.withCodec(null) - } - } - super.close() - } -} + metrics: Map[String, GpuMetric]) extends GpuOrcDataReader320Plus(props, conf, metrics) object GpuOrcDataReader { // File cache is being used, so we want read ranges that can be cached separately diff --git a/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuOrcDataReader320Plus.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuOrcDataReader320Plus.scala new file mode 100644 index 00000000000..e28f7001a2b --- /dev/null +++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/GpuOrcDataReader320Plus.scala @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/*** spark-rapids-shim-json-lines +{"spark": "320"} +{"spark": "321"} +{"spark": "322"} +{"spark": "323"} +{"spark": "324"} +{"spark": "330"} +{"spark": "330db"} +{"spark": "331"} +{"spark": "332"} +{"spark": "332db"} +{"spark": "333"} +{"spark": "334"} +{"spark": "340"} +{"spark": "341"} +{"spark": "341db"} +{"spark": "342"} +{"spark": "343"} +{"spark": "350"} +{"spark": "351"} +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package com.nvidia.spark.rapids.shims + +import java.io.EOFException +import java.nio.ByteBuffer +import java.nio.channels.SeekableByteChannel + +import ai.rapids.cudf.HostMemoryBuffer +import com.nvidia.spark.rapids.Arm.closeOnExcept +import com.nvidia.spark.rapids.GpuMetric +import com.nvidia.spark.rapids.filecache.FileCache +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.hive.common.io.DiskRangeList +import org.apache.orc.OrcProto +import org.apache.orc.impl.{BufferChunk, BufferChunkList, DataReaderProperties, InStream, OrcCodecPool} + +abstract class GpuOrcDataReader320Plus( + props: DataReaderProperties, + conf: Configuration, + metrics: Map[String, GpuMetric]) extends GpuOrcDataReaderBase(props, conf, metrics) { + + private class BufferChunkLoader(useDirect: Boolean) extends BlockLoader { + override def loadRemoteBlocks( + baseOffset: Long, + first: DiskRangeList, + last: DiskRangeList, + data: ByteBuffer): DiskRangeList = { + var current = first + val offset = current.getOffset + while (current ne last.next) { + val buffer = if (current eq last) data else data.duplicate() + buffer.position((current.getOffset - offset).toInt) + buffer.limit((current.getEnd - offset).toInt) + current.asInstanceOf[BufferChunk].setChunk(buffer) + // see if the filecache wants any of this data + val cacheToken = FileCache.get.startDataRangeCache(filePathString, + baseOffset + current.getOffset, current.getLength, conf) + cacheToken.foreach { token => + val hmb = closeOnExcept(HostMemoryBuffer.allocate(current.getLength, false)) { hmb => + hmb.setBytes(0, buffer.array(), + buffer.arrayOffset() + buffer.position(), current.getLength) + hmb + } + token.complete(hmb) + } + current = current.next + } + current + } + + override def loadCachedBlock( + chunk: DiskRangeList, + channel: SeekableByteChannel): DiskRangeList = { + val buffer = if (useDirect) { + ByteBuffer.allocateDirect(chunk.getLength) + } else { + ByteBuffer.allocate(chunk.getLength) + } + while (buffer.remaining() > 0) { + if (channel.read(buffer) < 0) { + throw new EOFException(s"Unexpected EOF while reading cached block for $filePathString") + } + } + buffer.flip() + chunk.asInstanceOf[BufferChunk].setChunk(buffer) + chunk + } + } + + override protected def parseStripeFooter(buf: ByteBuffer, size: Int): OrcProto.StripeFooter = { + OrcProto.StripeFooter.parseFrom( + InStream.createCodedInputStream(InStream.create("footer", + new BufferChunk(buf, 0), 0, size, compression))) + } + + override def getCompressionOptions: InStream.StreamOptions = compression + + override def readFileData(chunks: BufferChunkList, forceDirect: Boolean): BufferChunkList = { + if (chunks != null) { + readDiskRanges(chunks.get, 0, new BufferChunkLoader(forceDirect)) + } + chunks + } + + override def close(): Unit = { + if (compression.getCodec != null) { + if (compression.getCodec != null) { + OrcCodecPool.returnCodec(compression.getCodec.getKind, compression.getCodec) + compression.withCodec(null) + } + } + super.close() + } +} \ No newline at end of file diff --git a/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala index c3152a8a235..bba205f267f 100644 --- a/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala +++ b/sql-plugin/src/main/spark320/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala @@ -29,7 +29,8 @@ import org.apache.parquet.schema.LogicalTypeAnnotation._ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._ import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.rapids.execution.TrampolineUtil +import org.apache.spark.sql.rapids.execution.RapidsAnalysisException +import org.apache.spark.sql.rapids.shims.RapidsErrorUtils import org.apache.spark.sql.types._ object ParquetSchemaClipShims { @@ -67,10 +68,10 @@ object ParquetSchemaClipShims { if (typeAnnotation == null) s"$typeName" else s"$typeName ($typeAnnotation)" def typeNotImplemented() = - TrampolineUtil.throwAnalysisException(s"Parquet type not yet supported: $typeString") + throw RapidsErrorUtils.parquetTypeUnsupportedYetError(typeString) def illegalType() = - TrampolineUtil.throwAnalysisException(s"Illegal Parquet type: $typeString") + throw RapidsErrorUtils.illegalParquetTypeError(typeString) // When maxPrecision = -1, we skip precision range check, and always respect the precision // specified in field.getDecimalMetadata. This is useful when interpreting decimal types stored @@ -82,7 +83,7 @@ object ParquetSchemaClipShims { val scale = decimalLogicalTypeAnnotation.getScale if (!(maxPrecision == -1 || 1 <= precision && precision <= maxPrecision)) { - TrampolineUtil.throwAnalysisException( + throw new RapidsAnalysisException( s"Invalid decimal precision: $typeName " + s"cannot store $precision digits (max $maxPrecision)") } @@ -143,14 +144,14 @@ object ParquetSchemaClipShims { TimestampType case timestamp: TimestampLogicalTypeAnnotation if timestamp.getUnit == TimeUnit.NANOS && ParquetLegacyNanoAsLongShims.legacyParquetNanosAsLong => - TrampolineUtil.throwAnalysisException( + throw new RapidsAnalysisException( "GPU does not support spark.sql.legacy.parquet.nanosAsLong") case _ => illegalType() } case INT96 => if (!SQLConf.get.isParquetINT96AsTimestamp) { - TrampolineUtil.throwAnalysisException( + throw new RapidsAnalysisException( "INT96 is not supported unless it's interpreted as timestamp. " + s"Please try to set ${SQLConf.PARQUET_INT96_AS_TIMESTAMP.key} to true.") } diff --git a/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala index b301397255a..68a6ce30569 100644 --- a/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala +++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,7 +29,7 @@ import org.apache.spark.sql.catalyst.trees.Origin import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.types.{DataType, Decimal, DecimalType} -object RapidsErrorUtils { +object RapidsErrorUtils extends RapidsQueryErrorUtils { def invalidArrayIndexError(index: Int, numElements: Int, isElementAtF: Boolean = false): ArrayIndexOutOfBoundsException = { // Follow the Spark string format before 3.3.0 diff --git a/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsQueryErrorUtils.scala b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsQueryErrorUtils.scala new file mode 100644 index 00000000000..dbc4145ee54 --- /dev/null +++ b/sql-plugin/src/main/spark320/scala/org/apache/spark/sql/rapids/shims/RapidsQueryErrorUtils.scala @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "320"} +{"spark": "321"} +{"spark": "321cdh"} +{"spark": "322"} +{"spark": "323"} +{"spark": "324"} +{"spark": "330"} +{"spark": "330cdh"} +{"spark": "330db"} +{"spark": "331"} +{"spark": "332"} +{"spark": "332cdh"} +{"spark": "332db"} +{"spark": "333"} +{"spark": "334"} +{"spark": "340"} +{"spark": "341"} +{"spark": "341db"} +{"spark": "342"} +{"spark": "343"} +{"spark": "350"} +{"spark": "351"} +{"spark": "400"} +spark-rapids-shim-json-lines ***/ + +package org.apache.spark.sql.rapids.shims + +import org.apache.hadoop.fs.Path +import org.apache.hadoop.hive.ql.ErrorMsg + +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.rapids.execution.RapidsAnalysisException +import org.apache.spark.sql.types.StructType + +trait RapidsQueryErrorUtils { + + def outputPathAlreadyExistsError(qualifiedOutputPath: Path): Throwable = { + QueryCompilationErrors.outputPathAlreadyExistsError(qualifiedOutputPath) + } + + def createTableAsSelectWithNonEmptyDirectoryError(tablePath: String, conf: String): Throwable = { + QueryCompilationErrors.createTableAsSelectWithNonEmptyDirectoryError(tablePath) + } + + def cannotResolveAttributeError(name: String, outputStr: String): Throwable = { + QueryCompilationErrors.cannotResolveAttributeError(name, outputStr) + } + + def partitionColumnNotSpecifiedError(format: String, partitionColumn: String): Throwable = { + QueryCompilationErrors.partitionColumnNotSpecifiedError(format, partitionColumn) + } + + def dataSchemaNotSpecifiedError(format: String): Throwable = { + QueryCompilationErrors.dataSchemaNotSpecifiedError(format) + } + + def schemaNotSpecifiedForSchemaRelationProviderError(className: String): Throwable = { + QueryCompilationErrors.schemaNotSpecifiedForSchemaRelationProviderError(className) + } + + def userSpecifiedSchemaMismatchActualSchemaError( + schema: StructType, + actualSchema: StructType): Throwable = { + QueryCompilationErrors.userSpecifiedSchemaMismatchActualSchemaError(schema, actualSchema) + } + + def dataSchemaNotSpecifiedError(format: String, fileCatalog: String): Throwable = { + QueryCompilationErrors.dataSchemaNotSpecifiedError(format, fileCatalog) + } + + def invalidDataSourceError(className: String): Throwable = { + QueryCompilationErrors.invalidDataSourceError(className) + } + + def orcNotUsedWithHiveEnabledError(): Throwable = { + QueryCompilationErrors.orcNotUsedWithHiveEnabledError() + } + + def failedToFindAvroDataSourceError(provider: String): Throwable = { + QueryCompilationErrors.failedToFindAvroDataSourceError(provider) + } + + def failedToFindKafkaDataSourceError(provider: String): Throwable = { + QueryCompilationErrors.failedToFindKafkaDataSourceError(provider) + } + + def findMultipleDataSourceError(provider: String, sourceNames: Seq[String]): Throwable = { + QueryCompilationErrors.findMultipleDataSourceError(provider, sourceNames) + } + + def dataPathNotExistError(path: String): Throwable = { + QueryCompilationErrors.dataPathNotExistError(path) + } + + def tableOrViewAlreadyExistsError(tableName: String): Throwable = { + QueryCompilationErrors.tableOrViewAlreadyExistsError(tableName) + } + + def parquetTypeUnsupportedYetError(parquetType: String): Throwable = { + QueryCompilationErrors.parquetTypeUnsupportedYetError(parquetType) + } + + def illegalParquetTypeError(parquetType: String): Throwable = { + QueryCompilationErrors.illegalParquetTypeError(parquetType) + } + + def dynamicPartitionParentError: Throwable = { + throw new RapidsAnalysisException(ErrorMsg.PARTITION_DYN_STA_ORDER.getMsg) + } +} \ No newline at end of file diff --git a/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala index 56708017a23..8c395274e07 100644 --- a/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala +++ b/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/ParquetSchemaClipShims.scala @@ -44,7 +44,8 @@ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._ import org.apache.spark.sql.execution.datasources.parquet.ParquetReadSupport.containsFieldIds import org.apache.spark.sql.execution.datasources.parquet.ParquetUtils import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.rapids.execution.TrampolineUtil +import org.apache.spark.sql.rapids.execution.RapidsAnalysisException +import org.apache.spark.sql.rapids.shims.RapidsErrorUtils import org.apache.spark.sql.types._ object ParquetSchemaClipShims { @@ -109,10 +110,11 @@ object ParquetSchemaClipShims { if (typeAnnotation == null) s"$typeName" else s"$typeName ($typeAnnotation)" def typeNotImplemented() = - TrampolineUtil.throwAnalysisException(s"Parquet type not yet supported: $typeString") + throw RapidsErrorUtils.parquetTypeUnsupportedYetError(typeString) def illegalType() = - TrampolineUtil.throwAnalysisException(s"Illegal Parquet type: $parquetType") + throw RapidsErrorUtils.illegalParquetTypeError(typeString) + // When maxPrecision = -1, we skip precision range check, and always respect the precision // specified in field.getDecimalMetadata. This is useful when interpreting decimal types stored @@ -124,7 +126,7 @@ object ParquetSchemaClipShims { val scale = decimalLogicalTypeAnnotation.getScale if (!(maxPrecision == -1 || 1 <= precision && precision <= maxPrecision)) { - TrampolineUtil.throwAnalysisException(s"Invalid decimal precision: $typeName " + + throw new RapidsAnalysisException(s"Invalid decimal precision: $typeName " + s"cannot store $precision digits (max $maxPrecision)") } @@ -183,14 +185,14 @@ object ParquetSchemaClipShims { ParquetTimestampAnnotationShims.timestampTypeForMillisOrMicros(timestamp) case timestamp: TimestampLogicalTypeAnnotation if timestamp.getUnit == TimeUnit.NANOS && ParquetLegacyNanoAsLongShims.legacyParquetNanosAsLong => - TrampolineUtil.throwAnalysisException( + throw new RapidsAnalysisException( "GPU does not support spark.sql.legacy.parquet.nanosAsLong") case _ => illegalType() } case INT96 => if (!SQLConf.get.isParquetINT96AsTimestamp) { - TrampolineUtil.throwAnalysisException( + throw new RapidsAnalysisException( "INT96 is not supported unless it's interpreted as timestamp. " + s"Please try to set ${SQLConf.PARQUET_INT96_AS_TIMESTAMP.key} to true.") } diff --git a/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/GpuPythonMapInArrowExec.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/GpuPythonMapInArrowExec.scala index 768261cbc89..5118c21ff2e 100644 --- a/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/GpuPythonMapInArrowExec.scala +++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/GpuPythonMapInArrowExec.scala @@ -31,7 +31,6 @@ {"spark": "343"} {"spark": "350"} {"spark": "351"} -{"spark": "400"} spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.shims diff --git a/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala index bb28c370749..e5cdcd43568 100644 --- a/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala +++ b/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,7 +31,7 @@ import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DataType, Decimal, DecimalType} -object RapidsErrorUtils extends RapidsErrorUtilsFor330plus { +object RapidsErrorUtils extends RapidsErrorUtilsFor330plus with RapidsQueryErrorUtils { def mapKeyNotExistError( key: String, diff --git a/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala index cb8eef809f3..a6338e7adc5 100644 --- a/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala +++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala @@ -30,7 +30,6 @@ import com.nvidia.spark.rapids.GpuSemaphore import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python._ -import org.apache.spark.sql.execution.python.PythonUDFRunner import org.apache.spark.sql.rapids.execution.python.{GpuArrowPythonWriter, GpuPythonRunnerCommon} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch @@ -50,7 +49,7 @@ import org.apache.spark.util.Utils * more data being sent. */ class GpuGroupUDFArrowPythonRunner( - funcs: Seq[ChainedPythonFunctions], + funcs: Seq[(ChainedPythonFunctions, Long)], evalType: Int, argOffsets: Array[Array[Int]], pythonInSchema: StructType, @@ -59,8 +58,8 @@ class GpuGroupUDFArrowPythonRunner( maxBatchSize: Long, override val pythonOutSchema: StructType, jobArtifactUUID: Option[String] = None) - extends GpuBasePythonRunner[ColumnarBatch](funcs, evalType, argOffsets, jobArtifactUUID) - with GpuArrowPythonOutput with GpuPythonRunnerCommon { + extends GpuBasePythonRunner[ColumnarBatch](funcs.map(_._1), evalType, argOffsets, + jobArtifactUUID) with GpuArrowPythonOutput with GpuPythonRunnerCommon { protected override def newWriterThread( env: SparkEnv, @@ -72,7 +71,7 @@ class GpuGroupUDFArrowPythonRunner( val arrowWriter = new GpuArrowPythonWriter(pythonInSchema, maxBatchSize) { override protected def writeUDFs(dataOut: DataOutputStream): Unit = { - PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets) + WritePythonUDFUtils.writeUDFs(dataOut, funcs, argOffsets) } } diff --git a/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupedPythonRunnerFactory.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupedPythonRunnerFactory.scala index 451de0a2527..313ea6c20a2 100644 --- a/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupedPythonRunnerFactory.scala +++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupedPythonRunnerFactory.scala @@ -27,7 +27,7 @@ import org.apache.spark.sql.vectorized.ColumnarBatch case class GpuGroupedPythonRunnerFactory( conf: org.apache.spark.sql.internal.SQLConf, - chainedFunc: Seq[ChainedPythonFunctions], + chainedFunc: Seq[(ChainedPythonFunctions, Long)], argOffsets: Array[Array[Int]], dedupAttrs: StructType, pythonOutputSchema: StructType, diff --git a/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala index 1012b28d8b7..7e58a54c921 100644 --- a/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala +++ b/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,7 +22,7 @@ package org.apache.spark.sql.rapids.shims import org.apache.spark.sql.errors.QueryExecutionErrors -object RapidsErrorUtils extends RapidsErrorUtilsBase { +object RapidsErrorUtils extends RapidsErrorUtilsBase with RapidsQueryErrorUtils { def sqlArrayIndexNotStartAtOneError(): RuntimeException = { QueryExecutionErrors.elementAtByIndexZeroError(context = null) } diff --git a/sql-plugin/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/GpuInsertIntoHiveTable.scala b/sql-plugin/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/GpuInsertIntoHiveTable.scala index 9105ab50e1e..42fd5941025 100644 --- a/sql-plugin/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/GpuInsertIntoHiveTable.scala +++ b/sql-plugin/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/GpuInsertIntoHiveTable.scala @@ -37,7 +37,7 @@ import org.apache.hadoop.hive.ql.ErrorMsg import org.apache.hadoop.hive.ql.plan.TableDesc import org.apache.spark.SparkException -import org.apache.spark.sql.{AnalysisException, SparkSession} +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, ExternalCatalog, ExternalCatalogUtils, ExternalCatalogWithListener} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan @@ -47,7 +47,8 @@ import org.apache.spark.sql.execution.command.CommandUtils import org.apache.spark.sql.hive.HiveExternalCatalog import org.apache.spark.sql.hive.client.HiveClientImpl import org.apache.spark.sql.hive.execution.InsertIntoHiveTable -import org.apache.spark.sql.hive.rapids.{GpuHiveTextFileFormat, GpuSaveAsHiveFile, RapidsHiveErrors} +import org.apache.spark.sql.hive.rapids.{GpuHiveFileFormat, GpuSaveAsHiveFile, RapidsHiveErrors} +import org.apache.spark.sql.rapids.shims.RapidsErrorUtils import org.apache.spark.sql.vectorized.ColumnarBatch final class GpuInsertIntoHiveTableMeta(cmd: InsertIntoHiveTable, @@ -59,16 +60,17 @@ final class GpuInsertIntoHiveTableMeta(cmd: InsertIntoHiveTable, private var fileFormat: Option[ColumnarFileFormat] = None override def tagSelfForGpuInternal(): Unit = { - // Only Hive delimited text writes are currently supported. - // Check whether that is the format currently in play. - fileFormat = GpuHiveTextFileFormat.tagGpuSupport(this) + fileFormat = GpuHiveFileFormat.tagGpuSupport(this) } override def convertToGpu(): GpuDataWritingCommand = { + val format = fileFormat.getOrElse( + throw new IllegalStateException("fileFormat missing, tagSelfForGpu not called?")) + GpuInsertIntoHiveTable( table = wrapped.table, partition = wrapped.partition, - fileFormat = this.fileFormat.get, + fileFormat = format, query = wrapped.query, overwrite = wrapped.overwrite, ifPartitionNotExists = wrapped.ifPartitionNotExists, @@ -127,7 +129,7 @@ case class GpuInsertIntoHiveTable( } // un-cache this table. - CommandUtils.uncacheTableOrView(sparkSession, table.identifier.quotedString) + CommandUtilsShim.uncacheTableOrView(sparkSession, table.identifier) sparkSession.sessionState.catalog.refreshTable(table.identifier) CommandUtils.updateTableStats(sparkSession, table) @@ -181,7 +183,7 @@ case class GpuInsertIntoHiveTable( // Report error if any static partition appears after a dynamic partition val isDynamic = partitionColumnNames.map(partitionSpec(_).isEmpty) if (isDynamic.init.zip(isDynamic.tail).contains((true, false))) { - throw new AnalysisException(ErrorMsg.PARTITION_DYN_STA_ORDER.getMsg) + throw RapidsErrorUtils.dynamicPartitionParentError } } @@ -315,8 +317,10 @@ case class GpuInsertIntoHiveTable( if (!fs.delete(path, true)) { throw RapidsHiveErrors.cannotRemovePartitionDirError(path) } - // Don't let Hive do overwrite operation since it is slower. - doHiveOverwrite = false + // Don't let Hive do overwrite operation since it is slower. But still give a + // chance to forcely override this for some customized cases when this + // operation is optimized. + doHiveOverwrite = hadoopConf.getBoolean("hive.movetask.enable.dir.move", false) } } } diff --git a/sql-plugin/src/main/spark332db/scala/org/apache/spark/sql/rapids/GpuFileFormatWriter.scala b/sql-plugin/src/main/spark332db/scala/org/apache/spark/sql/rapids/GpuFileFormatWriter.scala index 78daa0bf6f1..e7b3561f5fd 100644 --- a/sql-plugin/src/main/spark332db/scala/org/apache/spark/sql/rapids/GpuFileFormatWriter.scala +++ b/sql-plugin/src/main/spark332db/scala/org/apache/spark/sql/rapids/GpuFileFormatWriter.scala @@ -42,7 +42,7 @@ import org.apache.spark.{SparkException, TaskContext} import org.apache.spark.internal.Logging import org.apache.spark.internal.io.{FileCommitProtocol, SparkHadoopWriterUtils} import org.apache.spark.shuffle.FetchFailedException -import org.apache.spark.sql.{AnalysisException, SparkSession} +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, AttributeSet, Expression, SortOrder} @@ -51,6 +51,7 @@ import org.apache.spark.sql.connector.write.WriterCommitMessage import org.apache.spark.sql.execution.{SparkPlan, SQLExecution} import org.apache.spark.sql.execution.datasources.{GpuWriteFiles, GpuWriteFilesExec, GpuWriteFilesSpec, WriteTaskResult, WriteTaskStats} import org.apache.spark.sql.execution.datasources.FileFormatWriter.OutputSpec +import org.apache.spark.sql.rapids.execution.RapidsAnalysisException import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.util.{SerializableConfiguration, Utils} @@ -61,7 +62,7 @@ object GpuFileFormatWriter extends Logging { private def verifySchema(format: ColumnarFileFormat, schema: StructType): Unit = { schema.foreach { field => if (!format.supportDataType(field.dataType)) { - throw new AnalysisException( + throw new RapidsAnalysisException( s"$format data source does not support ${field.dataType.catalogString} data type.") } } diff --git a/sql-plugin/src/main/spark332db/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala b/sql-plugin/src/main/spark332db/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala index 9e36cf41fad..6308f24c552 100644 --- a/sql-plugin/src/main/spark332db/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala +++ b/sql-plugin/src/main/spark332db/scala/org/apache/spark/sql/rapids/shims/GpuCreateDataSourceTableAsSelectCommandShims.scala @@ -64,7 +64,7 @@ case class GpuCreateDataSourceTableAsSelectCommand( s"Expect the table $tableName has been dropped when the save mode is Overwrite") if (mode == SaveMode.ErrorIfExists) { - throw new AnalysisException(s"Table $tableName already exists. You need to drop it first.") + throw RapidsErrorUtils.tableOrViewAlreadyExistsError(tableName) } if (mode == SaveMode.Ignore) { // Since the table already exists and the save mode is Ignore, we will just return. diff --git a/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala index 39f42d8b833..5fb252524fd 100644 --- a/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala +++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala @@ -22,7 +22,6 @@ {"spark": "343"} {"spark": "350"} {"spark": "351"} -{"spark": "400"} spark-rapids-shim-json-lines ***/ package com.nvidia.spark.rapids.shims diff --git a/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala index ca2fa215892..62fe32ae8db 100644 --- a/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala +++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala @@ -19,27 +19,7 @@ {"spark": "341"} {"spark": "342"} {"spark": "343"} -{"spark": "350"} -{"spark": "351"} -{"spark": "400"} spark-rapids-shim-json-lines ***/ package com.nvidia.spark.rapids.shims -import org.apache.spark.paths.SparkPath -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.execution.datasources.PartitionedFile - -object PartitionedFileUtilsShim { - // Wrapper for case class constructor so Java code can access - // the default values across Spark versions. - def newPartitionedFile( - partitionValues: InternalRow, - filePath: String, - start: Long, - length: Long): PartitionedFile = PartitionedFile(partitionValues, - SparkPath.fromPathString(filePath), start, length) - - def withNewLocations(pf: PartitionedFile, locations: Seq[String]): PartitionedFile = { - pf.copy(locations = locations.toArray) - } -} +object PartitionedFileUtilsShim extends PartitionedFileUtilsShimBase diff --git a/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShimBase.scala b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShimBase.scala new file mode 100644 index 00000000000..a94c76dc083 --- /dev/null +++ b/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShimBase.scala @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "340"} +{"spark": "341"} +{"spark": "342"} +{"spark": "343"} +{"spark": "350"} +{"spark": "351"} +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package com.nvidia.spark.rapids.shims + +import org.apache.spark.paths.SparkPath +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.datasources.PartitionedFile + +trait PartitionedFileUtilsShimBase { + + // Wrapper for case class constructor so Java code can access + // the default values across Spark versions. + def newPartitionedFile(partitionValues: InternalRow, + filePath: String, + start: Long, + length: Long): PartitionedFile = PartitionedFile(partitionValues, + SparkPath.fromPathString(filePath), start, length) + + def withNewLocations(pf: PartitionedFile, locations: Seq[String]): PartitionedFile = { + pf.copy(locations = locations.toArray) + } +} diff --git a/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala index 8ee0485ab36..e6f8886f19c 100644 --- a/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala +++ b/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala @@ -31,7 +31,7 @@ import org.apache.spark.sql.errors.QueryExecutionErrors import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DataType, Decimal, DecimalType} -object RapidsErrorUtils extends RapidsErrorUtilsFor330plus { +object RapidsErrorUtils extends RapidsErrorUtilsFor330plus with RapidsQueryErrorUtils { def mapKeyNotExistError( key: String, diff --git a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala b/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala index 249502f1b49..0f1bdafde7a 100644 --- a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala +++ b/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,8 +20,10 @@ spark-rapids-shim-json-lines ***/ package com.nvidia.spark.rapids.shims import org.apache.spark.paths.SparkPath +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.execution.datasources.PartitionedFile +import org.apache.spark.sql.execution.PartitionedFileUtil +import org.apache.spark.sql.execution.datasources.{FileStatusWithMetadata, PartitionedFile} object PartitionedFileUtilsShim { // Wrapper for case class constructor so Java code can access @@ -37,4 +39,14 @@ object PartitionedFileUtilsShim { def withNewLocations(pf: PartitionedFile, locations: Seq[String]): PartitionedFile = { pf.copy(locations = locations) } + + // In Spark 4.0, PartitionedFileUtil.splitFiles lost its `sparkSession` parameter. + // This pre-Spark-4.0 shim keeps the `sparkSession` parameter. + def splitFiles(sparkSession: SparkSession, + file: FileStatusWithMetadata, + isSplitable: Boolean, + maxSplitBytes: Long, + partitionValues: InternalRow): Seq[PartitionedFile] = { + PartitionedFileUtil.splitFiles(sparkSession, file, isSplitable, maxSplitBytes, partitionValues) + } } diff --git a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/execution/rapids/shims/SplitFiles.scala b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/execution/rapids/shims/SplitFiles.scala index 3b94d5a5201..1934cb6af9f 100644 --- a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/execution/rapids/shims/SplitFiles.scala +++ b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/execution/rapids/shims/SplitFiles.scala @@ -23,12 +23,12 @@ spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.execution.rapids.shims +import com.nvidia.spark.rapids.shims.PartitionedFileUtilsShim import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.io.compress.{CompressionCodecFactory, SplittableCompressionCodec} import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.execution.PartitionedFileUtil import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, PartitionDirectory, PartitionedFile} trait SplitFiles { @@ -49,7 +49,7 @@ trait SplitFiles { selectedPartitions.flatMap { partition => partition.files.flatMap { f => - PartitionedFileUtil.splitFiles( + PartitionedFileUtilsShim.splitFiles( sparkSession, f, isSplitable = canBeSplit(f.getPath, hadoopConf), @@ -71,7 +71,7 @@ trait SplitFiles { val filePath = file.getPath val isSplitable = relation.fileFormat.isSplitable( relation.sparkSession, relation.options, filePath) - PartitionedFileUtil.splitFiles( + PartitionedFileUtilsShim.splitFiles( sparkSession = relation.sparkSession, file = file, isSplitable = isSplitable, diff --git a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuArrowPythonRunner.scala b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuArrowPythonRunner.scala index ac58baa2eb7..50c5e280e9c 100644 --- a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuArrowPythonRunner.scala +++ b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuArrowPythonRunner.scala @@ -25,7 +25,6 @@ import com.nvidia.spark.rapids.GpuSemaphore import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python._ -import org.apache.spark.sql.execution.python.PythonUDFRunner import org.apache.spark.sql.rapids.execution.python.{GpuArrowPythonWriter, GpuPythonRunnerCommon} import org.apache.spark.sql.rapids.shims.ArrowUtilsShim import org.apache.spark.sql.types.StructType @@ -35,7 +34,7 @@ import org.apache.spark.sql.vectorized.ColumnarBatch * Similar to `PythonUDFRunner`, but exchange data with Python worker via Arrow stream. */ class GpuArrowPythonRunner( - funcs: Seq[ChainedPythonFunctions], + funcs: Seq[(ChainedPythonFunctions, Long)], evalType: Int, argOffsets: Array[Array[Int]], pythonInSchema: StructType, @@ -44,8 +43,8 @@ class GpuArrowPythonRunner( maxBatchSize: Long, override val pythonOutSchema: StructType, jobArtifactUUID: Option[String] = None) - extends GpuBasePythonRunner[ColumnarBatch](funcs, evalType, argOffsets, jobArtifactUUID) - with GpuArrowPythonOutput with GpuPythonRunnerCommon { + extends GpuBasePythonRunner[ColumnarBatch](funcs.map(_._1), evalType, argOffsets, + jobArtifactUUID) with GpuArrowPythonOutput with GpuPythonRunnerCommon { protected override def newWriter( env: SparkEnv, @@ -57,7 +56,7 @@ class GpuArrowPythonRunner( val arrowWriter = new GpuArrowPythonWriter(pythonInSchema, maxBatchSize) { override protected def writeUDFs(dataOut: DataOutputStream): Unit = { - PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets) + WritePythonUDFUtils.writeUDFs(dataOut, funcs, argOffsets) } } val isInputNonEmpty = inputIterator.nonEmpty diff --git a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala index aad1eb52c02..0317a89009e 100644 --- a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala +++ b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuCoGroupedArrowPythonRunner.scala @@ -27,7 +27,6 @@ import com.nvidia.spark.rapids.GpuSemaphore import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python.{ChainedPythonFunctions, PythonRDD, PythonWorker} -import org.apache.spark.sql.execution.python.PythonUDFRunner import org.apache.spark.sql.rapids.execution.python.{GpuArrowWriter, GpuPythonRunnerCommon} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch @@ -39,7 +38,7 @@ import org.apache.spark.sql.vectorized.ColumnarBatch * and receive it back in JVM as batches of single DataFrame. */ class GpuCoGroupedArrowPythonRunner( - funcs: Seq[ChainedPythonFunctions], + funcs: Seq[(ChainedPythonFunctions, Long)], evalType: Int, argOffsets: Array[Array[Int]], leftSchema: StructType, @@ -49,7 +48,7 @@ class GpuCoGroupedArrowPythonRunner( batchSize: Int, override val pythonOutSchema: StructType, jobArtifactUUID: Option[String] = None) - extends GpuBasePythonRunner[(ColumnarBatch, ColumnarBatch)](funcs, evalType, + extends GpuBasePythonRunner[(ColumnarBatch, ColumnarBatch)](funcs.map(_._1), evalType, argOffsets, jobArtifactUUID) with GpuArrowPythonOutput with GpuPythonRunnerCommon { protected override def newWriter( @@ -67,7 +66,7 @@ class GpuCoGroupedArrowPythonRunner( PythonRDD.writeUTF(k, dataOut) PythonRDD.writeUTF(v, dataOut) } - PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets) + WritePythonUDFUtils.writeUDFs(dataOut, funcs, argOffsets) } override def writeNextInputToStream(dataOut: DataOutputStream): Boolean = { diff --git a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala index 4393c8b7057..42c6178ff83 100644 --- a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala +++ b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupUDFArrowPythonRunner.scala @@ -28,7 +28,6 @@ import com.nvidia.spark.rapids.GpuSemaphore import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python._ -import org.apache.spark.sql.execution.python.PythonUDFRunner import org.apache.spark.sql.rapids.execution.python.{GpuArrowPythonWriter, GpuPythonRunnerCommon} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.vectorized.ColumnarBatch @@ -47,7 +46,7 @@ import org.apache.spark.sql.vectorized.ColumnarBatch * more data being sent. */ class GpuGroupUDFArrowPythonRunner( - funcs: Seq[ChainedPythonFunctions], + funcs: Seq[(ChainedPythonFunctions, Long)], evalType: Int, argOffsets: Array[Array[Int]], pythonInSchema: StructType, @@ -56,8 +55,8 @@ class GpuGroupUDFArrowPythonRunner( batchSize: Long, override val pythonOutSchema: StructType, jobArtifactUUID: Option[String] = None) - extends GpuBasePythonRunner[ColumnarBatch](funcs, evalType, argOffsets, jobArtifactUUID) - with GpuArrowPythonOutput with GpuPythonRunnerCommon { + extends GpuBasePythonRunner[ColumnarBatch](funcs.map(_._1), evalType, argOffsets, + jobArtifactUUID) with GpuArrowPythonOutput with GpuPythonRunnerCommon { protected override def newWriter( env: SparkEnv, @@ -69,7 +68,7 @@ class GpuGroupUDFArrowPythonRunner( val arrowWriter = new GpuArrowPythonWriter(pythonInSchema, batchSize) { override protected def writeUDFs(dataOut: DataOutputStream): Unit = { - PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets) + WritePythonUDFUtils.writeUDFs(dataOut, funcs, argOffsets) } } diff --git a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupedPythonRunnerFactory.scala b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupedPythonRunnerFactory.scala index b1dabbf5b5e..63a4289c5b0 100644 --- a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupedPythonRunnerFactory.scala +++ b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/execution/python/shims/GpuGroupedPythonRunnerFactory.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.vectorized.ColumnarBatch case class GpuGroupedPythonRunnerFactory( conf: org.apache.spark.sql.internal.SQLConf, - chainedFunc: Seq[ChainedPythonFunctions], + chainedFunc: Seq[(ChainedPythonFunctions, Long)], argOffsets: Array[Array[Int]], dedupAttrs: StructType, pythonOutputSchema: StructType, diff --git a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala index a0ba17f9bd4..9b800d4e51a 100644 --- a/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala +++ b/sql-plugin/src/main/spark341db/scala/org/apache/spark/sql/rapids/shims/RapidsErrorUtils.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,7 @@ package org.apache.spark.sql.rapids.shims import org.apache.spark.sql.errors.QueryExecutionErrors -object RapidsErrorUtils extends RapidsErrorUtilsBase { +object RapidsErrorUtils extends RapidsErrorUtilsBase with RapidsQueryErrorUtils { def sqlArrayIndexNotStartAtOneError(): RuntimeException = { QueryExecutionErrors.invalidIndexOfZeroError(context = null) } diff --git a/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala b/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala index 4bbc4644241..4b29de25bf0 100644 --- a/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala +++ b/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala @@ -17,68 +17,18 @@ /*** spark-rapids-shim-json-lines {"spark": "350"} {"spark": "351"} -{"spark": "400"} spark-rapids-shim-json-lines ***/ package com.nvidia.spark.rapids.shims import com.nvidia.spark.rapids._ -import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.datasources.v2.BatchScanExec class BatchScanExecMeta(p: BatchScanExec, conf: RapidsConf, parent: Option[RapidsMeta[_, _, _]], rule: DataFromReplacementRule) - extends SparkPlanMeta[BatchScanExec](p, conf, parent, rule) { - // Replaces SubqueryBroadcastExec inside dynamic pruning filters with GPU counterpart - // if possible. Instead regarding filters as childExprs of current Meta, we create - // a new meta for SubqueryBroadcastExec. The reason is that the GPU replacement of - // BatchScanExec is independent from the replacement of the runtime filters. It is - // possible that the BatchScanExec is on the CPU, while the dynamic runtime filters - // are on the GPU. And vice versa. - private lazy val runtimeFilters = { - val convertBroadcast = (bc: SubqueryBroadcastExec) => { - val meta = GpuOverrides.wrapAndTagPlan(bc, conf) - meta.tagForExplain() - meta.convertIfNeeded().asInstanceOf[BaseSubqueryExec] - } - wrapped.runtimeFilters.map { filter => - filter.transformDown { - case dpe @ DynamicPruningExpression(inSub: InSubqueryExec) => - inSub.plan match { - case bc: SubqueryBroadcastExec => - dpe.copy(inSub.copy(plan = convertBroadcast(bc))) - case reuse @ ReusedSubqueryExec(bc: SubqueryBroadcastExec) => - dpe.copy(inSub.copy(plan = reuse.copy(convertBroadcast(bc)))) - case _ => - dpe - } - } - } - } - - override val childExprs: Seq[BaseExprMeta[_]] = { - // We want to leave the runtime filters as CPU expressions - p.output.map(GpuOverrides.wrapExpr(_, conf, Some(this))) - } - - override val childScans: scala.Seq[ScanMeta[_]] = - Seq(GpuOverrides.wrapScan(p.scan, conf, Some(this))) - - override def tagPlanForGpu(): Unit = { - if (!p.runtimeFilters.isEmpty && !childScans.head.supportsRuntimeFilters) { - willNotWorkOnGpu("runtime filtering (DPP) is not supported for this scan") - } - } - - override def convertToCpu(): SparkPlan = { - val cpu = wrapped.copy(runtimeFilters = runtimeFilters) - cpu.copyTagsFrom(wrapped) - cpu - } - + extends BatchScanExecMetaBase(p, conf, parent, rule) { override def convertToGpu(): GpuExec = { val spj = p.spjParams GpuBatchScanExec(p.output, childScans.head.convertToGpu(), runtimeFilters, diff --git a/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/BatchScanExecMetaBase.scala b/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/BatchScanExecMetaBase.scala new file mode 100644 index 00000000000..914702a289c --- /dev/null +++ b/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/BatchScanExecMetaBase.scala @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "350"} +{"spark": "351"} +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package com.nvidia.spark.rapids.shims + +import com.nvidia.spark.rapids._ + +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.datasources.v2.BatchScanExec + +abstract class BatchScanExecMetaBase(p: BatchScanExec, + conf: RapidsConf, + parent: Option[RapidsMeta[_, _, _]], + rule: DataFromReplacementRule) + extends SparkPlanMeta[BatchScanExec](p, conf, parent, rule) { + // Replaces SubqueryBroadcastExec inside dynamic pruning filters with GPU counterpart + // if possible. Instead regarding filters as childExprs of current Meta, we create + // a new meta for SubqueryBroadcastExec. The reason is that the GPU replacement of + // BatchScanExec is independent from the replacement of the runtime filters. It is + // possible that the BatchScanExec is on the CPU, while the dynamic runtime filters + // are on the GPU. And vice versa. + protected lazy val runtimeFilters = { + val convertBroadcast = (bc: SubqueryBroadcastExec) => { + val meta = GpuOverrides.wrapAndTagPlan(bc, conf) + meta.tagForExplain() + meta.convertIfNeeded().asInstanceOf[BaseSubqueryExec] + } + wrapped.runtimeFilters.map { filter => + filter.transformDown { + case dpe @ DynamicPruningExpression(inSub: InSubqueryExec) => + inSub.plan match { + case bc: SubqueryBroadcastExec => + dpe.copy(inSub.copy(plan = convertBroadcast(bc))) + case reuse @ ReusedSubqueryExec(bc: SubqueryBroadcastExec) => + dpe.copy(inSub.copy(plan = reuse.copy(convertBroadcast(bc)))) + case _ => + dpe + } + } + } + } + + override val childExprs: Seq[BaseExprMeta[_]] = { + // We want to leave the runtime filters as CPU expressions + p.output.map(GpuOverrides.wrapExpr(_, conf, Some(this))) + } + + override val childScans: scala.Seq[ScanMeta[_]] = + Seq(GpuOverrides.wrapScan(p.scan, conf, Some(this))) + + override def tagPlanForGpu(): Unit = { + if (!p.runtimeFilters.isEmpty && !childScans.head.supportsRuntimeFilters) { + willNotWorkOnGpu("runtime filtering (DPP) is not supported for this scan") + } + } + + override def convertToCpu(): SparkPlan = { + val cpu = wrapped.copy(runtimeFilters = runtimeFilters) + cpu.copyTagsFrom(wrapped) + cpu + } +} diff --git a/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala b/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala new file mode 100644 index 00000000000..71ad5ae1a0f --- /dev/null +++ b/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "350"} +{"spark": "351"} +spark-rapids-shim-json-lines ***/ +package com.nvidia.spark.rapids.shims + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.PartitionedFileUtil +import org.apache.spark.sql.execution.datasources.{FileStatusWithMetadata, PartitionedFile} + +object PartitionedFileUtilsShim extends PartitionedFileUtilsShimBase { + // In Spark 4.0, PartitionedFileUtil.splitFiles lost its `sparkSession` parameter. + // This pre-Spark-4.0 shim keeps the `sparkSession` parameter. + def splitFiles(sparkSession: SparkSession, + file: FileStatusWithMetadata, + isSplitable: Boolean, + maxSplitBytes: Long, + partitionValues: InternalRow): Seq[PartitionedFile] = { + PartitionedFileUtil.splitFiles(sparkSession, file, isSplitable, maxSplitBytes, partitionValues) + } +} diff --git a/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/PythonMapInArrowExecShims.scala b/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/PythonMapInArrowExecShims.scala index 833767558c6..8f9bc5c1573 100644 --- a/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/PythonMapInArrowExecShims.scala +++ b/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/PythonMapInArrowExecShims.scala @@ -17,7 +17,6 @@ /*** spark-rapids-shim-json-lines {"spark": "350"} {"spark": "351"} -{"spark": "400"} spark-rapids-shim-json-lines ***/ package com.nvidia.spark.rapids.shims diff --git a/sql-plugin/src/main/spark350/scala/org/apache/spark/sql/rapids/shims/GpuPythonMapInArrowExecMeta.scala b/sql-plugin/src/main/spark350/scala/org/apache/spark/sql/rapids/shims/GpuPythonMapInArrowExecMeta.scala index a08211f3795..c27f4824c4a 100644 --- a/sql-plugin/src/main/spark350/scala/org/apache/spark/sql/rapids/shims/GpuPythonMapInArrowExecMeta.scala +++ b/sql-plugin/src/main/spark350/scala/org/apache/spark/sql/rapids/shims/GpuPythonMapInArrowExecMeta.scala @@ -17,7 +17,6 @@ /*** spark-rapids-shim-json-lines {"spark": "350"} {"spark": "351"} -{"spark": "400"} spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.shims diff --git a/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala b/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala new file mode 100644 index 00000000000..e6c26eb65b8 --- /dev/null +++ b/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/BatchScanExecMeta.scala @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package com.nvidia.spark.rapids.shims + +import com.nvidia.spark.rapids._ + +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.datasources.v2.BatchScanExec + +class BatchScanExecMeta(p: BatchScanExec, + conf: RapidsConf, + parent: Option[RapidsMeta[_, _, _]], + rule: DataFromReplacementRule) + extends BatchScanExecMetaBase(p, conf, parent, rule) { + override def convertToGpu(): GpuExec = { + val spj = p.spjParams + GpuBatchScanExec(p.output, childScans.head.convertToGpu(), runtimeFilters, + p.ordering, p.table, spj) + } +} diff --git a/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/CudfUnsafeRow.scala b/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/CudfUnsafeRow.scala new file mode 100644 index 00000000000..623005654fc --- /dev/null +++ b/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/CudfUnsafeRow.scala @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/*** spark-rapids-shim-json-lines +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package com.nvidia.spark.rapids.shims + +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.unsafe.types.VariantVal + + +final class CudfUnsafeRow( + attributes: Array[Attribute], + remapping: Array[Int]) extends CudfUnsafeRowBase(attributes, remapping) { + def getVariant(ordinal: Int) = { + throw new UnsupportedOperationException("VariantVal is not supported") + } +} + +object CudfUnsafeRow extends CudfUnsafeRowTrait \ No newline at end of file diff --git a/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala b/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala new file mode 100644 index 00000000000..4fc62d82df3 --- /dev/null +++ b/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/GpuBatchScanExec.scala @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package com.nvidia.spark.rapids.shims + +import com.google.common.base.Objects +import com.nvidia.spark.rapids.GpuScan + +import org.apache.spark.SparkException +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, DynamicPruningExpression, Expression, Literal, RowOrdering, SortOrder} +import org.apache.spark.sql.catalyst.plans.QueryPlan +import org.apache.spark.sql.catalyst.plans.physical.{KeyGroupedPartitioning, KeyGroupedShuffleSpec, Partitioning, SinglePartition} +import org.apache.spark.sql.catalyst.util.{truncatedString, InternalRowComparableWrapper} +import org.apache.spark.sql.connector.catalog.Table +import org.apache.spark.sql.connector.read._ +import org.apache.spark.sql.execution.datasources.rapids.DataSourceStrategyUtils +import org.apache.spark.sql.execution.datasources.v2.{DataSourceRDD, StoragePartitionJoinParams} +import org.apache.spark.sql.internal.SQLConf + +case class GpuBatchScanExec( + output: Seq[AttributeReference], + @transient scan: GpuScan, + runtimeFilters: Seq[Expression] = Seq.empty, + ordering: Option[Seq[SortOrder]] = None, + @transient table: Table, + spjParams: StoragePartitionJoinParams = StoragePartitionJoinParams() + ) extends GpuBatchScanExecBase(scan, runtimeFilters) { + + @transient override lazy val batch: Batch = if (scan == null) null else scan.toBatch + // TODO: unify the equal/hashCode implementation for all data source v2 query plans. + override def equals(other: Any): Boolean = other match { + case other: GpuBatchScanExec => + this.batch != null && this.batch == other.batch && + this.runtimeFilters == other.runtimeFilters && + this.spjParams == other.spjParams + case _ => + false + } + + override def hashCode(): Int = Objects.hashCode(batch, runtimeFilters) + + @transient override lazy val inputPartitions: Seq[InputPartition] = + batch.planInputPartitions() + + @transient override protected lazy val filteredPartitions: Seq[Seq[InputPartition]] = { + val dataSourceFilters = runtimeFilters.flatMap { + case DynamicPruningExpression(e) => DataSourceStrategyUtils.translateRuntimeFilter(e) + case _ => None + } + + if (dataSourceFilters.nonEmpty) { + val originalPartitioning = outputPartitioning + + // the cast is safe as runtime filters are only assigned if the scan can be filtered + val filterableScan = scan.asInstanceOf[SupportsRuntimeV2Filtering] + filterableScan.filter(dataSourceFilters.toArray) + + // call toBatch again to get filtered partitions + val newPartitions = scan.toBatch.planInputPartitions() + + originalPartitioning match { + case p: KeyGroupedPartitioning => + if (newPartitions.exists(!_.isInstanceOf[HasPartitionKey])) { + throw new SparkException("Data source must have preserved the original partitioning " + + "during runtime filtering: not all partitions implement HasPartitionKey after " + + "filtering") + } + + val newPartitionValues = newPartitions.map(partition => + InternalRowComparableWrapper(partition.asInstanceOf[HasPartitionKey], p.expressions)) + .toSet + val oldPartitionValues = p.partitionValues + .map(partition => InternalRowComparableWrapper(partition, p.expressions)).toSet + // We require the new number of partition values to be equal or less than the old number + // of partition values here. In the case of less than, empty partitions will be added for + // those missing values that are not present in the new input partitions. + if (oldPartitionValues.size < newPartitionValues.size) { + throw new SparkException("During runtime filtering, data source must either report " + + "the same number of partition values, or a subset of partition values from the " + + s"original. Before: ${oldPartitionValues.size} partition values. " + + s"After: ${newPartitionValues.size} partition values") + } + + if (!newPartitionValues.forall(oldPartitionValues.contains)) { + throw new SparkException("During runtime filtering, data source must not report new " + + "partition values that are not present in the original partitioning.") + } + groupPartitions(newPartitions) + .map(_.groupedParts.map(_.parts)).getOrElse(Seq.empty) + + case _ => + // no validation is needed as the data source did not report any specific partitioning + newPartitions.map(Seq(_)) + } + + } else { + partitions + } + } + + override def outputPartitioning: Partitioning = { + super.outputPartitioning match { + case k: KeyGroupedPartitioning if spjParams.commonPartitionValues.isDefined => + // We allow duplicated partition values if + // `spark.sql.sources.v2.bucketing.partiallyClusteredDistribution.enabled` is true + val newPartValues = spjParams.commonPartitionValues.get.flatMap { + case (partValue, numSplits) => Seq.fill(numSplits)(partValue) + } + val expressions = spjParams.joinKeyPositions match { + case Some(projectionPositions) => projectionPositions.map(i => k.expressions(i)) + case _ => k.expressions + } + k.copy(expressions = expressions, numPartitions = newPartValues.length, + partitionValues = newPartValues) + case p => p + } + } + + override lazy val readerFactory: PartitionReaderFactory = batch.createReaderFactory() + + override lazy val inputRDD: RDD[InternalRow] = { + val rdd = if (filteredPartitions.isEmpty && outputPartitioning == SinglePartition) { + // return an empty RDD with 1 partition if dynamic filtering removed the only split + sparkContext.parallelize(Array.empty[InternalRow], 1) + } else { + val finalPartitions = outputPartitioning match { + case p: KeyGroupedPartitioning => + assert(spjParams.keyGroupedPartitioning.isDefined) + val expressions = spjParams.keyGroupedPartitioning.get + + // Re-group the input partitions if we are projecting on a subset of join keys + val (groupedPartitions, partExpressions) = spjParams.joinKeyPositions match { + case Some(projectPositions) => + val projectedExpressions = projectPositions.map(i => expressions(i)) + val parts = filteredPartitions.flatten.groupBy(part => { + val row = part.asInstanceOf[HasPartitionKey].partitionKey() + val projectedRow = KeyGroupedPartitioning.project( + expressions, projectPositions, row) + InternalRowComparableWrapper(projectedRow, projectedExpressions) + }).map { case (wrapper, splits) => (wrapper.row, splits) }.toSeq + (parts, projectedExpressions) + case _ => + val groupedParts = filteredPartitions.map(splits => { + assert(splits.nonEmpty && splits.head.isInstanceOf[HasPartitionKey]) + (splits.head.asInstanceOf[HasPartitionKey].partitionKey(), splits) + }) + (groupedParts, expressions) + } + + // Also re-group the partitions if we are reducing compatible partition expressions + val finalGroupedPartitions = spjParams.reducers match { + case Some(reducers) => + val result = groupedPartitions.groupBy { case (row, _) => + KeyGroupedShuffleSpec.reducePartitionValue(row, partExpressions, reducers) + }.map { case (wrapper, splits) => (wrapper.row, splits.flatMap(_._2)) }.toSeq + val rowOrdering = RowOrdering.createNaturalAscendingOrdering( + partExpressions.map(_.dataType)) + result.sorted(rowOrdering.on((t: (InternalRow, _)) => t._1)) + case _ => groupedPartitions + } + + // When partially clustered, the input partitions are not grouped by partition + // values. Here we'll need to check `commonPartitionValues` and decide how to group + // and replicate splits within a partition. + if (spjParams.commonPartitionValues.isDefined && spjParams.applyPartialClustering) { + // A mapping from the common partition values to how many splits the partition + // should contain. + val commonPartValuesMap = spjParams.commonPartitionValues + .get + .map(t => (InternalRowComparableWrapper(t._1, partExpressions), t._2)) + .toMap + val nestGroupedPartitions = finalGroupedPartitions.map { case (partValue, splits) => + // `commonPartValuesMap` should contain the part value since it's the super set. + val numSplits = commonPartValuesMap + .get(InternalRowComparableWrapper(partValue, partExpressions)) + assert(numSplits.isDefined, s"Partition value $partValue does not exist in " + + "common partition values from Spark plan") + + val newSplits = if (spjParams.replicatePartitions) { + // We need to also replicate partitions according to the other side of join + Seq.fill(numSplits.get)(splits) + } else { + // Not grouping by partition values: this could be the side with partially + // clustered distribution. Because of dynamic filtering, we'll need to check if + // the final number of splits of a partition is smaller than the original + // number, and fill with empty splits if so. This is necessary so that both + // sides of a join will have the same number of partitions & splits. + splits.map(Seq(_)).padTo(numSplits.get, Seq.empty) + } + (InternalRowComparableWrapper(partValue, partExpressions), newSplits) + } + + // Now fill missing partition keys with empty partitions + val partitionMapping = nestGroupedPartitions.toMap + spjParams.commonPartitionValues.get.flatMap { + case (partValue, numSplits) => + // Use empty partition for those partition values that are not present. + partitionMapping.getOrElse( + InternalRowComparableWrapper(partValue, partExpressions), + Seq.fill(numSplits)(Seq.empty)) + } + } else { + // either `commonPartitionValues` is not defined, or it is defined but + // `applyPartialClustering` is false. + val partitionMapping = finalGroupedPartitions.map { case (partValue, splits) => + InternalRowComparableWrapper(partValue, partExpressions) -> splits + }.toMap + + // In case `commonPartitionValues` is not defined (e.g., SPJ is not used), there + // could exist duplicated partition values, as partition grouping is not done + // at the beginning and postponed to this method. It is important to use unique + // partition values here so that grouped partitions won't get duplicated. + p.uniquePartitionValues.map { partValue => + // Use empty partition for those partition values that are not present + partitionMapping.getOrElse( + InternalRowComparableWrapper(partValue, partExpressions), Seq.empty) + } + } + + case _ => filteredPartitions + } + + new DataSourceRDD( + sparkContext, finalPartitions, readerFactory, supportsColumnar, customMetrics) + } + postDriverMetrics() + rdd + } + + override def keyGroupedPartitioning: Option[Seq[Expression]] = + spjParams.keyGroupedPartitioning + + override def doCanonicalize(): GpuBatchScanExec = { + this.copy( + output = output.map(QueryPlan.normalizeExpressions(_, output)), + runtimeFilters = QueryPlan.normalizePredicates( + runtimeFilters.filterNot(_ == DynamicPruningExpression(Literal.TrueLiteral)), + output)) + } + + override def simpleString(maxFields: Int): String = { + val truncatedOutputString = truncatedString(output, "[", ", ", "]", maxFields) + val runtimeFiltersString = s"RuntimeFilters: ${runtimeFilters.mkString("[", ",", "]")}" + val result = s"$nodeName$truncatedOutputString ${scan.description()} $runtimeFiltersString" + redact(result) + } + + override def nodeName: String = { + s"GpuBatchScan ${table.name()}".trim + } +} diff --git a/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/GpuOrcDataReader.scala b/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/GpuOrcDataReader.scala new file mode 100644 index 00000000000..8c6a9c793f2 --- /dev/null +++ b/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/GpuOrcDataReader.scala @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/*** spark-rapids-shim-json-lines +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package com.nvidia.spark.rapids.shims + +import com.nvidia.spark.rapids.GpuMetric +import org.apache.hadoop.conf.Configuration +import org.apache.orc.impl.DataReaderProperties + +class GpuOrcDataReader( + props: DataReaderProperties, + conf: Configuration, + metrics: Map[String, GpuMetric]) extends GpuOrcDataReader320Plus(props, conf, metrics) { + override def releaseAllBuffers(): Unit = { + throw new IllegalStateException("should not be trying to release buffers") + } +} + + +object GpuOrcDataReader { + // File cache is being used, so we want read ranges that can be cached separately + val shouldMergeDiskRanges: Boolean = false +} diff --git a/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/MapInArrowExecShims.scala b/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/MapInArrowExecShims.scala new file mode 100644 index 00000000000..4a1998fa88d --- /dev/null +++ b/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/MapInArrowExecShims.scala @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package com.nvidia.spark.rapids.shims + +import com.nvidia.spark.rapids._ + +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.python.MapInArrowExec +import org.apache.spark.sql.rapids.shims.GpuMapInArrowExecMeta + +object PythonMapInArrowExecShims { + + def execs: Map[Class[_ <: SparkPlan], ExecRule[_ <: SparkPlan]] = Seq( + GpuOverrides.exec[MapInArrowExec]( + "The backend for Map Arrow Iterator UDF. Accelerates the data transfer between the" + + " Java process and the Python process. It also supports scheduling GPU resources" + + " for the Python process when enabled.", + ExecChecks((TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.STRUCT).nested(), + TypeSig.all), + (mapPy, conf, p, r) => new GpuMapInArrowExecMeta(mapPy, conf, p, r)) + ).map(r => (r.getClassFor.asSubclass(classOf[SparkPlan]), r)).toMap + +} \ No newline at end of file diff --git a/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala b/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala new file mode 100644 index 00000000000..de8e98962a7 --- /dev/null +++ b/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/PartitionedFileUtilsShim.scala @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package com.nvidia.spark.rapids.shims + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.execution.PartitionedFileUtil +import org.apache.spark.sql.execution.datasources.{FileStatusWithMetadata, PartitionedFile} + +object PartitionedFileUtilsShim extends PartitionedFileUtilsShimBase { + + // In Spark 4.0, PartitionedFileUtil.splitFiles lost its `sparkSession` parameter. + // This Spark-4.0+ shim ignores the `sparkSession` parameter. + def splitFiles(sparkSession: SparkSession, + file: FileStatusWithMetadata, + isSplitable: Boolean, + maxSplitBytes: Long, + partitionValues: InternalRow): Seq[PartitionedFile] = { + PartitionedFileUtil.splitFiles(file, isSplitable, maxSplitBytes, partitionValues) + } + +} // object PartitionFileUtilsShim; diff --git a/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/RaiseErrorShim.scala b/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/RaiseErrorShim.scala new file mode 100644 index 00000000000..70d40fc19a0 --- /dev/null +++ b/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/RaiseErrorShim.scala @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/*** spark-rapids-shim-json-lines +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package com.nvidia.spark.rapids.shims + +import com.nvidia.spark.rapids.{ExprRule, GpuOverrides} +import com.nvidia.spark.rapids.{ExprChecks, GpuExpression, TypeSig, UnaryExprMeta} + +import org.apache.spark.sql.catalyst.expressions.{Expression, RaiseError} + +object RaiseErrorShim { + val exprs: Map[Class[_ <: Expression], ExprRule[_ <: Expression]] = Map.empty +} diff --git a/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/hive/rapids/shims/CommandUtilsShim.scala b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/hive/rapids/shims/CommandUtilsShim.scala new file mode 100644 index 00000000000..f5858e4cfd6 --- /dev/null +++ b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/hive/rapids/shims/CommandUtilsShim.scala @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.hive.rapids.shims + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.execution.command.CommandUtils + +object CommandUtilsShim { + + // Shim for CommandUtils.uncacheTableOrView, whose signature changed in Apache Spark 4.0. + def uncacheTableOrView(sparkSession: SparkSession, tableId: TableIdentifier): Unit = { + CommandUtils.uncacheTableOrView(sparkSession, tableId) + } + +} \ No newline at end of file diff --git a/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/execution/python/shims/WritePythonUDFUtils.scala b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/execution/python/shims/WritePythonUDFUtils.scala new file mode 100644 index 00000000000..4650d998fd7 --- /dev/null +++ b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/execution/python/shims/WritePythonUDFUtils.scala @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.execution.python.shims + +import java.io.DataOutputStream + +import org.apache.spark.api.python.ChainedPythonFunctions +import org.apache.spark.sql.execution.python.PythonUDFRunner + +object WritePythonUDFUtils { + def writeUDFs( + dataOut: DataOutputStream, + funcs: Seq[(ChainedPythonFunctions, Long)], + argOffsets: Array[Array[Int]], + profiler: Option[String] = None): Unit = { + PythonUDFRunner.writeUDFs(dataOut, funcs, argOffsets, profiler) + } +} diff --git a/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/GpuMapInArrowExecMeta.scala b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/GpuMapInArrowExecMeta.scala new file mode 100644 index 00000000000..f7010099813 --- /dev/null +++ b/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/GpuMapInArrowExecMeta.scala @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/*** spark-rapids-shim-json-lines +{"spark": "400"} +spark-rapids-shim-json-lines ***/ +package org.apache.spark.sql.rapids.shims + +import com.nvidia.spark.rapids._ + +import org.apache.spark.api.python.PythonEvalType +import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PythonUDF} +import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.python.MapInArrowExec +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.rapids.execution.TrampolineUtil +import org.apache.spark.sql.rapids.execution.python.GpuMapInBatchExec +import org.apache.spark.sql.types.{BinaryType, StringType} + +class GpuMapInArrowExecMeta( + mapArrow: MapInArrowExec, + conf: RapidsConf, + parent: Option[RapidsMeta[_, _, _]], + rule: DataFromReplacementRule) + extends SparkPlanMeta[MapInArrowExec](mapArrow, conf, parent, rule) { + override def replaceMessage: String = "partially run on GPU" + + override def noReplacementPossibleMessage(reasons: String): String = + s"cannot run even partially on the GPU because $reasons" + + protected val udf: BaseExprMeta[PythonUDF] = GpuOverrides.wrapExpr( + mapArrow.func.asInstanceOf[PythonUDF], conf, Some(this)) + protected val resultAttrs: Seq[BaseExprMeta[Attribute]] = + mapArrow.output.map(GpuOverrides.wrapExpr(_, conf, Some(this))) + + override val childExprs: Seq[BaseExprMeta[_]] = resultAttrs :+ udf + + override def tagPlanForGpu(): Unit = { + super.tagPlanForGpu() + if (SQLConf.get.getConf(SQLConf.ARROW_EXECUTION_USE_LARGE_VAR_TYPES)) { + + val inputTypes = mapArrow.child.schema.fields.map(_.dataType) + val outputTypes = mapArrow.output.map(_.dataType) + + val hasStringOrBinaryTypes = (inputTypes ++ outputTypes).exists(dataType => + TrampolineUtil.dataTypeExistsRecursively(dataType, + dt => dt == StringType || dt == BinaryType)) + + if (hasStringOrBinaryTypes) { + willNotWorkOnGpu(s"${SQLConf.ARROW_EXECUTION_USE_LARGE_VAR_TYPES.key} is " + + s"enabled and the schema contains string or binary types. This is not " + + s"supported on the GPU.") + } + } + } + + override def convertToGpu(): GpuExec = + GpuMapInArrowExec( + udf.convertToGpu(), + resultAttrs.map(_.convertToGpu()).asInstanceOf[Seq[Attribute]], + childPlans.head.convertIfNeeded(), + isBarrier = mapArrow.isBarrier, + ) +} + +/* + * A relation produced by applying a function that takes an iterator of PyArrow's record + * batches and outputs an iterator of PyArrow's record batches. + * + * This GpuMapInPandasExec aims at accelerating the data transfer between + * JVM and Python, and scheduling GPU resources for its Python processes. + * + */ +case class GpuMapInArrowExec( + func: Expression, + output: Seq[Attribute], + child: SparkPlan, + override val isBarrier: Boolean) extends GpuMapInBatchExec { + + override protected val pythonEvalType: Int = PythonEvalType.SQL_MAP_ARROW_ITER_UDF +} diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionRewriteSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionRewriteSuite.scala index a9ef6364aac..7626c1450c1 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionRewriteSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionRewriteSuite.scala @@ -40,9 +40,9 @@ class RegularExpressionRewriteSuite extends AnyFunSuite { test("regex rewrite contains") { import RegexOptimizationType._ val patterns = Seq(".*abc.*", ".*(abc).*", "^.*(abc).*$", "^.*(.*)(abc).*.*", - raw".*\w.*\Z", raw".*..*\Z") - val excepted = Seq(Contains("abc"), Contains("abc"), NoOptimization, Contains("abc"), - NoOptimization, NoOptimization) + raw".*\w.*\Z", raw".*..*\Z", "^(.*)(abc)") + val excepted = Seq(Contains("abc"), Contains("abc"), NoOptimization, NoOptimization, + NoOptimization, NoOptimization, NoOptimization) verifyRewritePattern(patterns, excepted) } @@ -53,12 +53,42 @@ class RegularExpressionRewriteSuite extends AnyFunSuite { "(.*)abc[0-9a-z]{1,3}(.*)", "(.*)abc[0-9]{2}.*", "^abc[0-9]{1,3}", - "火花急流[\u4e00-\u9fa5]{1}") - val excepted = Seq(PrefixRange("abc", 1, 48, 57), - NoOptimization, - PrefixRange("abc", 2, 48, 57), + "火花急流[\u4e00-\u9fa5]{1}", + "^[0-9]{6}", + "^[0-9]{3,10}", + "^.*[0-9]{6}", + "^(.*)[0-9]{3,10}" + ) + val excepted = Seq( PrefixRange("abc", 1, 48, 57), - PrefixRange("火花急流", 1, 19968, 40869)) + NoOptimization, // prefix followed by a multi-range not supported + PrefixRange("abc", 2, 48, 57), + NoOptimization, // starts with PrefixRange not supported + PrefixRange("火花急流", 1, 19968, 40869), + NoOptimization, // starts with PrefixRange not supported + NoOptimization, // starts with PrefixRange not supported + NoOptimization, // .* can't match line break so can't be optimized + NoOptimization // .* can't match line break so can't be optimized + ) + verifyRewritePattern(patterns, excepted) + } + + test("regex rewrite multiple contains") { + import RegexOptimizationType._ + val patterns = Seq( + "(abc|def).*", + ".*(abc|def|ghi).*", + "((abc)|(def))", + "(abc)|(def)", + "(火花|急流)" + ) + val excepted = Seq( + MultipleContains(Seq("abc", "def")), + MultipleContains(Seq("abc", "def", "ghi")), + MultipleContains(Seq("abc", "def")), + MultipleContains(Seq("abc", "def")), + MultipleContains(Seq("火花", "急流")) + ) verifyRewritePattern(patterns, excepted) } } diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/GpuInSubqueryExecSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/GpuInSubqueryExecSuite.scala index 82ce1073e13..a606dba0572 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/GpuInSubqueryExecSuite.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/GpuInSubqueryExecSuite.scala @@ -65,7 +65,7 @@ class GpuInSubqueryExecSuite extends SparkQueryCompareTestSuite { private def buildCpuInSubqueryPlan( spark: SparkSession, - shouldBroadcast: Boolean): SparkPlan = { + shouldBroadcastOrDpp: Boolean): SparkPlan = { val df1ReadExec = readToPhysicalPlan(nullableStringsIntsDf(spark)) val df2ReadExec = readToPhysicalPlan(subqueryTable(spark)) val inSubquery = InSubqueryExec( @@ -73,16 +73,19 @@ class GpuInSubqueryExecSuite extends SparkQueryCompareTestSuite { SubqueryExec("sbe", ProjectExec(Seq(df2ReadExec.output.head), df2ReadExec)), ExprId(7), - shouldBroadcast=shouldBroadcast) + shouldBroadcastOrDpp) FilterExec(DynamicPruningExpression(inSubquery), df1ReadExec) } - for (shouldBroadcast <- Seq(false, true)) { - test(s"InSubqueryExec shouldBroadcast=$shouldBroadcast") { + /** + * The named parameter shouldBroadcast was renamed to isDynamicPruning in Spark 4.0.0+ + */ + for (shouldBroadcastOrDpp <- Seq(false, true)) { + test(s"InSubqueryExec shouldBroadcastOrDpp=$shouldBroadcastOrDpp") { val gpuResults = withGpuSparkSession({ spark => val overrides = new GpuOverrides() val transitionOverrides = new GpuTransitionOverrides() - val cpuPlan = buildCpuInSubqueryPlan(spark, shouldBroadcast) + val cpuPlan = buildCpuInSubqueryPlan(spark, shouldBroadcastOrDpp) val gpuPlan = transitionOverrides(overrides(cpuPlan)) gpuPlan.execute().collect() }) diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsDataFrameAggregateSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsDataFrameAggregateSuite.scala index 5a394a5b0e8..dba811c073c 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsDataFrameAggregateSuite.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsDataFrameAggregateSuite.scala @@ -19,12 +19,67 @@ spark-rapids-shim-json-lines ***/ package org.apache.spark.sql.rapids.suites -import org.apache.spark.sql.DataFrameAggregateSuite +import org.apache.spark.sql.{DataFrameAggregateSuite, Row} +import org.apache.spark.sql.functions._ import org.apache.spark.sql.rapids.utils.RapidsSQLTestsTrait +import org.apache.spark.sql.types._ class RapidsDataFrameAggregateSuite extends DataFrameAggregateSuite with RapidsSQLTestsTrait { - // example to show how to replace the logic of an excluded test case in Vanilla Spark - testRapids("collect functions" ) { // "collect functions" was excluded at RapidsTestSettings - // println("...") + import testImplicits._ + + testRapids("collect functions") { + val df = Seq((1, 2), (2, 2), (3, 4)).toDF("a", "b") + checkAnswer( + df.select(sort_array(collect_list($"a")), sort_array(collect_list($"b"))), + Seq(Row(Seq(1, 2, 3), Seq(2, 2, 4))) + ) + checkAnswer( + df.select(sort_array(collect_set($"a")), sort_array(collect_set($"b"))), + Seq(Row(Seq(1, 2, 3), Seq(2, 4))) + ) + + checkDataset( + df.select(sort_array(collect_set($"a")).as("aSet")).as[Set[Int]], + Set(1, 2, 3)) + checkDataset( + df.select(sort_array(collect_set($"b")).as("bSet")).as[Set[Int]], + Set(2, 4)) + checkDataset( + df.select(sort_array(collect_set($"a")), sort_array(collect_set($"b"))) + .as[(Set[Int], Set[Int])], Seq(Set(1, 2, 3) -> Set(2, 4)): _*) + } + + testRapids("collect functions structs") { + val df = Seq((1, 2, 2), (2, 2, 2), (3, 4, 1)) + .toDF("a", "x", "y") + .select($"a", struct($"x", $"y").as("b")) + checkAnswer( + df.select(sort_array(collect_list($"a")), sort_array(collect_list($"b"))), + Seq(Row(Seq(1, 2, 3), Seq(Row(2, 2), Row(2, 2), Row(4, 1)))) + ) + checkAnswer( + df.select(sort_array(collect_set($"a")), sort_array(collect_set($"b"))), + Seq(Row(Seq(1, 2, 3), Seq(Row(2, 2), Row(4, 1)))) + ) + } + + testRapids("SPARK-17641: collect functions should not collect null values") { + val df = Seq(("1", 2), (null, 2), ("1", 4)).toDF("a", "b") + checkAnswer( + df.select(sort_array(collect_list($"a")), sort_array(collect_list($"b"))), + Seq(Row(Seq("1", "1"), Seq(2, 2, 4))) + ) + checkAnswer( + df.select(sort_array(collect_set($"a")), sort_array(collect_set($"b"))), + Seq(Row(Seq("1"), Seq(2, 4))) + ) + } + + testRapids("collect functions should be able to cast to array type with no null values") { + val df = Seq(1, 2).toDF("a") + checkAnswer(df.select(sort_array(collect_list("a")) cast ArrayType(IntegerType, false)), + Seq(Row(Seq(1, 2)))) + checkAnswer(df.select(sort_array(collect_set("a")) cast ArrayType(FloatType, false)), + Seq(Row(Seq(1.0, 2.0)))) } } diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonSuite.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonSuite.scala index 3e9f685dfdc..ef9ae630dfd 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonSuite.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsJsonSuite.scala @@ -31,10 +31,6 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap class RapidsJsonSuite extends JsonSuite with RapidsSQLTestsBaseTrait with RapidsJsonConfTrait { - /** Returns full path to the given file in the resource folder */ - override protected def testFile(fileName: String): String = { - getWorkspaceFilePath("sql", "core", "src", "test", "resources").toString + "/" + fileName - } } class RapidsJsonV1Suite extends RapidsJsonSuite with RapidsSQLTestsBaseTrait { diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/BackendTestSettings.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/BackendTestSettings.scala index 83396e977fa..e1aec1ffebc 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/BackendTestSettings.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/BackendTestSettings.scala @@ -83,6 +83,7 @@ abstract class BackendTestSettings { // or a description like "This simply can't work on GPU". // It should never be "unknown" or "need investigation" case class KNOWN_ISSUE(reason: String) extends ExcludeReason + case class ADJUST_UT(reason: String) extends ExcludeReason case class WONT_FIX_ISSUE(reason: String) extends ExcludeReason diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsSQLTestsBaseTrait.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsSQLTestsBaseTrait.scala index 6db9e8b71a6..f8b9d21d169 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsSQLTestsBaseTrait.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsSQLTestsBaseTrait.scala @@ -21,27 +21,36 @@ package org.apache.spark.sql.rapids.utils import java.util.{Locale, TimeZone} +import org.apache.hadoop.fs.FileUtil import org.scalactic.source.Position import org.scalatest.Tag import org.apache.spark.SparkConf +import org.apache.spark.internal.Logging import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.sql.DataFrame import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, ShuffleQueryStageExec} +import org.apache.spark.sql.rapids.execution.TrampolineUtil import org.apache.spark.sql.rapids.utils.RapidsTestConstants.RAPIDS_TEST import org.apache.spark.sql.test.SharedSparkSession /** Basic trait for Rapids SQL test cases. */ trait RapidsSQLTestsBaseTrait extends SharedSparkSession with RapidsTestsBaseTrait { - protected override def afterAll(): Unit = { // SparkFunSuite will set this to true, and forget to reset to false System.clearProperty(IS_TESTING.key) super.afterAll() } + override protected def testFile(fileName: String): String = { + import RapidsSQLTestsBaseTrait.sparkTestResourcesDir + + java.nio.file.Paths.get(sparkTestResourcesDir(getClass).toString, fileName) + .toString + } + protected def testRapids(testName: String, testTag: Tag*)(testFun: => Any)(implicit pos: Position): Unit = { test(RAPIDS_TEST + testName, testTag: _*)(testFun) @@ -107,7 +116,40 @@ trait RapidsSQLTestsBaseTrait extends SharedSparkSession with RapidsTestsBaseTra } } -object RapidsSQLTestsBaseTrait { +object RapidsSQLTestsBaseTrait extends Logging { + private val resourceMap = scala.collection.mutable.Map.empty[String, java.nio.file.Path] + private val testJarUrlRegex = raw"jar:file:(/.*-tests.jar)!.*".r + TrampolineUtil.addShutdownHook(10000, () => { + resourceMap.valuesIterator.foreach { dirPath => + logWarning(s"Deleting expanded test jar dir $dirPath") + FileUtil.fullyDelete(dirPath.toFile) + } + }) + + private def expandJar(jarPath: String): java.nio.file.Path = { + val jarFile = new java.io.File(jarPath) + val destDir = java.nio.file.Files.createTempDirectory(jarFile.getName + ".expanded") + logWarning(s"Registering $destDir for deletion on exit") + FileUtil.unZip(jarFile, destDir.toFile) + destDir + } + + def sparkTestResourcesDir(testClass: Class[_]): java.nio.file.Path = { + var sparkTestClass = testClass + while (sparkTestClass.getName.contains("rapids")) { + sparkTestClass = sparkTestClass.getSuperclass + } + val sparkTestClassResource = "/" + sparkTestClass.getName.replace(".", "/") + ".class" + val resourceURL = sparkTestClass.getResource(sparkTestClassResource).toString + val resourceJar = resourceURL match { + case testJarUrlRegex(testJarPath) => testJarPath + case _ => sys.error(s"Could not extract tests jar path from $resourceURL") + } + this.synchronized { + resourceMap.getOrElseUpdate(resourceJar, expandJar(resourceJar)) + } + } + def nativeSparkConf(origin: SparkConf, warehouse: String): SparkConf = { // Timezone is fixed to UTC to allow timestamps to work by default TimeZone.setDefault(TimeZone.getTimeZone("UTC")) diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala index 3ccd478d368..4cf155041d9 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala @@ -34,12 +34,14 @@ class RapidsTestSettings extends BackendTestSettings { .exclude("SPARK-35735: Take into account day-time interval fields in cast", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10771")) .exclude("casting to fixed-precision decimals", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10771")) .exclude("SPARK-32828: cast from a derived user-defined type to a base type", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10771")) + .exclude("cast string to timestamp", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10771")) + .exclude("cast string to date", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10771")) enableSuite[RapidsDataFrameAggregateSuite] - .exclude("collect functions", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10772")) - .exclude("collect functions structs", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10772")) - .exclude("collect functions should be able to cast to array type with no null values", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10772")) - .exclude("SPARK-17641: collect functions should not collect null values", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10772")) - .exclude("SPARK-19471: AggregationIterator does not initialize the generated result projection before using it", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10772")) + .exclude("collect functions", ADJUST_UT("order of elements in the array is non-deterministic in collect")) + .exclude("collect functions structs", ADJUST_UT("order of elements in the array is non-deterministic in collect")) + .exclude("collect functions should be able to cast to array type with no null values", ADJUST_UT("order of elements in the array is non-deterministic in collect")) + .exclude("SPARK-17641: collect functions should not collect null values", ADJUST_UT("order of elements in the array is non-deterministic in collect")) + .exclude("SPARK-19471: AggregationIterator does not initialize the generated result projection before using it", WONT_FIX_ISSUE("Codegen related UT, not applicable for GPU")) .exclude("SPARK-24788: RelationalGroupedDataset.toString with unresolved exprs should not fail", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10801")) enableSuite[RapidsJsonExpressionsSuite] .exclude("from_json - invalid data", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849")) @@ -66,20 +68,10 @@ class RapidsTestSettings extends BackendTestSettings { enableSuite[RapidsJsonSuite] .exclude("Casting long as timestamp", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) .exclude("Write timestamps correctly with timestampFormat option and timeZone option", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) - .exclude("SPARK-23723: json in UTF-16 with BOM", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) - .exclude("SPARK-23723: multi-line json in UTF-32BE with BOM", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) - .exclude("SPARK-23723: Use user's encoding in reading of multi-line json in UTF-16LE", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) - .exclude("SPARK-23723: Unsupported encoding name", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) - .exclude("SPARK-23723: checking that the encoding option is case agnostic", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) - .exclude("SPARK-23723: specified encoding is not matched to actual encoding", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) - .exclude("SPARK-23724: lineSep should be set if encoding if different from UTF-8", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) - .exclude("SPARK-31716: inferring should handle malformed input", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) - .exclude("SPARK-24190: restrictions for JSONOptions in read", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) .exclude("exception mode for parsing date/timestamp string", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10773")) enableSuite[RapidsMathFunctionsSuite] enableSuite[RapidsRegexpExpressionsSuite] enableSuite[RapidsStringExpressionsSuite] - .exclude("concat", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) .exclude("string substring_index function", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) .exclude("SPARK-22498: Concat should not generate codes beyond 64KB", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) .exclude("SPARK-22549: ConcatWs should not generate codes beyond 64KB", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10775")) diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala index bcac0b8fe2d..69bd4532c71 100644 --- a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala +++ b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestsTrait.scala @@ -110,6 +110,30 @@ trait RapidsTestsTrait extends RapidsTestsCommonTrait { // .config("spark.rapids.sql.test.enabled", "true") // .config("spark.rapids.sql.test.allowedNonGpu", // "SerializeFromObjectExec,DeserializeToObjectExec,ExternalRDDScanExec") + .config("spark.rapids.sql.castStringToTimestamp.enabled", "true") + .config("spark.rapids.sql.csv.read.decimal.enabled", "true") + .config("spark.rapids.sql.format.avro.enabled", "true") + .config("spark.rapids.sql.format.avro.read.enabled", "true") + .config("spark.rapids.sql.format.hive.text.write.enabled", "true") + .config("spark.rapids.sql.format.json.enabled", "true") + .config("spark.rapids.sql.format.json.read.enabled", "true") + .config("spark.rapids.sql.incompatibleDateFormats.enabled", "true") + .config("spark.rapids.sql.python.gpu.enabled", "true") + .config("spark.rapids.sql.rowBasedUDF.enabled", "true") + .config("spark.rapids.sql.window.collectList.enabled", "true") + .config("spark.rapids.sql.window.collectSet.enabled", "true") + .config("spark.rapids.sql.window.range.byte.enabled", "true") + .config("spark.rapids.sql.window.range.short.enabled", "true") + .config("spark.rapids.sql.expression.Ascii", "true") + .config("spark.rapids.sql.expression.Conv", "true") + .config("spark.rapids.sql.expression.GetJsonObject", "true") + .config("spark.rapids.sql.expression.JsonToStructs", "true") + .config("spark.rapids.sql.expression.JsonTuple", "true") + .config("spark.rapids.sql.expression.StructsToJson", "true") + .config("spark.rapids.sql.exec.CollectLimitExec", "true") + .config("spark.rapids.sql.exec.FlatMapCoGroupsInPandasExec", "true") + .config("spark.rapids.sql.exec.WindowInPandasExec", "true") + .config("spark.rapids.sql.hasExtendedYearValues", "false") .appName("rapids spark plugin running Vanilla Spark UT") _spark = sparkBuilder