YanxuanLiu · YanxuanLiu · May 23, 2023 · May 23, 2023 · May 23, 2023 · May 23, 2023
diff --git a/.github/workflows/mvn-verify-check.yml b/.github/workflows/mvn-verify-check.yml
@@ -23,14 +23,20 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
+env:
+  COMMON_MVN_FLAGS: >
+    -Ddist.jar.compress=false
+    -DskipTests
+    -Dskip
+    -Dmaven.javadoc.skip
+
 jobs:
   get-shim-versions-from-dist:
     runs-on: ubuntu-latest
     outputs:
       sparkHeadVersion: ${{ steps.allShimVersionsStep.outputs.headVersion }}
       sparkTailVersions: ${{ steps.allShimVersionsStep.outputs.tailVersions }}
-      sparkJDK11Versions: ${{ steps.allShimVersionsStep.outputs.jdk11Versions }}
-      sparkJDK17Versions: ${{ steps.allShimVersionsStep.outputs.jdk17Versions }}
+      sparkJDKVersions: ${{ steps.allShimVersionsStep.outputs.jdkVersions }}
     steps:
       - uses: actions/checkout@v3 # refs/pull/:prNumber/merge
 
@@ -59,15 +65,14 @@ jobs:
           echo "headVersion=$SPARK_BASE_SHIM_VERSION" >> $GITHUB_OUTPUT
           echo "tailVersions=$svJsonStr" >> $GITHUB_OUTPUT
           # jdk11
-          jdkVersionArrBody=$(printf ",{\"spark-version\":\"%s\"}" "${SPARK_SHIM_VERSIONS_JDK11[@]}")
-          jdkVersionArrBody=${jdkVersionArrBody:1}
-          jdkVersionJsonStr=$(printf {\"include\":[%s]} $jdkVersionArrBody)
-          echo "jdk11Versions=$jdkVersionJsonStr" >> $GITHUB_OUTPUT
+          jdk11VersionArrBody=$(printf ",{\"spark-version\":\"%s\",\"java-version\":11}" "${SPARK_SHIM_VERSIONS_JDK11[@]}")
           # jdk17
-          jdkVersionArrBody=$(printf ",{\"spark-version\":\"%s\"}" "${SPARK_SHIM_VERSIONS_JDK17[@]}")
+          jdk17VersionArrBody=$(printf ",{\"spark-version\":\"%s\",\"java-version\":17}" "${SPARK_SHIM_VERSIONS_JDK17[@]}")
+          # jdk
+          jdkVersionArrBody=$jdk11VersionArrBody$jdk17VersionArrBody
           jdkVersionArrBody=${jdkVersionArrBody:1}
           jdkVersionJsonStr=$(printf {\"include\":[%s]} $jdkVersionArrBody)
-          echo "jdk17Versions=$jdkVersionJsonStr" >> $GITHUB_OUTPUT
+          echo "jdkVersions=$jdkVersionJsonStr" >> $GITHUB_OUTPUT
 
   package-tests:
     needs: get-shim-versions-from-dist
@@ -91,11 +96,9 @@ jobs:
           -pl integration_tests,tests -am
           -P 'individual,pre-merge'
           -Dbuildver=${{ matrix.spark-version }}
-          -DskipTests
-          -Dskip
-          -Dmaven.javadoc.skip
           -Dmaven.scalastyle.skip=true
           -Drat.skip=true
+          $COMMON_MVN_FLAGS
 
 
   verify-all-modules-with-headSparkVersion:
@@ -116,53 +119,25 @@ jobs:
           mvn -Dmaven.wagon.http.retryHandler.count=3 -B verify
           -P 'individual,pre-merge'
           -Dbuildver=${{ needs.get-shim-versions-from-dist.outputs.sparkHeadVersion }}
-          -DskipTests
-          -Dskip
-          -Dmaven.javadoc.skip
+          $COMMON_MVN_FLAGS
 
-  verify-modules-with-jdk11:
+  verify-modules-with-jdk:
     needs: get-shim-versions-from-dist
     runs-on: ubuntu-latest
     strategy:
-      matrix: ${{ fromJSON(needs.get-shim-versions-from-dist.outputs.sparkJDK11Versions) }}
+      matrix: ${{ fromJSON(needs.get-shim-versions-from-dist.outputs.sparkJDKVersions) }}
     steps:
       - uses: actions/checkout@v3 # refs/pull/:prNumber/merge
 
       - name: Setup Java and Maven Env
         uses: actions/setup-java@v3
         with:
           distribution: adopt
-          java-version: 11
+          java-version: ${{ matrix.java-version }}
 
-      - name: Build JDK11
-        run: >
-          mvn -Dmaven.wagon.http.retryHandler.count=3 -B verify
-          -P 'individual,pre-merge,jdk11'
-          -Dbuildver=${{ matrix.spark-version }}
-          -DskipTests
-          -Dskip
-          -Dmaven.javadoc.skip
-
-  # TODO: use matrix to combine all jdk* jobs
-  verify-modules-with-jdk17:
-    needs: get-shim-versions-from-dist
-    runs-on: ubuntu-latest
-    strategy:
-      matrix: ${{ fromJSON(needs.get-shim-versions-from-dist.outputs.sparkJDK17Versions) }}
-    steps:
-      - uses: actions/checkout@v3 # refs/pull/:prNumber/merge
-
-      - name: Setup Java and Maven Env
-        uses: actions/setup-java@v3
-        with:
-          distribution: adopt
-          java-version: 17
-
-      - name: Build JDK17
+      - name: Build JDK
         run: >
           mvn -Dmaven.wagon.http.retryHandler.count=3 -B verify
-          -P 'individual,pre-merge,jdk17'
+          -P "individual,pre-merge,jdk${{ matrix.java-version }}"
           -Dbuildver=${{ matrix.spark-version }}
-          -DskipTests
-          -Dskip
-          -Dmaven.javadoc.skip
+          $COMMON_MVN_FLAGS
diff --git a/docs/compatibility.md b/docs/compatibility.md
@@ -435,8 +435,7 @@ These are the known edge cases where running on the GPU will produce different r
 
 The following regular expression patterns are not yet supported on the GPU and will fall back to the CPU.
 
-- Line anchor `^` is not supported in some contexts, such as when combined with a choice (`^|a`).
-- Line anchor `$` is not supported in some rare contexts.
+- Line anchors `^` and `$` are not supported in some contexts, such as when combined with a choice (`^|a` or `$|a`).
 - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts.
 - String anchor `\z` is not supported
 - Patterns containing an end of line or string anchor immediately next to a newline or repetition that produces zero

diff --git a/docs/spark-profiling-tool.md b/docs/spark-profiling-tool.md
@@ -654,10 +654,13 @@ Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/*
 Starting with release _22.10_, the Profiling tool a new _Auto-Tuner_ that aims at optimizing
 Apache Spark applications by recommending a set of configurations to tune the performance of
 Rapids accelerator.  
+
 Currently, the _Auto-Tuner_ calculates a set of configurations that impact the performance of Apache
 Spark apps executing on GPU. Those calculations can leverage cluster information
 (e.g. memory, cores, Spark default configurations) as well as information processed in the
-application event logs.
+application event logs.  Note that the tool also will recommend settings for the application assuming
+that the job will be able to use all the cluster resources (CPU and GPU) when it is running.
+
 The values loaded from the app logs have higher precedence than the default configs.  
 Please refer to [Understanding the Profiling tool output](#d-recommended-configuration) for
 more details on the output of the _Auto-Tuner_.

diff --git a/docs/spark-qualification-tool.md b/docs/spark-qualification-tool.md
@@ -16,15 +16,18 @@ that could not run on GPU because they are unsupported operators or not SQL/Data
 
 This tool is intended to give the users a starting point and does not guarantee the
 queries or applications with the highest _recommendation_ will actually be accelerated the most. Currently,
-it reports by looking at the amount of time spent in tasks of SQL Dataframe operations.
+it reports by looking at the amount of time spent in tasks of SQL Dataframe operations.  Note that the qualification
+tool estimates assume that the application is run on a dedicated cluster where it can use all of the available
+Spark resources.
 
 The estimations for GPU duration are available for different environments and are based on benchmarks run in the
 applicable environments.  Here are the cluster information for the ETL benchmarks used for the estimates:
 
 | Environment      | CPU Cluster       | GPU Cluster                    |
 |------------------|-------------------|--------------------------------|
 | On-prem          | 8x 128-core       | 8x 128-core + 8x A100 40 GB    |
-| Dataproc         | 4x n1-standard-32 | 4x n1-standard-32 + 8x T4 16GB |
+| Dataproc (T4)    | 4x n1-standard-32 | 4x n1-standard-32 + 8x T4 16GB |
+| Dataproc (L4)    | 8x n1-standard-16 | 8x g2-standard-16              |
 | EMR              | 8x m5d.8xlarge    | 4x g4dn.12xlarge               |
 | Databricks AWS   | 8x m6gd.8xlage    | 8x g5.8xlarge                  |
 | Databricks Azure | 8x E8ds_v4        | 8x NC8as_T4_v3                 |
@@ -132,15 +135,15 @@ any machine and include the jars in the classpath.
     multiple event logs files or directories containing spark event logs in the local filesystem, HDFS, S3 or mixed.
 
     ```bash
-    Usage: java ${QUALIFICATION_HEAP}
+    Usage: java ${QUALIFICATION_HEAP} \
              -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/* \
              com.nvidia.spark.rapids.tool.qualification.QualificationMain [options]
              <eventlogs | eventlog directories ...>
     ```
 
     ```bash
     Sample: java ${QUALIFICATION_HEAP} \
-              -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/*
+              -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/* \
               com.nvidia.spark.rapids.tool.qualification.QualificationMain /usr/logs/app-name1
     ```
 
@@ -242,8 +245,9 @@ Usage: java -cp rapids-4-spark-tools_2.12-<version>.jar:$SPARK_HOME/jars/*
                                      the same name.
   -p, --per-sql                      Report at the individual SQL query level.
       --platform  <arg>              Cluster platform where Spark CPU workloads were
-                                     executed. Options include onprem, dataproc, emr
-                                     databricks-aws, and databricks-azure.
+                                     executed. Options include onprem, dataproc-t4,
+                                     dataproc-l4, emr, databricks-aws, and
+                                     databricks-azure.
                                      Default is onprem.
   -r, --report-read-schema           Whether to output the read formats and
                                      datatypes to the CSV file. This can be very
@@ -374,6 +378,67 @@ For information on the files content and processing the Qualification report and
 to [Understanding the Qualification tool output](#understanding-the-qualification-tool-output) and
 [Output Formats](#output-formats) sections below.
 
+## Running using a Spark Listener
+
+We provide a Spark Listener that can be installed at application start that will produce output
+for each SQL queries in the running application and indicate if that query is a good fit to try
+with the Rapids Accelerator for Spark.
+
+### Prerequisites
+- Java 8 or above, Spark 3.0.1+
+
+### Download the tools jar
+- Download the latest jar from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/)
+
+### Configuration
+
+Add the RunningQualificationEventProcess to the spark listeners configuration:
+`spark.extraListeners=org.apache.spark.sql.rapids.tool.qualification.RunningQualificationEventProcessor`
+
+The user should specify the output directory if they want the output to go to separate
+files, otherwise it will go to the Spark driver log. If the output directory is specified, it outputs
+two different files, one csv and one pretty printed log file. The output directory can be a local directory
+or point to a distributed file system or blobstore like S3.
+ - `spark.rapids.qualification.outputDir`
+
+By default, this will output results for 10 SQL queries per file and will
+keep 100 files. This behavior is because many blob stores don't show files until
+they are fully written so you wouldn't be able to see the results for a running
+application until it finishes the number of SQL queries per file. This behavior
+can be configured with the following configs.
+ - `spark.rapids.qualification.output.numSQLQueriesPerFile` - default 10
+ - `spark.rapids.qualification.output.maxNumFiles` - default 100
+
+### Run the Spark application
+
+Run the application and include the tools jar, `spark.extraListeners` config and optionally the other
+configs to control the tools behavior.
+
+For example:
+
+```bash
+$SPARK_HOME/bin/spark-shell \
+--jars rapids-4-spark-tools_2.12-<version>.jar \
+--conf spark.extraListeners=org.apache.spark.sql.rapids.tool.qualification.RunningQualificationEventProcessor \
+--conf spark.rapids.qualification.outputDir=/tmp/qualPerSqlOutput \
+--conf spark.rapids.qualification.output.numSQLQueriesPerFile=5 \
+--conf spark.rapids.qualification.output.maxNumFiles=10
+```
+
+After running some SQL queries you can look in the output directory and see files like:
+
+```
+rapids_4_spark_qualification_output_persql_0.csv
+rapids_4_spark_qualification_output_persql_0.log
+rapids_4_spark_qualification_output_persql_1.csv
+rapids_4_spark_qualification_output_persql_1.log
+rapids_4_spark_qualification_output_persql_2.csv
+rapids_4_spark_qualification_output_persql_2.log
+```
+
+See the [Understanding the Qualification tool output](#understanding-the-qualification-tool-output)
+section on the file contents details.
+
 ## Running the Qualification tool inside a running Spark application using the API
 
 ### Prerequisites
@@ -384,8 +449,7 @@ to [Understanding the Qualification tool output](#understanding-the-qualificatio
 
 ### Modify your application code to call the api's
 
-Currently only Scala api's are supported. Note this does not support reporting at the per sql level currently. This can be done
-manually by just wrapping and reporting around those queries instead of the entire application.
+Currently only Scala api's are supported. Note this does not support reporting at the per sql level currently. This can be done manually by just wrapping and reporting around those queries instead of the entire application.
 
 Create the `RunningQualicationApp`:
 ```
@@ -457,67 +521,6 @@ For example, if running the spark-shell:
 $SPARK_HOME/bin/spark-shell --jars rapids-4-spark-tools_2.12-<version>.jar
 ```
 
-## Running using a Spark Listener
-
-We provide a Spark Listener that can be installed at application start that will produce output
-for each SQL queries in the running application and indicate if that query is a good fit to try
-with the Rapids Accelerator for Spark.
-
-### Prerequisites
-- Java 8 or above, Spark 3.0.1+
-
-### Download the tools jar
-- Download the latest jar from [Maven repository](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/)
-
-### Configuration
-
-Add the RunningQualificationEventProcess to the spark listeners configuration:
-`spark.extraListeners=org.apache.spark.sql.rapids.tool.qualification.RunningQualificationEventProcessor`
-
-The user should specify the output directory if they want the output to go to separate
-files, otherwise it will go to the Spark driver log. If the output directory is specified, it outputs
-two different files, one csv and one pretty printed log file. The output directory can be a local directory
-or point to a distributed file system or blobstore like S3.
- - `spark.rapids.qualification.outputDir`
-
-By default, this will output results for 10 SQL queries per file and will
-keep 100 files. This behavior is because many blob stores don't show files until
-they are fully written so you wouldn't be able to see the results for a running
-application until it finishes the number of SQL queries per file. This behavior
-can be configured with the following configs.
- - `spark.rapids.qualification.output.numSQLQueriesPerFile` - default 10
- - `spark.rapids.qualification.output.maxNumFiles` - default 100
-
-### Run the Spark application
-
-Run the application and include the tools jar, `spark.extraListeners` config and optionally the other
-configs to control the tools behavior.
-
-For example:
-
-```bash
-SPARK_HOME/bin/spark-shell \
---jars rapids-4-spark-tools_2.12-<version>.jar \
---conf spark.extraListeners=org.apache.spark.sql.rapids.tool.qualification.RunningQualificationEventProcessor \
---conf spark.rapids.qualification.outputDir=/tmp/qualPerSqlOutput \
---conf spark.rapids.qualification.output.numSQLQueriesPerFile=5 \
---conf spark.rapids.qualification.output.maxNumFiles=10
-```
-
-After running some SQL queries you can look in the output directory and see files like:
-
-```
-rapids_4_spark_qualification_output_persql_0.csv
-rapids_4_spark_qualification_output_persql_0.log
-rapids_4_spark_qualification_output_persql_1.csv
-rapids_4_spark_qualification_output_persql_1.log
-rapids_4_spark_qualification_output_persql_2.csv
-rapids_4_spark_qualification_output_persql_2.log
-```
-
-See the [Understanding the Qualification tool output](#understanding-the-qualification-tool-output)
-section on the file contents details.
-
 ## Understanding the Qualification tool output
 
 For each processed Spark application, the Qualification tool generates two main fields to help quantify the expected

diff --git a/...ration_tests/src/main/java/com/nvidia/spark/rapids/tests/udf/hive/EmptyHiveSimpleUDF.java b/...ration_tests/src/main/java/com/nvidia/spark/rapids/tests/udf/hive/EmptyHiveSimpleUDF.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,9 @@
 
 package com.nvidia.spark.rapids.tests.udf.hive;
 
-import org.apache.hadoop.hive.ql.exec.UDF;
-
 /** An empty Hive simple UDF returning the first input directly for row-based UDF test only. */
-public class EmptyHiveSimpleUDF extends UDF {
+@SuppressWarnings("deprecation")
+public class EmptyHiveSimpleUDF extends org.apache.hadoop.hive.ql.exec.UDF {
   public String evaluate(String in, String in2) {
     return in;
   }

diff --git a/integration_tests/src/main/python/avro_test.py b/integration_tests/src/main/python/avro_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -160,3 +160,24 @@ def test_read_count(spark_tmp_path, v1_enabled_list, reader_type, batch_size_row
     assert_gpu_and_cpu_row_counts_equal(
         lambda spark: spark.read.format("avro").load(data_path),
         conf=all_confs)
+
+@pytest.mark.parametrize('col_name', ['K0', 'k0', 'K3', 'k3', 'V0', 'v0'], ids=idfn)
+@ignore_order
+def test_read_case_col_name(spark_tmp_path, col_name):
+    gen_list =[('k0', LongGen(nullable=False, min_val=0, max_val=0)), 
+            ('k1', LongGen(nullable=False, min_val=1, max_val=1)),
+            ('k2', LongGen(nullable=False, min_val=2, max_val=2)),
+            ('k3', LongGen(nullable=False, min_val=3, max_val=3)),
+            ('v0', LongGen()),
+            ('v1', LongGen()),
+            ('v2', LongGen()),
+            ('v3', LongGen())]
+
+    gen = StructGen(gen_list, nullable=False)
+    data_path = spark_tmp_path + '/AVRO_DATA'
+    with_cpu_session(
+            lambda spark : gen_df(spark, gen).write.partitionBy('k0', 'k1', 'k2', 'k3').format('avro').save(data_path))
+
+    assert_gpu_and_cpu_are_equal_collect(
+            lambda spark : spark.read.format('avro').load(data_path).selectExpr(col_name),
+            conf=_enable_all_types_conf)