Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/branch-25.02' into hivehash-nest…
Browse files Browse the repository at this point in the history
…ed-support
  • Loading branch information
ustcfy committed Dec 11, 2024
2 parents 67125c0 + e22a7ca commit 7d7d57c
Show file tree
Hide file tree
Showing 50 changed files with 783 additions and 341 deletions.
58 changes: 58 additions & 0 deletions .github/workflows/license-header-check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# A workflow to check copyright/license header
name: license header check

on:
pull_request:
types: [opened, synchronize, reopened]

jobs:
license-header-check:
runs-on: ubuntu-latest
if: "!contains(github.event.pull_request.title, '[bot]')"
steps:
- name: Get checkout depth
run: |
echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 10 ))" >> $GITHUB_ENV
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: ${{ env.PR_FETCH_DEPTH }}

- name: license-header-check
uses: NVIDIA/spark-rapids-common/license-header-check@main
with:
included_file_patterns: |
*.yml,
*.yaml,
*.sh,
*.xml,
*.properties,
*.scala,
*.py,
build/*,
*.cpp,
*Dockerfile*,
*Jenkinsfile*,
*.ini,
*.java,
*.fbs
excluded_file_patterns: |
*target/*,
thirdparty/*,
sql-plugin/src/main/java/com/nvidia/spark/rapids/format/*
6 changes: 4 additions & 2 deletions .github/workflows/mvn-verify-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ jobs:
id: generateCacheKey
run: |
set -x
cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-$(date +'%Y-%m-%d')"
depsSHA1=$(. .github/workflows/mvn-verify-check/get-deps-sha1.sh 2.12)
cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-${depsSHA1}"
echo "dailyCacheKey=$cacheKey" | tee $GITHUB_ENV $GITHUB_OUTPUT
- name: Cache local Maven repository
id: cache
Expand Down Expand Up @@ -165,7 +166,8 @@ jobs:
id: generateCacheKey
run: |
set -x
cacheKey="${{ runner.os }}-maven-scala213-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-$(date +'%Y-%m-%d')"
depsSHA1=$(. .github/workflows/mvn-verify-check/get-deps-sha1.sh 2.13)
cacheKey="${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ github.event.pull_request.base.ref }}-${depsSHA1}"
echo "scala213dailyCacheKey=$cacheKey" | tee $GITHUB_ENV $GITHUB_OUTPUT
- name: Cache local Maven repository
id: cache
Expand Down
36 changes: 36 additions & 0 deletions .github/workflows/mvn-verify-check/get-deps-sha1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -e

scala_ver=${1:-"2.12"}
base_URL="https://oss.sonatype.org/service/local/artifact/maven/resolve"
project_jni="spark-rapids-jni"
project_private="rapids-4-spark-private_${scala_ver}"

jni_ver=$(mvn help:evaluate -q -pl dist -Dexpression=spark-rapids-jni.version -DforceStdout)
private_ver=$(mvn help:evaluate -q -pl dist -Dexpression=spark-rapids-private.version -DforceStdout)

jni_sha1=$(curl -s -H "Accept: application/json" \
"${base_URL}?r=snapshots&g=com.nvidia&a=${project_jni}&v=${jni_ver}&c=&e=jar&wt=json" \
| jq .data.sha1) || $(date +'%Y-%m-%d')
private_sha1=$(curl -s -H "Accept: application/json" \
"${base_URL}?r=snapshots&g=com.nvidia&a=${project_private}&v=${private_ver}&c=&e=jar&wt=json" \
| jq .data.sha1) || $(date +'%Y-%m-%d')

sha1md5=$(echo -n "${jni_sha1}_${private_sha1}" | md5sum | awk '{print $1}')

echo $sha1md5
21 changes: 12 additions & 9 deletions .github/workflows/mvn-verify-check/populate-daily-cache.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,30 @@
# See the License for the specific language governing permissions and
# limitations under the License.

set -x
max_retry=3; delay=30; i=1
set -e
set -o pipefail

if [[ $SCALA_VER == '2.12' ]]; then
pom='pom.xml'
elif [[ $SCALA_VER == '2.13' ]]; then
pom='scala2.13/pom.xml'
fi

max_retry=3; delay=30; i=1
while true; do
buildvers=($(python build/get_buildvers.py no_snapshots $pom | tr -d ',')) &&
{
python build/get_buildvers.py "no_snapshots.buildvers" $pom | tr -d ',' | \
xargs -n 1 -I {} bash -c \
"mvn $COMMON_MVN_FLAGS --file $pom -Dbuildver={} de.qaware.maven:go-offline-maven-plugin:resolve-dependencies"

for buildver in "${buildvers[@]}"; do
mvn $COMMON_MVN_FLAGS --file $pom -Dbuildver=$buildver de.qaware.maven:go-offline-maven-plugin:resolve-dependencies
done
} && {
# compile base versions to cache scala compiler and compiler bridge
mvn $COMMON_MVN_FLAGS --file $pom \
process-test-resources -pl sql-plugin-api -am
mvn $COMMON_MVN_FLAGS --file $pom process-test-resources -pl sql-plugin-api -am
} && break || {
if [[ $i -le $max_retry ]]; then
echo "mvn command failed. Retry $i/$max_retry."; ((i++)); sleep $delay; ((delay=delay*2))
else
echo "mvn command failed. Exit 1"; exit 1
fi
}
done
done
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,6 @@ import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.internal.ExpressionUtils.{column, expression}

object DataGenExprShims {
def columnToExpr(c: Column): Expression = c
def exprToColumn(e: Expression): Column = e
def columnToExpr(c: Column): Expression = expression(c)
def exprToColumn(e: Expression): Column = column(e)
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import org.apache.spark.sql.execution.{CoalescedPartitionSpec, ShufflePartitionS
import org.apache.spark.sql.execution.exchange.Exchange
import org.apache.spark.sql.execution.metric.{SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter}
import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBase, ShuffledBatchRDD}
import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.createAdditionalExchangeMetrics
import org.apache.spark.sql.vectorized.ColumnarBatch
import org.apache.spark.util.ThreadUtils

Expand Down Expand Up @@ -71,22 +72,11 @@ case class GpuOptimizeWriteExchangeExec(
private[sql] lazy val readMetrics =
SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext)

override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
"dataSize" -> createSizeMetric(ESSENTIAL_LEVEL, "data size"),
"dataReadSize" -> createSizeMetric(MODERATE_LEVEL, "data read size"),
"rapidsShuffleSerializationTime" ->
createNanoTimingMetric(DEBUG_LEVEL, "rs. serialization time"),
"rapidsShuffleDeserializationTime" ->
createNanoTimingMetric(DEBUG_LEVEL, "rs. deserialization time"),
"rapidsShuffleWriteTime" ->
createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle write time"),
"rapidsShuffleCombineTime" ->
createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle combine time"),
"rapidsShuffleWriteIoTime" ->
createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle write io time"),
"rapidsShuffleReadTime" ->
createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle read time")
) ++ GpuMetric.wrap(readMetrics) ++ GpuMetric.wrap(writeMetrics)
override lazy val additionalMetrics : Map[String, GpuMetric] = {
createAdditionalExchangeMetrics(this) ++
GpuMetric.wrap(readMetrics) ++
GpuMetric.wrap(writeMetrics)
}

override lazy val allMetrics: Map[String, GpuMetric] = {
Map(
Expand All @@ -98,7 +88,7 @@ case class GpuOptimizeWriteExchangeExec(
}

private lazy val serializer: Serializer =
new GpuColumnarBatchSerializer(gpuLongMetric("dataSize"),
new GpuColumnarBatchSerializer(allMetrics,
child.output.map(_.dataType).toArray,
RapidsConf.SHUFFLE_KUDO_SERIALIZER_ENABLED.get(child.conf))

Expand Down
16 changes: 16 additions & 0 deletions docs/dev/idea-code-style-settings.xml
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
<!--
Copyright (c) 2024, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<code_scheme name="Default" version="173">
<option name="SOFT_MARGINS" value="100" />
<JavaCodeStyleSettings>
Expand Down
8 changes: 5 additions & 3 deletions integration_tests/src/main/python/datasourcev2_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from data_gen import gen_df, decimal_gens, non_utc_allow, StructGen, ArrayGen, string_gen
from marks import *
from spark_session import is_hive_available, is_spark_330_or_later, with_cpu_session, with_gpu_session
from hive_parquet_write_test import _hive_bucket_gens
from hive_parquet_write_test import _hive_bucket_gens_sans_bools
from hive_parquet_write_test import read_single_bucket

_hive_write_conf = {
Expand All @@ -33,9 +33,11 @@
@allow_non_gpu(*non_utc_allow)
def test_write_hive_bucketed_table(spark_tmp_table_factory, file_format):
num_rows = 2048

# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen
def gen_table(spark):
gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens)]
gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens_sans_bools)]
types_sql_str = ','.join('{} {}'.format(
name, gen.data_type.simpleString()) for name, gen in gen_list)
col_names_str = ','.join(name for name, gen in gen_list)
Expand Down
12 changes: 6 additions & 6 deletions integration_tests/src/main/python/date_time_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,34 +139,34 @@ def test_datediff(data_gen):

hms_fallback = ['ProjectExec'] if not is_supported_time_zone() else []

@allow_non_gpu(*hms_fallback)
@allow_non_gpu(*non_utc_tz_allow)
def test_months_between():
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, timestamp_gen).selectExpr('months_between(a, b, false)'))

@allow_non_gpu(*hms_fallback)
@allow_non_gpu(*non_utc_tz_allow)
def test_months_between_first_day():
assert_gpu_and_cpu_are_equal_collect(
lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2024-01-01", false)'))

@allow_non_gpu(*hms_fallback)
@allow_non_gpu(*non_utc_tz_allow)
def test_months_between_last_day():
assert_gpu_and_cpu_are_equal_collect(
lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2023-12-31", false)'))

@allow_non_gpu(*hms_fallback)
@allow_non_gpu(*non_utc_tz_allow)
@approximate_float()
def test_months_between_round():
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, timestamp_gen).selectExpr('months_between(a, b, true)'))

@allow_non_gpu(*hms_fallback)
@allow_non_gpu(*non_utc_tz_allow)
@approximate_float()
def test_months_between_first_day_round():
assert_gpu_and_cpu_are_equal_collect(
lambda spark : unary_op_df(spark, timestamp_gen).selectExpr('months_between(a, timestamp"2024-01-01", true)'))

@allow_non_gpu(*hms_fallback)
@allow_non_gpu(*non_utc_tz_allow)
@approximate_float()
def test_months_between_last_day_round():
assert_gpu_and_cpu_are_equal_collect(
Expand Down
13 changes: 7 additions & 6 deletions integration_tests/src/main/python/hive_parquet_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,25 @@
# "GpuInsertIntoHiveTable" for Parquet write.
_write_to_hive_conf = {"spark.sql.hive.convertMetastoreParquet": False}

_hive_bucket_basic_gens = [
boolean_gen, byte_gen, short_gen, int_gen, long_gen, string_gen, float_gen, double_gen,
_hive_bucket_basic_gens_sans_bools = [
byte_gen, short_gen, int_gen, long_gen, string_gen, float_gen, double_gen,
DateGen(start=date(1590, 1, 1)), _restricted_timestamp()]
_hive_bucket_basic_gens = [boolean_gen] + _hive_bucket_basic_gens_sans_bools

_hive_bucket_basic_struct_gen = StructGen(
[['c'+str(ind), c_gen] for ind, c_gen in enumerate(_hive_bucket_basic_gens)])
[['c'+str(ind), c_gen] for ind, c_gen in enumerate(_hive_bucket_basic_gens_sans_bools)])

_hive_bucket_struct_gens = [
_hive_bucket_basic_struct_gen,
StructGen([['child0', byte_gen], ['child1', _hive_bucket_basic_struct_gen]]),
StructGen([['child0', ArrayGen(short_gen)], ['child1', double_gen]])]

_hive_bucket_array_gens = [ArrayGen(sub_gen) for sub_gen in _hive_bucket_basic_gens] + [
_hive_bucket_array_gens = [ArrayGen(sub_gen) for sub_gen in _hive_bucket_basic_gens_sans_bools] + [
ArrayGen(ArrayGen(short_gen, max_length=10), max_length=10),
ArrayGen(ArrayGen(string_gen, max_length=10), max_length=10),
ArrayGen(StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]]))]

_hive_bucket_gens = _hive_bucket_basic_gens + _hive_bucket_struct_gens + _hive_bucket_array_gens
_hive_bucket_gens_sans_bools = _hive_bucket_basic_gens_sans_bools + _hive_bucket_struct_gens + _hive_bucket_array_gens

_hive_basic_gens = _hive_bucket_basic_gens + [
DecimalGen(precision=19, scale=1, nullable=True),
Expand Down Expand Up @@ -211,7 +212,7 @@ def test_insert_hive_bucketed_table(spark_tmp_table_factory):
num_rows = 2048

def gen_table(spark):
gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens)]
gen_list = [('_c' + str(i), gen) for i, gen in enumerate(_hive_bucket_gens_sans_bools)]
types_sql_str = ','.join('{} {}'.format(
name, gen.data_type.simpleString()) for name, gen in gen_list)
col_names_str = ','.join(name for name, gen in gen_list)
Expand Down
10 changes: 8 additions & 2 deletions integration_tests/src/main/python/hive_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,11 @@ def _restricted_timestamp(nullable=True):
end=datetime(2262, 4, 11, tzinfo=timezone.utc),
nullable=nullable)

# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen
_basic_gens = [byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
string_gen, boolean_gen, DateGen(start=date(1590, 1, 1)),
string_gen, DateGen(start=date(1590, 1, 1)),
_restricted_timestamp()
] + decimal_gens

Expand All @@ -45,8 +48,11 @@ def _restricted_timestamp(nullable=True):
ArrayGen(ArrayGen(string_gen, max_length=10), max_length=10),
ArrayGen(StructGen([['child0', byte_gen], ['child1', string_gen], ['child2', float_gen]]))]

# Use every type except boolean, see https://github.com/NVIDIA/spark-rapids/issues/11762 and
# https://github.com/rapidsai/cudf/issues/6763 .
# Once the first issue is fixed, add back boolean_gen
_map_gens = [simple_string_to_string_map_gen] + [MapGen(f(nullable=False), f()) for f in [
BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen,
ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen,
lambda nullable=True: _restricted_timestamp(nullable=nullable),
lambda nullable=True: DateGen(start=date(1590, 1, 1), nullable=nullable),
lambda nullable=True: DecimalGen(precision=15, scale=1, nullable=nullable),
Expand Down
3 changes: 2 additions & 1 deletion integration_tests/src/main/python/join_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
from marks import ignore_order, allow_non_gpu, incompat, validate_execs_in_gpu_plan
from spark_session import with_cpu_session, is_before_spark_330, is_databricks_runtime

pytestmark = [pytest.mark.nightly_resource_consuming_test]
# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
pytestmark = [pytest.mark.nightly_resource_consuming_test, pytest.mark.premerge_ci_1]

all_non_sized_join_types = ['LeftSemi', 'LeftAnti', 'Cross']
all_symmetric_sized_join_types = ['Inner', 'FullOuter']
Expand Down
3 changes: 3 additions & 0 deletions integration_tests/src/main/python/json_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
from marks import approximate_float, allow_non_gpu, ignore_order, datagen_overrides
from spark_session import *

# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
pytestmark = [pytest.mark.premerge_ci_1]

TEXT_INPUT_EXEC='FileSourceScanExec'

# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653'
Expand Down
Loading

0 comments on commit 7d7d57c

Please sign in to comment.