Skip to content

Commit

Permalink
Merge branch 'branch-25.02' into db-script-optimize
Browse files Browse the repository at this point in the history
  • Loading branch information
NvTimLiu authored Dec 6, 2024
2 parents 2d3ca0a + 89984d0 commit a002c5f
Show file tree
Hide file tree
Showing 27 changed files with 380 additions and 120 deletions.
58 changes: 58 additions & 0 deletions .github/workflows/license-header-check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# A workflow to check copyright/license header
name: license header check

on:
pull_request:
types: [opened, synchronize, reopened]

jobs:
license-header-check:
runs-on: ubuntu-latest
if: "!contains(github.event.pull_request.title, '[bot]')"
steps:
- name: Get checkout depth
run: |
echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 10 ))" >> $GITHUB_ENV
- name: Checkout code
uses: actions/checkout@v4
with:
fetch-depth: ${{ env.PR_FETCH_DEPTH }}

- name: license-header-check
uses: NVIDIA/spark-rapids-common/license-header-check@main
with:
included_file_patterns: |
*.yml,
*.yaml,
*.sh,
*.xml,
*.properties,
*.scala,
*.py,
build/*,
*.cpp,
*Dockerfile*,
*Jenkinsfile*,
*.ini,
*.java,
*.fbs
excluded_file_patterns: |
*target/*,
thirdparty/*,
sql-plugin/src/main/java/com/nvidia/spark/rapids/format/*
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import org.apache.spark.sql.execution.{CoalescedPartitionSpec, ShufflePartitionS
import org.apache.spark.sql.execution.exchange.Exchange
import org.apache.spark.sql.execution.metric.{SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter}
import org.apache.spark.sql.rapids.execution.{GpuShuffleExchangeExecBase, ShuffledBatchRDD}
import org.apache.spark.sql.rapids.execution.GpuShuffleExchangeExecBase.createAdditionalExchangeMetrics
import org.apache.spark.sql.vectorized.ColumnarBatch
import org.apache.spark.util.ThreadUtils

Expand Down Expand Up @@ -71,22 +72,11 @@ case class GpuOptimizeWriteExchangeExec(
private[sql] lazy val readMetrics =
SQLShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext)

override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
"dataSize" -> createSizeMetric(ESSENTIAL_LEVEL, "data size"),
"dataReadSize" -> createSizeMetric(MODERATE_LEVEL, "data read size"),
"rapidsShuffleSerializationTime" ->
createNanoTimingMetric(DEBUG_LEVEL, "rs. serialization time"),
"rapidsShuffleDeserializationTime" ->
createNanoTimingMetric(DEBUG_LEVEL, "rs. deserialization time"),
"rapidsShuffleWriteTime" ->
createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle write time"),
"rapidsShuffleCombineTime" ->
createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle combine time"),
"rapidsShuffleWriteIoTime" ->
createNanoTimingMetric(DEBUG_LEVEL, "rs. shuffle write io time"),
"rapidsShuffleReadTime" ->
createNanoTimingMetric(ESSENTIAL_LEVEL, "rs. shuffle read time")
) ++ GpuMetric.wrap(readMetrics) ++ GpuMetric.wrap(writeMetrics)
override lazy val additionalMetrics : Map[String, GpuMetric] = {
createAdditionalExchangeMetrics(this) ++
GpuMetric.wrap(readMetrics) ++
GpuMetric.wrap(writeMetrics)
}

override lazy val allMetrics: Map[String, GpuMetric] = {
Map(
Expand All @@ -98,7 +88,7 @@ case class GpuOptimizeWriteExchangeExec(
}

private lazy val serializer: Serializer =
new GpuColumnarBatchSerializer(gpuLongMetric("dataSize"),
new GpuColumnarBatchSerializer(allMetrics,
child.output.map(_.dataType).toArray,
RapidsConf.SHUFFLE_KUDO_SERIALIZER_ENABLED.get(child.conf))

Expand Down
16 changes: 16 additions & 0 deletions docs/dev/idea-code-style-settings.xml
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
<!--
Copyright (c) 2024, NVIDIA CORPORATION.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<code_scheme name="Default" version="173">
<option name="SOFT_MARGINS" value="100" />
<JavaCodeStyleSettings>
Expand Down
3 changes: 2 additions & 1 deletion integration_tests/src/main/python/join_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
from marks import ignore_order, allow_non_gpu, incompat, validate_execs_in_gpu_plan
from spark_session import with_cpu_session, is_before_spark_330, is_databricks_runtime

pytestmark = [pytest.mark.nightly_resource_consuming_test]
# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
pytestmark = [pytest.mark.nightly_resource_consuming_test, pytest.mark.premerge_ci_1]

all_non_sized_join_types = ['LeftSemi', 'LeftAnti', 'Cross']
all_symmetric_sized_join_types = ['Inner', 'FullOuter']
Expand Down
3 changes: 3 additions & 0 deletions integration_tests/src/main/python/json_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
from marks import approximate_float, allow_non_gpu, ignore_order, datagen_overrides
from spark_session import *

# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
pytestmark = [pytest.mark.premerge_ci_1]

TEXT_INPUT_EXEC='FileSourceScanExec'

# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653'
Expand Down
2 changes: 2 additions & 0 deletions integration_tests/src/main/python/parquet_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
from spark_session import *
from conftest import is_databricks_runtime, is_dataproc_runtime

# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
pytestmark = [pytest.mark.premerge_ci_1]

def read_parquet_df(data_path):
return lambda spark : spark.read.parquet(data_path)
Expand Down
10 changes: 7 additions & 3 deletions integration_tests/src/main/python/regexp_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1012,14 +1012,16 @@ def test_regexp_replace_simple(regexp_enabled):
'REGEXP_REPLACE(a, "ab", "PROD")',
'REGEXP_REPLACE(a, "ae", "PROD")',
'REGEXP_REPLACE(a, "bc", "PROD")',
'REGEXP_REPLACE(a, "fa", "PROD")'
'REGEXP_REPLACE(a, "fa", "PROD")',
'REGEXP_REPLACE(a, "a\n", "PROD")',
'REGEXP_REPLACE(a, "\n", "PROD")'
),
conf=conf
)

@pytest.mark.parametrize("regexp_enabled", ['true', 'false'])
def test_regexp_replace_multi_optimization(regexp_enabled):
gen = mk_str_gen('[abcdef]{0,2}')
gen = mk_str_gen('[abcdef\t\n\a]{0,3}')

conf = { 'spark.rapids.sql.regexp.enabled': regexp_enabled }

Expand All @@ -1032,7 +1034,9 @@ def test_regexp_replace_multi_optimization(regexp_enabled):
'REGEXP_REPLACE(a, "aa|bb|cc|dd", "PROD")',
'REGEXP_REPLACE(a, "(aa|bb)|(cc|dd)", "PROD")',
'REGEXP_REPLACE(a, "aa|bb|cc|dd|ee", "PROD")',
'REGEXP_REPLACE(a, "aa|bb|cc|dd|ee|ff", "PROD")'
'REGEXP_REPLACE(a, "aa|bb|cc|dd|ee|ff", "PROD")',
'REGEXP_REPLACE(a, "a\n|b\a|c\t", "PROD")',
'REGEXP_REPLACE(a, "a\ta|b\nb", "PROD")'
),
conf=conf
)
Expand Down
3 changes: 3 additions & 0 deletions integration_tests/src/main/python/window_function_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
from spark_session import is_before_spark_320, is_databricks113_or_later, is_databricks133_or_later, is_spark_350_or_later, spark_version, with_cpu_session
import warnings

# mark this test as ci_1 for mvn verify sanity check in pre-merge CI
pytestmark = [pytest.mark.premerge_ci_1]

_grpkey_longs_with_no_nulls = [
('a', RepeatSeqGen(LongGen(nullable=False), length=20)),
('b', IntegerGen()),
Expand Down
2 changes: 2 additions & 0 deletions jenkins/Jenkinsfile-blossom.premerge-databricks
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ pipeline {
DATABRICKS_PUBKEY = credentials("SPARK_DATABRICKS_PUBKEY")
DATABRICKS_DRIVER = DbUtils.getDriver("$DB_TYPE")
DATABRICKS_WORKER = DbUtils.getWorker("$DB_TYPE")
INIT_SCRIPTS_DIR = "/databricks/init_scripts/${BUILD_TAG}"
TEST_TYPE = 'pre-commit'
}

stages {
Expand Down
15 changes: 11 additions & 4 deletions jenkins/databricks/params.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
# Copyright (c) 2021-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -33,6 +33,8 @@
spark_conf = ''
# can take comma separated environments, e.g., foo=abc,bar=123,...'
extra_envs = ''
# 'nightly' is for nightly CI, 'pre-commit' is for the pre-merge CI
test_type = 'nightly'


def usage():
Expand All @@ -51,11 +53,12 @@ def usage():
' -n <skipstartingcluster>'
' -f <sparkconf>'
' -i <sparkinstallver>'
' -e <extraenvs>')
' -e <extraenvs>'
' -m <testtype>')


try:
opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:e:',
opts, script_args = getopt.getopt(sys.argv[1:], 'hw:t:c:p:l:d:z:m:v:b:j:f:i:e:m:',
['workspace=',
'token=',
'clusterid=',
Expand All @@ -68,7 +71,8 @@ def usage():
'jarpath=',
'sparkconf=',
'sparkinstallver=',
'extraenvs='])
'extraenvs=',
'testtype='])
except getopt.GetoptError:
usage()
sys.exit(2)
Expand Down Expand Up @@ -103,6 +107,8 @@ def usage():
base_spark_version_to_install_databricks_jars = arg
elif opt in ('-e', '--extraenvs'):
extra_envs = arg
elif opt in ('-m', '--testtype'):
test_type = arg

print('-w is ' + workspace)
print('-c is ' + clusterid)
Expand All @@ -116,3 +122,4 @@ def usage():
print('-f is ' + spark_conf)
print('-i is ' + base_spark_version_to_install_databricks_jars)
print('-e is ' + extra_envs)
print('-m is ' + test_type)
12 changes: 7 additions & 5 deletions jenkins/databricks/run-build.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
# Copyright (c) 2021-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -46,10 +46,12 @@ def main():
print("ssh command: %s" % ssh_command)
subprocess.check_call(ssh_command, shell = True)

print("Copying built tarball back")
rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (ssh_args, master_addr)
print("rsync command to get built tarball: %s" % rsync_command)
subprocess.check_call(rsync_command, shell = True)
# Only the nightly build needs to copy the spark-rapids-built.tgz back
if params.test_type == 'nightly':
print("Copying built tarball back")
rsync_command = "rsync -I -Pave \"ssh %s\" ubuntu@%s:/home/ubuntu/spark-rapids-built.tgz ./" % (ssh_args, master_addr)
print("rsync command to get built tarball: %s" % rsync_command)
subprocess.check_call(rsync_command, shell = True)

if __name__ == '__main__':
main()
6 changes: 3 additions & 3 deletions jenkins/databricks/run-tests.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -37,9 +37,9 @@ def main():
subprocess.check_call(rsync_command, shell=True)

ssh_command = "ssh %s ubuntu@%s " % (ssh_args, master_addr) + \
"'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s EXTRA_ENVS=%s bash %s %s 2>&1 | tee testout; " \
"'LOCAL_JAR_PATH=%s SPARK_CONF=%s BASE_SPARK_VERSION=%s EXTRA_ENVS=%s TEST_TYPE=%s bash %s %s 2>&1 | tee testout; " \
"if [ ${PIPESTATUS[0]} -ne 0 ]; then false; else true; fi'" % \
(params.jar_path, params.spark_conf, params.base_spark_pom_version, params.extra_envs,
(params.jar_path, params.spark_conf, params.base_spark_pom_version, params.extra_envs, params.test_type,
params.script_dest, ' '.join(params.script_args))
print("ssh command: %s" % ssh_command)
try:
Expand Down
7 changes: 4 additions & 3 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -936,13 +936,14 @@
Build and run unit tests on one specific version for each sub-version (e.g. 320, 330)
Base shim version (320 currently) should be covered in default mvn verify command of premerge script,
so base shim version is removed from the premergeUT list.
Separate the versions to two parts (premergeUT1, premergeUT2) for balancing the duration
Separate the versions to two parts: premergeUT1(2 shims' UT + 1/3 of the integration tests)
and premergeUT2(1 shim's UT + 2/3 of the integration tests), for balancing the duration
-->
<premergeUT1.buildvers>
320
320,
330
</premergeUT1.buildvers>
<premergeUT2.buildvers>
330,
340
</premergeUT2.buildvers>
<premergeUTF8.buildvers>
Expand Down
1 change: 1 addition & 0 deletions python/rapids/daemon.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
Expand Down
1 change: 1 addition & 0 deletions python/rapids/daemon_databricks.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
Expand Down
7 changes: 4 additions & 3 deletions scala2.13/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -936,13 +936,14 @@
Build and run unit tests on one specific version for each sub-version (e.g. 320, 330)
Base shim version (320 currently) should be covered in default mvn verify command of premerge script,
so base shim version is removed from the premergeUT list.
Separate the versions to two parts (premergeUT1, premergeUT2) for balancing the duration
Separate the versions to two parts: premergeUT1(2 shims' UT + 1/3 of the integration tests)
and premergeUT2(1 shim's UT + 2/3 of the integration tests), for balancing the duration
-->
<premergeUT1.buildvers>
320
320,
330
</premergeUT1.buildvers>
<premergeUT2.buildvers>
330,
340
</premergeUT2.buildvers>
<premergeUTF8.buildvers>
Expand Down
Loading

0 comments on commit a002c5f

Please sign in to comment.