From 2f3c0c276dba3e8870f3e3bfa9287e644809f4f6 Mon Sep 17 00:00:00 2001 From: Tim Liu Date: Fri, 14 Jun 2024 19:47:56 +0800 Subject: [PATCH] Calculate parallelism to speed up pre-merge CI (#11046) * Calculate parallelism to speed up pre-merge CI Calculate parallelism based on GPU memory to speed up pre-merge CI with appropriate amount of parallelism. But when TEST_PARALLEL > 8 and as it increases, the integration tests running speed will become slower and slower, so we limit TEST_PARALLEL <= 8. Based on this change, and ran pre-merge CI on powerful nodes, we observed the pre-merge CI 1 hour less than on common nodes. 16 CPU/128G Mem/24G GPU : [2hours] VS 8 CPU/64G Mem/16G GPU : [3hours] Note: currently we only have 3 fixed powerful nodes for the pre-merge CI job, so only 1 pre-merge CI be speeded up at the same time Signed-off-by: Tim Liu * Add a variable to set maximum test parallelism for the integration tests Signed-off-by: Tim Liu * Fix typo Signed-off-by: Tim Liu --------- Signed-off-by: Tim Liu --- integration_tests/run_pyspark_from_build.sh | 5 +++++ jenkins/spark-premerge-build.sh | 4 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/integration_tests/run_pyspark_from_build.sh b/integration_tests/run_pyspark_from_build.sh index 8b10b3debac..18c26aa26e7 100755 --- a/integration_tests/run_pyspark_from_build.sh +++ b/integration_tests/run_pyspark_from_build.sh @@ -171,11 +171,16 @@ else TEST_TYPE_PARAM="--test_type $TEST_TYPE" fi + # We found that when parallelism > 8, as it increases, the test speed will become slower and slower. So we set the default maximum parallelism to 8. + # Note that MAX_PARALLEL varies with the hardware, OS, and test case. Please overwrite it with an appropriate value if needed. + MAX_PARALLEL=${MAX_PARALLEL:-8} if [[ ${TEST_PARALLEL} -lt 2 ]]; then # With xdist 0 and 1 are the same parallelism but # 0 is more efficient TEST_PARALLEL_OPTS=() + elif [[ ${TEST_PARALLEL} -gt ${MAX_PARALLEL} ]]; then + TEST_PARALLEL_OPTS=("-n" "$MAX_PARALLEL") else TEST_PARALLEL_OPTS=("-n" "$TEST_PARALLEL") fi diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index 883b3f3acfc..697722c0138 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -78,7 +78,7 @@ mvn_verify() { # Here run Python integration tests tagged with 'premerge_ci_1' only, that would help balance test duration and memory # consumption from two k8s pods running in parallel, which executes 'mvn_verify()' and 'ci_2()' respectively. $MVN_CMD -B $MVN_URM_MIRROR $PREMERGE_PROFILES clean verify -Dpytest.TEST_TAGS="premerge_ci_1" \ - -Dpytest.TEST_TYPE="pre-commit" -Dpytest.TEST_PARALLEL=4 -Dcuda.version=$CLASSIFIER + -Dpytest.TEST_TYPE="pre-commit" -Dcuda.version=$CLASSIFIER # The jacoco coverage should have been collected, but because of how the shade plugin # works and jacoco we need to clean some things up so jacoco will only report for the @@ -162,7 +162,6 @@ ci_2() { $MVN_CMD -U -B $MVN_URM_MIRROR clean package $MVN_BUILD_ARGS -DskipTests=true export TEST_TAGS="not premerge_ci_1" export TEST_TYPE="pre-commit" - export TEST_PARALLEL=5 # Download a Scala 2.12 build of spark prepare_spark $SPARK_VER 2.12 @@ -206,7 +205,6 @@ ci_scala213() { cd .. # Run integration tests in the project root dir to leverage test cases and resource files export TEST_TAGS="not premerge_ci_1" export TEST_TYPE="pre-commit" - export TEST_PARALLEL=5 # SPARK_HOME (and related) must be set to a Spark built with Scala 2.13 SPARK_HOME=$SPARK_HOME PYTHONPATH=$PYTHONPATH \ ./integration_tests/run_pyspark_from_build.sh