Support running Databricks CI_PART2 integration tests with JARs built…

… by CI_PART1 To fix: #11838 The CI_PART1 job uploads the built Spark Rapids tar file to Databricks DBFS storage. The CI_PART2 job retrieves the built tar file from DBFS storage and runs integration tests against it. Then the CI_PART2 job doesn't need to duplicate the building of Spark Rapids jars; it can save about 1 hour of Databricks time. Signed-off-by: timl <[email protected]>
NVIDIA · Dec 9, 2024 · 730b28b · 730b28b
1 parent 0dbef90
commit 730b28b
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 13 deletions.
diff --git a/jenkins/Jenkinsfile-blossom.premerge b/jenkins/Jenkinsfile-blossom.premerge
@@ -39,6 +39,8 @@ def skipped = false
 def db_build = false
 def sourcePattern = 'shuffle-plugin/src/main/scala/,udf-compiler/src/main/scala/,' +
     'sql-plugin/src/main/java/,sql-plugin/src/main/scala/'
+// The path where the CI_PART1 job shares rapids plugin built tars with the CI_PART job
+def plugin_built_dir = "dbfs:/cicd/$BUILD_TAG"
 
 pipeline {
     agent {
@@ -281,12 +283,14 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true"""
                     steps {
                         script {
                             githubHelper.updateCommitStatus("", "Running - includes databricks", GitHubCommitState.PENDING)
+                            //CI_PART1 upload plugin buit tars to PLUGIN_BUILT_DIR for CI_PART2
                             def DBJob = build(job: 'rapids-databricks_premerge-github',
                                 propagate: false, wait: true,
                                 parameters: [
                                         string(name: 'REF', value: params.REF),
                                         string(name: 'GITHUB_DATA', value: params.GITHUB_DATA),
-                                        string(name: 'TEST_MODE', value: 'CI_PART1')
+                                        string(name: 'TEST_MODE', value: 'CI_PART1'),
+                                        string(name: 'PLUGIN_BUILT_DIR', value: "$plugin_built_dir"),
                                 ])
                             if ( DBJob.result != 'SUCCESS' ) {
                                 // Output Databricks failure logs to uploaded onto the pre-merge PR
@@ -304,13 +308,18 @@ git --no-pager diff --name-only HEAD \$BASE -- ${PREMERGE_DOCKERFILE} || true"""
                     }
                     steps {
                         script {
+                            container('cpu') {
+                                waitForPluginBuiltTar(plugin_built_dir)
+                            }
+
                             githubHelper.updateCommitStatus("", "Running - includes databricks", GitHubCommitState.PENDING)
                             def DBJob = build(job: 'rapids-databricks_premerge-github',
                                 propagate: false, wait: true,
                                 parameters: [
                                         string(name: 'REF', value: params.REF),
                                         string(name: 'GITHUB_DATA', value: params.GITHUB_DATA),
-                                        string(name: 'TEST_MODE', value: 'CI_PART2')
+                                        string(name: 'TEST_MODE', value: 'CI_PART2'),
+                                        string(name: 'PLUGIN_BUILT_DIR', value: "$plugin_built_dir"),
                                 ])
                             if ( DBJob.result != 'SUCCESS' ) {
                                 // Output Databricks failure logs to uploaded onto the pre-merge PR
@@ -435,3 +444,27 @@ boolean databricksCodeChanged() {
     }
     return false
 }
+
+// Wait for the rapids plgin built tars to be ready
+def waitForPluginBuiltTar(String tar_path, String db_type='aws', int timeout=60) {
+    // get DB runtimes from Jenkinsfile: "DB_RUNTIME [space]+ values 'x.y', 'a.b'"
+    def dbJenkinsfile = 'jenkins/Jenkinsfile-blossom.premerge-databricks'
+    def DBRuntimes = sh(script: "grep -Pzo 'DB_RUNTIME.*\\n\\s*values.*' $dbJenkinsfile", returnStdout: true)
+    // DB_RUNTIME\n values 'x.y', 'a.b' --> x.y,a.b
+    DBRuntimes = DBRuntimes.replaceAll("DB_RUNTIME.*|.*values|[\"']|\\s+", '').trim()
+    // x.y,a.b --> /path/x.y/,/path/x.y/
+
+    def DBFiles = DBRuntimes.split(',').collect { "$tar_path/$it/" }.join(',')
+    def databricks_host = DbUtils.getHost("$db_type")
+    def dbTokenId = DbUtils.getToken("$db_type")
+    withCredentials([string(credentialsId: dbTokenId, variable: 'DATABRICKS_TOKEN')]) {
+        withEnv(["DATABRICKS_HOST=$databricks_host"]) {
+            // wait for all the rapids plugin tars built in CI_PART1 to be ready
+            if (DbUtils.allFilesExist(this, DBFiles, timeout)) {
+                println('Rapids plugin built tars are ready for CI_PART2')
+            } else {
+                error "Timeout, rapids plugin built tars are not ready for CI_PART2"
+            }
+        }
+    }
+}
diff --git a/jenkins/Jenkinsfile-blossom.premerge-databricks b/jenkins/Jenkinsfile-blossom.premerge-databricks
@@ -53,6 +53,8 @@ pipeline {
             description: 'Json-formatted github data from upstream blossom-ci')
         choice(name: 'TEST_MODE', choices: ['CI_PART1', 'CI_PART2'],
             description: 'Separate integration tests into 2 parts, and run each part in parallell')
+        string(name: 'PLUGIN_BUILT_DIR', defaultValue: 'dbfs:/cicd',
+            description: 'CI_PART1 uploads spark-rapids built tgz for CI_PART2')
     }
 
     environment {
@@ -77,7 +79,7 @@ pipeline {
                 script {
                     githubHelper = GithubHelper.getInstance("${GITHUB_TOKEN}", params.GITHUB_DATA)
                     // desc contains the PR ID and can be accessed from different builds
-                    currentBuild.description = githubHelper.getBuildDescription()
+                    currentBuild.description = githubHelper.getBuildDescription() + " | $TEST_MODE"
                     checkoutCode(githubHelper.getCloneUrl(), githubHelper.getMergedSHA())
                 }
             }
@@ -138,6 +140,11 @@ void databricksBuild() {
     def CLUSTER_ID = ''
     def SPARK_MAJOR = BASE_SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS.replace('.', '')
     def dbStep = ''
+    def pluginBuiltTar ="$PLUGIN_BUILT_DIR/$DB_RUNTIME/spark-rapids-built.tgz"
+    // Map DBFS path to the local path into the cluster
+    def buildArgs = (params.TEST_MODE == 'CI_PART1') ? pluginBuiltTar.replace('dbfs:/', '/dbfs/') : ''
+    def testArgs = (params.TEST_MODE == 'CI_PART2') ?  pluginBuiltTar.replace('dbfs:/', '/dbfs/') : ''
+
     try {
         stage("Create $SPARK_MAJOR DB") {
             dbStep = 'CREATE'
@@ -148,17 +155,21 @@ void databricksBuild() {
             echo CLUSTER_ID
         }
 
-        stage("Build against $SPARK_MAJOR DB") {
-            sh "rm -rf spark-rapids-ci.tgz"
-            sh "tar -zcf spark-rapids-ci.tgz * .git"
-            dbStep = 'BUILD'
-            withCredentials([file(credentialsId: 'SPARK_DATABRICKS_PRIVKEY', variable: 'DATABRICKS_PRIVKEY')]) {
-                def BUILD_PARAMS = DbUtils.getParameters(this, dbStep, "-c $CLUSTER_ID")
-                retry(3) {
-                    sh "python3 ./jenkins/databricks/run-build.py $BUILD_PARAMS"
+        if (params.TEST_MODE == 'CI_PART1') {
+            stage("Build against $SPARK_MAJOR DB") {
+                sh "rm -rf spark-rapids-ci.tgz"
+                sh "tar -zcf spark-rapids-ci.tgz * .git"
+                dbStep = 'BUILD'
+                withCredentials([file(credentialsId: 'SPARK_DATABRICKS_PRIVKEY', variable: 'DATABRICKS_PRIVKEY')]) {
+                    def BUILD_PARAMS = DbUtils.getParameters(this, dbStep, "-c $CLUSTER_ID")
+                    retry(3) {
+                        // Back-up built tar to the path "$buildArgs" on Databricks cluster
+                        // Refer to https://github.com/NvTimLiu/spark-rapids/blob/d030630c1/jenkins/databricks/build.sh#L190-L194
+                        sh "python3 ./jenkins/databricks/run-build.py $BUILD_PARAMS $buildArgs"
+                    }
                 }
+                sh "rm spark-rapids-ci.tgz"
             }
-            sh "rm spark-rapids-ci.tgz"
         }
 
         // TODO: Temporarily skip tests on Databricks 14.3 until the test failures are fixed
@@ -167,14 +178,17 @@ void databricksBuild() {
                 dbStep = 'TEST'
                 withCredentials([file(credentialsId: 'SPARK_DATABRICKS_PRIVKEY', variable: 'DATABRICKS_PRIVKEY')]) {
                     def TEST_PARAMS = DbUtils.getParameters(this, dbStep, "-c $CLUSTER_ID")
-                    sh "python3 ./jenkins/databricks/run-tests.py $TEST_PARAMS"
+                    // Get built tar from the path "$testArgs" on Databricks cluster
+                    // Refer to https://github.com/NvTimLiu/spark-rapids/blob/d030630c1/jenkins/databricks/test.sh#L52-L55
+                    sh "python3 ./jenkins/databricks/run-tests.py $TEST_PARAMS $testArgs"
                 }
             }
         }
     } finally {
         if (CLUSTER_ID) {
             (dbStep == 'TEST') ? common.publishPytestResult(this, "Test against $SPARK_MAJOR DB") : ''
             retry(3) {
+                params.TEST_MODE == 'CI_PART1' ? DbUtils.cleanUp(this, "$PLUGIN_BUILT_DIR/$DB_RUNTIME") : ''
                 env.INIT_SCRIPTS ? DbUtils.cleanUp(this, env.INIT_SCRIPTS_DIR) : ''
                 sh "python3 ./jenkins/databricks/shutdown.py -s $DATABRICKS_HOST -t $DATABRICKS_TOKEN -c $CLUSTER_ID -d"
             }