From 41f25c7ef8c4cfe79552589c7f6fad53fb803365 Mon Sep 17 00:00:00 2001 From: liyuan <84758614+nvliyuan@users.noreply.github.com> Date: Fri, 28 Oct 2022 15:20:25 +0800 Subject: [PATCH 1/8] merge dev-2210 branch to Main branch (#237) * Init 22.10.0-SNAPSHOT (#214) Signed-off-by: Peixin Li Signed-off-by: Peixin Li * update version and fix some document error, add more comments for running xgboost notebooks on GCP (#215) (#222) Signed-off-by: liyuan Signed-off-by: liyuan Signed-off-by: liyuan * update version and fix some document error, add more comments for running xgboost notebooks on GCP (#215) (#224) Signed-off-by: liyuan Signed-off-by: liyuan Signed-off-by: liyuan * Update default cmake to 3.23.X in udf exmaple dockerfile (#227) Signed-off-by: Peixin Li Signed-off-by: Peixin Li * [xgboost] Remove default parameters (#226) * remove the default parameters for xgboost examples * remove the default parameters Signed-off-by: Bobby Wang * remove unused variables for mortgage-ETL Signed-off-by: Bobby Wang * add more details/notes for the mortgage perforamcne tests (#229) * add more details/notes for the mortgage perforamcne tests Signed-off-by: liyuan * Update examples/XGBoost-Examples/README.md Co-authored-by: Hao Zhu <9665750+viadea@users.noreply.github.com> * Update examples/XGBoost-Examples/README.md Co-authored-by: Hao Zhu <9665750+viadea@users.noreply.github.com> * Update examples/XGBoost-Examples/README.md Co-authored-by: Hao Zhu <9665750+viadea@users.noreply.github.com> Signed-off-by: liyuan Co-authored-by: Hao Zhu <9665750+viadea@users.noreply.github.com> * Enable automerge from 22.10 to 22.12 (#230) Signed-off-by: Peixin Li Signed-off-by: Peixin Li * update versions for v22.10 release (#235) Signed-off-by: liyuan Signed-off-by: liyuan Signed-off-by: Peixin Li Signed-off-by: liyuan Signed-off-by: Bobby Wang Co-authored-by: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Co-authored-by: Peixin Co-authored-by: Bobby Wang Co-authored-by: Hao Zhu <9665750+viadea@users.noreply.github.com> --- .github/workflows/auto-merge.yml | 8 +++---- .../generate-init-script-10.4.ipynb | 6 ++--- .../csp/databricks/generate-init-script.ipynb | 6 ++--- .../on-prem-cluster/kubernetes-scala.md | 2 +- .../preparation-python.md | 4 ++-- .../prepare-package-data/preparation-scala.md | 4 ++-- .../ML+DL-Examples/Spark-cuML/pca/Dockerfile | 2 +- .../ML+DL-Examples/Spark-cuML/pca/README.md | 4 ++-- .../ML+DL-Examples/Spark-cuML/pca/pom.xml | 4 ++-- .../Spark-cuML/pca/spark-submit.sh | 6 ++--- .../notebooks/micro-benchmarks-gpu.ipynb | 2 +- .../RAPIDS-accelerated-UDFs/Dockerfile | 2 +- .../RAPIDS-accelerated-UDFs/README.md | 2 +- .../RAPIDS-accelerated-UDFs/pom.xml | 4 ++-- .../src/main/cpp/CMakeLists.txt | 10 ++++----- .../UDF-Examples/Spark-cuSpatial/Dockerfile | 2 +- .../Spark-cuSpatial/Dockerfile.awsdb | 2 +- .../UDF-Examples/Spark-cuSpatial/README.md | 6 ++--- .../UDF-Examples/Spark-cuSpatial/gpu-run.sh | 2 +- .../cuspatial_sample_standalone.ipynb | 2 +- examples/UDF-Examples/Spark-cuSpatial/pom.xml | 4 ++-- .../src/main/native/CMakeLists.txt | 2 +- examples/XGBoost-Examples/README.md | 4 ++++ .../nvidia/spark/examples/agaricus/Main.scala | 3 --- .../python/MortgageETL+XGBoost.ipynb | 2 +- .../notebooks/python/MortgageETL.ipynb | 6 ++--- .../notebooks/scala/mortgage-ETL.ipynb | 22 ++++--------------- .../spark/examples/mortgage/Mortgage.scala | 11 ---------- .../taxi/notebooks/python/taxi-ETL.ipynb | 4 ++-- .../taxi/notebooks/scala/taxi-ETL.ipynb | 4 ++-- .../com/nvidia/spark/examples/taxi/Taxi.scala | 5 +---- 31 files changed, 60 insertions(+), 87 deletions(-) diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml index b292dca00..33bf4c5c6 100644 --- a/.github/workflows/auto-merge.yml +++ b/.github/workflows/auto-merge.yml @@ -18,7 +18,7 @@ name: auto-merge HEAD to BASE on: pull_request_target: branches: - - branch-22.08 + - branch-22.10 types: [closed] jobs: @@ -29,13 +29,13 @@ jobs: steps: - uses: actions/checkout@v2 with: - ref: branch-22.08 # force to fetch from latest upstream instead of PR ref + ref: branch-22.10 # force to fetch from latest upstream instead of PR ref - name: auto-merge job uses: ./.github/workflows/auto-merge env: OWNER: NVIDIA REPO_NAME: spark-rapids-examples - HEAD: branch-22.08 - BASE: branch-22.10 + HEAD: branch-22.10 + BASE: branch-22.12 AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR diff --git a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb index 3be77a4b5..f056dfdf9 100644 --- a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb +++ b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script-10.4.ipynb @@ -24,7 +24,7 @@ "source": [ "%sh\n", "cd ../../dbfs/FileStore/jars/\n", - "sudo wget -O rapids-4-spark_2.12-22.08.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar\n", + "sudo wget -O rapids-4-spark_2.12-22.10.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar\n", "sudo wget -O xgboost4j-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-gpu_2.12/1.6.1/xgboost4j-gpu_2.12-1.6.1.jar\n", "sudo wget -O xgboost4j-spark-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark-gpu_2.12/1.6.1/xgboost4j-spark-gpu_2.12-1.6.1.jar\n", "ls -ltr\n", @@ -60,7 +60,7 @@ "sudo rm -f /databricks/jars/spark--maven-trees--ml--10.x--xgboost-gpu--ml.dmlc--xgboost4j-spark-gpu_2.12--ml.dmlc__xgboost4j-spark-gpu_2.12__1.5.2.jar\n", "\n", "sudo cp /dbfs/FileStore/jars/xgboost4j-gpu_2.12-1.6.1.jar /databricks/jars/\n", - "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.08.0.jar /databricks/jars/\n", + "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.10.0.jar /databricks/jars/\n", "sudo cp /dbfs/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar /databricks/jars/\"\"\", True)" ] }, @@ -133,7 +133,7 @@ "1. Edit your cluster, adding an initialization script from `dbfs:/databricks/init_scripts/init.sh` in the \"Advanced Options\" under \"Init Scripts\" tab\n", "2. Reboot the cluster\n", "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n", - "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.08/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n", + "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n", "5. Inside the mortgage example notebook, update the data paths\n", " `train_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-train.csv')`\n", " `trans_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-trans.csv')`" diff --git a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb index 540132062..772453e39 100644 --- a/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb +++ b/docs/get-started/xgboost-examples/csp/databricks/generate-init-script.ipynb @@ -24,7 +24,7 @@ "source": [ "%sh\n", "cd ../../dbfs/FileStore/jars/\n", - "sudo wget -O rapids-4-spark_2.12-22.08.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar\n", + "sudo wget -O rapids-4-spark_2.12-22.10.0.jar https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar\n", "sudo wget -O xgboost4j-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-gpu_2.12/1.6.1/xgboost4j-gpu_2.12-1.6.1.jar\n", "sudo wget -O xgboost4j-spark-gpu_2.12-1.6.1.jar https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark-gpu_2.12/1.6.1/xgboost4j-spark-gpu_2.12-1.6.1.jar\n", "ls -ltr\n", @@ -60,7 +60,7 @@ "sudo rm -f /databricks/jars/spark--maven-trees--ml--9.x--xgboost-gpu--ml.dmlc--xgboost4j-spark-gpu_2.12--ml.dmlc__xgboost4j-spark-gpu_2.12__1.4.1.jar\n", "\n", "sudo cp /dbfs/FileStore/jars/xgboost4j-gpu_2.12-1.6.1.jar /databricks/jars/\n", - "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.08.0.jar /databricks/jars/\n", + "sudo cp /dbfs/FileStore/jars/rapids-4-spark_2.12-22.10.0.jar /databricks/jars/\n", "sudo cp /dbfs/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar /databricks/jars/\"\"\", True)" ] }, @@ -133,7 +133,7 @@ "1. Edit your cluster, adding an initialization script from `dbfs:/databricks/init_scripts/init.sh` in the \"Advanced Options\" under \"Init Scripts\" tab\n", "2. Reboot the cluster\n", "3. Go to \"Libraries\" tab under your cluster and install `dbfs:/FileStore/jars/xgboost4j-spark-gpu_2.12-1.6.1.jar` in your cluster by selecting the \"DBFS\" option for installing jars\n", - "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.08/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n", + "4. Import the mortgage example notebook from `https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb`\n", "5. Inside the mortgage example notebook, update the data paths\n", " `train_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-train.csv')`\n", " `trans_data = reader.schema(schema).option('header', True).csv('/data/mortgage/csv/small-trans.csv')`" diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md b/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md index 11d1fb4dd..54a251fd1 100644 --- a/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md +++ b/docs/get-started/xgboost-examples/on-prem-cluster/kubernetes-scala.md @@ -40,7 +40,7 @@ export SPARK_DOCKER_IMAGE= export SPARK_DOCKER_TAG= pushd ${SPARK_HOME} -wget https://github.com/NVIDIA/spark-rapids-examples/raw/branch-22.08/dockerfile/Dockerfile +wget https://github.com/NVIDIA/spark-rapids-examples/raw/branch-22.10/dockerfile/Dockerfile # Optionally install additional jars into ${SPARK_HOME}/jars/ diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md index 6f511be5b..ca9442f44 100644 --- a/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md +++ b/docs/get-started/xgboost-examples/prepare-package-data/preparation-python.md @@ -9,7 +9,7 @@ For simplicity export the location to these jars. All examples assume the packag * [XGBoost4j-Spark Package](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/) 2. Download the RAPIDS Accelerator for Apache Spark plugin jar - * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar) + * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar) ### Build XGBoost Python Examples @@ -26,7 +26,7 @@ You need to copy the dataset to `/opt/xgboost`. Use the following links to downl ``` bash export SPARK_XGBOOST_DIR=/opt/xgboost -export RAPIDS_JAR=${SPARK_XGBOOST_DIR}/rapids-4-spark_2.12-22.08.0.jar +export RAPIDS_JAR=${SPARK_XGBOOST_DIR}/rapids-4-spark_2.12-22.10.0.jar export XGBOOST4J_JAR=${SPARK_XGBOOST_DIR}/xgboost4j_3.0-1.4.2-0.3.0.jar export XGBOOST4J_SPARK_JAR=${SPARK_XGBOOST_DIR}/xgboost4j-spark_3.0-1.4.2-0.3.0.jar export SAMPLE_ZIP=${SPARK_XGBOOST_DIR}/samples.zip diff --git a/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md b/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md index e5bf88571..5bdc4f7cc 100644 --- a/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md +++ b/docs/get-started/xgboost-examples/prepare-package-data/preparation-scala.md @@ -5,7 +5,7 @@ For simplicity export the location to these jars. All examples assume the packag ### Download the jars 1. Download the RAPIDS Accelerator for Apache Spark plugin jar - * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar) + * [RAPIDS Spark Package](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar) ### Build XGBoost Scala Examples @@ -22,6 +22,6 @@ You need to copy the dataset to `/opt/xgboost`. Use the following links to downl ``` bash export SPARK_XGBOOST_DIR=/opt/xgboost -export RAPIDS_JAR=${SPARK_XGBOOST_DIR}/rapids-4-spark_2.12-22.08.0.jar +export RAPIDS_JAR=${SPARK_XGBOOST_DIR}/rapids-4-spark_2.12-22.10.0.jar export SAMPLE_JAR=${SPARK_XGBOOST_DIR}/sample_xgboost_apps-0.2.3-jar-with-dependencies.jar ``` diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile b/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile index ea40e1ed4..ba511c45f 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile +++ b/examples/ML+DL-Examples/Spark-cuML/pca/Dockerfile @@ -17,7 +17,7 @@ ARG CUDA_VER=11.5.1 FROM nvidia/cuda:${CUDA_VER}-devel-ubuntu20.04 -ARG BRANCH_VER=22.08 +ARG BRANCH_VER=22.10 RUN apt-get update RUN apt-get install -y wget ninja-build git diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/README.md b/examples/ML+DL-Examples/Spark-cuML/pca/README.md index 4c1d9e861..1086c6907 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/README.md +++ b/examples/ML+DL-Examples/Spark-cuML/pca/README.md @@ -12,7 +12,7 @@ User can also download the release jar from Maven central: [rapids-4-spark-ml_2.12-22.02.0-cuda11.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-ml_2.12/22.02.0/rapids-4-spark-ml_2.12-22.02.0-cuda11.jar) -[rapids-4-spark_2.12-22.08.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar) +[rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar) ## Sample code @@ -48,7 +48,7 @@ It is assumed that a Standalone Spark cluster has been set up, the `SPARK_MASTER ``` bash RAPIDS_ML_JAR=PATH_TO_rapids-4-spark-ml_2.12-22.02.0-cuda11.jar - PLUGIN_JAR=PATH_TO_rapids-4-spark_2.12-22.08.0.jar + PLUGIN_JAR=PATH_TO_rapids-4-spark_2.12-22.10.0.jar jupyter toree install \ --spark_home=${SPARK_HOME} \ diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml index 7a7b399d5..875ada38a 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml +++ b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml @@ -21,7 +21,7 @@ com.nvidia PCAExample jar - 22.08.0-SNAPSHOT + 22.10.0-SNAPSHOT 8 @@ -51,7 +51,7 @@ com.nvidia rapids-4-spark-ml_2.12 - 22.08.0-SNAPSHOT + 22.10.0-SNAPSHOT diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh index 03381d8e9..a167ad0cc 100755 --- a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh +++ b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh @@ -15,8 +15,8 @@ # limitations under the License. # -ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/22.08.0-SNAPSHOT/rapids-4-spark-ml_2.12-22.08.0-SNAPSHOT.jar -PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/22.08.0-SNAPSHOT/rapids-4-spark_2.12-22.08.0-SNAPSHOT.jar +ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/22.10.0-SNAPSHOT/rapids-4-spark-ml_2.12-22.10.0-SNAPSHOT.jar +PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/22.10.0-SNAPSHOT/rapids-4-spark_2.12-22.10.0-SNAPSHOT.jar $SPARK_HOME/bin/spark-submit \ --master spark://127.0.0.1:7077 \ @@ -38,4 +38,4 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.network.timeout=1000s \ --jars $ML_JAR,$PLUGIN_JAR \ --class com.nvidia.spark.examples.pca.Main \ -/workspace/target/PCAExample-22.08.0-SNAPSHOT.jar +/workspace/target/PCAExample-22.10.0-SNAPSHOT.jar diff --git a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb index cc1d11331..d5249e8fd 100644 --- a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb +++ b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb @@ -22,7 +22,7 @@ "import os\n", "# Change to your cluster ip:port and directories\n", "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"spark:your-ip:port\")\n", - "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-path/rapids-4-spark_2.12-22.08.0.jar\")\n" + "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-path/rapids-4-spark_2.12-22.10.0.jar\")\n" ] }, { diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/Dockerfile b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/Dockerfile index 6e83e421b..63c11815a 100644 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/Dockerfile +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/Dockerfile @@ -58,7 +58,7 @@ CUDA_VERSION_MINOR=$(echo $CUDA_VERSION | tr -d '.' | cut -c 3); \ # Set JDK8 as the default Java && update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java -ARG CMAKE_VERSION=3.20.5 +ARG CMAKE_VERSION=3.23.3 # Install CMake RUN cd /tmp \ diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md index 242719b2e..40d3d6cdd 100644 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/README.md @@ -108,7 +108,7 @@ See above Prerequisites section First finish the steps in "Building with Native Code Examples and run test cases" section, then do the following in the docker. ### Get jars from Maven Central -[rapids-4-spark_2.12-22.08.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar) +[rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar) ### Launch a local mode Spark diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml index 95f18cbc4..252d3abc3 100644 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml @@ -25,7 +25,7 @@ user defined functions for use with the RAPIDS Accelerator for Apache Spark - 22.08.0-SNAPSHOT + 22.10.0-SNAPSHOT 1.8 @@ -37,7 +37,7 @@ cuda11 2.12 - 22.08.0 + 22.10.0 3.1.1 2.12.15 ${project.build.directory}/cpp-build diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt index b9b4929d5..6ec503c13 100755 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/src/main/cpp/CMakeLists.txt @@ -14,9 +14,9 @@ # limitations under the License. #============================================================================= -cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) -file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.08/RAPIDS.cmake +file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake ${CMAKE_BINARY_DIR}/RAPIDS.cmake) include(${CMAKE_BINARY_DIR}/RAPIDS.cmake) @@ -32,7 +32,7 @@ if(DEFINED GPU_ARCHS) endif() rapids_cuda_init_architectures(UDFEXAMPLESJNI) -project(UDFEXAMPLESJNI VERSION 22.08.0 LANGUAGES C CXX CUDA) +project(UDFEXAMPLESJNI VERSION 22.10.0 LANGUAGES C CXX CUDA) option(PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF) option(BUILD_UDF_BENCHMARKS "Build the benchmarks" OFF) @@ -84,10 +84,10 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -w --expt-extended-lambda --expt-relax set(CUDA_USE_STATIC_CUDA_RUNTIME OFF) rapids_cpm_init() -rapids_cpm_find(cudf 22.08.00 +rapids_cpm_find(cudf 22.10.00 CPM_ARGS GIT_REPOSITORY https://github.com/rapidsai/cudf.git - GIT_TAG branch-22.08 + GIT_TAG branch-22.10 GIT_SHALLOW TRUE SOURCE_SUBDIR cpp OPTIONS "BUILD_TESTS OFF" diff --git a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile index 6d81a260c..f9bbff653 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile +++ b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile @@ -39,7 +39,7 @@ RUN conda --version RUN conda install -c conda-forge openjdk=8 maven=3.8.1 -y # install cuDF dependency. -RUN conda install -c rapidsai -c nvidia -c conda-forge -c defaults libcuspatial=22.08 python=3.8 -y +RUN conda install -c rapidsai -c nvidia -c conda-forge -c defaults libcuspatial=22.10 python=3.8 -y RUN wget --quiet \ https://github.com/Kitware/CMake/releases/download/v3.21.3/cmake-3.21.3-linux-x86_64.tar.gz \ diff --git a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb index 98839d1ed..a054441a4 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb +++ b/examples/UDF-Examples/Spark-cuSpatial/Dockerfile.awsdb @@ -48,7 +48,7 @@ RUN wget -q https://repo.continuum.io/miniconda/Miniconda3-py38_4.9.2-Linux-x86_ conda config --system --set always_yes True && \ conda clean --all -RUN conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.08 +RUN conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.10 RUN conda install -c conda-forge libgdal==3.3.1 RUN pip install jupyter ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64 diff --git a/examples/UDF-Examples/Spark-cuSpatial/README.md b/examples/UDF-Examples/Spark-cuSpatial/README.md index b90b34be7..3828a8177 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/README.md +++ b/examples/UDF-Examples/Spark-cuSpatial/README.md @@ -65,9 +65,9 @@ Note: The docker env is just for building the jar, not for running the applicati 4. [cuspatial](https://github.com/rapidsai/cuspatial): install libcuspatial ```Bash # Install libcuspatial from conda - conda install -c rapidsai -c nvidia -c conda-forge -c defaults libcuspatial=22.06 + conda install -c rapidsai -c nvidia -c conda-forge -c defaults libcuspatial=22.10 # or below command for the nightly (aka SNAPSHOT) version. - conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.08 + conda install -c rapidsai-nightly -c nvidia -c conda-forge -c defaults libcuspatial=22.10 ``` 5. Build the JAR using `mvn package`. ```Bash @@ -86,7 +86,7 @@ Note: The docker env is just for building the jar, not for running the applicati 2. Set up [a standalone cluster](/docs/get-started/xgboost-examples/on-prem-cluster/standalone-scala.md) of Spark. Make sure the conda/lib is included in LD_LIBRARY_PATH, so that spark executors can load libcuspatial.so. 3. Download Spark RAPIDS JAR - * [Spark RAPIDS JAR v22.08.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar) or above + * [Spark RAPIDS JAR v22.10.0](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar) or above 4. Prepare sample dataset and JARs. Copy the [sample dataset](../../../datasets/cuspatial_data.tar.gz) to `/data/cuspatial_data/`. Copy Spark RAPIDS JAR and `spark-cuspatial-.jar` to `/data/cuspatial_data/jars/`. If you build the `spark-cuspatial-.jar` in docker, please copy the jar from docker to local: diff --git a/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh b/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh index 987a3ea52..fead762aa 100755 --- a/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh +++ b/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh @@ -31,7 +31,7 @@ rm -rf $DATA_OUT_PATH # the path to keep the jars of spark-rapids & spark-cuspatial JARS=$ROOT_PATH/jars -JARS_PATH=${JARS_PATH:-$JARS/rapids-4-spark_2.12-22.08.0.jar,$JARS/spark-cuspatial-22.08.0-SNAPSHOT.jar} +JARS_PATH=${JARS_PATH:-$JARS/rapids-4-spark_2.12-22.10.0.jar,$JARS/spark-cuspatial-22.10.0-SNAPSHOT.jar} $SPARK_HOME/bin/spark-submit --master spark://$HOSTNAME:7077 \ --name "Gpu Spatial Join UDF" \ diff --git a/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb b/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb index f13889ed1..04f77452f 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb +++ b/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb @@ -9,7 +9,7 @@ "source": [ "from pyspark.sql import SparkSession\n", "import os\n", - "jarsPath = os.getenv(\"JARS_PATH\", \"/data/cuspatial_data/jars/rapids-4-spark_2.12-22.08.0.jar,/data/cuspatial_data/jars/spark-cuspatial-22.08.0-SNAPSHOT.jar\")\n", + "jarsPath = os.getenv(\"JARS_PATH\", \"/data/cuspatial_data/jars/rapids-4-spark_2.12-22.10.0.jar,/data/cuspatial_data/jars/spark-cuspatial-22.10.0-SNAPSHOT.jar\")\n", "spark = SparkSession.builder \\\n", " .config(\"spark.jars\", jarsPath) \\\n", " .config(\"spark.sql.adaptive.enabled\", \"false\") \\\n", diff --git a/examples/UDF-Examples/Spark-cuSpatial/pom.xml b/examples/UDF-Examples/Spark-cuSpatial/pom.xml index 6f8f08a49..100cc3f1d 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/pom.xml +++ b/examples/UDF-Examples/Spark-cuSpatial/pom.xml @@ -24,13 +24,13 @@ UDF of the cuSpatial case for the RAPIDS Accelerator The RAPIDS accelerated user defined function of the cuSpatial case for use with the RAPIDS Accelerator for Apache Spark - 22.08.0-SNAPSHOT + 22.10.0-SNAPSHOT 1.8 1.8 8 - 22.08.0 + 22.10.0 2.12 3.2.0 ${project.build.directory}/cpp-build diff --git a/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt b/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt index 40eff8c31..50675a42a 100755 --- a/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt +++ b/examples/UDF-Examples/Spark-cuSpatial/src/main/native/CMakeLists.txt @@ -16,7 +16,7 @@ cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) -project(SPATIALUDJNI VERSION 22.08.0 LANGUAGES C CXX CUDA) +project(SPATIALUDJNI VERSION 22.10.0 LANGUAGES C CXX CUDA) ################################################################################################### # - build type ------------------------------------------------------------------------------------ diff --git a/examples/XGBoost-Examples/README.md b/examples/XGBoost-Examples/README.md index f4e654530..69a831af0 100644 --- a/examples/XGBoost-Examples/README.md +++ b/examples/XGBoost-Examples/README.md @@ -12,6 +12,10 @@ In the public cloud, better performance can lead to significantly lower costs as ![mortgage-speedup](/docs/img/guides/mortgage-perf.png) +Note that the test result is based on 21 years [Fannie Mea Single-Family Loan Performance Data](https://capitalmarkets.fanniemae.com/credit-risk-transfer/single-family-credit-risk-transfer/fannie-mae-single-family-loan-performance-data) +with a 4 A100 GPU and 512 CPU vcores cluster, the performance is affected by many aspects, +including data size and type of GPU. + In this folder, there are three blue prints for users to learn about using Spark XGBoost and RAPIDS Accelerator on GPUs : diff --git a/examples/XGBoost-Examples/agaricus/scala/src/com/nvidia/spark/examples/agaricus/Main.scala b/examples/XGBoost-Examples/agaricus/scala/src/com/nvidia/spark/examples/agaricus/Main.scala index a2c0f6b04..d81f38a43 100644 --- a/examples/XGBoost-Examples/agaricus/scala/src/com/nvidia/spark/examples/agaricus/Main.scala +++ b/examples/XGBoost-Examples/agaricus/scala/src/com/nvidia/spark/examples/agaricus/Main.scala @@ -63,9 +63,6 @@ object Main { val xgbClassificationModel = if (xgboostArgs.isToTrain) { // build XGBoost classifier val paramMap = xgboostArgs.xgboostParams(Map( - "eta" -> 0.1, - "missing" -> 0.0, - "max_depth" -> 2, "objective" -> "binary:logistic", "eval_sets" -> datasets(1).map(ds => Map("eval" -> ds)).getOrElse(Map.empty) )) diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb index 63a11ccfd..974b6094d 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb @@ -6,7 +6,7 @@ "source": [ "# Dataset\n", "\n", - "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.08/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", + "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", "\n", "# ETL + XGBoost train & transform\n", "\n", diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb index 433d35880..93dd98866 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL.ipynb @@ -6,10 +6,10 @@ "source": [ "## Prerequirement\n", "### 1. Download data\n", - "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.08/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", + "Dataset is derived from Fannie Mae’s [Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", "\n", "### 2. Download needed jars\n", - "* [rapids-4-spark_2.12-22.08.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar)\n", + "* [rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)\n", "\n", "\n", "### 3. Start Spark Standalone\n", @@ -17,7 +17,7 @@ "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-22.08.0.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-22.10.0.jar\n", "$ export PYSPARK_DRIVER_PYTHON=jupyter \n", "$ export PYSPARK_DRIVER_PYTHON_OPTS=notebook\n", "```\n", diff --git a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb index 4a9bee1ec..794a0fa36 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb @@ -16,18 +16,18 @@ "source": [ "## Prerequirement\n", "### 1. Download data\n", - "\n", - "Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.08/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", + "\n", + "Refer to these [instructions](https://github.com/NVIDIA/spark-rapids-examples/blob/branch-22.10/docs/get-started/xgboost-examples/dataset/mortgage.md) to download the dataset.\n", "\n", "### 2. Download needed jars\n", - "* [rapids-4-spark_2.12-22.08.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar)\n", + "* [rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)\n", "\n", "### 3. Start Spark Standalone\n", "Before Running the script, please setup Spark standalone mode\n", "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-22.08.0.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-22.10.0.jar\n", "\n", "```\n", "\n", @@ -462,20 +462,6 @@ " (labelColName, IntegerType)\n", " )\n", "\n", - "val commParamMap = Map(\n", - " \"eta\" -> 0.1,\n", - " \"gamma\" -> 0.1,\n", - " \"missing\" -> 0.0,\n", - " \"max_depth\" -> 10,\n", - " \"max_leaves\" -> 256,\n", - " \"objective\" -> \"binary:logistic\",\n", - " \"grow_policy\" -> \"depthwise\",\n", - " \"min_child_weight\" -> 30,\n", - " \"lambda\" -> 1,\n", - " \"scale_pos_weight\" -> 2,\n", - " \"subsample\" -> 1,\n", - " \"nthread\" -> 1,\n", - " \"num_round\" -> 100)\n", "var cachedDictDF: DataFrame = _" ] }, diff --git a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Mortgage.scala b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Mortgage.scala index c051cff07..89b32f76c 100644 --- a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Mortgage.scala +++ b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Mortgage.scala @@ -60,17 +60,6 @@ private[mortgage] trait Mortgage { lazy val featureNames = schema.filter(_.name != labelColName).map(_.name).toArray val commParamMap = Map( - "eta" -> 0.1, - "gamma" -> 0.1, - "missing" -> 0.0, - "max_depth" -> 10, - "max_leaves" -> 256, "objective" -> "binary:logistic", - "grow_policy" -> "depthwise", - "min_child_weight" -> 30, - "lambda" -> 1, - "scale_pos_weight" -> 2, - "subsample" -> 1, - "nthread" -> 1, "num_round" -> 100) } diff --git a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb index 171f47f4c..54b181513 100644 --- a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb +++ b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb @@ -19,14 +19,14 @@ "All data could be found at https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page\n", "\n", "### 2. Download needed jars\n", - "* [rapids-4-spark_2.12-22.08.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar)\n", + "* [rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)\n", "\n", "### 3. Start Spark Standalone\n", "Before running the script, please setup Spark standalone mode\n", "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-22.08.0.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-22.10.0.jar\n", "$ export PYSPARK_DRIVER_PYTHON=jupyter \n", "$ export PYSPARK_DRIVER_PYTHON_OPTS=notebook\n", "```\n", diff --git a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb index 0f14cdc65..9b1d891ce 100644 --- a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb +++ b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb @@ -19,14 +19,14 @@ "All data could be found at https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page\n", "\n", "### 2. Download needed jar\n", - "* [rapids-4-spark_2.12-22.08.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.08.0/rapids-4-spark_2.12-22.08.0.jar)\n", + "* [rapids-4-spark_2.12-22.10.0.jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.10.0/rapids-4-spark_2.12-22.10.0.jar)\n", "\n", "### 3. Start Spark Standalone\n", "Before running the script, please setup Spark standalone mode\n", "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-22.08.0.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-22.10.0.jar\n", "\n", "```\n", "\n", diff --git a/examples/XGBoost-Examples/taxi/scala/src/com/nvidia/spark/examples/taxi/Taxi.scala b/examples/XGBoost-Examples/taxi/scala/src/com/nvidia/spark/examples/taxi/Taxi.scala index 8d0a248e8..2de25acbd 100644 --- a/examples/XGBoost-Examples/taxi/scala/src/com/nvidia/spark/examples/taxi/Taxi.scala +++ b/examples/XGBoost-Examples/taxi/scala/src/com/nvidia/spark/examples/taxi/Taxi.scala @@ -27,10 +27,7 @@ private[taxi] trait Taxi { lazy val featureNames = etledSchema.filter(_.name != labelColName).map(_.name).toArray lazy val commParamMap = Map( - "learning_rate" -> 0.05, - "max_depth" -> 8, - "subsample" -> 0.8, - "gamma" -> 1 + "num_round" -> 100 ) val rawSchema = StructType(Seq( From 5f7707079e47db921d0c874d820b473f64253e18 Mon Sep 17 00:00:00 2001 From: Matt Ahrens Date: Sun, 20 Nov 2022 19:44:44 -0600 Subject: [PATCH 2/8] Adding Databricks tool demo notebooks for qualification and profiling (#249) * Adding Databricks tool demo notebooks for basic qualification and profiling Signed-off-by: Matt Ahrens * Adding README updates for the databricks tool notebooks Signed-off-by: Matt Ahrens * Fixing typo in OUTPUT_DIR in qual notebook Signed-off-by: Matt Ahrens Signed-off-by: Matt Ahrens --- README.md | 1 + tools/databricks/README.md | 12 ++++++++++++ ...che Spark] Profiling Tool Notebook Template.ipynb | 1 + ...Spark] Qualification Tool Notebook Template.ipynb | 1 + 4 files changed, 15 insertions(+) create mode 100644 tools/databricks/README.md create mode 100644 tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb create mode 100644 tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb diff --git a/README.md b/README.md index a84a738eb..6c4df4ca5 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ There are broadly four categories of examples in this repo: 2. [Spark XGBoost](./examples/XGBoost-Examples) 3. [Deep Learning/Machine Learning](./examples/ML+DL-Examples) 4. [RAPIDS UDF](./examples/UDF-Examples) +5. [Databricks Tools demo notebooks](./tools/databricks) For more information on each of the examples please look into respective categories. diff --git a/tools/databricks/README.md b/tools/databricks/README.md new file mode 100644 index 000000000..467c5d277 --- /dev/null +++ b/tools/databricks/README.md @@ -0,0 +1,12 @@ +# Databricks Tools Demo Notebooks + +The RAPIDS Accelerator for Apache Spark includes two key tools for understanding the benefits of +GPU acceleration as well as analyzing GPU Spark jobs. For customers on Databricks, the demo +notebooks offer a simple interface for running the tools given a set of Spark event logs from +CPU (qualification) or GPU (profiling) application runs. + +To use a demo notebook, you can import the notebook in the Databricks Notebook UI via File->Import Notebook. + +Once the demo notebook is imported, you can select run to activate the notebook to an available compute +cluster. Once the notebook is activated, you can enter in the log path location in the text widget at the +top of the notebook. After that, select *Run all* to execute the tools for the specific logs in the log path. diff --git a/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb new file mode 100644 index 000000000..4e0815095 --- /dev/null +++ b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","source":["# Welcome to the Profiling Tool for the RAPIDS Accelerator for Apache Spark\nTo run the tool, you need to enter a log path that represents the DBFS location for your Spark GPU event logs. Then you can select \"Run all\" to execute the notebook. After the notebook completes, you will see various output tables show up below.\n\n## GPU Job Tuning Recommendations\nThis has general suggestions for tuning your applications to run optimally on GPUs.\n\n## Per-Job Profile\nThe profiler output includes information about the application, data sources, executors, SQL stages, Spark properties, and key application metrics at the job and stage levels."],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"5156a76c-7af7-465d-aff4-41a2e54e3595","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["import json\nimport requests\nimport base64\nimport shlex\nimport subprocess\nimport pandas as pd\n\nTOOL_JAR_URL = 'https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/22.10.0/rapids-4-spark-tools_2.12-22.10.0.jar'\nTOOL_JAR_LOCAL_PATH = '/tmp/rapids-4-spark-tools.jar'\n\n# Profiling tool output directory.\nOUTPUT_DIR = '/tmp' \n\nresponse = requests.get(TOOL_JAR_URL)\nopen(TOOL_JAR_LOCAL_PATH, \"wb\").write(response.content)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"53b4d770-9db6-4bd7-9b93-d036d375eac5","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["dbutils.widgets.text(\"log_path\", \"\")"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"f0e4371a-d2d9-4449-81ed-8f6c61ae8f80","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["eventlog_string=dbutils.widgets.get(\"log_path\") \n\nq_command_string=\"java -Xmx10g -cp /tmp/rapids-4-spark-tools.jar:/databricks/jars/* com.nvidia.spark.rapids.tool.profiling.ProfileMain --csv --auto-tuner -o {} \".format(OUTPUT_DIR) + eventlog_string\nargs = shlex.split(q_command_string)\ncmd_out = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n\nif cmd_out.returncode != 0:\n dbutils.notebook.exit(\"Profiling Tool failed with stderr:\" + cmd_out.stderr)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"e9e7cecf-c2dc-4a0f-aea1-61a323e4ccc4","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["import os\n\napp_df = pd.DataFrame(columns = ['appId', 'appName'])\n\nfor x in os.scandir(OUTPUT_DIR + \"/rapids_4_spark_profile/\"):\n tmp_df = pd.read_csv(x.path + \"/application_information.csv\")\n app_df = app_df.append(tmp_df[['appId', 'appName']])"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"be0a2da7-1ee3-475e-96f9-303779edfd85","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## GPU Job Tuning Recommendations"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"a1e326ec-5701-4b08-ae0f-7df0c8440038","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["app_list = app_df[\"appId\"].tolist()\napp_recommendations = pd.DataFrame(columns=['app', 'recommendations'])\n\nfor app in app_list:\n app_file = open(OUTPUT_DIR + \"/rapids_4_spark_profile/\" + app + \"/profile.log\")\n recommendations_start = 0\n recommendations_str = \"\"\n for line in app_file:\n if recommendations_start == 1:\n recommendations_str = recommendations_str + line\n if \"### D. Recommended Configuration ###\" in line:\n recommendations_start = 1\n app_recommendations = app_recommendations.append({'app': app, 'recommendations': recommendations_str}, ignore_index=True)\n \ndisplay(app_recommendations)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"4979f78c-44a0-4e54-b803-e5e194b71104","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Per-App Profile"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"1d4f9927-e9d8-4897-b604-f7832dc634aa","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["for x in os.scandir(OUTPUT_DIR + \"/rapids_4_spark_profile/\"):\n print(\"APPLICATION ID = \" + str(x))\n log = open(x.path + \"/profile.log\")\n print(log.read())"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"9a8f1a58-e86f-4bd0-a245-878186feb8b9","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template","dashboards":[{"elements":[{"elementNUID":"be0a2da7-1ee3-475e-96f9-303779edfd85","dashboardResultIndex":0,"guid":"05eef9d3-7c55-4e26-8d1f-fa80338359e6","resultIndex":null,"options":null,"position":{"x":0,"y":0,"height":6,"width":24,"z":null},"elementType":"command"}],"guid":"a9ea7799-040a-484e-a59d-c3cdf5072953","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"91c1bfb2-695a-4e5c-8a25-848a433108dc","origId":2690941040041430,"title":"Executive View","width":1600,"globalVars":{}},{"elements":[],"guid":"0896a45f-af1b-4849-b6c2-2b6abcb8b97b","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"62243296-4562-4f06-90ac-d7a609f19c16","origId":2690941040041431,"title":"App View","width":1920,"globalVars":{}}],"notebookMetadata":{"pythonIndentUnit":2,"widgetLayout":[{"name":"log_path","width":576,"breakBefore":false},{"name":"Apps","width":494,"breakBefore":false}]},"language":"python","widgets":{"log_path":{"nuid":"c7ce3870-db19-4813-b1cb-cead3f4c36f1","currentValue":"/dbfs/","widgetInfo":{"widgetType":"text","name":"log_path","defaultValue":"","label":null,"options":{"widgetType":"text","validationRegex":null}}}},"notebookOrigID":2690941040041407}},"nbformat":4,"nbformat_minor":0} diff --git a/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb new file mode 100644 index 000000000..d2fb9823c --- /dev/null +++ b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","source":["# Welcome to the Qualification Tool for the RAPIDS Accelerator for Apache Spark\nTo run the tool, you need to enter a log path that represents the DBFS location for your Spark CPU event logs. Then you can select \"Run all\" to execute the notebook. After the notebook completes, you will see various output tables show up below.\n\n## Summary Output\nThe report represents the entire app execution, including unsupported operators and non-SQL operations. By default, the applications and queries are sorted in descending order by the following fields:\n- Recommendation;\n- Estimated GPU Speed-up;\n- Estimated GPU Time Saved; and\n- End Time.\n\n## Stages Output\nFor each stage used in SQL operations, the Qualification tool generates the following information:\n1. App ID\n1. Stage ID\n1. Average Speedup Factor: the average estimated speed-up of all the operators in the given stage.\n1. Stage Task Duration: amount of time spent in tasks of SQL Dataframe operations for the given stage.\n1. Unsupported Task Duration: sum of task durations for the unsupported operators. For more details, see Supported Operators.\n1. Stage Estimated: True or False indicates if we had to estimate the stage duration.\n\n## Execs Output\nThe Qualification tool generates a report of the “Exec” in the “SparkPlan” or “Executor Nodes” along with the estimated acceleration on the GPU. Please refer to the Supported Operators guide for more details on limitations on UDFs and unsupported operators.\n1. App ID\n1. SQL ID\n1. Exec Name: example Filter, HashAggregate\n1. Expression Name\n1. Task Speedup Factor: it is simply the average acceleration of the operators based on the original CPU duration of the operator divided by the GPU duration. The tool uses historical queries and benchmarks to estimate a speed-up at an individual operator level to calculate how much a specific operator would accelerate on GPU.\n1. Exec Duration: wall-Clock time measured since the operator starts till it is completed.\n1. SQL Node Id\n1. Exec Is Supported: whether the Exec is supported by RAPIDS or not. Please refer to the Supported Operators section.\n1. Exec Stages: an array of stage IDs\n1. Exec Children\n1. Exec Children Node Ids\n1. Exec Should Remove: whether the Op is removed from the migrated plan."],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"df33c614-2ecc-47a0-8600-bc891681997f","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["import json\nimport requests\nimport base64\nimport shlex\nimport subprocess\nimport pandas as pd\n\nTOOL_JAR_URL = 'https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/22.10.0/rapids-4-spark-tools_2.12-22.10.0.jar'\nTOOL_JAR_LOCAL_PATH = '/tmp/rapids-4-spark-tools.jar'\n\n# Qualification tool output directory.\nOUTPUT_DIR = '/tmp/'\n\nresponse = requests.get(TOOL_JAR_URL)\nopen(TOOL_JAR_LOCAL_PATH, \"wb\").write(response.content)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"53b4d770-9db6-4bd7-9b93-d036d375eac5","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["dbutils.widgets.text(\"log_path\", \"\")\neventlog_string=dbutils.widgets.get(\"log_path\")\n\nq_command_string=\"java -Xmx10g -cp /tmp/rapids-4-spark-tools.jar:/databricks/jars/* com.nvidia.spark.rapids.tool.qualification.QualificationMain -o {} \".format(OUTPUT_DIR) + eventlog_string\nargs = shlex.split(q_command_string)\ncmd_out = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n\n\nif cmd_out.returncode != 0:\n dbutils.notebook.exit(\"Qualification Tool failed with stderr:\" + cmd_out.stderr)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"e9e7cecf-c2dc-4a0f-aea1-61a323e4ccc4","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Summary Output"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"bbe50fde-0bd6-4281-95fd-6a1ec6f17ab2","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["summary_output=pd.read_csv(OUTPUT_DIR + \"rapids_4_spark_qualification_output/rapids_4_spark_qualification_output.csv\")\ndisplay(summary_output)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"fb8edb26-e173-47ff-92a1-463baec7c06b","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Stages Output"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"6756159b-30ca-407a-ab6b-9c29ced01ea6","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["stages_output=pd.read_csv(OUTPUT_DIR + \"rapids_4_spark_qualification_output/rapids_4_spark_qualification_output_stages.csv\")\ndisplay(stages_output)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"cdde6177-db5f-434a-995b-776678a64a3a","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Execs Output"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"4d7ce219-ae75-4a0c-a78c-4e7f25b8cd6f","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["execs_output=pd.read_csv(OUTPUT_DIR + \"rapids_4_spark_qualification_output/rapids_4_spark_qualification_output_execs.csv\")\ndisplay(execs_output)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"998b0c51-0cb6-408e-a01a-d1f5b1a61e1f","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template","dashboards":[{"elements":[],"guid":"0ed3c80b-b2f6-4c89-9a92-1af2f168d5ea","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"91c1bfb2-695a-4e5c-8a25-848a433108dc","origId":2721260844584915,"title":"Executive View","width":1600,"globalVars":{}},{"elements":[],"guid":"ab4cecf9-0471-4fee-aa33-8927bb7e1bb1","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"62243296-4562-4f06-90ac-d7a609f19c16","origId":2721260844584916,"title":"App View","width":1920,"globalVars":{}}],"notebookMetadata":{"pythonIndentUnit":2,"widgetLayout":[{"name":"log_path","width":1152,"breakBefore":false}]},"language":"python","widgets":{"log_path":{"nuid":"88986aa6-6e67-4d09-aeeb-7c96ea1ea8f1","currentValue":"/dbfs/","widgetInfo":{"widgetType":"text","name":"log_path","defaultValue":"","label":null,"options":{"widgetType":"text","validationRegex":null}}}},"notebookOrigID":2721260844584890}},"nbformat":4,"nbformat_minor":0} From c1af0cd600115741687fcdc90a75f18e61b22c3b Mon Sep 17 00:00:00 2001 From: liyuan <84758614+nvliyuan@users.noreply.github.com> Date: Mon, 21 Nov 2022 11:33:28 +0800 Subject: [PATCH 3/8] Revert "Adding Databricks tool demo notebooks for qualification and profiling (#249)" (#251) This reverts commit 5f7707079e47db921d0c874d820b473f64253e18. --- README.md | 1 - tools/databricks/README.md | 12 ------------ ...che Spark] Profiling Tool Notebook Template.ipynb | 1 - ...Spark] Qualification Tool Notebook Template.ipynb | 1 - 4 files changed, 15 deletions(-) delete mode 100644 tools/databricks/README.md delete mode 100644 tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb delete mode 100644 tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb diff --git a/README.md b/README.md index 6c4df4ca5..a84a738eb 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,6 @@ There are broadly four categories of examples in this repo: 2. [Spark XGBoost](./examples/XGBoost-Examples) 3. [Deep Learning/Machine Learning](./examples/ML+DL-Examples) 4. [RAPIDS UDF](./examples/UDF-Examples) -5. [Databricks Tools demo notebooks](./tools/databricks) For more information on each of the examples please look into respective categories. diff --git a/tools/databricks/README.md b/tools/databricks/README.md deleted file mode 100644 index 467c5d277..000000000 --- a/tools/databricks/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# Databricks Tools Demo Notebooks - -The RAPIDS Accelerator for Apache Spark includes two key tools for understanding the benefits of -GPU acceleration as well as analyzing GPU Spark jobs. For customers on Databricks, the demo -notebooks offer a simple interface for running the tools given a set of Spark event logs from -CPU (qualification) or GPU (profiling) application runs. - -To use a demo notebook, you can import the notebook in the Databricks Notebook UI via File->Import Notebook. - -Once the demo notebook is imported, you can select run to activate the notebook to an available compute -cluster. Once the notebook is activated, you can enter in the log path location in the text widget at the -top of the notebook. After that, select *Run all* to execute the tools for the specific logs in the log path. diff --git a/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb deleted file mode 100644 index 4e0815095..000000000 --- a/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"cells":[{"cell_type":"markdown","source":["# Welcome to the Profiling Tool for the RAPIDS Accelerator for Apache Spark\nTo run the tool, you need to enter a log path that represents the DBFS location for your Spark GPU event logs. Then you can select \"Run all\" to execute the notebook. After the notebook completes, you will see various output tables show up below.\n\n## GPU Job Tuning Recommendations\nThis has general suggestions for tuning your applications to run optimally on GPUs.\n\n## Per-Job Profile\nThe profiler output includes information about the application, data sources, executors, SQL stages, Spark properties, and key application metrics at the job and stage levels."],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"5156a76c-7af7-465d-aff4-41a2e54e3595","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["import json\nimport requests\nimport base64\nimport shlex\nimport subprocess\nimport pandas as pd\n\nTOOL_JAR_URL = 'https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/22.10.0/rapids-4-spark-tools_2.12-22.10.0.jar'\nTOOL_JAR_LOCAL_PATH = '/tmp/rapids-4-spark-tools.jar'\n\n# Profiling tool output directory.\nOUTPUT_DIR = '/tmp' \n\nresponse = requests.get(TOOL_JAR_URL)\nopen(TOOL_JAR_LOCAL_PATH, \"wb\").write(response.content)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"53b4d770-9db6-4bd7-9b93-d036d375eac5","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["dbutils.widgets.text(\"log_path\", \"\")"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"f0e4371a-d2d9-4449-81ed-8f6c61ae8f80","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["eventlog_string=dbutils.widgets.get(\"log_path\") \n\nq_command_string=\"java -Xmx10g -cp /tmp/rapids-4-spark-tools.jar:/databricks/jars/* com.nvidia.spark.rapids.tool.profiling.ProfileMain --csv --auto-tuner -o {} \".format(OUTPUT_DIR) + eventlog_string\nargs = shlex.split(q_command_string)\ncmd_out = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n\nif cmd_out.returncode != 0:\n dbutils.notebook.exit(\"Profiling Tool failed with stderr:\" + cmd_out.stderr)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"e9e7cecf-c2dc-4a0f-aea1-61a323e4ccc4","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["import os\n\napp_df = pd.DataFrame(columns = ['appId', 'appName'])\n\nfor x in os.scandir(OUTPUT_DIR + \"/rapids_4_spark_profile/\"):\n tmp_df = pd.read_csv(x.path + \"/application_information.csv\")\n app_df = app_df.append(tmp_df[['appId', 'appName']])"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"be0a2da7-1ee3-475e-96f9-303779edfd85","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## GPU Job Tuning Recommendations"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"a1e326ec-5701-4b08-ae0f-7df0c8440038","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["app_list = app_df[\"appId\"].tolist()\napp_recommendations = pd.DataFrame(columns=['app', 'recommendations'])\n\nfor app in app_list:\n app_file = open(OUTPUT_DIR + \"/rapids_4_spark_profile/\" + app + \"/profile.log\")\n recommendations_start = 0\n recommendations_str = \"\"\n for line in app_file:\n if recommendations_start == 1:\n recommendations_str = recommendations_str + line\n if \"### D. Recommended Configuration ###\" in line:\n recommendations_start = 1\n app_recommendations = app_recommendations.append({'app': app, 'recommendations': recommendations_str}, ignore_index=True)\n \ndisplay(app_recommendations)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"4979f78c-44a0-4e54-b803-e5e194b71104","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Per-App Profile"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"1d4f9927-e9d8-4897-b604-f7832dc634aa","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["for x in os.scandir(OUTPUT_DIR + \"/rapids_4_spark_profile/\"):\n print(\"APPLICATION ID = \" + str(x))\n log = open(x.path + \"/profile.log\")\n print(log.read())"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"9a8f1a58-e86f-4bd0-a245-878186feb8b9","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template","dashboards":[{"elements":[{"elementNUID":"be0a2da7-1ee3-475e-96f9-303779edfd85","dashboardResultIndex":0,"guid":"05eef9d3-7c55-4e26-8d1f-fa80338359e6","resultIndex":null,"options":null,"position":{"x":0,"y":0,"height":6,"width":24,"z":null},"elementType":"command"}],"guid":"a9ea7799-040a-484e-a59d-c3cdf5072953","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"91c1bfb2-695a-4e5c-8a25-848a433108dc","origId":2690941040041430,"title":"Executive View","width":1600,"globalVars":{}},{"elements":[],"guid":"0896a45f-af1b-4849-b6c2-2b6abcb8b97b","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"62243296-4562-4f06-90ac-d7a609f19c16","origId":2690941040041431,"title":"App View","width":1920,"globalVars":{}}],"notebookMetadata":{"pythonIndentUnit":2,"widgetLayout":[{"name":"log_path","width":576,"breakBefore":false},{"name":"Apps","width":494,"breakBefore":false}]},"language":"python","widgets":{"log_path":{"nuid":"c7ce3870-db19-4813-b1cb-cead3f4c36f1","currentValue":"/dbfs/","widgetInfo":{"widgetType":"text","name":"log_path","defaultValue":"","label":null,"options":{"widgetType":"text","validationRegex":null}}}},"notebookOrigID":2690941040041407}},"nbformat":4,"nbformat_minor":0} diff --git a/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb deleted file mode 100644 index d2fb9823c..000000000 --- a/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"cells":[{"cell_type":"markdown","source":["# Welcome to the Qualification Tool for the RAPIDS Accelerator for Apache Spark\nTo run the tool, you need to enter a log path that represents the DBFS location for your Spark CPU event logs. Then you can select \"Run all\" to execute the notebook. After the notebook completes, you will see various output tables show up below.\n\n## Summary Output\nThe report represents the entire app execution, including unsupported operators and non-SQL operations. By default, the applications and queries are sorted in descending order by the following fields:\n- Recommendation;\n- Estimated GPU Speed-up;\n- Estimated GPU Time Saved; and\n- End Time.\n\n## Stages Output\nFor each stage used in SQL operations, the Qualification tool generates the following information:\n1. App ID\n1. Stage ID\n1. Average Speedup Factor: the average estimated speed-up of all the operators in the given stage.\n1. Stage Task Duration: amount of time spent in tasks of SQL Dataframe operations for the given stage.\n1. Unsupported Task Duration: sum of task durations for the unsupported operators. For more details, see Supported Operators.\n1. Stage Estimated: True or False indicates if we had to estimate the stage duration.\n\n## Execs Output\nThe Qualification tool generates a report of the “Exec” in the “SparkPlan” or “Executor Nodes” along with the estimated acceleration on the GPU. Please refer to the Supported Operators guide for more details on limitations on UDFs and unsupported operators.\n1. App ID\n1. SQL ID\n1. Exec Name: example Filter, HashAggregate\n1. Expression Name\n1. Task Speedup Factor: it is simply the average acceleration of the operators based on the original CPU duration of the operator divided by the GPU duration. The tool uses historical queries and benchmarks to estimate a speed-up at an individual operator level to calculate how much a specific operator would accelerate on GPU.\n1. Exec Duration: wall-Clock time measured since the operator starts till it is completed.\n1. SQL Node Id\n1. Exec Is Supported: whether the Exec is supported by RAPIDS or not. Please refer to the Supported Operators section.\n1. Exec Stages: an array of stage IDs\n1. Exec Children\n1. Exec Children Node Ids\n1. Exec Should Remove: whether the Op is removed from the migrated plan."],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"df33c614-2ecc-47a0-8600-bc891681997f","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["import json\nimport requests\nimport base64\nimport shlex\nimport subprocess\nimport pandas as pd\n\nTOOL_JAR_URL = 'https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark-tools_2.12/22.10.0/rapids-4-spark-tools_2.12-22.10.0.jar'\nTOOL_JAR_LOCAL_PATH = '/tmp/rapids-4-spark-tools.jar'\n\n# Qualification tool output directory.\nOUTPUT_DIR = '/tmp/'\n\nresponse = requests.get(TOOL_JAR_URL)\nopen(TOOL_JAR_LOCAL_PATH, \"wb\").write(response.content)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"53b4d770-9db6-4bd7-9b93-d036d375eac5","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"code","source":["dbutils.widgets.text(\"log_path\", \"\")\neventlog_string=dbutils.widgets.get(\"log_path\")\n\nq_command_string=\"java -Xmx10g -cp /tmp/rapids-4-spark-tools.jar:/databricks/jars/* com.nvidia.spark.rapids.tool.qualification.QualificationMain -o {} \".format(OUTPUT_DIR) + eventlog_string\nargs = shlex.split(q_command_string)\ncmd_out = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n\n\nif cmd_out.returncode != 0:\n dbutils.notebook.exit(\"Qualification Tool failed with stderr:\" + cmd_out.stderr)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"e9e7cecf-c2dc-4a0f-aea1-61a323e4ccc4","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Summary Output"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"bbe50fde-0bd6-4281-95fd-6a1ec6f17ab2","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["summary_output=pd.read_csv(OUTPUT_DIR + \"rapids_4_spark_qualification_output/rapids_4_spark_qualification_output.csv\")\ndisplay(summary_output)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"fb8edb26-e173-47ff-92a1-463baec7c06b","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Stages Output"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"6756159b-30ca-407a-ab6b-9c29ced01ea6","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["stages_output=pd.read_csv(OUTPUT_DIR + \"rapids_4_spark_qualification_output/rapids_4_spark_qualification_output_stages.csv\")\ndisplay(stages_output)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"cdde6177-db5f-434a-995b-776678a64a3a","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0},{"cell_type":"markdown","source":["## Execs Output"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"4d7ce219-ae75-4a0c-a78c-4e7f25b8cd6f","inputWidgets":{},"title":""}}},{"cell_type":"code","source":["execs_output=pd.read_csv(OUTPUT_DIR + \"rapids_4_spark_qualification_output/rapids_4_spark_qualification_output_execs.csv\")\ndisplay(execs_output)"],"metadata":{"application/vnd.databricks.v1+cell":{"showTitle":false,"cellMetadata":{},"nuid":"998b0c51-0cb6-408e-a01a-d1f5b1a61e1f","inputWidgets":{},"title":""}},"outputs":[],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template","dashboards":[{"elements":[],"guid":"0ed3c80b-b2f6-4c89-9a92-1af2f168d5ea","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"91c1bfb2-695a-4e5c-8a25-848a433108dc","origId":2721260844584915,"title":"Executive View","width":1600,"globalVars":{}},{"elements":[],"guid":"ab4cecf9-0471-4fee-aa33-8927bb7e1bb1","layoutOption":{"stack":true,"grid":true},"version":"DashboardViewV1","nuid":"62243296-4562-4f06-90ac-d7a609f19c16","origId":2721260844584916,"title":"App View","width":1920,"globalVars":{}}],"notebookMetadata":{"pythonIndentUnit":2,"widgetLayout":[{"name":"log_path","width":1152,"breakBefore":false}]},"language":"python","widgets":{"log_path":{"nuid":"88986aa6-6e67-4d09-aeeb-7c96ea1ea8f1","currentValue":"/dbfs/","widgetInfo":{"widgetType":"text","name":"log_path","defaultValue":"","label":null,"options":{"widgetType":"text","validationRegex":null}}}},"notebookOrigID":2721260844584890}},"nbformat":4,"nbformat_minor":0} From 79c0601a6eac1b60aea8604c1efadd61b7b3a2f8 Mon Sep 17 00:00:00 2001 From: liyuan Date: Mon, 20 Feb 2023 10:29:02 +0800 Subject: [PATCH 4/8] remove snapshot Signed-off-by: liyuan --- examples/ML+DL-Examples/Spark-cuML/pca/README.md | 2 +- examples/ML+DL-Examples/Spark-cuML/pca/pom.xml | 4 ++-- examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh | 6 +++--- .../micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb | 2 +- examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml | 4 ++-- .../RAPIDS-accelerated-UDFs/run_pyspark_from_build.sh | 2 +- examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh | 2 +- .../notebooks/cuspatial_sample_standalone.ipynb | 2 +- examples/UDF-Examples/Spark-cuSpatial/pom.xml | 4 ++-- examples/XGBoost-Examples/agaricus/pom.xml | 2 +- examples/XGBoost-Examples/aggregator/pom.xml | 2 +- .../mortgage/notebooks/python/cv-mortgage-gpu.ipynb | 2 +- .../mortgage/notebooks/scala/mortgage-ETL.ipynb | 2 +- examples/XGBoost-Examples/mortgage/pom.xml | 2 +- examples/XGBoost-Examples/pom.xml | 2 +- .../XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb | 2 +- .../XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb | 2 +- examples/XGBoost-Examples/taxi/pom.xml | 2 +- examples/XGBoost-Examples/utility/pom.xml | 2 +- 19 files changed, 24 insertions(+), 24 deletions(-) diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/README.md b/examples/ML+DL-Examples/Spark-cuML/pca/README.md index d0185f7d0..e83e40683 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/README.md +++ b/examples/ML+DL-Examples/Spark-cuML/pca/README.md @@ -48,7 +48,7 @@ It is assumed that a Standalone Spark cluster has been set up, the `SPARK_MASTER ``` bash RAPIDS_ML_JAR=PATH_TO_rapids-4-spark-ml_2.12-22.02.0-cuda11.jar - PLUGIN_JAR=PATH_TO_rapids-4-spark_2.12-23.02.0-SNAPSHOT.jar + PLUGIN_JAR=PATH_TO_rapids-4-spark_2.12-23.02.0.jar jupyter toree install \ --spark_home=${SPARK_HOME} \ diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml index bcdc8192f..a1182028d 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml +++ b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml @@ -21,7 +21,7 @@ com.nvidia PCAExample jar - 23.02.0-SNAPSHOT + 23.02.0 8 @@ -51,7 +51,7 @@ com.nvidia rapids-4-spark-ml_2.12 - 23.02.0-SNAPSHOT + 23.02.0 diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh index e0991aa36..9d5b25a9f 100755 --- a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh +++ b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh @@ -15,8 +15,8 @@ # limitations under the License. # -ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/23.02.0-SNAPSHOT/rapids-4-spark-ml_2.12-23.02.0-SNAPSHOT.jar -PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.02.0-SNAPSHOT/rapids-4-spark_2.12-23.02.0-SNAPSHOT.jar +ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/23.02.0/rapids-4-spark-ml_2.12-23.02.0.jar +PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar $SPARK_HOME/bin/spark-submit \ --master spark://127.0.0.1:7077 \ @@ -38,4 +38,4 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.network.timeout=1000s \ --jars $ML_JAR,$PLUGIN_JAR \ --class com.nvidia.spark.examples.pca.Main \ -/workspace/target/PCAExample-23.02.0-SNAPSHOT.jar +/workspace/target/PCAExample-23.02.0.jar diff --git a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb index d900a2ec3..0a672cc09 100644 --- a/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb +++ b/examples/SQL+DF-Examples/micro-benchmarks/notebooks/micro-benchmarks-gpu.ipynb @@ -22,7 +22,7 @@ "import os\n", "# Change to your cluster ip:port and directories\n", "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"spark:your-ip:port\")\n", - "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-path/rapids-4-spark_2.12-23.02.0-SNAPSHOT.jar\")\n" + "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-path/rapids-4-spark_2.12-23.02.0.jar\")\n" ] }, { diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml index ea9a25dac..92593984e 100644 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/pom.xml @@ -25,7 +25,7 @@ user defined functions for use with the RAPIDS Accelerator for Apache Spark - 23.02.0-SNAPSHOT + 23.02.0 1.8 @@ -37,7 +37,7 @@ cuda11 2.12 - 23.02.0-SNAPSHOT + 23.02.0 3.1.1 2.12.15 ${project.build.directory}/cpp-build diff --git a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/run_pyspark_from_build.sh b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/run_pyspark_from_build.sh index 6cf4f5c70..a802066df 100755 --- a/examples/UDF-Examples/RAPIDS-accelerated-UDFs/run_pyspark_from_build.sh +++ b/examples/UDF-Examples/RAPIDS-accelerated-UDFs/run_pyspark_from_build.sh @@ -33,7 +33,7 @@ else # more lineant configuration, else it will set them to 1 as spurious task failures are not expected # for Spark 3.1.1+ VERSION_STRING=`$SPARK_HOME/bin/pyspark --version 2>&1|grep -v Scala|awk '/version\ [0-9.]+/{print $NF}'` - VERSION_STRING="${VERSION_STRING/-SNAPSHOT/}" + VERSION_STRING="${VERSION_STRING//}" [[ -z $VERSION_STRING ]] && { echo "Unable to detect the Spark version at $SPARK_HOME"; exit 1; } [[ -z $SPARK_SHIM_VER ]] && { SPARK_SHIM_VER="spark${VERSION_STRING//./}"; } diff --git a/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh b/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh index bd675849d..c79947967 100755 --- a/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh +++ b/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh @@ -31,7 +31,7 @@ rm -rf $DATA_OUT_PATH # the path to keep the jars of spark-rapids & spark-cuspatial JARS=$ROOT_PATH/jars -JARS_PATH=${JARS_PATH:-$JARS/rapids-4-spark_2.12-23.02.0-SNAPSHOT.jar,$JARS/spark-cuspatial-23.02.0-SNAPSHOT.jar} +JARS_PATH=${JARS_PATH:-$JARS/rapids-4-spark_2.12-23.02.0.jar,$JARS/spark-cuspatial-23.02.0.jar} $SPARK_HOME/bin/spark-submit --master spark://$HOSTNAME:7077 \ --name "Gpu Spatial Join UDF" \ diff --git a/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb b/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb index 1fd1c4855..7c2a13477 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb +++ b/examples/UDF-Examples/Spark-cuSpatial/notebooks/cuspatial_sample_standalone.ipynb @@ -9,7 +9,7 @@ "source": [ "from pyspark.sql import SparkSession\n", "import os\n", - "jarsPath = os.getenv(\"JARS_PATH\", \"/data/cuspatial_data/jars/rapids-4-spark_2.12-23.02.0-SNAPSHOT.jar,/data/cuspatial_data/jars/spark-cuspatial-23.02.0-SNAPSHOT.jar\")\n", + "jarsPath = os.getenv(\"JARS_PATH\", \"/data/cuspatial_data/jars/rapids-4-spark_2.12-23.02.0.jar,/data/cuspatial_data/jars/spark-cuspatial-23.02.0.jar\")\n", "spark = SparkSession.builder \\\n", " .config(\"spark.jars\", jarsPath) \\\n", " .config(\"spark.sql.adaptive.enabled\", \"false\") \\\n", diff --git a/examples/UDF-Examples/Spark-cuSpatial/pom.xml b/examples/UDF-Examples/Spark-cuSpatial/pom.xml index dd81a615a..324236e27 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/pom.xml +++ b/examples/UDF-Examples/Spark-cuSpatial/pom.xml @@ -24,13 +24,13 @@ UDF of the cuSpatial case for the RAPIDS Accelerator The RAPIDS accelerated user defined function of the cuSpatial case for use with the RAPIDS Accelerator for Apache Spark - 23.02.0-SNAPSHOT + 23.02.0 1.8 1.8 8 - 23.02.0-SNAPSHOT + 23.02.0 2.12 3.2.0 ${project.build.directory}/cpp-build diff --git a/examples/XGBoost-Examples/agaricus/pom.xml b/examples/XGBoost-Examples/agaricus/pom.xml index ee30ec285..6b4bab9a8 100644 --- a/examples/XGBoost-Examples/agaricus/pom.xml +++ b/examples/XGBoost-Examples/agaricus/pom.xml @@ -21,7 +21,7 @@ sample_xgboost_examples com.nvidia - 0.2.3-SNAPSHOT + 0.2.3 4.0.0 diff --git a/examples/XGBoost-Examples/aggregator/pom.xml b/examples/XGBoost-Examples/aggregator/pom.xml index 5fb27d872..3485b8d08 100644 --- a/examples/XGBoost-Examples/aggregator/pom.xml +++ b/examples/XGBoost-Examples/aggregator/pom.xml @@ -21,7 +21,7 @@ sample_xgboost_examples com.nvidia - 0.2.3-SNAPSHOT + 0.2.3 4.0.0 diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb index 971ffdc6a..f2675ac93 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb @@ -63,7 +63,7 @@ "Setting default log level to \"WARN\".\n", "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", "2022-11-25 09:34:43,952 WARN resource.ResourceUtils: The configuration of cores (exec = 4 task = 1, runnable tasks = 4) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n", - "2022-11-25 09:34:58,155 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 23.02.0-SNAPSHOT using cudf 23.02.0-SNAPSHOT.\n", + "2022-11-25 09:34:58,155 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 23.02.0 using cudf 23.02.0.\n", "2022-11-25 09:34:58,171 WARN rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n", "2022-11-25 09:34:58,175 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n", "2022-11-25 09:34:58,175 WARN rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n" diff --git a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb index d1b8e5f7e..faaf3245b 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-ETL.ipynb @@ -27,7 +27,7 @@ "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-23.02.0-SNAPSHOT.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-23.02.0.jar\n", "\n", "```\n", "\n", diff --git a/examples/XGBoost-Examples/mortgage/pom.xml b/examples/XGBoost-Examples/mortgage/pom.xml index 1eb2ffa32..ffef6cfd5 100644 --- a/examples/XGBoost-Examples/mortgage/pom.xml +++ b/examples/XGBoost-Examples/mortgage/pom.xml @@ -21,7 +21,7 @@ sample_xgboost_examples com.nvidia - 0.2.3-SNAPSHOT + 0.2.3 4.0.0 diff --git a/examples/XGBoost-Examples/pom.xml b/examples/XGBoost-Examples/pom.xml index d6977f8c5..a0537bf0d 100644 --- a/examples/XGBoost-Examples/pom.xml +++ b/examples/XGBoost-Examples/pom.xml @@ -33,7 +33,7 @@ aggregator - 0.2.3-SNAPSHOT + 0.2.3 sample_xgboost_apps diff --git a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb index ab334a6bf..774ef7185 100644 --- a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb +++ b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-ETL.ipynb @@ -26,7 +26,7 @@ "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-23.02.0-SNAPSHOT.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-23.02.0.jar\n", "$ export PYSPARK_DRIVER_PYTHON=jupyter \n", "$ export PYSPARK_DRIVER_PYTHON_OPTS=notebook\n", "```\n", diff --git a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb index d8395e439..cf47acfd7 100644 --- a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb +++ b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-ETL.ipynb @@ -26,7 +26,7 @@ "\n", "### 4. Add ENV\n", "```\n", - "$ export SPARK_JARS=rapids-4-spark_2.12-23.02.0-SNAPSHOT.jar\n", + "$ export SPARK_JARS=rapids-4-spark_2.12-23.02.0.jar\n", "\n", "```\n", "\n", diff --git a/examples/XGBoost-Examples/taxi/pom.xml b/examples/XGBoost-Examples/taxi/pom.xml index e4cbff02d..5fcb7796b 100644 --- a/examples/XGBoost-Examples/taxi/pom.xml +++ b/examples/XGBoost-Examples/taxi/pom.xml @@ -21,7 +21,7 @@ sample_xgboost_examples com.nvidia - 0.2.3-SNAPSHOT + 0.2.3 4.0.0 diff --git a/examples/XGBoost-Examples/utility/pom.xml b/examples/XGBoost-Examples/utility/pom.xml index ca26ca70d..174b509de 100644 --- a/examples/XGBoost-Examples/utility/pom.xml +++ b/examples/XGBoost-Examples/utility/pom.xml @@ -21,7 +21,7 @@ sample_xgboost_examples com.nvidia - 0.2.3-SNAPSHOT + 0.2.3 4.0.0 From 99d0060871579caf578eea40b68ae61bc34ff04a Mon Sep 17 00:00:00 2001 From: liyuan Date: Thu, 23 Feb 2023 11:14:36 +0800 Subject: [PATCH 5/8] we do not release rapids-4-spark-ml_2.12, submit as snapshot Signed-off-by: liyuan --- examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh index 9d5b25a9f..c77f80930 100755 --- a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh +++ b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh @@ -15,7 +15,7 @@ # limitations under the License. # -ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/23.02.0/rapids-4-spark-ml_2.12-23.02.0.jar +ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/23.02.0/rapids-4-spark-ml_2.12-23.02.0-SNAPSHOT.jar PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar $SPARK_HOME/bin/spark-submit \ From 6213dadc97a218cf12a62dfe0d31edf9fbbd6fa1 Mon Sep 17 00:00:00 2001 From: liyuan Date: Thu, 23 Feb 2023 11:19:25 +0800 Subject: [PATCH 6/8] updated version to v22.02 Signed-off-by: liyuan --- examples/ML+DL-Examples/Spark-cuML/pca/pom.xml | 2 +- examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml index a1182028d..a33bfefce 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml +++ b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml @@ -51,7 +51,7 @@ com.nvidia rapids-4-spark-ml_2.12 - 23.02.0 + 22.02.0 diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh index c77f80930..10ca8f4ed 100755 --- a/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh +++ b/examples/ML+DL-Examples/Spark-cuML/pca/spark-submit.sh @@ -15,7 +15,7 @@ # limitations under the License. # -ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/23.02.0/rapids-4-spark-ml_2.12-23.02.0-SNAPSHOT.jar +ML_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark-ml_2.12/22.02.0/rapids-4-spark-ml_2.12-22.02.0.jar PLUGIN_JAR=/root/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.02.0/rapids-4-spark_2.12-23.02.0.jar $SPARK_HOME/bin/spark-submit \ From 5a692212bd718103b0e6d36c36a750cfdc3a1f6b Mon Sep 17 00:00:00 2001 From: liyuan Date: Thu, 29 Jun 2023 10:47:05 +0800 Subject: [PATCH 7/8] merge dev to main Signed-off-by: liyuan --- examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh | 2 +- examples/UDF-Examples/Spark-cuSpatial/pom.xml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh b/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh index e1d3a4a7f..c79947967 100755 --- a/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh +++ b/examples/UDF-Examples/Spark-cuSpatial/gpu-run.sh @@ -31,7 +31,7 @@ rm -rf $DATA_OUT_PATH # the path to keep the jars of spark-rapids & spark-cuspatial JARS=$ROOT_PATH/jars -JARS_PATH=${JARS_PATH:-$JARS/rapids-4-spark_2.12-23.06.0-SNAPSHOT.jar,$JARS/spark-cuspatial-23.06.0.jar} +JARS_PATH=${JARS_PATH:-$JARS/rapids-4-spark_2.12-23.02.0.jar,$JARS/spark-cuspatial-23.02.0.jar} $SPARK_HOME/bin/spark-submit --master spark://$HOSTNAME:7077 \ --name "Gpu Spatial Join UDF" \ diff --git a/examples/UDF-Examples/Spark-cuSpatial/pom.xml b/examples/UDF-Examples/Spark-cuSpatial/pom.xml index 2048bb6f1..324236e27 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/pom.xml +++ b/examples/UDF-Examples/Spark-cuSpatial/pom.xml @@ -24,13 +24,13 @@ UDF of the cuSpatial case for the RAPIDS Accelerator The RAPIDS accelerated user defined function of the cuSpatial case for use with the RAPIDS Accelerator for Apache Spark - 23.06.0 + 23.02.0 1.8 1.8 8 - 23.06.0 + 23.02.0 2.12 3.2.0 ${project.build.directory}/cpp-build From 8e1701fd823931d4c39644c86b7cbbe8a482008d Mon Sep 17 00:00:00 2001 From: Suraj Aralihalli Date: Fri, 25 Aug 2023 10:23:21 -0700 Subject: [PATCH 8/8] updates ml jars Signed-off-by: Suraj Aralihalli --- examples/ML+DL-Examples/Spark-cuML/pca/pom.xml | 1 - examples/UDF-Examples/Spark-cuSpatial/pom.xml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml index 1e76f45cc..77e99bc31 100644 --- a/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml +++ b/examples/ML+DL-Examples/Spark-cuML/pca/pom.xml @@ -51,7 +51,6 @@ com.nvidia rapids-4-spark-ml_2.12 - 23.02.0 diff --git a/examples/UDF-Examples/Spark-cuSpatial/pom.xml b/examples/UDF-Examples/Spark-cuSpatial/pom.xml index a833ddb9b..324236e27 100644 --- a/examples/UDF-Examples/Spark-cuSpatial/pom.xml +++ b/examples/UDF-Examples/Spark-cuSpatial/pom.xml @@ -24,7 +24,7 @@ UDF of the cuSpatial case for the RAPIDS Accelerator The RAPIDS accelerated user defined function of the cuSpatial case for use with the RAPIDS Accelerator for Apache Spark - 23.08.1 + 23.02.0 1.8