Add cluster startup script

NVIDIA · Nov 25, 2024 · 533c590 · 533c590
1 parent a6281f3
commit 533c590
Show file tree

Hide file tree

Showing 2 changed files with 73 additions and 4 deletions.
diff --git a/examples/ML+DL-Examples/Optuna-Spark/README.md b/examples/ML+DL-Examples/Optuna-Spark/README.md
@@ -44,12 +44,12 @@ We provide two implementations with differences in how data is passed to workers
 - Make sure your [Databricks CLI]((https://docs.databricks.com/en/dev-tools/cli/tutorial.html)) is configured for your Databricks workspace.
 - Copy the desired Python script into your Databricks workspace, for example:
     ```shell
-    databricks workspace import /path/to/directory/in/workspace  \
+    databricks workspace import /path/in/workspace/to/sparkrapids-xgboost-read-per-worker.py  \
         --format AUTO --file sparkrapids-xgboost-read-per-worker.py
     ```
-- Copy the corresponding init script ```databricks/init_optuna.sh``` or ```databricks/init_optuna_xgboost.sh```, for example:
+- Copy the init script ```databricks/init_optuna_xgboost.sh```:
     ```shell
-    databricks workspace import /path/to/directory/in/workspace  \
+    databricks workspace import /path/in/workspace/to/init_optuna_xgboost.sh  \
         --format AUTO --file databricks/init_optuna_xgboost.sh
     ```
 - (For XGBOOST example): Upload the [Wine Qualities](https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv) dataset via the Databricks CLI:
@@ -62,10 +62,19 @@ We provide two implementations with differences in how data is passed to workers
 
 ### 2. Create cluster
 
+Run the cluster startup script, which is configured to create an 8 node GPU cluster:
+```shell
+export INIT_PATH=/path/in/workspace/to/init_optuna_xgboost.sh
+cd databricks
+chmod +x start_cluster.sh
+./start_cluster.sh
+```
+
+Or, create a cluster via the web UI:
 - Go to `Compute > Create compute` and set the desired cluster settings.    
 - Under `Advanced Options > Init Scripts`, upload the init script from your workspace.
 - Under `Advanced Options > Spark > Environment variables`, set `LIBCUDF_CUFILE_POLICY=OFF`. 
-- For XGBOOST examples: Make sure to use a GPU cluster and include task GPU resources.
+- Make sure to use a GPU cluster and include task GPU resources.
 
 The init script will install the required libraries on all nodes, including rapids/cuml for data operations on GPU. On the driver, it will setup the MySQL server backend and create an Optuna study referencing the server. 
 

diff --git a/examples/ML+DL-Examples/Optuna-Spark/optuna-examples/databricks/start_cluster.sh b/examples/ML+DL-Examples/Optuna-Spark/optuna-examples/databricks/start_cluster.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+if [[ -z ${INIT_PATH} ]]; then
+    echo "Please export INIT_PATH per README.md"
+    exit 1
+fi
+
+json_config=$(cat <<EOF
+{
+    "cluster_name": "optuna-xgboost-gpu",
+    "spark_version": "13.3.x-gpu-ml-scala2.12",
+    "spark_conf": {
+        "spark.task.resource.gpu.amount": "1",
+        "spark.executorEnv.PYTHONPATH": "/databricks/jars/rapids-4-spark_2.12-24.10.1.jar:/databricks/spark/python:/databricks/python3",
+        "spark.sql.execution.arrow.maxRecordsPerBatch": "10000",
+        "spark.executor.cores": "8",
+        "spark.rapids.memory.gpu.minAllocFraction": "0.0001",
+        "spark.plugins": "com.nvidia.spark.SQLPlugin",
+        "spark.locality.wait": "0s",
+        "spark.sql.cache.serializer": "com.nvidia.spark.ParquetCachedBatchSerializer",
+        "spark.rapids.memory.gpu.pooling.enabled": "false",
+        "spark.executor.resource.gpu.amount": "1",
+        "spark.sql.execution.sortBeforeRepartition": "false",
+        "spark.rapids.sql.python.gpu.enabled": "true",
+        "spark.rapids.memory.pinnedPool.size": "2G",
+        "spark.executor.instances": "8",
+        "spark.task.maxFailures": "1",
+        "spark.python.daemon.module": "rapids.daemon_databricks",
+        "spark.rapids.ml.uvm.enabled": "true",
+        "spark.rapids.sql.batchSizeBytes": "512m",
+        "spark.sql.adaptive.enabled": "false",
+        "spark.rapids.sql.format.parquet.reader.type": "MULTITHREADED",
+        "spark.sql.execution.arrow.pyspark.enabled": "true",
+        "spark.rapids.sql.format.parquet.multiThreadedRead.maxNumFilesParallel": "20",
+        "spark.sql.files.maxPartitionBytes": "512m",
+        "spark.rapids.sql.multiThreadedRead.numThreads": "20",
+        "spark.rapids.sql.concurrentGpuTasks": "2"
+    },
+    "node_type_id": "Standard_NC8as_T4_v3",
+    "driver_node_type_id": "Standard_NC8as_T4_v3",
+    "spark_env_vars": {
+        "LIBCUDF_CUFILE_POLICY": "OFF"
+    },
+    "autotermination_minutes": 60,
+    "enable_elastic_disk": true,
+    "init_scripts": [
+        {
+            "workspace": {
+                "destination": "/Users/[email protected]/init/init_optuna_xgboost_24.10.sh"
+            }
+        }
+    ],
+    "runtime_engine": "STANDARD",
+    "num_workers": 8,
+}
+EOF
+)
+
+databricks clusters create --json "$json_config"