diff --git a/docs/get-started/xgboost-examples/csp/aws/ec2.md b/docs/get-started/xgboost-examples/csp/aws/ec2.md
index b64fa7a77..0565ce601 100644
--- a/docs/get-started/xgboost-examples/csp/aws/ec2.md
+++ b/docs/get-started/xgboost-examples/csp/aws/ec2.md
@@ -177,8 +177,8 @@ spark-submit --master spark://$HOSTNAME:7077 \
         ${SAMPLE_JAR} \
         -num_workers=${NUM_EXECUTORS} \
         -format=csv \
-        -dataPath="train::s3a://spark-xgboost-mortgage-dataset/csv/train/2000Q1" \
-        -dataPath="trans::s3a://spark-xgboost-mortgage-dataset/csv/eval/2000Q1" \
+        -dataPath="train::your-train-data-path" \
+        -dataPath="trans::your-eval-data-path" \
         -numRound=100 -max_depth=8 -nthread=$NUM_EXECUTOR_CORES -showFeatures=0 \
         -tree_method=gpu_hist
 ```
diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md b/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md
index 6132a7563..b41824fe2 100644
--- a/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md
+++ b/docs/get-started/xgboost-examples/on-prem-cluster/standalone-python.md
@@ -12,11 +12,13 @@ Prerequisites
   * Multi-node clusters with homogenous GPU configuration
 * Software Requirements
   * Ubuntu 18.04, 20.04/CentOS7, CentOS8
-  * CUDA 11.0+
+  * CUDA 11.5+
   * NVIDIA driver compatible with your CUDA
   * NCCL 2.7.8+
-  * Python 3.6+
+  * Python 3.8 or 3.9
   * NumPy
+  * XGBoost 1.7.0+
+  * cudf-cu11  
 
 The number of GPUs in each host dictates the number of Spark executors that can run there.
 Additionally, cores per Spark executor and cores per Spark task must match, such that each executor can run 1 task at any given time.
@@ -47,6 +49,14 @@ And here are the steps to enable the GPU resources discovery for Spark 3.1+.
     spark.worker.resource.gpu.amount 1
     spark.worker.resource.gpu.discoveryScript ${SPARK_HOME}/examples/src/main/scripts/getGpusResources.sh
     ```
+3. Install the XGBoost, cudf-cu11, numpy libraries on all nodes before running XGBoost application.
+
+``` bash
+pip install xgboost
+pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+pip install numpy
+pip install scikit-learn
+```
 
 Get Application Files, Jar and Dataset
 -------------------------------
@@ -182,6 +192,10 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.gpu_main
 
 # tree construction algorithm
 export TREE_METHOD=gpu_hist
+
+# if you enable archive python environment
+export PYSPARK_DRIVER_PYTHON=python
+export PYSPARK_PYTHON=./environment/bin/python
 ```
 
 Run spark-submit:
@@ -197,8 +211,9 @@ ${SPARK_HOME}/bin/spark-submit
  --driver-memory ${SPARK_DRIVER_MEMORY}                                         \
  --executor-memory ${SPARK_EXECUTOR_MEMORY}                                     \
  --conf spark.cores.max=${TOTAL_CORES}                                          \
- --jars ${RAPIDS_JAR},${XGBOOST4J_JAR},${XGBOOST4J_SPARK_JAR}     \
- --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP}                   \
+ --archives your_pyspark_venv.tar.gz#environment     #if you enabled archive python environment \
+ --jars ${RAPIDS_JAR}    \
+ --py-files ${SAMPLE_ZIP}                   \
  ${MAIN_PY}                                                     \
  --mainClass=${EXAMPLE_CLASS}                                                   \
  --dataPath=train::${SPARK_XGBOOST_DIR}/mortgage/output/train/      \
@@ -261,6 +276,10 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.cpu_main
 
 # tree construction algorithm
 export TREE_METHOD=hist
+
+# if you enable archive python environment
+export PYSPARK_DRIVER_PYTHON=python
+export PYSPARK_PYTHON=./environment/bin/python
 ```
 
 This is the same command as for the GPU example, repeated for convenience:
@@ -271,8 +290,9 @@ ${SPARK_HOME}/bin/spark-submit
  --driver-memory ${SPARK_DRIVER_MEMORY}                                         \
  --executor-memory ${SPARK_EXECUTOR_MEMORY}                                     \
  --conf spark.cores.max=${TOTAL_CORES}                                          \
- --jars ${XGBOOST4J_JAR},${XGBOOST4J_SPARK_JAR}       \
- --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP}                       \
+ --archives your_pyspark_venv.tar.gz#environment     #if you enabled archive python environment \
+ --jars ${RAPIDS_JAR}     \
+ --py-files ${SAMPLE_ZIP}                       \
  ${SPARK_PYTHON_ENTRYPOINT}                                                     \
  --mainClass=${EXAMPLE_CLASS}                                                   \
  --dataPath=train::${DATA_PATH}/mortgage/output/train/      \
diff --git a/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md b/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md
index 9d92da01a..f2bff0fdd 100644
--- a/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md
+++ b/docs/get-started/xgboost-examples/on-prem-cluster/yarn-python.md
@@ -12,12 +12,14 @@ Prerequisites
   * Multi-node clusters with homogenous GPU configuration
 * Software Requirements
   * Ubuntu 18.04, 20.04/CentOS7, CentOS8
-  * CUDA 11.0+
+  * CUDA 11.5+
   * NVIDIA driver compatible with your CUDA
   * NCCL 2.7.8+
-  * Python 3.6+
+  * Python 3.8 or 3.9
   * NumPy
-
+  * XGBoost 1.7.0+
+  * cudf-cu11  
+  
 The number of GPUs per NodeManager dictates the number of Spark executors that can run in that NodeManager. 
 Additionally, cores per Spark executor and cores per Spark task must match, such that each executor can run 1 task at any given time.
 
@@ -32,6 +34,32 @@ We use `SPARK_HOME` environment variable to point to the Apache Spark cluster.
 And as to how to enable GPU scheduling and isolation for Yarn,
 please refer to [here](https://hadoop.apache.org/docs/r3.1.0/hadoop-yarn/hadoop-yarn-site/UsingGpus.html).
 
+Please make sure to install the XGBoost, cudf-cu11, numpy libraries on all nodes before running XGBoost application.
+``` bash
+pip install xgboost
+pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+pip install numpy
+pip install scikit-learn
+```
+You can also create an isolated python environment by using (Virtualenv)[https://virtualenv.pypa.io/en/latest/],
+and then directly pass/unpack the archive file and enable the environment on executors
+by leveraging the --archives option or spark.archives configuration.
+``` bash
+# create an isolated python environment and install libraries
+python -m venv pyspark_venv
+source pyspark_venv/bin/activate
+pip install xgboost
+pip install cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
+pip install numpy
+pip install scikit-learn
+venv-pack -o pyspark_venv.tar.gz
+
+# enable archive python environment on executors
+export PYSPARK_DRIVER_PYTHON=python # Do not set in cluster modes.
+export PYSPARK_PYTHON=./environment/bin/python
+spark-submit --archives pyspark_venv.tar.gz#environment app.py
+```
+
 Get Application Files, Jar and Dataset
 -------------------------------
 
@@ -114,6 +142,10 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.gpu_main
 
 # tree construction algorithm
 export TREE_METHOD=gpu_hist
+
+# if you enable archive python environment
+export PYSPARK_DRIVER_PYTHON=python
+export PYSPARK_PYTHON=./environment/bin/python
 ```
 
 Run spark-submit:
@@ -129,11 +161,12 @@ ${SPARK_HOME}/bin/spark-submit
  --files ${SPARK_HOME}/examples/src/main/scripts/getGpusResources.sh            \
  --master yarn                                                                  \
  --deploy-mode ${SPARK_DEPLOY_MODE}                                             \
+ --archives your_pyspark_venv.tar.gz#environment     #if you enabled archive python environment \
  --num-executors ${SPARK_NUM_EXECUTORS}                                         \
  --driver-memory ${SPARK_DRIVER_MEMORY}                                         \
  --executor-memory ${SPARK_EXECUTOR_MEMORY}                                     \
- --jars ${RAPIDS_JAR},${XGBOOST4J_JAR}        \
- --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP}                   \
+ --jars ${RAPIDS_JAR}        \
+ --py-files ${SAMPLE_ZIP}                   \
  ${MAIN_PY}                                                     \
  --mainClass=${EXAMPLE_CLASS}                                                   \
  --dataPath=train::${DATA_PATH}/mortgage/out/train/      \
@@ -190,6 +223,10 @@ export EXAMPLE_CLASS=com.nvidia.spark.examples.mortgage.cpu_main
 
 # tree construction algorithm
 export TREE_METHOD=hist
+
+# if you enable archive python environment
+export PYSPARK_DRIVER_PYTHON=python
+export PYSPARK_PYTHON=./environment/bin/python
 ```
 
 This is the same command as for the GPU example, repeated for convenience:
@@ -197,12 +234,13 @@ This is the same command as for the GPU example, repeated for convenience:
 ``` bash
 ${SPARK_HOME}/bin/spark-submit                                                  \
  --master yarn                                                                  \
+ --archives your_pyspark_venv.tar.gz#environment     #if you enabled archive python environment \
  --deploy-mode ${SPARK_DEPLOY_MODE}                                             \
  --num-executors ${SPARK_NUM_EXECUTORS}                                         \
  --driver-memory ${SPARK_DRIVER_MEMORY}                                         \
  --executor-memory ${SPARK_EXECUTOR_MEMORY}                                     \
- --jars ${XGBOOST4J_JAR},${XGBOOST4J_SPARK_JAR}                                 \
- --py-files ${XGBOOST4J_SPARK_JAR},${SAMPLE_ZIP}                                  \
+ --jars ${RAPIDS_JAR}        \
+ --py-files ${SAMPLE_ZIP}                                  \
  ${MAIN_PY}                                                     \
  --mainClass=${EXAMPLE_CLASS}                                                   \
  --dataPath=train::${DATA_PATH}/mortgage/output/train/       \
diff --git a/docs/img/guides/mortgage-perf.png b/docs/img/guides/mortgage-perf.png
index 23715ce9a..11c94865a 100644
Binary files a/docs/img/guides/mortgage-perf.png and b/docs/img/guides/mortgage-perf.png differ
diff --git a/examples/XGBoost-Examples/README.md b/examples/XGBoost-Examples/README.md
index 69a831af0..5d38f816f 100644
--- a/examples/XGBoost-Examples/README.md
+++ b/examples/XGBoost-Examples/README.md
@@ -1,19 +1,18 @@
 # Spark XGBoost Examples
 
-Spark XGBoost examples here showcase the need for end-to-end GPU acceleration.
+Spark XGBoost examples here showcase the need for ETL+Training pipeline GPU acceleration.
 The Scala based XGBoost examples here use [DMLC’s version](https://repo1.maven.org/maven2/ml/dmlc/xgboost4j-spark_2.12/).
-For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that
-uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/).
+The pyspark based XGBoost examples requires [installing RAPIDS via pip](https://rapids.ai/pip.html#install).
 Most data scientists spend a lot of time not only on
 Training models but also processing the large amounts of data needed to train these models.
-As you can see below, XGBoost training on GPUs can be up to 10X and data processing using
-RAPIDS Accelerator can also be accelerated with an end-to-end speed-up of 7X on GPU compared to CPU.
+As you can see below, Pyspark+XGBoost training on GPUs can be up to 13X and data processing using
+RAPIDS Accelerator can also be accelerated with an end-to-end speed-up of 11X on GPU compared to CPU.
 In the public cloud, better performance can lead to significantly lower costs as demonstrated in this [blog](https://developer.nvidia.com/blog/gpu-accelerated-spark-xgboost/).
 
 ![mortgage-speedup](/docs/img/guides/mortgage-perf.png)
 
-Note that the test result is based on 21 years [Fannie Mea Single-Family Loan Performance Data](https://capitalmarkets.fanniemae.com/credit-risk-transfer/single-family-credit-risk-transfer/fannie-mae-single-family-loan-performance-data) 
-with a 4 A100 GPU and 512 CPU vcores cluster, the performance is affected by many aspects, 
+Note that the Training test result is based on 4 years [Fannie Mea Single-Family Loan Performance Data](https://capitalmarkets.fanniemae.com/credit-risk-transfer/single-family-credit-risk-transfer/fannie-mae-single-family-loan-performance-data) 
+with a 8 A100 GPU and 1024 CPU vcores cluster, the performance is affected by many aspects, 
 including data size and type of GPU. 
 
 In this folder, there are three blue prints for users to learn about using 
@@ -94,6 +93,9 @@ Please follow below steps to run the example notebooks in different notebook env
     - [Jupyter Notebook for Python](/docs/get-started/xgboost-examples/notebook/python-notebook.md)
     
 Note: 
+Update the default value of `spark.sql.execution.arrow.maxRecordsPerBatch` to a larger number(such as 200000) will  
+significantly improve performance by accelerating data transfer between JVM and Python process.
+
 For the CrossValidator job, we need to set `spark.task.resource.gpu.amount=1` to allow only 1 training task running on 1 GPU(executor),
 otherwise the customized CrossValidator may schedule more than 1 xgboost training tasks into one executor simultaneously and trigger 
 [issue-131](https://github.com/NVIDIA/spark-rapids-examples/issues/131).
diff --git a/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb b/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb
index 9d1b1e311..ad529fb37 100644
--- a/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb
+++ b/examples/XGBoost-Examples/agaricus/notebooks/python/agaricus-gpu.ipynb
@@ -9,16 +9,12 @@
     "Agaricus is an example of xgboost classifier for multiple classification. This notebook will show you how to load data, train the xgboost model.\n",
     "\n",
     "A few libraries required for this notebook:\n",
-    "  1. NumPy\n",
-    "  2. cudf jar\n",
-    "  3. xgboost4j jar\n",
-    "  4. xgboost4j-spark jar\n",
-    "  5. rapids-4-spark.jar\n",
+    "  1. cudf-cu11\n",
+    "  2. xgboost\n",
+    "  3. scikit-learn\n",
+    "  4. numpy\n",
     "  \n",
-    "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is only one change required for running Spark XGBoost on GPU. That is replacing the API `setFeaturesCol(feature)` on CPU with the new API `setFeaturesCols(features)`. This also eliminates the need for vectorization (assembling multiple feature columns in to one column) since we can read multiple columns.\n",
-    "\n",
-    "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n",
-    "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)."
+    "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is no change required for running Spark XGBoost on GPU because both CPU and GPU call the same API. For CPU run, we need to vectorize the trained dataset before fitting data to classifier."
    ]
   },
   {
@@ -34,12 +30,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier\n",
+    "from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel\n",
     "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
     "from pyspark.sql import SparkSession\n",
     "from pyspark.sql.types import FloatType, StructField, StructType\n",
     "from time import time\n",
-    "import os"
+    "from pyspark.conf import SparkConf\n",
+    "import os\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n",
+    "# os.environ['PYSPARK_DRIVER_PYTHON'] = \"./environment/bin/python\""
    ]
   },
   {
@@ -64,9 +64,66 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 06:57:40,306 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
+      "2022-11-30 06:57:40,550 WARN resource.ResourceUtils: The configuration of cores (exec = 2 task = 1, runnable tasks = 2) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n",
+      "2022-11-30 06:57:54,195 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 22.10.0 using cudf 22.10.0.\n",
+      "2022-11-30 06:57:54,210 WARN rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n",
+      "2022-11-30 06:57:54,214 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n",
+      "2022-11-30 06:57:54,214 WARN rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n",
+      "2022-11-30 06:57:54,685 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.\n"
+     ]
+    }
+   ],
    "source": [
-    "spark = SparkSession.builder.getOrCreate()\n",
+    "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n",
+    "\n",
+    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n",
+    "\n",
+    "# You need to update with your real hardware resource \n",
+    "driverMem = os.getenv(\"DRIVER_MEM\", \"2g\")\n",
+    "executorMem = os.getenv(\"EXECUTOR_MEM\", \"2g\")\n",
+    "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n",
+    "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n",
+    "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"2\"))\n",
+    "# Common spark settings\n",
+    "conf = SparkConf()\n",
+    "conf.setMaster(SPARK_MASTER_URL)\n",
+    "conf.setAppName(\"Microbenchmark on GPU\")\n",
+    "conf.set(\"spark.executor.instances\",\"1\")\n",
+    "conf.set(\"spark.driver.memory\", driverMem)\n",
+    "## The tasks will run on GPU memory, so there is no need to set a high host memory\n",
+    "conf.set(\"spark.executor.memory\", executorMem)\n",
+    "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n",
+    "conf.set(\"spark.executor.cores\", executorCores)\n",
+    "\n",
+    "\n",
+    "# Plugin settings\n",
+    "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n",
+    "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n",
+    "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n",
+    "# since pyspark and xgboost share the same GPU, we need to allocate some memory to xgboost to avoid GPU OOM while training \n",
+    "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.7\")\n",
+    "conf.set(\"spark.locality.wait\",\"0\")\n",
+    "##############note: only support value=1 https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n",
+    "conf.set(\"spark.task.resource.gpu.amount\", 1) \n",
+    "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n",
+    "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n",
+    "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n",
+    "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n",
+    "\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
+    "# Create spark session\n",
+    "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
+    "\n",
     "reader = spark.read"
    ]
   },
@@ -89,8 +146,17 @@
     "\n",
     "# You need to update them to your real paths!\n",
     "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
-    "train_data = reader.schema(schema).option('header', True).csv(dataRoot + '/agaricus/csv/train')\n",
-    "trans_data = reader.schema(schema).option('header', True).csv(dataRoot + '/agaricus/csv/test')"
+    "train_path = dataRoot + \"/agaricus/csv/train\"\n",
+    "eval_path = dataRoot + \"/agaricus/csv/eval\"\n",
+    "\n",
+    "data_format = 'csv'\n",
+    "has_header = 'true'\n",
+    "if data_format == 'csv':\n",
+    "    train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n",
+    "    trans_data = reader.schema(schema).option('header',has_header).csv(eval_path)\n",
+    "else :\n",
+    "    train_data = reader.load(train_path)\n",
+    "    trans_data = reader.load(eval_path)"
    ]
   },
   {
@@ -127,28 +193,34 @@
    "outputs": [],
    "source": [
     "params = { \n",
-    "    'eta': 0.1,\n",
-    "    'missing': 0.0,\n",
-    "    'treeMethod': 'gpu_hist',\n",
-    "    'maxDepth': 2,\n",
-    "    'numWorkers': 1,\n",
-    "    'numRound' : 100,\n",
+    "    \"tree_method\": \"gpu_hist\",\n",
+    "    \"grow_policy\": \"depthwise\",\n",
+    "    \"num_workers\": 1,\n",
+    "    \"use_gpu\": \"true\",\n",
     "}\n",
-    "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)"
+    "params['features_col'] = features\n",
+    "params['label_col'] = label\n",
+    "    \n",
+    "classifier = SparkXGBClassifier(**params)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The CPU version classifier provides the API `setFeaturesCol` which only accepts a single column name, so vectorization for multiple feature columns is required.\n",
-    "```Python\n",
-    "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCol('features')\n",
-    "```\n",
-    "\n",
     "The parameter `num_workers` should be set to the number of GPUs in Spark cluster for GPU version, while for CPU version it is usually equal to the number of the CPU cores.\n",
     "\n",
-    "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training."
+    "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training.\n",
+    "\n",
+    "An example of CPU classifier:\n",
+    "```\n",
+    "classifier = SparkXGBClassifier(\n",
+    "  feature_col=features,\n",
+    "  label_col=label,  \n",
+    "  num_workers=1024,\n",
+    "  use_gpu=False,\n",
+    ")\n",
+    "```"
    ]
   },
   {
@@ -163,11 +235,30 @@
    "execution_count": 5,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 07:00:45,526 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n",
+      "[Stage 5:>                                                          (0 + 1) / 1]\r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Training takes 27.95 seconds\n"
+      "Training takes 13.92 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "                                                                                \r",
+      "/data/home/yuanli/work/reviews/pr252/pyspark_venv_20221125/lib/python3.8/site-packages/xgboost/sklearn.py:808: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n",
+      "  warnings.warn(\"Loading a native XGBoost model with Scikit-Learn interface.\")\n"
      ]
     }
    ],
@@ -192,10 +283,26 @@
    "cell_type": "code",
    "execution_count": 6,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.write().overwrite().save(dataRoot + '/model/agaricus')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
    "outputs": [],
    "source": [
-    "model.write().overwrite().save(dataRoot + '/new-model-path')\n",
-    "loaded_model = XGBoostClassificationModel().load(dataRoot + '/new-model-path')"
+    "loaded_model = SparkXGBClassifierModel().load(dataRoot + '/model/agaricus')"
    ]
   },
   {
@@ -207,22 +314,330 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 07:01:07,030 WARN rapids.GpuOverrides: \n",
+      "!Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#798, probability#1062]\n",
+      "  @Expression <AttributeReference> label#254 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_0#255 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_1#256 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_2#257 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_3#258 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_4#259 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_5#260 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_6#261 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_7#262 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_8#263 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_9#264 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_10#265 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_11#266 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_12#267 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_13#268 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_14#269 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_15#270 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_16#271 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_17#272 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_18#273 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_19#274 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_20#275 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_21#276 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_22#277 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_23#278 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_24#279 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_25#280 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_26#281 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_27#282 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_28#283 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_29#284 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_30#285 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_31#286 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_32#287 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_33#288 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_34#289 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_35#290 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_36#291 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_37#292 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_38#293 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_39#294 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_40#295 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_41#296 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_42#297 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_43#298 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_44#299 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_45#300 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_46#301 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_47#302 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_48#303 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_49#304 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_50#305 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_51#306 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_52#307 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_53#308 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_54#309 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_55#310 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_56#311 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_57#312 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_58#313 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_59#314 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_60#315 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_61#316 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_62#317 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_63#318 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_64#319 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_65#320 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_66#321 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_67#322 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_68#323 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_69#324 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_70#325 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_71#326 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_72#327 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_73#328 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_74#329 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_75#330 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_76#331 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_77#332 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_78#333 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_79#334 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_80#335 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_81#336 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_82#337 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_83#338 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_84#339 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_85#340 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_86#341 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_87#342 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_88#343 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_89#344 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_90#345 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_91#346 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_92#347 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_93#348 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_94#349 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_95#350 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_96#351 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_97#352 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_98#353 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_99#354 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_100#355 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_101#356 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_102#357 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_103#358 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_104#359 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_105#360 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_106#361 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_107#362 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_108#363 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_109#364 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_110#365 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_111#366 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_112#367 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_113#368 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_114#369 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_115#370 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_116#371 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_117#372 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_118#373 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_119#374 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_120#375 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_121#376 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_122#377 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_123#378 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_124#379 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_125#380 could run on GPU\n",
+      "  !Expression <Alias> UDF(pythonUDF0#1327.rawPrediction) AS rawPrediction#798 cannot run on GPU because expression Alias UDF(pythonUDF0#1327.rawPrediction) AS rawPrediction#798 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#1327.rawPrediction) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "    !Expression <ScalaUDF> UDF(pythonUDF0#1327.rawPrediction) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3659/488666387 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#1327.rawPrediction) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      @Expression <GetStructField> pythonUDF0#1327.rawPrediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#1327 could run on GPU\n",
+      "  @Expression <Alias> pythonUDF0#1327.prediction AS prediction#931 could run on GPU\n",
+      "    @Expression <GetStructField> pythonUDF0#1327.prediction could run on GPU\n",
+      "      @Expression <AttributeReference> pythonUDF0#1327 could run on GPU\n",
+      "  !Expression <Alias> UDF(pythonUDF0#1327.probability) AS probability#1062 cannot run on GPU because expression Alias UDF(pythonUDF0#1327.probability) AS probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#1327.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "    !Expression <ScalaUDF> UDF(pythonUDF0#1327.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3659/488666387 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#1327.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      @Expression <GetStructField> pythonUDF0#1327.probability could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#1327 could run on GPU\n",
+      "\n",
+      "2022-11-30 07:01:07,071 WARN rapids.GpuOverrides: \n",
+      "!Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#798, probability#1062]\n",
+      "  @Expression <AttributeReference> label#254 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_0#255 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_1#256 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_2#257 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_3#258 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_4#259 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_5#260 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_6#261 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_7#262 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_8#263 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_9#264 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_10#265 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_11#266 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_12#267 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_13#268 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_14#269 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_15#270 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_16#271 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_17#272 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_18#273 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_19#274 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_20#275 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_21#276 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_22#277 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_23#278 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_24#279 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_25#280 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_26#281 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_27#282 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_28#283 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_29#284 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_30#285 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_31#286 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_32#287 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_33#288 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_34#289 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_35#290 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_36#291 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_37#292 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_38#293 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_39#294 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_40#295 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_41#296 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_42#297 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_43#298 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_44#299 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_45#300 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_46#301 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_47#302 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_48#303 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_49#304 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_50#305 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_51#306 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_52#307 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_53#308 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_54#309 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_55#310 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_56#311 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_57#312 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_58#313 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_59#314 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_60#315 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_61#316 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_62#317 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_63#318 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_64#319 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_65#320 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_66#321 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_67#322 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_68#323 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_69#324 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_70#325 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_71#326 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_72#327 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_73#328 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_74#329 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_75#330 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_76#331 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_77#332 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_78#333 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_79#334 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_80#335 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_81#336 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_82#337 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_83#338 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_84#339 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_85#340 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_86#341 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_87#342 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_88#343 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_89#344 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_90#345 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_91#346 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_92#347 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_93#348 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_94#349 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_95#350 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_96#351 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_97#352 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_98#353 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_99#354 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_100#355 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_101#356 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_102#357 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_103#358 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_104#359 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_105#360 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_106#361 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_107#362 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_108#363 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_109#364 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_110#365 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_111#366 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_112#367 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_113#368 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_114#369 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_115#370 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_116#371 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_117#372 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_118#373 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_119#374 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_120#375 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_121#376 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_122#377 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_123#378 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_124#379 could run on GPU\n",
+      "  @Expression <AttributeReference> feature_125#380 could run on GPU\n",
+      "  !Expression <AttributeReference> rawPrediction#798 cannot run on GPU because expression AttributeReference rawPrediction#798 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  @Expression <AttributeReference> prediction#931 could run on GPU\n",
+      "  !Expression <AttributeReference> probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 07:01:09,857 WARN rapids.GpuOverrides:                               \n",
+      "!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n",
+      "  @Partitioning <SinglePartition$> could run on GPU\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#1062, rawPrediction#798]; not all expressions can be replaced\n",
+      "    @Expression <Alias> cast(label#254 as string) AS label#3936 could run on GPU\n",
+      "      @Expression <Cast> cast(label#254 as string) could run on GPU\n",
+      "        @Expression <AttributeReference> label#254 could run on GPU\n",
+      "    @Expression <Alias> cast(rawPrediction#798 as string) AS rawPrediction#3937 could run on GPU\n",
+      "      !Expression <Cast> cast(rawPrediction#798 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n",
+      "        !Expression <AttributeReference> rawPrediction#798 cannot run on GPU because expression AttributeReference rawPrediction#798 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    @Expression <Alias> cast(probability#1062 as string) AS probability#3938 could run on GPU\n",
+      "      !Expression <Cast> cast(probability#1062 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n",
+      "        !Expression <AttributeReference> probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    @Expression <Alias> cast(prediction#931 as string) AS prediction#3939 could run on GPU\n",
+      "      @Expression <Cast> cast(prediction#931 as string) could run on GPU\n",
+      "        @Expression <AttributeReference> prediction#931 could run on GPU\n",
+      "    !Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#1062, rawPrediction#798]; not all expressions can be replaced\n",
+      "      @Expression <AttributeReference> label#254 could run on GPU\n",
+      "      @Expression <AttributeReference> prediction#931 could run on GPU\n",
+      "      !Expression <AttributeReference> probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      !Expression <AttributeReference> rawPrediction#798 cannot run on GPU because expression AttributeReference rawPrediction#798 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Transformation takes 2.63 seconds\n",
+      "Transformation takes 3.26 seconds\n",
       "+-----+--------------------+--------------------+----------+\n",
       "|label|       rawPrediction|         probability|prediction|\n",
       "+-----+--------------------+--------------------+----------+\n",
-      "|  1.0|[-0.9667757749557...|[0.03322422504425...|       1.0|\n",
-      "|  0.0|[-0.0080436170101...|[0.99195638298988...|       0.0|\n",
-      "|  0.0|[-0.0080436170101...|[0.99195638298988...|       0.0|\n",
-      "|  0.0|[-0.1416745483875...|[0.85832545161247...|       0.0|\n",
-      "|  0.0|[-0.0747678577899...|[0.92523214221000...|       0.0|\n",
+      "|  1.0|[-9.6646747589111...|[6.35385513305664...|       1.0|\n",
+      "|  0.0|[-8.3923015594482...|[2.26557254791259...|       1.0|\n",
+      "|  0.0|[-8.0568389892578...|[3.16858291625976...|       1.0|\n",
+      "|  0.0|[1.91234850883483...|[0.87128275632858...|       0.0|\n",
+      "|  0.0|[-8.5582475662231...|[1.91867351531982...|       1.0|\n",
       "+-----+--------------------+--------------------+----------+\n",
       "only showing top 5 rows\n",
       "\n"
@@ -247,15 +662,54 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 07:01:10,292 WARN rapids.GpuOverrides: \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#931, label#5899, 1.0#5900, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(label,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#931 could run on GPU\n",
+      "    @Expression <AttributeReference> label#5899 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#5900 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#5905 cannot run on GPU because expression AttributeReference obj#5905 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#1062]; unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#1062]\n",
+      "    @Expression <AttributeReference> prediction#931 could run on GPU\n",
+      "    @Expression <Alias> cast(label#254 as double) AS label#5899 could run on GPU\n",
+      "      @Expression <Cast> cast(label#254 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> label#254 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#5900 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <AttributeReference> probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    !Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#1062]\n",
+      "      @Expression <AttributeReference> label#254 could run on GPU\n",
+      "      @Expression <AttributeReference> prediction#931 could run on GPU\n",
+      "      !Expression <AttributeReference> probability#1062 cannot run on GPU because expression AttributeReference probability#1062 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Evaluation takes 0.29 seconds\n",
-      "Accuracy is 0.9987577063864658\n"
+      "Evaluation takes 1.0 seconds\n",
+      "Accuracy is 0.9069677632722861\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "[Stage 12:>                                                         (0 + 1) / 1]\r",
+      "\r",
+      "                                                                                \r"
      ]
     }
    ],
@@ -275,7 +729,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb
index 56463f90a..40c71669d 100644
--- a/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb
+++ b/examples/XGBoost-Examples/mortgage/notebooks/python/MortgageETL+XGBoost.ipynb
@@ -10,16 +10,13 @@
     "\n",
     "# ETL + XGBoost train & transform\n",
     "\n",
-    "This notebook is an end-to-end example of ETL + XGBoost Train & Transform by using [Spark-Rapids](https://github.com/NVIDIA/spark-rapids) and [XGBoost](https://github.com/nvidia/spark-xgboost) with GPU accelerated.\n",
+    "This notebook is an end-to-end example of ETL + XGBoost Train & Transform by using [Spark-Rapids](https://github.com/NVIDIA/spark-rapids) and [XGBoost](https://github.com/dmlc/xgboost) with GPU accelerated.\n",
     "<br>The main steps:\n",
     "1. Run ETL to generate 2 datasets for train and test<br>\n",
     "   You can choose to save the datasets or not by setting \"is_save_dataset\" to True or False.<br>\n",
     "   It means you don't need to save the dataset to disk after ETL and directly feed the dataframe to XGBoost train or transform.\n",
     "2. Run XGBoost train with the train dataset\n",
-    "3. Run XGBoost transform with the test dataset\n",
-    "\n",
-    "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n",
-    "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)."
+    "3. Run XGBoost transform with the test dataset"
    ]
   },
   {
@@ -31,10 +28,13 @@
     "import time\n",
     "import os\n",
     "from pyspark import broadcast\n",
+    "from pyspark.conf import SparkConf\n",
     "from pyspark.sql import SparkSession\n",
     "from pyspark.sql.functions import *\n",
     "from pyspark.sql.types import *\n",
-    "from pyspark.sql.window import Window"
+    "from pyspark.sql.window import Window\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\""
    ]
   },
   {
@@ -64,10 +64,47 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "spark = (SparkSession\n",
-    "    .builder\n",
-    "    .appName(\"MortgageETL+XGBoost\")\n",
-    "    .getOrCreate())"
+    "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n",
+    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n",
+    "\n",
+    "# You need to update with your real hardware resource \n",
+    "driverMem = os.getenv(\"DRIVER_MEM\", \"10g\")\n",
+    "executorMem = os.getenv(\"EXECUTOR_MEM\", \"10g\")\n",
+    "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n",
+    "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n",
+    "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"4\"))\n",
+    "\n",
+    "# Common spark settings\n",
+    "conf = SparkConf()\n",
+    "conf.setMaster(SPARK_MASTER_URL)\n",
+    "conf.setAppName(\"Microbenchmark on GPU\")\n",
+    "conf.set(\"spark.driver.memory\", driverMem)\n",
+    "## The tasks will run on GPU memory, so there is no need to set a high host memory\n",
+    "conf.set(\"spark.executor.memory\", executorMem)\n",
+    "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n",
+    "conf.set(\"spark.executor.cores\", executorCores)\n",
+    "\n",
+    "# Plugin settings\n",
+    "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n",
+    "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n",
+    "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n",
+    "##############note: only support value=1 see https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n",
+    "conf.set(\"spark.task.resource.gpu.amount\", 1) \n",
+    "# since pyspark and xgboost share the same GPU, we need to allocate some memory to xgboost to avoid GPU OOM while training \n",
+    "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.6\")\n",
+    "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n",
+    "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n",
+    "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n",
+    "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.jars\", RAPIDS_JAR)\n",
+    "\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
+    "\n",
+    "# Create spark session\n",
+    "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
+    "reader = spark.read"
    ]
   },
   {
@@ -737,9 +774,7 @@
     "spark.conf.set(\"spark.rapids.sql.explain\", \"ALL\")\n",
     "spark.conf.set(\"spark.rapids.sql.batchSizeBytes\", \"512M\")\n",
     "spark.conf.set(\"spark.rapids.sql.reader.batchSizeBytes\", \"768M\")\n",
-    "spark.conf.set(\"spark.rapids.sql.hasNans\", \"false\")\n",
-    "# use GPU to read CSV\n",
-    "spark.conf.set(\"spark.rapids.sql.csv.read.double.enabled\", \"true\")"
+    "spark.conf.set(\"spark.rapids.sql.hasNans\", \"false\")"
    ]
   },
   {
@@ -805,7 +840,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier\n",
+    "from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel\n",
     "from pyspark.ml.evaluation import MulticlassClassificationEvaluator"
    ]
   },
@@ -893,16 +928,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# This sample uses 2 workers(GPUs) to run XGBoost training \n",
+    "# This sample uses 1 worker(GPU) to run XGBoost training, you can change according to your GPU resources\n",
     "params = { \n",
-    "    \"treeMethod\": \"gpu_hist\",\n",
-    "    \"objective\":\"binary:logistic\",\n",
-    "    \"growPolicy\": \"depthwise\",\n",
-    "    \"nthread\": 1,\n",
-    "    \"numRound\": 100,\n",
-    "    \"numWorkers\": 1,\n",
+    "    \"tree_method\": \"gpu_hist\",\n",
+    "    \"grow_policy\": \"depthwise\",\n",
+    "    \"num_workers\": 1,\n",
+    "    \"use_gpu\": \"true\",\n",
     "}\n",
-    "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)"
+    "params['features_col'] = features\n",
+    "params['label_col'] = label\n",
+    "    \n",
+    "classifier = SparkXGBClassifier(**params)"
    ]
   },
   {
@@ -934,8 +970,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.write().overwrite().save(output_path_model)\n",
-    "loaded_model = XGBoostClassificationModel().load(output_path_model)"
+    "model.write().overwrite().save(output_path_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loaded_model = SparkXGBClassifierModel().load(output_path_model)"
    ]
   },
   {
diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb
index 94a682cef..ac4e24dbe 100644
--- a/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb
+++ b/examples/XGBoost-Examples/mortgage/notebooks/python/cv-mortgage-gpu.ipynb
@@ -11,13 +11,10 @@
     "Here takes the application 'Mortgage' as an example.\n",
     "\n",
     "A few libraries are required for this notebook:\n",
-    "  1. NumPy\n",
-    "  2. cudf jar\n",
-    "  2. xgboost4j jar\n",
-    "  3. xgboost4j-spark jar\n",
-    "  \n",
-    "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n",
-    "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)."
+    "  1. cudf-cu11\n",
+    "  2. xgboost\n",
+    "  3. scikit-learn\n",
+    "  4. numpy"
    ]
   },
   {
@@ -33,21 +30,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier\n",
-    "from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator\n",
+    "from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel\n",
+    "from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n",
     "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
-    "from pyspark.ml.tuning import ParamGridBuilder\n",
     "from pyspark.sql import SparkSession\n",
     "from pyspark.sql.types import FloatType, IntegerType, StructField, StructType, DoubleType\n",
+    "from pyspark.conf import SparkConf\n",
     "from time import time\n",
-    "import os"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "As shown above, here `CrossValidator` is imported from package `ml.dmlc.xgboost4j.scala.spark.rapids`, not the spark's `tuning.CrossValidator`."
+    "import os\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n",
+    "# os.environ['PYSPARK_DRIVER_PYTHON'] = \"./environment/bin/python\""
    ]
   },
   {
@@ -61,9 +54,62 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-25 09:34:43,524 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
+      "2022-11-25 09:34:43,952 WARN resource.ResourceUtils: The configuration of cores (exec = 4 task = 1, runnable tasks = 4) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n",
+      "2022-11-25 09:34:58,155 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 22.10.0-SNAPSHOT using cudf 22.10.0-SNAPSHOT.\n",
+      "2022-11-25 09:34:58,171 WARN rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n",
+      "2022-11-25 09:34:58,175 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n",
+      "2022-11-25 09:34:58,175 WARN rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n"
+     ]
+    }
+   ],
    "source": [
-    "spark = SparkSession.builder.appName(\"mortgage-cv-gpu-python\").getOrCreate()"
+    "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n",
+    "\n",
+    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n",
+    "\n",
+    "# You need to update with your real hardware resource \n",
+    "driverMem = os.getenv(\"DRIVER_MEM\", \"2g\")\n",
+    "executorMem = os.getenv(\"EXECUTOR_MEM\", \"2g\")\n",
+    "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n",
+    "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n",
+    "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"4\"))\n",
+    "# Common spark settings\n",
+    "conf = SparkConf()\n",
+    "conf.setMaster(SPARK_MASTER_URL)\n",
+    "conf.setAppName(\"Microbenchmark on GPU\")\n",
+    "conf.set(\"spark.driver.memory\", driverMem)\n",
+    "## The tasks will run on GPU memory, so there is no need to set a high host memory\n",
+    "conf.set(\"spark.executor.memory\", executorMem)\n",
+    "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n",
+    "conf.set(\"spark.executor.cores\", executorCores)\n",
+    "\n",
+    "# Plugin settings\n",
+    "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n",
+    "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n",
+    "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n",
+    "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.7\")\n",
+    "conf.set(\"spark.locality.wait\",\"0\")\n",
+    "##############note: only support value=1 https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n",
+    "conf.set(\"spark.task.resource.gpu.amount\", 1) \n",
+    "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n",
+    "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n",
+    "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n",
+    "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
+    "# Create spark session\n",
+    "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
+    "\n",
+    "reader = spark.read"
    ]
   },
   {
@@ -117,8 +163,14 @@
     "train_path = dataRoot + \"/mortgage/output/train\"\n",
     "eval_path = dataRoot + \"/mortgage/output/eval\"\n",
     "\n",
-    "train_data = spark.read.parquet(train_path)\n",
-    "trans_data = spark.read.parquet(eval_path)"
+    "data_format = 'parquet'\n",
+    "has_header = 'true'\n",
+    "if data_format == 'csv':\n",
+    "    train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n",
+    "    trans_data = reader.schema(schema).option('header',has_header).csv(eval_path)\n",
+    "else :\n",
+    "    train_data = reader.load(train_path)\n",
+    "    trans_data = reader.load(eval_path)"
    ]
   },
   {
@@ -134,38 +186,31 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# First build a classifier of GPU version using *setFeaturesCols* to set feature columns\n",
     "params = { \n",
-    "    'eta': 0.1,\n",
-    "    'gamma': 0.1,\n",
-    "    'missing': 0.0,\n",
-    "    'treeMethod': 'gpu_hist',\n",
-    "    'maxDepth': 10, \n",
-    "    'maxLeaves': 256,\n",
-    "    'growPolicy': 'depthwise',\n",
-    "    'objective': 'binary:logistic',\n",
-    "    'minChildWeight': 30.0,\n",
-    "    'lambda_': 1.0,\n",
-    "    'scalePosWeight': 2.0,\n",
-    "    'subsample': 1.0,\n",
-    "    'nthread': 1,\n",
-    "    'numRound': 100,\n",
-    "    'numWorkers': 1,\n",
+    "    \"tree_method\": \"gpu_hist\",\n",
+    "    \"grow_policy\": \"depthwise\",\n",
+    "    \"num_workers\": 1,\n",
+    "    \"use_gpu\": \"true\",\n",
     "}\n",
-    "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)\n",
+    "\n",
+    "params['features_col'] = features\n",
+    "params['label_col'] = label\n",
+    "    \n",
+    "classifier = SparkXGBClassifier(**params)\n",
+    "\n",
     "# Then build the evaluator and the hyperparameters\n",
     "evaluator = (MulticlassClassificationEvaluator()\n",
     "    .setLabelCol(label))\n",
     "param_grid = (ParamGridBuilder()\n",
-    "    .addGrid(classifier.maxDepth, [3, 6])\n",
-    "    .addGrid(classifier.numRound, [100, 200])\n",
+    "    .addGrid(classifier.max_depth, [3, 6])\n",
+    "    .addGrid(classifier.n_estimators, [100, 200])\n",
     "    .build())\n",
     "# Finally the corss validator\n",
     "cross_validator = (CrossValidator()\n",
     "    .setEstimator(classifier)\n",
     "    .setEvaluator(evaluator)\n",
     "    .setEstimatorParamMaps(param_grid)\n",
-    "    .setNumFolds(3))"
+    "    .setNumFolds(2))"
    ]
   },
   {
@@ -180,11 +225,242 @@
    "execution_count": 5,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-25 09:35:01,049 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "/data/home/yuanli/work/reviews/pr252/pyspark_venv_20221125/lib/python3.8/site-packages/xgboost/sklearn.py:808: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n",
+      "  warnings.warn(\"Loading a native XGBoost model with Scikit-Learn interface.\")\n",
+      "2022-11-25 09:35:26,758 WARN rapids.GpuOverrides: \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#2153, delinquency_12#2255, 1.0#2256, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#2153 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#2255 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#2256 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#2186 cannot run on GPU because expression AttributeReference probability#2186 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#2261 cannot run on GPU because expression AttributeReference obj#2261 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#2186]\n",
+      "    @Expression <Alias> pythonUDF0#2552.prediction AS prediction#2153 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#2552.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#2552 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#2255 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#2256 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#2552.probability) AS probability#2186 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#2552.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#2552.probability) AS probability#2186 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#2552.probability) cannot run on GPU because expression ScalaUDF UDF(pythonUDF0#2552.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled\n",
+      "        @Expression <GetStructField> pythonUDF0#2552.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#2552 could run on GPU\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.                 \n",
+      "2022-11-25 09:35:34,074 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#4415, delinquency_12#4517, 1.0#4518, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#4415 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#4517 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#4518 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#4448 cannot run on GPU because expression AttributeReference probability#4448 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#4523 cannot run on GPU because expression AttributeReference obj#4523 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#4448]; not all expressions can be replaced\n",
+      "    @Expression <Alias> pythonUDF0#4814.prediction AS prediction#4415 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#4814.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#4814 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#4517 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#4518 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#4814.probability) AS probability#4448 cannot run on GPU because expression Alias UDF(pythonUDF0#4814.probability) AS probability#4448 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#4814.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#4814.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#4814.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "        @Expression <GetStructField> pythonUDF0#4814.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#4814 could run on GPU\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-25 09:35:37,859 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#6677, delinquency_12#6779, 1.0#6780, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#6677 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#6779 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#6780 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#6710 cannot run on GPU because expression AttributeReference probability#6710 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#6785 cannot run on GPU because expression AttributeReference obj#6785 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#6710]; not all expressions can be replaced\n",
+      "    @Expression <Alias> pythonUDF0#7076.prediction AS prediction#6677 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#7076.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#7076 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#6779 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#6780 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#7076.probability) AS probability#6710 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#7076.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#7076.probability) AS probability#6710 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#7076.probability) cannot run on GPU because expression ScalaUDF UDF(pythonUDF0#7076.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled\n",
+      "        @Expression <GetStructField> pythonUDF0#7076.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#7076 could run on GPU\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-25 09:35:41,551 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#8939, delinquency_12#9041, 1.0#9042, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#8939 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#9041 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#9042 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#8972 cannot run on GPU because expression AttributeReference probability#8972 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#9047 cannot run on GPU because expression AttributeReference obj#9047 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#8972]; not all expressions can be replaced\n",
+      "    @Expression <Alias> pythonUDF0#9338.prediction AS prediction#8939 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#9338.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#9338 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#9041 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#9042 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#9338.probability) AS probability#8972 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#9338.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#9338.probability) AS probability#8972 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#9338.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#9338.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "        @Expression <GetStructField> pythonUDF0#9338.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#9338 could run on GPU\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-25 09:35:45,231 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#11491, delinquency_12#11593, 1.0#11594, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#11491 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#11593 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#11594 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#11524 cannot run on GPU because expression AttributeReference probability#11524 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#11599 cannot run on GPU because expression AttributeReference obj#11599 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#11524]\n",
+      "    @Expression <Alias> pythonUDF0#11890.prediction AS prediction#11491 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#11890.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#11890 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#11593 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#11594 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#11890.probability) AS probability#11524 cannot run on GPU because expression Alias UDF(pythonUDF0#11890.probability) AS probability#11524 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#11890.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#11890.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#11890.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "        @Expression <GetStructField> pythonUDF0#11890.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#11890 could run on GPU\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-25 09:35:49,003 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#13753, delinquency_12#13855, 1.0#13856, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#13753 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#13855 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#13856 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#13786 cannot run on GPU because expression AttributeReference probability#13786 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#13861 cannot run on GPU because expression AttributeReference obj#13861 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#13786]; not all expressions can be replaced\n",
+      "    @Expression <Alias> pythonUDF0#14152.prediction AS prediction#13753 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#14152.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#14152 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#13855 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#13856 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#14152.probability) AS probability#13786 cannot run on GPU because expression Alias UDF(pythonUDF0#14152.probability) AS probability#13786 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#14152.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#14152.probability) cannot run on GPU because expression ScalaUDF UDF(pythonUDF0#14152.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled\n",
+      "        @Expression <GetStructField> pythonUDF0#14152.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#14152 could run on GPU\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-25 09:35:52,578 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#16015, delinquency_12#16117, 1.0#16118, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#16015 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#16117 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#16118 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#16048 cannot run on GPU because expression AttributeReference probability#16048 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#16123 cannot run on GPU because expression AttributeReference obj#16123 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#16048]; not all expressions can be replaced\n",
+      "    @Expression <Alias> pythonUDF0#16414.prediction AS prediction#16015 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#16414.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#16414 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#16117 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#16118 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#16414.probability) AS probability#16048 cannot run on GPU because expression Alias UDF(pythonUDF0#16414.probability) AS probability#16048 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#16414.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#16414.probability) cannot run on GPU because expression ScalaUDF UDF(pythonUDF0#16414.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled\n",
+      "        @Expression <GetStructField> pythonUDF0#16414.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#16414 could run on GPU\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-25 09:35:56,267 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#18277, delinquency_12#18379, 1.0#18380, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#18277 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#18379 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#18380 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#18310 cannot run on GPU because expression AttributeReference probability#18310 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#18385 cannot run on GPU because expression AttributeReference obj#18385 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18310]; not all expressions can be replaced\n",
+      "    @Expression <Alias> pythonUDF0#18676.prediction AS prediction#18277 could run on GPU\n",
+      "      @Expression <GetStructField> pythonUDF0#18676.prediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#18676 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#27 as double) AS delinquency_12#18379 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#27 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#27 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#18380 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <Alias> UDF(pythonUDF0#18676.probability) AS probability#18310 cannot run on GPU because expression Alias UDF(pythonUDF0#18676.probability) AS probability#18310 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#18676.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "      !Expression <ScalaUDF> UDF(pythonUDF0#18676.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#18676.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "        @Expression <GetStructField> pythonUDF0#18676.probability could run on GPU\n",
+      "          @Expression <AttributeReference> pythonUDF0#18676 could run on GPU\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "[Stage 69:>                                                         (0 + 1) / 1]\r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Cross-Validation takes 88.53 seconds\n"
+      "Cross-Validation takes 59.46 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "                                                                                \r"
      ]
     }
    ],
@@ -207,22 +483,126 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-25 09:35:59,886 WARN rapids.GpuOverrides: \n",
+      "!Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#18908, probability#18974]; not all expressions can be replaced\n",
+      "  @Expression <AttributeReference> orig_channel#56 could run on GPU\n",
+      "  @Expression <AttributeReference> first_home_buyer#57 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_purpose#58 could run on GPU\n",
+      "  @Expression <AttributeReference> property_type#59 could run on GPU\n",
+      "  @Expression <AttributeReference> occupancy_status#60 could run on GPU\n",
+      "  @Expression <AttributeReference> property_state#61 could run on GPU\n",
+      "  @Expression <AttributeReference> product_type#62 could run on GPU\n",
+      "  @Expression <AttributeReference> relocation_mortgage_indicator#63 could run on GPU\n",
+      "  @Expression <AttributeReference> seller_name#64 could run on GPU\n",
+      "  @Expression <AttributeReference> mod_flag#65 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_interest_rate#66 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_upb#67 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_loan_term#68 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_ltv#69 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_cltv#70 could run on GPU\n",
+      "  @Expression <AttributeReference> num_borrowers#71 could run on GPU\n",
+      "  @Expression <AttributeReference> dti#72 could run on GPU\n",
+      "  @Expression <AttributeReference> borrower_credit_score#73 could run on GPU\n",
+      "  @Expression <AttributeReference> num_units#74 could run on GPU\n",
+      "  @Expression <AttributeReference> zip#75 could run on GPU\n",
+      "  @Expression <AttributeReference> mortgage_insurance_percent#76 could run on GPU\n",
+      "  @Expression <AttributeReference> current_loan_delinquency_status#77 could run on GPU\n",
+      "  @Expression <AttributeReference> current_actual_upb#78 could run on GPU\n",
+      "  @Expression <AttributeReference> interest_rate#79 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_age#80 could run on GPU\n",
+      "  @Expression <AttributeReference> msa#81 could run on GPU\n",
+      "  @Expression <AttributeReference> non_interest_bearing_upb#82 could run on GPU\n",
+      "  @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "  !Expression <Alias> UDF(pythonUDF0#19041.rawPrediction) AS rawPrediction#18908 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#19041.rawPrediction) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#19041.rawPrediction) AS rawPrediction#18908 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    !Expression <ScalaUDF> UDF(pythonUDF0#19041.rawPrediction) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#19041.rawPrediction) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      @Expression <GetStructField> pythonUDF0#19041.rawPrediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#19041 could run on GPU\n",
+      "  @Expression <Alias> pythonUDF0#19041.prediction AS prediction#18942 could run on GPU\n",
+      "    @Expression <GetStructField> pythonUDF0#19041.prediction could run on GPU\n",
+      "      @Expression <AttributeReference> pythonUDF0#19041 could run on GPU\n",
+      "  !Expression <Alias> UDF(pythonUDF0#19041.probability) AS probability#18974 cannot run on GPU because expression Alias UDF(pythonUDF0#19041.probability) AS probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; input expression ScalaUDF UDF(pythonUDF0#19041.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported)\n",
+      "    !Expression <ScalaUDF> UDF(pythonUDF0#19041.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3606/1625633331 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#19041.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      @Expression <GetStructField> pythonUDF0#19041.probability could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#19041 could run on GPU\n",
+      "\n",
+      "2022-11-25 09:35:59,893 WARN rapids.GpuOverrides: \n",
+      "!Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#18908, probability#18974]; not all expressions can be replaced\n",
+      "  @Expression <AttributeReference> orig_channel#56 could run on GPU\n",
+      "  @Expression <AttributeReference> first_home_buyer#57 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_purpose#58 could run on GPU\n",
+      "  @Expression <AttributeReference> property_type#59 could run on GPU\n",
+      "  @Expression <AttributeReference> occupancy_status#60 could run on GPU\n",
+      "  @Expression <AttributeReference> property_state#61 could run on GPU\n",
+      "  @Expression <AttributeReference> product_type#62 could run on GPU\n",
+      "  @Expression <AttributeReference> relocation_mortgage_indicator#63 could run on GPU\n",
+      "  @Expression <AttributeReference> seller_name#64 could run on GPU\n",
+      "  @Expression <AttributeReference> mod_flag#65 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_interest_rate#66 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_upb#67 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_loan_term#68 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_ltv#69 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_cltv#70 could run on GPU\n",
+      "  @Expression <AttributeReference> num_borrowers#71 could run on GPU\n",
+      "  @Expression <AttributeReference> dti#72 could run on GPU\n",
+      "  @Expression <AttributeReference> borrower_credit_score#73 could run on GPU\n",
+      "  @Expression <AttributeReference> num_units#74 could run on GPU\n",
+      "  @Expression <AttributeReference> zip#75 could run on GPU\n",
+      "  @Expression <AttributeReference> mortgage_insurance_percent#76 could run on GPU\n",
+      "  @Expression <AttributeReference> current_loan_delinquency_status#77 could run on GPU\n",
+      "  @Expression <AttributeReference> current_actual_upb#78 could run on GPU\n",
+      "  @Expression <AttributeReference> interest_rate#79 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_age#80 could run on GPU\n",
+      "  @Expression <AttributeReference> msa#81 could run on GPU\n",
+      "  @Expression <AttributeReference> non_interest_bearing_upb#82 could run on GPU\n",
+      "  @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "  !Expression <AttributeReference> rawPrediction#18908 cannot run on GPU because expression AttributeReference rawPrediction#18908 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  @Expression <AttributeReference> prediction#18942 could run on GPU\n",
+      "  !Expression <AttributeReference> probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n",
+      "2022-11-25 09:36:00,975 WARN rapids.GpuOverrides:                               \n",
+      "!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n",
+      "  @Partitioning <SinglePartition$> could run on GPU\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18974, rawPrediction#18908]; not all expressions can be replaced\n",
+      "    @Expression <Alias> cast(delinquency_12#83 as string) AS delinquency_12#19670 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#83 as string) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "    @Expression <Alias> cast(rawPrediction#18908 as string) AS rawPrediction#19671 could run on GPU\n",
+      "      !Expression <Cast> cast(rawPrediction#18908 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n",
+      "        !Expression <AttributeReference> rawPrediction#18908 cannot run on GPU because expression AttributeReference rawPrediction#18908 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    @Expression <Alias> cast(probability#18974 as string) AS probability#19672 could run on GPU\n",
+      "      !Expression <Cast> cast(probability#18974 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n",
+      "        !Expression <AttributeReference> probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    @Expression <Alias> cast(prediction#18942 as string) AS prediction#19673 could run on GPU\n",
+      "      @Expression <Cast> cast(prediction#18942 as string) could run on GPU\n",
+      "        @Expression <AttributeReference> prediction#18942 could run on GPU\n",
+      "    !Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18974, rawPrediction#18908]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; not all expressions can be replaced\n",
+      "      @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "      @Expression <AttributeReference> prediction#18942 could run on GPU\n",
+      "      !Expression <AttributeReference> probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      !Expression <AttributeReference> rawPrediction#18908 cannot run on GPU because expression AttributeReference rawPrediction#18908 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Transforming takes 3.13 seconds\n",
+      "Transforming takes 1.15 seconds\n",
       "+--------------+--------------------+--------------------+----------+\n",
       "|delinquency_12|       rawPrediction|         probability|prediction|\n",
       "+--------------+--------------------+--------------------+----------+\n",
-      "|             0|[2.57163572311401...|[0.92901364713907...|       0.0|\n",
-      "|             0|[2.63977861404418...|[0.93337820470333...|       0.0|\n",
-      "|             0|[2.50156974792480...|[0.92425179481506...|       0.0|\n",
-      "|             0|[2.63977861404418...|[0.93337820470333...|       0.0|\n",
-      "|             0|[2.09173870086669...|[0.89009761810302...|       0.0|\n",
+      "|             0|[10.2152490615844...|[0.99996340274810...|       0.0|\n",
+      "|             0|[8.85215473175048...|[0.99985694885253...|       0.0|\n",
+      "|             0|[8.85215473175048...|[0.99985694885253...|       0.0|\n",
+      "|             0|[8.85215473175048...|[0.99985694885253...|       0.0|\n",
+      "|             0|[10.2152490615844...|[0.99996340274810...|       0.0|\n",
       "+--------------+--------------------+--------------------+----------+\n",
       "only showing top 5 rows\n",
       "\n"
@@ -247,15 +627,53 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-25 09:36:01,155 WARN rapids.GpuOverrides: \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#18942, delinquency_12#20148, 1.0#20149, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#18942 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#20148 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#20149 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#20154 cannot run on GPU because expression AttributeReference obj#20154 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18974]; not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18974]\n",
+      "    @Expression <AttributeReference> prediction#18942 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#83 as double) AS delinquency_12#20148 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#83 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#20149 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <AttributeReference> probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    !Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#18974]\n",
+      "      @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "      @Expression <AttributeReference> prediction#18942 could run on GPU\n",
+      "      !Expression <AttributeReference> probability#18974 cannot run on GPU because expression AttributeReference probability#18974 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n",
+      "[Stage 72:>                                                         (0 + 1) / 1]\r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Evaluation takes 0.29 seconds\n",
-      "Accuracy is 0.9868033296704449\n"
+      "Evaluation takes 1.41 seconds\n",
+      "Accuracy is 1.0\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "                                                                                \r"
      ]
     }
    ],
@@ -268,7 +686,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb
index e2c64c15e..a91911b9b 100644
--- a/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb
+++ b/examples/XGBoost-Examples/mortgage/notebooks/python/mortgage-gpu.ipynb
@@ -9,16 +9,12 @@
     "The goal of this notebook is to show how to train a XGBoost Model with Spark RAPIDS XGBoost library on GPUs. The dataset used with this notebook is derived from Fannie Mae’s Single-Family Loan Performance Data with all rights reserved by Fannie Mae. This processed dataset is redistributed with permission and consent from Fannie Mae. This notebook uses XGBoost to train 12-month mortgage loan delinquency prediction model .\n",
     "\n",
     "A few libraries required for this notebook:\n",
-    "  1. NumPy\n",
-    "  2. cudf jar\n",
-    "  3. xgboost4j jar\n",
-    "  4. xgboost4j-spark jar\n",
-    "  5. rapids-4-spark.jar\n",
+    "  1. cudf-cu11\n",
+    "  2. xgboost\n",
+    "  3. scikit-learn\n",
+    "  4. numpy\n",
     "\n",
-    "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is only one change required for running Spark XGBoost on GPU. That is replacing the API `setFeaturesCol(feature)` on CPU with the new API `setFeaturesCols(features)`. This also eliminates the need for vectorization (assembling multiple feature columns in to one column) since we can read multiple columns.\n",
-    "\n",
-    "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n",
-    "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)."
+    "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is no change required for running Spark XGBoost on GPU because both CPU and GPU call the same API. For CPU run, we need to vectorize the trained dataset before fitting data to classifier."
    ]
   },
   {
@@ -34,12 +30,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier\n",
+    "import os\n",
+    "\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from xgboost.spark import SparkXGBClassifier, SparkXGBClassifierModel\n",
     "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
     "from pyspark.sql import SparkSession\n",
     "from pyspark.sql.types import FloatType, IntegerType, StructField, StructType, DoubleType\n",
-    "from time import time\n",
-    "import os"
+    "from pyspark.conf import SparkConf\n",
+    "from time import time"
    ]
   },
   {
@@ -62,11 +70,68 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
+      "22/11/24 06:14:05 WARN org.apache.spark.resource.ResourceUtils: The configuration of cores (exec = 4 task = 1, runnable tasks = 4) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n",
+      "22/11/24 06:14:06 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker\n",
+      "22/11/24 06:14:06 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster\n",
+      "22/11/24 06:14:06 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat\n",
+      "22/11/24 06:14:06 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator\n",
+      "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: RAPIDS Accelerator 22.08.0 using cudf 22.08.0.\n",
+      "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n",
+      "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n",
+      "22/11/24 06:14:07 WARN com.nvidia.spark.rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n"
+     ]
+    }
+   ],
    "source": [
-    "spark = SparkSession.builder.getOrCreate()\n",
+    "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n",
+    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n",
+    "\n",
+    "# You need to update with your real hardware resource \n",
+    "driverMem = os.getenv(\"DRIVER_MEM\", \"10g\")\n",
+    "executorMem = os.getenv(\"EXECUTOR_MEM\", \"10g\")\n",
+    "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n",
+    "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n",
+    "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"4\"))\n",
+    "\n",
+    "# Common spark settings\n",
+    "conf = SparkConf()\n",
+    "conf.setMaster(SPARK_MASTER_URL)\n",
+    "conf.setAppName(\"Microbenchmark on GPU\")\n",
+    "conf.set(\"spark.driver.memory\", driverMem)\n",
+    "## The tasks will run on GPU memory, so there is no need to set a high host memory\n",
+    "conf.set(\"spark.executor.memory\", executorMem)\n",
+    "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n",
+    "conf.set(\"spark.executor.cores\", executorCores)\n",
+    "\n",
+    "# Plugin settings\n",
+    "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n",
+    "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n",
+    "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n",
+    "##############note: only support value=1 see https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n",
+    "conf.set(\"spark.task.resource.gpu.amount\", 1) \n",
+    "# since pyspark and xgboost share the same GPU, we need to allocate some memory to xgboost to avoid GPU OOM while training \n",
+    "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.6\")\n",
+    "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n",
+    "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n",
+    "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n",
+    "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.jars\", RAPIDS_JAR)\n",
+    "\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
+    "\n",
+    "# Create spark session\n",
+    "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
     "reader = spark.read"
    ]
   },
@@ -79,7 +144,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -121,8 +186,15 @@
     "train_path = dataRoot + \"/mortgage/output/train\"\n",
     "eval_path = dataRoot + \"/mortgage/output/eval\"\n",
     "\n",
-    "train_data = reader.parquet(train_path)\n",
-    "trans_data = reader.parquet(eval_path)"
+    "data_format = 'parquet'\n",
+    "has_header = 'true'\n",
+    "if data_format == 'csv':\n",
+    "    train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n",
+    "    trans_data = reader.schema(schema).option('header',has_header).csv(eval_path)\n",
+    "else :\n",
+    "    train_data = reader.load(train_path)\n",
+    "    trans_data = reader.load(eval_path)\n",
+    "  "
    ]
   },
   {
@@ -154,42 +226,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "params = { \n",
-    "    'eta': 0.1,\n",
-    "    'gamma': 0.1,\n",
-    "    'missing': 0.0,\n",
-    "    'treeMethod': 'gpu_hist',\n",
-    "    'maxDepth': 10, \n",
-    "    'maxLeaves': 256,\n",
-    "    'objective':'binary:logistic',\n",
-    "    'growPolicy': 'depthwise',\n",
-    "    'minChildWeight': 30.0,\n",
-    "    'lambda_': 1.0,\n",
-    "    'scalePosWeight': 2.0,\n",
-    "    'subsample': 1.0,\n",
-    "    'nthread': 1,\n",
-    "    'numRound': 100,\n",
-    "    'numWorkers': 1,\n",
+    "    \"tree_method\": \"gpu_hist\",\n",
+    "    \"grow_policy\": \"depthwise\",\n",
+    "    \"num_workers\": 1,\n",
+    "    \"use_gpu\": \"true\",\n",
     "}\n",
-    "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCols(features)"
+    "params['features_col'] = features\n",
+    "params['label_col'] = label\n",
+    "    \n",
+    "classifier = SparkXGBClassifier(**params)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The CPU version classifier provides the API `setFeaturesCol` which only accepts a single column name, so vectorization for multiple feature columns is required.\n",
-    "```Python\n",
-    "classifier = XGBoostClassifier(**params).setLabelCol(label).setFeaturesCol('features')\n",
-    "```\n",
-    "\n",
     "The parameter `num_workers` should be set to the number of GPUs in Spark cluster for GPU version, while for CPU version it is usually equal to the number of the CPU cores.\n",
     "\n",
-    "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training."
+    "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training.\n",
+    "\n",
+    "An example of CPU classifier:\n",
+    "```\n",
+    "classifier = SparkXGBClassifier(\n",
+    "  feature_col=features,\n",
+    "  label_col=label,  \n",
+    "  num_workers=1024,\n",
+    "  use_gpu=False,\n",
+    ")\n",
+    "```"
    ]
   },
   {
@@ -201,14 +270,42 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "22/11/24 06:14:44 WARN org.apache.spark.sql.catalyst.util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n",
+      "[Stage 12:>                                                         (0 + 1) / 1]\r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Training takes 25.67 seconds\n"
+      "[06:15:10] WARNING: ../src/learner.cc:553: \n",
+      "  If you are loading a serialized model (like pickle in Python, RDS in R) generated by\n",
+      "  older XGBoost, please export the model by calling `Booster.save_model` from that version\n",
+      "  first, then load it back in current version. See:\n",
+      "\n",
+      "    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html\n",
+      "\n",
+      "  for more details about differences between saving model and serializing.\n",
+      "\n",
+      "Training takes 28.6 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "                                                                                \r",
+      "/home/yuali_nvidia_com/.local/lib/python3.8/site-packages/xgboost/sklearn.py:808: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n",
+      "  warnings.warn(\"Loading a native XGBoost model with Scikit-Learn interface.\")\n"
      ]
     }
    ],
@@ -231,12 +328,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "                                                                                \r"
+     ]
+    }
+   ],
+   "source": [
+    "model.write().overwrite().save(dataRoot + '/model/mortgage')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.write().overwrite().save(dataRoot + '/mortgage/model')\n",
-    "loaded_model = XGBoostClassificationModel().load(dataRoot + '/mortgage/model')"
+    "loaded_model = SparkXGBClassifierModel().load(dataRoot + '/model/mortgage')"
    ]
   },
   {
@@ -248,22 +362,126 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "22/11/24 06:15:13 WARN com.nvidia.spark.rapids.GpuOverrides: \n",
+      "!Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#209, probability#275]\n",
+      "  @Expression <AttributeReference> orig_channel#56 could run on GPU\n",
+      "  @Expression <AttributeReference> first_home_buyer#57 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_purpose#58 could run on GPU\n",
+      "  @Expression <AttributeReference> property_type#59 could run on GPU\n",
+      "  @Expression <AttributeReference> occupancy_status#60 could run on GPU\n",
+      "  @Expression <AttributeReference> property_state#61 could run on GPU\n",
+      "  @Expression <AttributeReference> product_type#62 could run on GPU\n",
+      "  @Expression <AttributeReference> relocation_mortgage_indicator#63 could run on GPU\n",
+      "  @Expression <AttributeReference> seller_name#64 could run on GPU\n",
+      "  @Expression <AttributeReference> mod_flag#65 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_interest_rate#66 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_upb#67 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_loan_term#68 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_ltv#69 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_cltv#70 could run on GPU\n",
+      "  @Expression <AttributeReference> num_borrowers#71 could run on GPU\n",
+      "  @Expression <AttributeReference> dti#72 could run on GPU\n",
+      "  @Expression <AttributeReference> borrower_credit_score#73 could run on GPU\n",
+      "  @Expression <AttributeReference> num_units#74 could run on GPU\n",
+      "  @Expression <AttributeReference> zip#75 could run on GPU\n",
+      "  @Expression <AttributeReference> mortgage_insurance_percent#76 could run on GPU\n",
+      "  @Expression <AttributeReference> current_loan_delinquency_status#77 could run on GPU\n",
+      "  @Expression <AttributeReference> current_actual_upb#78 could run on GPU\n",
+      "  @Expression <AttributeReference> interest_rate#79 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_age#80 could run on GPU\n",
+      "  @Expression <AttributeReference> msa#81 could run on GPU\n",
+      "  @Expression <AttributeReference> non_interest_bearing_upb#82 could run on GPU\n",
+      "  @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "  !Expression <Alias> UDF(pythonUDF0#342.rawPrediction) AS rawPrediction#209 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#342.rawPrediction) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#342.rawPrediction) AS rawPrediction#209 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    !Expression <ScalaUDF> UDF(pythonUDF0#342.rawPrediction) cannot run on GPU because expression ScalaUDF UDF(pythonUDF0#342.rawPrediction) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7; neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3898/645590696 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled\n",
+      "      @Expression <GetStructField> pythonUDF0#342.rawPrediction could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#342 could run on GPU\n",
+      "  @Expression <Alias> pythonUDF0#342.prediction AS prediction#243 could run on GPU\n",
+      "    @Expression <GetStructField> pythonUDF0#342.prediction could run on GPU\n",
+      "      @Expression <AttributeReference> pythonUDF0#342 could run on GPU\n",
+      "  !Expression <Alias> UDF(pythonUDF0#342.probability) AS probability#275 cannot run on GPU because input expression ScalaUDF UDF(pythonUDF0#342.probability) (org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 is not supported); expression Alias UDF(pythonUDF0#342.probability) AS probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    !Expression <ScalaUDF> UDF(pythonUDF0#342.probability) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.functions$$$Lambda$3898/645590696 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled; expression ScalaUDF UDF(pythonUDF0#342.probability) produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      @Expression <GetStructField> pythonUDF0#342.probability could run on GPU\n",
+      "        @Expression <AttributeReference> pythonUDF0#342 could run on GPU\n",
+      "\n",
+      "22/11/24 06:15:13 WARN com.nvidia.spark.rapids.GpuOverrides: \n",
+      "!Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; not all expressions can be replaced; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction#209, probability#275]\n",
+      "  @Expression <AttributeReference> orig_channel#56 could run on GPU\n",
+      "  @Expression <AttributeReference> first_home_buyer#57 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_purpose#58 could run on GPU\n",
+      "  @Expression <AttributeReference> property_type#59 could run on GPU\n",
+      "  @Expression <AttributeReference> occupancy_status#60 could run on GPU\n",
+      "  @Expression <AttributeReference> property_state#61 could run on GPU\n",
+      "  @Expression <AttributeReference> product_type#62 could run on GPU\n",
+      "  @Expression <AttributeReference> relocation_mortgage_indicator#63 could run on GPU\n",
+      "  @Expression <AttributeReference> seller_name#64 could run on GPU\n",
+      "  @Expression <AttributeReference> mod_flag#65 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_interest_rate#66 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_upb#67 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_loan_term#68 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_ltv#69 could run on GPU\n",
+      "  @Expression <AttributeReference> orig_cltv#70 could run on GPU\n",
+      "  @Expression <AttributeReference> num_borrowers#71 could run on GPU\n",
+      "  @Expression <AttributeReference> dti#72 could run on GPU\n",
+      "  @Expression <AttributeReference> borrower_credit_score#73 could run on GPU\n",
+      "  @Expression <AttributeReference> num_units#74 could run on GPU\n",
+      "  @Expression <AttributeReference> zip#75 could run on GPU\n",
+      "  @Expression <AttributeReference> mortgage_insurance_percent#76 could run on GPU\n",
+      "  @Expression <AttributeReference> current_loan_delinquency_status#77 could run on GPU\n",
+      "  @Expression <AttributeReference> current_actual_upb#78 could run on GPU\n",
+      "  @Expression <AttributeReference> interest_rate#79 could run on GPU\n",
+      "  @Expression <AttributeReference> loan_age#80 could run on GPU\n",
+      "  @Expression <AttributeReference> msa#81 could run on GPU\n",
+      "  @Expression <AttributeReference> non_interest_bearing_upb#82 could run on GPU\n",
+      "  @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "  !Expression <AttributeReference> rawPrediction#209 cannot run on GPU because expression AttributeReference rawPrediction#209 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  @Expression <AttributeReference> prediction#243 could run on GPU\n",
+      "  !Expression <AttributeReference> probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n",
+      "22/11/24 06:15:28 WARN com.nvidia.spark.rapids.GpuOverrides:                    \n",
+      "!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n",
+      "  @Partitioning <SinglePartition$> could run on GPU\n",
+      "  !Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced; unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#275, rawPrediction#209]\n",
+      "    @Expression <Alias> cast(delinquency_12#83 as string) AS delinquency_12#971 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#83 as string) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "    @Expression <Alias> cast(rawPrediction#209 as string) AS rawPrediction#972 could run on GPU\n",
+      "      !Expression <Cast> cast(rawPrediction#209 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n",
+      "        !Expression <AttributeReference> rawPrediction#209 cannot run on GPU because expression AttributeReference rawPrediction#209 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    @Expression <Alias> cast(probability#275 as string) AS probability#973 could run on GPU\n",
+      "      !Expression <Cast> cast(probability#275 as string) cannot run on GPU because Cast from org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 to StringType is not supported\n",
+      "        !Expression <AttributeReference> probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    @Expression <Alias> cast(prediction#243 as string) AS prediction#974 could run on GPU\n",
+      "      @Expression <Cast> cast(prediction#243 as string) could run on GPU\n",
+      "        @Expression <AttributeReference> prediction#243 could run on GPU\n",
+      "    !Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#275, rawPrediction#209]; not all expressions can be replaced\n",
+      "      @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "      @Expression <AttributeReference> prediction#243 could run on GPU\n",
+      "      !Expression <AttributeReference> probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "      !Expression <AttributeReference> rawPrediction#209 cannot run on GPU because expression AttributeReference rawPrediction#209 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Transformation takes 11.39 seconds\n",
+      "Transformation takes 15.62 seconds\n",
       "+--------------+--------------------+--------------------+----------+\n",
       "|delinquency_12|       rawPrediction|         probability|prediction|\n",
       "+--------------+--------------------+--------------------+----------+\n",
-      "|             0|[7.76566505432128...|[0.99957613222068...|       0.0|\n",
-      "|             0|[4.50240230560302...|[0.98903913144022...|       0.0|\n",
-      "|             0|[4.50240230560302...|[0.98903913144022...|       0.0|\n",
-      "|             0|[4.50240230560302...|[0.98903913144022...|       0.0|\n",
-      "|             0|[4.50240230560302...|[0.98903913144022...|       0.0|\n",
+      "|             0|[8.84631538391113...|[0.99985611438751...|       0.0|\n",
+      "|             0|[9.41864871978759...|[0.99991881847381...|       0.0|\n",
+      "|             0|[9.41864871978759...|[0.99991881847381...|       0.0|\n",
+      "|             0|[9.41864871978759...|[0.99991881847381...|       0.0|\n",
+      "|             0|[8.84631538391113...|[0.99985611438751...|       0.0|\n",
       "+--------------+--------------------+--------------------+----------+\n",
       "only showing top 5 rows\n",
       "\n"
@@ -288,40 +506,83 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def check_classification_accuracy(data_frame, label):\n",
+    "    accuracy = (MulticlassClassificationEvaluator()\n",
+    "                .setLabelCol(label)\n",
+    "                .evaluate(data_frame))\n",
+    "    print('-' * 100)\n",
+    "    print('Accuracy is ' + str(accuracy))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "22/11/24 06:15:28 WARN com.nvidia.spark.rapids.GpuOverrides: \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#243, delinquency_12#1450, 1.0#1449, newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize, StructField(prediction,DoubleType,true), StructField(delinquency_12,DoubleType,true), StructField(1.0,DoubleType,false), StructField(probability,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#243 could run on GPU\n",
+      "    @Expression <AttributeReference> delinquency_12#1450 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#1449 could run on GPU\n",
+      "    ! <Invoke> newInstance(class org.apache.spark.ml.linalg.VectorUDT).deserialize cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke\n",
+      "      ! <NewInstance> newInstance(class org.apache.spark.ml.linalg.VectorUDT) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.NewInstance\n",
+      "      !Expression <AttributeReference> probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "  !Expression <AttributeReference> obj#1455 cannot run on GPU because expression AttributeReference obj#1455 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "  !Exec <ProjectExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#275]; unsupported data types in input: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#275]; not all expressions can be replaced\n",
+      "    @Expression <AttributeReference> prediction#243 could run on GPU\n",
+      "    @Expression <Alias> cast(delinquency_12#83 as double) AS delinquency_12#1450 could run on GPU\n",
+      "      @Expression <Cast> cast(delinquency_12#83 as double) could run on GPU\n",
+      "        @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "    @Expression <Alias> 1.0 AS 1.0#1449 could run on GPU\n",
+      "      @Expression <Literal> 1.0 could run on GPU\n",
+      "    !Expression <AttributeReference> probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "    !Exec <InMemoryTableScanExec> cannot run on GPU because unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [rawPrediction, probability]; unsupported data types in output: org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7 [probability#275]; not all expressions can be replaced\n",
+      "      @Expression <AttributeReference> delinquency_12#83 could run on GPU\n",
+      "      @Expression <AttributeReference> prediction#243 could run on GPU\n",
+      "      !Expression <AttributeReference> probability#275 cannot run on GPU because expression AttributeReference probability#275 produces an unsupported type org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7\n",
+      "\n",
+      "[Stage 19:>                                                         (0 + 1) / 1]\r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Evaluation takes 1.03 seconds\n",
-      "Accuracy is 0.9876786703104035\n"
+      "----------------------------------------------------------------------------------------------------\n",
+      "Accuracy is 1.0\n",
+      "Evaluation takes 2.29 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "                                                                                \r"
      ]
     }
    ],
    "source": [
-    "accuracy = with_benchmark(\n",
-    "    'Evaluation',\n",
-    "    lambda: MulticlassClassificationEvaluator().setLabelCol(label).evaluate(result))\n",
-    "print('Accuracy is ' + str(accuracy))"
+    "with_benchmark('Evaluation', lambda: check_classification_accuracy(result, label))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
     "spark.stop()"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/examples/XGBoost-Examples/taxi/notebooks/python/cv-taxi-gpu.ipynb b/examples/XGBoost-Examples/taxi/notebooks/python/cv-taxi-gpu.ipynb
index f7530c133..3eb2d41dc 100644
--- a/examples/XGBoost-Examples/taxi/notebooks/python/cv-taxi-gpu.ipynb
+++ b/examples/XGBoost-Examples/taxi/notebooks/python/cv-taxi-gpu.ipynb
@@ -11,13 +11,10 @@
     "Here takes the application 'Taxi' as an example.\n",
     "\n",
     "A few libraries are required for this notebook:\n",
-    "  1. NumPy\n",
-    "  2. cudf jar\n",
-    "  2. xgboost4j jar\n",
-    "  3. xgboost4j-spark jar\n",
-    "  \n",
-    "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n",
-    "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)."
+    "  1. cudf-cu11\n",
+    "  2. xgboost\n",
+    "  3. scikit-learn\n",
+    "  4. numpy"
    ]
   },
   {
@@ -33,21 +30,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ml.dmlc.xgboost4j.scala.spark import XGBoostRegressionModel, XGBoostRegressor\n",
-    "from ml.dmlc.xgboost4j.scala.spark.rapids import CrossValidator\n",
+    "from xgboost.spark import SparkXGBRegressor, SparkXGBRegressorModel\n",
+    "from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n",
     "from pyspark.ml.evaluation import RegressionEvaluator\n",
-    "from pyspark.ml.tuning import ParamGridBuilder\n",
     "from pyspark.sql import SparkSession\n",
     "from pyspark.sql.types import FloatType, IntegerType, StructField, StructType\n",
     "from time import time\n",
-    "import os"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "As shown above, here `CrossValidator` is imported from package `ml.dmlc.xgboost4j.scala.spark.rapids`, not the spark's `tuning.CrossValidator`."
+    "from pyspark.conf import SparkConf\n",
+    "import os\n",
+    "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n",
+    "# os.environ['PYSPARK_DRIVER_PYTHON'] = \"./environment/bin/python\""
    ]
   },
   {
@@ -61,9 +53,64 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 08:02:09,748 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
+      "2022-11-30 08:02:10,103 WARN resource.ResourceUtils: The configuration of cores (exec = 2 task = 1, runnable tasks = 2) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n",
+      "2022-11-30 08:02:23,737 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 22.10.0 using cudf 22.10.0.\n",
+      "2022-11-30 08:02:23,752 WARN rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n",
+      "2022-11-30 08:02:23,756 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n",
+      "2022-11-30 08:02:23,757 WARN rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n",
+      "2022-11-30 08:02:24,226 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.\n"
+     ]
+    }
+   ],
    "source": [
-    "spark = SparkSession.builder.appName(\"taxi-cv-gpu-python\").getOrCreate()"
+    "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n",
+    "\n",
+    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n",
+    "\n",
+    "# You need to update with your real hardware resource \n",
+    "driverMem = os.getenv(\"DRIVER_MEM\", \"2g\")\n",
+    "executorMem = os.getenv(\"EXECUTOR_MEM\", \"2g\")\n",
+    "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n",
+    "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n",
+    "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"2\"))\n",
+    "# Common spark settings\n",
+    "conf = SparkConf()\n",
+    "conf.setMaster(SPARK_MASTER_URL)\n",
+    "conf.setAppName(\"Microbenchmark on GPU\")\n",
+    "conf.set(\"spark.executor.instances\",\"1\")\n",
+    "conf.set(\"spark.driver.memory\", driverMem)\n",
+    "## The tasks will run on GPU memory, so there is no need to set a high host memory\n",
+    "conf.set(\"spark.executor.memory\", executorMem)\n",
+    "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n",
+    "conf.set(\"spark.executor.cores\", executorCores)\n",
+    "\n",
+    "# Plugin settings\n",
+    "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n",
+    "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n",
+    "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n",
+    "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.7\")\n",
+    "conf.set(\"spark.locality.wait\",\"0\")\n",
+    "##############note: only support value=1 https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n",
+    "conf.set(\"spark.task.resource.gpu.amount\", 1) \n",
+    "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n",
+    "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n",
+    "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n",
+    "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
+    "# Create spark session\n",
+    "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
+    "\n",
+    "reader = spark.read"
    ]
   },
   {
@@ -103,8 +150,17 @@
     "\n",
     "# You need to update them to your real paths!\n",
     "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
-    "train_data = spark.read.parquet(dataRoot + '/taxi/parquet/train')\n",
-    "trans_data = spark.read.parquet(dataRoot + '/taxi/parquet/eval')"
+    "train_path = dataRoot + \"/taxi/csv/train\"\n",
+    "eval_path = dataRoot + \"/taxi/csv/test\"\n",
+    "\n",
+    "data_format = 'csv'\n",
+    "has_header = 'true'\n",
+    "if data_format == 'csv':\n",
+    "    train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n",
+    "    trans_data = reader.schema(schema).option('header',has_header).csv(eval_path)\n",
+    "else :\n",
+    "    train_data = reader.load(train_path)\n",
+    "    trans_data = reader.load(eval_path)"
    ]
   },
   {
@@ -121,29 +177,29 @@
    "outputs": [],
    "source": [
     "# First build a regressor of GPU version using *setFeaturesCols* to set feature columns\n",
-    "params = {\n",
-    "    'eta': 0.05,\n",
-    "    'maxDepth': 8,\n",
-    "    'subsample': 0.8,\n",
-    "    'gamma': 1.0,\n",
-    "    'numRound': 100,\n",
-    "    'numWorkers': 1,\n",
-    "    'treeMethod': 'gpu_hist',\n",
+    "params = { \n",
+    "    \"tree_method\": \"gpu_hist\",\n",
+    "    \"grow_policy\": \"depthwise\",\n",
+    "    \"num_workers\": 1,\n",
+    "    \"use_gpu\": \"true\",\n",
     "}\n",
-    "regressor = XGBoostRegressor(**params).setLabelCol(label).setFeaturesCols(features)\n",
+    "params['features_col'] = features\n",
+    "params['label_col'] = label\n",
+    "\n",
+    "regressor = SparkXGBRegressor(**params)\n",
     "# Then build the evaluator and the hyperparameters\n",
     "evaluator = (RegressionEvaluator()\n",
     "    .setLabelCol(label))\n",
     "param_grid = (ParamGridBuilder()\n",
-    "    .addGrid(regressor.maxDepth, [3, 6])\n",
-    "    .addGrid(regressor.numRound, [100, 200])\n",
+    "    .addGrid(regressor.max_depth, [3, 6])\n",
+    "    .addGrid(regressor.n_estimators, [100, 200])\n",
     "    .build())\n",
     "# Finally the corss validator\n",
     "cross_validator = (CrossValidator()\n",
     "    .setEstimator(regressor)\n",
     "    .setEvaluator(evaluator)\n",
     "    .setEstimatorParamMaps(param_grid)\n",
-    "    .setNumFolds(3))"
+    "    .setNumFolds(2))"
    ]
   },
   {
@@ -158,11 +214,108 @@
    "execution_count": 5,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "/data/home/yuanli/work/reviews/pr252/pyspark_venv_20221125/lib/python3.8/site-packages/xgboost/sklearn.py:808: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n",
+      "  warnings.warn(\"Loading a native XGBoost model with Scikit-Learn interface.\")\n",
+      "2022-11-30 08:03:14,308 WARN rapids.GpuOverrides: \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#889, fare_amount#890, 1.0#891, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#889 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#890 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#891 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#895 cannot run on GPU because expression AttributeReference obj#895 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n",
+      "2022-11-30 08:03:14,317 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 08:03:20,073 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#1789, fare_amount#1790, 1.0#1791, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#1789 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#1790 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#1791 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#1795 cannot run on GPU because expression AttributeReference obj#1795 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 08:03:23,687 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#2689, fare_amount#2690, 1.0#2691, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#2689 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#2690 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#2691 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#2695 cannot run on GPU because expression AttributeReference obj#2695 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 08:03:27,457 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#3589, fare_amount#3590, 1.0#3591, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#3589 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#3590 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#3591 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#3595 cannot run on GPU because expression AttributeReference obj#3595 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 08:03:30,964 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#4659, fare_amount#4660, 1.0#4661, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#4659 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#4660 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#4661 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#4665 cannot run on GPU because expression AttributeReference obj#4665 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 08:03:34,524 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#5559, fare_amount#5560, 1.0#5561, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#5559 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#5560 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#5561 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#5565 cannot run on GPU because expression AttributeReference obj#5565 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 08:03:38,067 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#6459, fare_amount#6460, 1.0#6461, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#6459 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#6460 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#6461 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#6465 cannot run on GPU because expression AttributeReference obj#6465 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n",
+      "If features_cols param set, then features_col param is ignored.\n",
+      "2022-11-30 08:03:41,793 WARN rapids.GpuOverrides:                               \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#7359, fare_amount#7360, 1.0#7361, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#7359 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#7360 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#7361 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#7365 cannot run on GPU because expression AttributeReference obj#7365 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "[Stage 34:>                                                         (0 + 1) / 1]\r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Cross-Validation takes 73.77 seconds\n"
+      "Cross-Validation takes 55.19 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "                                                                                \r"
      ]
     }
    ],
@@ -192,16 +345,32 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Transforming takes 1.33 seconds\n",
-      "+-----------+-----------------+\n",
-      "|fare_amount|       prediction|\n",
-      "+-----------+-----------------+\n",
-      "|        2.5|34.38509750366211|\n",
-      "|       45.0|37.97528839111328|\n",
-      "|        2.5|28.55727195739746|\n",
-      "|       45.0|40.39316177368164|\n",
-      "|       45.0|36.12188720703125|\n",
-      "+-----------+-----------------+\n",
+      "Transforming takes 0.23 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 08:03:45,503 WARN rapids.GpuOverrides: \n",
+      "!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n",
+      "  @Partitioning <SinglePartition$> could run on GPU\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-----------+-----------+\n",
+      "|fare_amount| prediction|\n",
+      "+-----------+-----------+\n",
+      "|        5.0| 5.01032114|\n",
+      "|       34.0|  31.134758|\n",
+      "|       10.0|9.288980484|\n",
+      "|       16.5|15.33446312|\n",
+      "|        7.0|8.197098732|\n",
+      "+-----------+-----------+\n",
       "only showing top 5 rows\n",
       "\n"
      ]
@@ -232,8 +401,22 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Evaluation takes 0.26 seconds\n",
-      "RMSE is 3.5167114187894883\n"
+      "Evaluation takes 0.05 seconds\n",
+      "RMSE is 2.055690464034438\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 08:03:45,728 WARN rapids.GpuOverrides: \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#7645, fare_amount#8271, 1.0#8272, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#7645 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#8271 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#8272 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#8276 cannot run on GPU because expression AttributeReference obj#8276 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n"
      ]
     }
    ],
diff --git a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-gpu.ipynb b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-gpu.ipynb
index 3fdfa540a..da2c41c3f 100644
--- a/examples/XGBoost-Examples/taxi/notebooks/python/taxi-gpu.ipynb
+++ b/examples/XGBoost-Examples/taxi/notebooks/python/taxi-gpu.ipynb
@@ -4,21 +4,17 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Introduction to XGBoost Spark3.0 with GPU\n",
+    "# Introduction to XGBoost Spark3.1 with GPU\n",
     "\n",
     "Taxi is an example of xgboost regressor. This notebook will show you how to load data, train the xgboost model and use this model to predict \"fare_amount\" of your taxi trip.\n",
     "\n",
     "A few libraries required for this notebook:\n",
-    "  1. NumPy\n",
-    "  2. cudf jar\n",
-    "  3. xgboost4j jar\n",
-    "  4. xgboost4j-spark jar\n",
-    "  5. rapids-4-spark.jar  \n",
+    "  1. cudf-cu11\n",
+    "  2. xgboost\n",
+    "  3. scikit-learn\n",
+    "  4. numpy\n",
     "\n",
-    "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is only one change required for running Spark XGBoost on GPU. That is replacing the API `setFeaturesCol(feature)` on CPU with the new API `setFeaturesCols(features)`. This also eliminates the need for vectorization (assembling multiple feature columns in to one column) since we can read multiple columns.\n",
-    "\n",
-    "Note: For PySpark based XGBoost, please refer to the [Spark-RAPIDS-examples 22.04 branch](https://github.com/NVIDIA/spark-rapids-examples/tree/branch-22.04) that\n",
-    "uses [NVIDIA’s Spark XGBoost version](https://repo1.maven.org/maven2/com/nvidia/xgboost4j-spark_3.0/1.4.2-0.3.0/)."
+    "This notebook also illustrates the ease of porting a sample CPU based Spark xgboost4j code into GPU. There is no change required for running Spark XGBoost on GPU because both CPU and GPU call the same API. For CPU run, we need to vectorize the trained dataset before fitting data to regressor."
    ]
   },
   {
@@ -34,12 +30,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from ml.dmlc.xgboost4j.scala.spark import XGBoostRegressionModel, XGBoostRegressor\n",
+    "from xgboost.spark import SparkXGBRegressor, SparkXGBRegressorModel\n",
     "from pyspark.ml.evaluation import RegressionEvaluator\n",
     "from pyspark.sql import SparkSession\n",
     "from pyspark.sql.types import FloatType, IntegerType, StructField, StructType\n",
     "from time import time\n",
-    "import os"
+    "from pyspark.conf import SparkConf\n",
+    "import os\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n",
+    "# os.environ['PYSPARK_DRIVER_PYTHON'] = \"./environment/bin/python\""
    ]
   },
   {
@@ -62,11 +62,67 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 07:51:19,104 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
+      "Setting default log level to \"WARN\".\n",
+      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
+      "2022-11-30 07:51:19,480 WARN resource.ResourceUtils: The configuration of cores (exec = 2 task = 1, runnable tasks = 2) will result in wasted resources due to resource gpu limiting the number of runnable tasks per executor to: 1. Please adjust your configuration.\n",
+      "2022-11-30 07:51:33,277 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator 22.10.0 using cudf 22.10.0.\n",
+      "2022-11-30 07:51:33,292 WARN rapids.RapidsPluginUtils: spark.rapids.sql.multiThreadedRead.numThreads is set to 20.\n",
+      "2022-11-30 07:51:33,295 WARN rapids.RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.\n",
+      "2022-11-30 07:51:33,295 WARN rapids.RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the GPU.\n",
+      "2022-11-30 07:51:33,798 WARN yarn.Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.\n"
+     ]
+    }
+   ],
    "source": [
-    "spark = SparkSession.builder.getOrCreate()\n",
+    "SPARK_MASTER_URL = os.getenv(\"SPARK_MASTER_URL\", \"/your-url\")\n",
+    "\n",
+    "RAPIDS_JAR = os.getenv(\"RAPIDS_JAR\", \"/your-jar-path\")\n",
+    "\n",
+    "# You need to update with your real hardware resource \n",
+    "driverMem = os.getenv(\"DRIVER_MEM\", \"2g\")\n",
+    "executorMem = os.getenv(\"EXECUTOR_MEM\", \"2g\")\n",
+    "pinnedPoolSize = os.getenv(\"PINNED_POOL_SIZE\", \"2g\")\n",
+    "concurrentGpuTasks = os.getenv(\"CONCURRENT_GPU_TASKS\", \"2\")\n",
+    "executorCores = int(os.getenv(\"EXECUTOR_CORES\", \"2\"))\n",
+    "# Common spark settings\n",
+    "conf = SparkConf()\n",
+    "conf.setMaster(SPARK_MASTER_URL)\n",
+    "conf.setAppName(\"Microbenchmark on GPU\")\n",
+    "conf.set(\"spark.executor.instances\",\"1\")\n",
+    "conf.set(\"spark.driver.memory\", driverMem)\n",
+    "## The tasks will run on GPU memory, so there is no need to set a high host memory\n",
+    "conf.set(\"spark.executor.memory\", executorMem)\n",
+    "## The tasks will run on GPU cores, so there is no need to use many cpu cores\n",
+    "conf.set(\"spark.executor.cores\", executorCores)\n",
+    "\n",
+    "# Plugin settings\n",
+    "conf.set(\"spark.executor.resource.gpu.amount\", \"1\")\n",
+    "conf.set(\"spark.rapids.sql.concurrentGpuTasks\", concurrentGpuTasks)\n",
+    "conf.set(\"spark.rapids.memory.pinnedPool.size\", pinnedPoolSize)\n",
+    "# since pyspark and xgboost share the same GPU, we need to allocate some memory to xgboost to avoid GPU OOM while training \n",
+    "conf.set(\"spark.rapids.memory.gpu.allocFraction\",\"0.7\")\n",
+    "conf.set(\"spark.locality.wait\",\"0\")\n",
+    "##############note: only support value=1 https://github.com/dmlc/xgboost/blame/master/python-package/xgboost/spark/core.py#L370-L374\n",
+    "conf.set(\"spark.task.resource.gpu.amount\", 1) \n",
+    "conf.set(\"spark.rapids.sql.enabled\", \"true\") \n",
+    "conf.set(\"spark.plugins\", \"com.nvidia.spark.SQLPlugin\")\n",
+    "conf.set(\"spark.sql.cache.serializer\",\"com.nvidia.spark.ParquetCachedBatchSerializer\")\n",
+    "conf.set(\"spark.driver.extraClassPath\", RAPIDS_JAR)\n",
+    "conf.set(\"spark.executor.extraClassPath\", RAPIDS_JAR)\n",
+    "\n",
+    "# if you pass/unpack the archive file and enable the environment\n",
+    "# conf.set(\"spark.yarn.dist.archives\", \"your_pyspark_venv.tar.gz#environment\")\n",
+    "# Create spark session\n",
+    "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
+    "\n",
     "reader = spark.read"
    ]
   },
@@ -79,7 +135,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -106,8 +162,17 @@
     "\n",
     "# You need to update them to your real paths!\n",
     "dataRoot = os.getenv(\"DATA_ROOT\", \"/data\")\n",
-    "train_data = reader.schema(schema).option('header', True).csv(dataRoot + '/taxi/csv/train')\n",
-    "trans_data  = reader.schema(schema).option('header', True).csv(dataRoot + '/taxi/csv/test')"
+    "train_path = dataRoot + \"/taxi/csv/train\"\n",
+    "eval_path = dataRoot + \"/taxi/csv/test\"\n",
+    "\n",
+    "data_format = 'csv'\n",
+    "has_header = 'true'\n",
+    "if data_format == 'csv':\n",
+    "    train_data = reader.schema(schema).option('header',has_header).csv(train_path)\n",
+    "    trans_data = reader.schema(schema).option('header',has_header).csv(eval_path)\n",
+    "else :\n",
+    "    train_data = reader.load(train_path)\n",
+    "    trans_data = reader.load(eval_path)"
    ]
   },
   {
@@ -139,34 +204,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
     "params = { \n",
-    "    'eta': 0.05,\n",
-    "    'treeMethod': 'gpu_hist',\n",
-    "    'maxDepth': 8,\n",
-    "    'subsample': 0.8,\n",
-    "    'gamma': 1.0,\n",
-    "    'numRound': 100,\n",
-    "    'numWorkers': 1,\n",
+    "    \"tree_method\": \"gpu_hist\",\n",
+    "    \"grow_policy\": \"depthwise\",\n",
+    "    \"num_workers\": 1,\n",
+    "    \"use_gpu\": \"true\",\n",
     "}\n",
-    "regressor = XGBoostRegressor(**params).setLabelCol(label).setFeaturesCols(features)"
+    "params['features_col'] = features\n",
+    "params['label_col'] = label\n",
+    "    \n",
+    "regressor = SparkXGBRegressor(**params)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The CPU version regressor provides the API `setFeaturesCol` which only accepts a single column name, so vectorization for multiple feature columns is required.\n",
-    "```Python\n",
-    "regressor = XGBoostRegressor(**params).setLabelCol(label).setFeaturesCol('features')\n",
-    "```\n",
-    "\n",
     "The parameter `num_workers` should be set to the number of GPUs in Spark cluster for GPU version, while for CPU version it is usually equal to the number of the CPU cores.\n",
     "\n",
-    "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training.\n"
+    "Concerning the tree method, GPU version only supports `gpu_hist` currently, while `hist` is designed and used here for CPU training.\n",
+    "\n",
+    "An example of CPU classifier:\n",
+    "```\n",
+    "classifier = SparkXGBClassifier(\n",
+    "  feature_col=features,\n",
+    "  label_col=label,  \n",
+    "  num_workers=1024,\n",
+    "  use_gpu=False,\n",
+    ")\n",
+    "```"
    ]
   },
   {
@@ -178,16 +248,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "metadata": {
     "scrolled": true
    },
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n",
+      "[Stage 2:>                                                          (0 + 1) / 1]\r"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Training takes 17.73 seconds\n"
+      "Training takes 24.08 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "                                                                                \r",
+      "/data/home/yuanli/work/reviews/pr252/pyspark_venv_20221125/lib/python3.8/site-packages/xgboost/sklearn.py:808: UserWarning: Loading a native XGBoost model with Scikit-Learn interface.\n",
+      "  warnings.warn(\"Loading a native XGBoost model with Scikit-Learn interface.\")\n"
      ]
     }
    ],
@@ -210,12 +298,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "If features_cols param set, then features_col param is ignored.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.write().overwrite().save(dataRoot + '/model/taxi')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.write().overwrite().save(dataRoot + '/new-model-path')\n",
-    "loaded_model = XGBoostRegressionModel().load(dataRoot + '/new-model-path')"
+    "loaded_model = SparkXGBRegressorModel().load(dataRoot + '/model/taxi')"
    ]
   },
   {
@@ -227,25 +331,48 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "metadata": {
     "scrolled": false
    },
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 07:52:27,357 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Transformation takes 0.93 seconds\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 07:52:28,189 WARN rapids.GpuOverrides: \n",
+      "!Exec <CollectLimitExec> cannot run on GPU because the Exec CollectLimitExec has been disabled, and is disabled by default because Collect Limit replacement can be slower on the GPU, if huge number of rows in a batch it could help by limiting the number of rows transferred from GPU to CPU. Set spark.rapids.sql.exec.CollectLimitExec to true if you wish to enable it\n",
+      "  @Partitioning <SinglePartition$> could run on GPU\n",
+      "\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Transformation takes 2.55 seconds\n",
-      "+------------+---------------+-------------+-----------+------------------+\n",
-      "|   vendor_id|passenger_count|trip_distance|fare_amount|        prediction|\n",
-      "+------------+---------------+-------------+-----------+------------------+\n",
-      "|1.55973043E9|            1.0|          1.1|        6.2| 5.670516490936279|\n",
-      "|1.55973043E9|            4.0|          2.7|        9.4|10.054250717163086|\n",
-      "|1.55973043E9|            1.0|          1.5|        6.1|  7.01417350769043|\n",
-      "|1.55973043E9|            1.0|          4.1|       12.6|14.309316635131836|\n",
-      "|1.55973043E9|            1.0|          4.6|       13.4|13.990922927856445|\n",
-      "+------------+---------------+-------------+-----------+------------------+\n",
+      "+--------------+---------------+-------------+-----------+-----------+\n",
+      "|     vendor_id|passenger_count|trip_distance|fare_amount| prediction|\n",
+      "+--------------+---------------+-------------+-----------+-----------+\n",
+      "|1.559730432E09|            2.0|  0.699999988|        5.0|5.046935558|\n",
+      "|1.559730432E09|            3.0|  10.69999981|       34.0|31.72706413|\n",
+      "|1.559730432E09|            1.0|  2.299999952|       10.0|9.294451714|\n",
+      "|1.559730432E09|            1.0|  4.400000095|       16.5|15.05233097|\n",
+      "|1.559730432E09|            1.0|          1.5|        7.0|8.995832443|\n",
+      "+--------------+---------------+-------------+-----------+-----------+\n",
       "only showing top 5 rows\n",
       "\n"
      ]
@@ -276,7 +403,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
    "metadata": {
     "scrolled": true
    },
@@ -285,8 +412,22 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Evaluation takes 0.45 seconds\n",
-      "RMSE is 3.3195416959403032\n"
+      "Evaluation takes 0.22 seconds\n",
+      "RMSE is 1.9141528471228921\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2022-11-30 07:52:28,580 WARN rapids.GpuOverrides: \n",
+      "! <DeserializeToObjectExec> cannot run on GPU because not all expressions can be replaced; GPU does not currently support the operator class org.apache.spark.sql.execution.DeserializeToObjectExec\n",
+      "  ! <CreateExternalRow> createexternalrow(prediction#87, fare_amount#728, 1.0#729, StructField(prediction,DoubleType,true), StructField(fare_amount,DoubleType,true), StructField(1.0,DoubleType,false)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow\n",
+      "    @Expression <AttributeReference> prediction#87 could run on GPU\n",
+      "    @Expression <AttributeReference> fare_amount#728 could run on GPU\n",
+      "    @Expression <AttributeReference> 1.0#729 could run on GPU\n",
+      "  !Expression <AttributeReference> obj#733 cannot run on GPU because expression AttributeReference obj#733 produces an unsupported type ObjectType(interface org.apache.spark.sql.Row)\n",
+      "\n"
      ]
     }
    ],
@@ -306,7 +447,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [