From d02d0431fb9356d5144ec9f80cf646fcbf6f84c5 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Mon, 21 Oct 2024 15:35:40 +0800 Subject: [PATCH] [xgboost] Fix eval dataset issues --- .../notebooks/scala/agaricus-gpu.ipynb | 20 ++++---------- .../nvidia/spark/examples/agaricus/Main.scala | 3 ++- .../notebooks/scala/mortgage-gpu.ipynb | 20 ++++---------- .../nvidia/spark/examples/mortgage/Main.scala | 7 +++-- .../taxi/notebooks/scala/taxi-gpu.ipynb | 22 +++++----------- .../com/nvidia/spark/examples/taxi/Main.scala | 7 +++-- .../spark/examples/utility/XGBoostArgs.scala | 26 +++++++++---------- 7 files changed, 37 insertions(+), 68 deletions(-) diff --git a/examples/XGBoost-Examples/agaricus/notebooks/scala/agaricus-gpu.ipynb b/examples/XGBoost-Examples/agaricus/notebooks/scala/agaricus-gpu.ipynb index 06efa2ba9..eb4cae8e5 100644 --- a/examples/XGBoost-Examples/agaricus/notebooks/scala/agaricus-gpu.ipynb +++ b/examples/XGBoost-Examples/agaricus/notebooks/scala/agaricus-gpu.ipynb @@ -316,23 +316,13 @@ "## Benchmark and train\n", "The object `benchmark` is used to compute the elapsed time of some operations.\n", "\n", - "Training with evaluation sets is also supported in 2 ways, the same as CPU version's behavior:\n", + "Training with evaluation dataset is also supported, the same as CPU version's behavior:\n", "\n", - "* Call API `setEvalSets` after initializing an XGBoostClassifier\n", + "* Call API `setEvalDataset` after initializing an XGBoostClassifier\n", "\n", "```scala\n", - "xgbClassifier.setEvalSets(Map(\"eval\" -> evalSet))\n", - "\n", - "```\n", - "\n", - "* Use parameter `eval_sets` when initializing an XGBoostClassifier\n", - "\n", - "```scala\n", - "val paramMapWithEval = paramMap + (\"eval_sets\" -> Map(\"eval\" -> evalSet))\n", - "val xgbClassifierWithEval = new XGBoostClassifier(paramMapWithEval)\n", - "```\n", - "\n", - "Here chooses the API way to set evaluation sets." + "xgbClassifier.setEvalDataset(evalSet)\n", + "```" ] }, { @@ -352,7 +342,7 @@ } ], "source": [ - "xgbClassifier.setEvalSets(Map(\"eval\" -> evalSet))" + "xgbClassifier.setEvalDataset(evalSet)" ] }, { diff --git a/examples/XGBoost-Examples/agaricus/scala/src/com/nvidia/spark/examples/agaricus/Main.scala b/examples/XGBoost-Examples/agaricus/scala/src/com/nvidia/spark/examples/agaricus/Main.scala index d81f38a43..b9baa8548 100644 --- a/examples/XGBoost-Examples/agaricus/scala/src/com/nvidia/spark/examples/agaricus/Main.scala +++ b/examples/XGBoost-Examples/agaricus/scala/src/com/nvidia/spark/examples/agaricus/Main.scala @@ -64,13 +64,14 @@ object Main { // build XGBoost classifier val paramMap = xgboostArgs.xgboostParams(Map( "objective" -> "binary:logistic", - "eval_sets" -> datasets(1).map(ds => Map("eval" -> ds)).getOrElse(Map.empty) )) val xgbClassifier = new XGBoostClassifier(paramMap) .setLabelCol(labelName) // === diff === .setFeaturesCol(featureCols) + datasets(1).foreach(_ => xgbClassifier.setEvalDataset(_)) + println("\n------ Training ------") val (model, _) = benchmark.time("train") { xgbClassifier.fit(datasets(0).get) diff --git a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb index cf61395c9..57ee76235 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb @@ -304,23 +304,13 @@ "## Benchmark and train\n", "The object `benchmark` is used to compute the elapsed time of some operations.\n", "\n", - "Training with evaluation sets is also supported in 2 ways, the same as CPU version's behavior:\n", + "Training with evaluation dataset is also supported, the same as CPU version's behavior:\n", "\n", - "* Call API `setEvalSets` after initializing an XGBoostClassifier\n", + "* Call API `setEvalDataset` after initializing an XGBoostClassifier\n", "\n", "```scala\n", - "xgbClassifier.setEvalSets(Map(\"eval\" -> evalSet))\n", - "\n", - "```\n", - "\n", - "* Use parameter `eval_sets` when initializing an XGBoostClassifier\n", - "\n", - "```scala\n", - "val paramMapWithEval = paramMap + (\"eval_sets\" -> Map(\"eval\" -> evalSet))\n", - "val xgbClassifierWithEval = new XGBoostClassifier(paramMapWithEval)\n", - "```\n", - "\n", - "Here chooses the API way to set evaluation sets." + "xgbClassifier.setEvalDataset(evalSet)\n", + "```" ] }, { @@ -340,7 +330,7 @@ } ], "source": [ - "xgbClassifier.setEvalSets(Map(\"eval\" -> evalSet))" + "xgbClassifier.setEvalDataset(evalSet)" ] }, { diff --git a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Main.scala b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Main.scala index 32c11eea6..edd273aa6 100644 --- a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Main.scala +++ b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Main.scala @@ -52,14 +52,13 @@ object Main extends Mortgage { val xgbClassificationModel = if (appArgs.isToTrain) { // build XGBoost classifier - val xgbParamFinal = appArgs.xgboostParams(commParamMap + - // Add train-eval dataset if specified - ("eval_sets" -> datasets(1).map(ds => Map("eval" -> ds)).getOrElse(Map.empty)) - ) + val xgbParamFinal = appArgs.xgboostParams(commParamMap) val xgbClassifier = new XGBoostClassifier(xgbParamFinal) .setLabelCol(labelColName) .setFeaturesCol(featureNames) + datasets(1).foreach(_ => xgbClassifier.setEvalDataset(_)) + // Start training println("\n------ Training ------") // Shall we not log the time if it is abnormal, which is usually caused by training failure diff --git a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-gpu.ipynb b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-gpu.ipynb index 58dd84eb0..2a6253a8c 100644 --- a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-gpu.ipynb +++ b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-gpu.ipynb @@ -320,23 +320,13 @@ "## Benchmark and train\n", "The object `benchmark` is used to compute the elapsed time of some operations.\n", "\n", - "Training with evaluation sets is also supported in 2 ways, the same as CPU version's behavior:\n", + "Training with evaluation dataset is also supported, the same as CPU version's behavior:\n", "\n", - "* Call API `setEvalSets` after initializing an XGBoostRegressor\n", + "* Call API `setEvalDataset` after initializing an XGBoostClassifier\n", "\n", "```scala\n", - "xgbRegressor.setEvalSets(Map(\"eval\" -> evalSet))\n", - "\n", - "```\n", - "\n", - "* Use parameter `eval_sets` when initializing an XGBoostRegressor\n", - "\n", - "```scala\n", - "val paramMapWithEval = paramMap + (\"eval_sets\" -> Map(\"eval\" -> evalSet))\n", - "val xgbRegressorWithEval = new XGBoostRegressor(paramMapWithEval)\n", - "```\n", - "\n", - "Here chooses the API way to set evaluation sets." + "xgbClassifier.setEvalDataset(evalSet)\n", + "```" ] }, { @@ -356,7 +346,7 @@ } ], "source": [ - "xgbRegressor.setEvalSets(Map(\"eval\" -> evalSet))" + "xgbRegressor.setEvalDataset(evalSet)" ] }, { @@ -609,4 +599,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/examples/XGBoost-Examples/taxi/scala/src/com/nvidia/spark/examples/taxi/Main.scala b/examples/XGBoost-Examples/taxi/scala/src/com/nvidia/spark/examples/taxi/Main.scala index 698d8cf56..e05017f79 100644 --- a/examples/XGBoost-Examples/taxi/scala/src/com/nvidia/spark/examples/taxi/Main.scala +++ b/examples/XGBoost-Examples/taxi/scala/src/com/nvidia/spark/examples/taxi/Main.scala @@ -58,14 +58,13 @@ object Main extends Taxi { val xgbRegressionModel = if (xgboostArgs.isToTrain) { // build XGBoost XGBoostRegressor - val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap + - // Add train-eval dataset if specified - ("eval_sets" -> datasets(1).map(ds => Map("test" -> ds)).getOrElse(Map.empty)) - ) + val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap) val xgbRegressor = new XGBoostRegressor(xgbParamFinal) .setLabelCol(labelColName) .setFeaturesCol(featureNames) + datasets(1).foreach(_ => xgbRegressor.setEvalDataset(_)) + println("\n------ Training ------") // Shall we not log the time if it is abnormal, which is usually caused by training failure val (model, _) = benchmark.time("train") { diff --git a/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala b/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala index 2e8cf0fc8..a47256601 100644 --- a/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala +++ b/examples/XGBoost-Examples/utility/scala/src/com/nvidia/spark/examples/utility/XGBoostArgs.scala @@ -191,19 +191,6 @@ class XGBoostArgs private[utility] ( def numFold: Int = appArgsMap.get("numFold").asInstanceOf[Option[Int]].getOrElse(3) - def xgboostParams(otherParams: Map[String, Any] = Map.empty): Map[String, Any] = { - val params = otherParams ++ xgbArgsMap.map{ - case (name, value) if !name.contains('_') => - (CaseFormat.LOWER_CAMEL.to(CaseFormat.LOWER_UNDERSCORE, name), value) - case (name, value) => (name, value) - } - - val hostIp = params.getOrElse("rabit_tracker_host", "").toString - if (!hostIp.isEmpty) { - params ++ Map("rabitTrackerHostIp" -> hostIp) - } else params - } - /** * getDataPaths check and get train/eval/transform paths * @return Array(train_paths, eval_paths, transform_paths) @@ -239,4 +226,17 @@ class XGBoostArgs private[utility] ( evalPaths.map(_.stripPrefix(prefixes(1))), transformPaths.map(_.stripPrefix(prefixes(2)))) } + + def xgboostParams(otherParams: Map[String, Any] = Map.empty): Map[String, Any] = { + val params = otherParams ++ xgbArgsMap.map{ + case (name, value) if !name.contains('_') => + (CaseFormat.LOWER_CAMEL.to(CaseFormat.LOWER_UNDERSCORE, name), value) + case (name, value) => (name, value) + } + + val hostIp = params.getOrElse("rabit_tracker_host", "").toString + if (!hostIp.isEmpty) { + params ++ Map("rabitTrackerHostIp" -> hostIp) + } else params + } }