From 8015ba40c7ef71c976e73007c32248a37232bb82 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Tue, 22 Oct 2024 22:08:48 +0800 Subject: [PATCH] [xgboost] Fix eval dataset issues (#450) Signed-off-by: Bobby Wang --- .../notebooks/scala/agaricus-gpu.ipynb | 20 +++++------------ .../nvidia/spark/examples/agaricus/Main.scala | 3 ++- .../notebooks/scala/mortgage-gpu.ipynb | 20 +++++------------ .../nvidia/spark/examples/mortgage/Main.scala | 7 +++--- .../taxi/notebooks/scala/taxi-gpu.ipynb | 22 +++++-------------- .../com/nvidia/spark/examples/taxi/Main.scala | 7 +++--- 6 files changed, 24 insertions(+), 55 deletions(-) diff --git a/examples/XGBoost-Examples/agaricus/notebooks/scala/agaricus-gpu.ipynb b/examples/XGBoost-Examples/agaricus/notebooks/scala/agaricus-gpu.ipynb index 06efa2ba9..eb4cae8e5 100644 --- a/examples/XGBoost-Examples/agaricus/notebooks/scala/agaricus-gpu.ipynb +++ b/examples/XGBoost-Examples/agaricus/notebooks/scala/agaricus-gpu.ipynb @@ -316,23 +316,13 @@ "## Benchmark and train\n", "The object `benchmark` is used to compute the elapsed time of some operations.\n", "\n", - "Training with evaluation sets is also supported in 2 ways, the same as CPU version's behavior:\n", + "Training with evaluation dataset is also supported, the same as CPU version's behavior:\n", "\n", - "* Call API `setEvalSets` after initializing an XGBoostClassifier\n", + "* Call API `setEvalDataset` after initializing an XGBoostClassifier\n", "\n", "```scala\n", - "xgbClassifier.setEvalSets(Map(\"eval\" -> evalSet))\n", - "\n", - "```\n", - "\n", - "* Use parameter `eval_sets` when initializing an XGBoostClassifier\n", - "\n", - "```scala\n", - "val paramMapWithEval = paramMap + (\"eval_sets\" -> Map(\"eval\" -> evalSet))\n", - "val xgbClassifierWithEval = new XGBoostClassifier(paramMapWithEval)\n", - "```\n", - "\n", - "Here chooses the API way to set evaluation sets." + "xgbClassifier.setEvalDataset(evalSet)\n", + "```" ] }, { @@ -352,7 +342,7 @@ } ], "source": [ - "xgbClassifier.setEvalSets(Map(\"eval\" -> evalSet))" + "xgbClassifier.setEvalDataset(evalSet)" ] }, { diff --git a/examples/XGBoost-Examples/agaricus/scala/src/com/nvidia/spark/examples/agaricus/Main.scala b/examples/XGBoost-Examples/agaricus/scala/src/com/nvidia/spark/examples/agaricus/Main.scala index d81f38a43..b9baa8548 100644 --- a/examples/XGBoost-Examples/agaricus/scala/src/com/nvidia/spark/examples/agaricus/Main.scala +++ b/examples/XGBoost-Examples/agaricus/scala/src/com/nvidia/spark/examples/agaricus/Main.scala @@ -64,13 +64,14 @@ object Main { // build XGBoost classifier val paramMap = xgboostArgs.xgboostParams(Map( "objective" -> "binary:logistic", - "eval_sets" -> datasets(1).map(ds => Map("eval" -> ds)).getOrElse(Map.empty) )) val xgbClassifier = new XGBoostClassifier(paramMap) .setLabelCol(labelName) // === diff === .setFeaturesCol(featureCols) + datasets(1).foreach(_ => xgbClassifier.setEvalDataset(_)) + println("\n------ Training ------") val (model, _) = benchmark.time("train") { xgbClassifier.fit(datasets(0).get) diff --git a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb index cf61395c9..57ee76235 100644 --- a/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb +++ b/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage-gpu.ipynb @@ -304,23 +304,13 @@ "## Benchmark and train\n", "The object `benchmark` is used to compute the elapsed time of some operations.\n", "\n", - "Training with evaluation sets is also supported in 2 ways, the same as CPU version's behavior:\n", + "Training with evaluation dataset is also supported, the same as CPU version's behavior:\n", "\n", - "* Call API `setEvalSets` after initializing an XGBoostClassifier\n", + "* Call API `setEvalDataset` after initializing an XGBoostClassifier\n", "\n", "```scala\n", - "xgbClassifier.setEvalSets(Map(\"eval\" -> evalSet))\n", - "\n", - "```\n", - "\n", - "* Use parameter `eval_sets` when initializing an XGBoostClassifier\n", - "\n", - "```scala\n", - "val paramMapWithEval = paramMap + (\"eval_sets\" -> Map(\"eval\" -> evalSet))\n", - "val xgbClassifierWithEval = new XGBoostClassifier(paramMapWithEval)\n", - "```\n", - "\n", - "Here chooses the API way to set evaluation sets." + "xgbClassifier.setEvalDataset(evalSet)\n", + "```" ] }, { @@ -340,7 +330,7 @@ } ], "source": [ - "xgbClassifier.setEvalSets(Map(\"eval\" -> evalSet))" + "xgbClassifier.setEvalDataset(evalSet)" ] }, { diff --git a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Main.scala b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Main.scala index 32c11eea6..edd273aa6 100644 --- a/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Main.scala +++ b/examples/XGBoost-Examples/mortgage/scala/src/com/nvidia/spark/examples/mortgage/Main.scala @@ -52,14 +52,13 @@ object Main extends Mortgage { val xgbClassificationModel = if (appArgs.isToTrain) { // build XGBoost classifier - val xgbParamFinal = appArgs.xgboostParams(commParamMap + - // Add train-eval dataset if specified - ("eval_sets" -> datasets(1).map(ds => Map("eval" -> ds)).getOrElse(Map.empty)) - ) + val xgbParamFinal = appArgs.xgboostParams(commParamMap) val xgbClassifier = new XGBoostClassifier(xgbParamFinal) .setLabelCol(labelColName) .setFeaturesCol(featureNames) + datasets(1).foreach(_ => xgbClassifier.setEvalDataset(_)) + // Start training println("\n------ Training ------") // Shall we not log the time if it is abnormal, which is usually caused by training failure diff --git a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-gpu.ipynb b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-gpu.ipynb index 58dd84eb0..2a6253a8c 100644 --- a/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-gpu.ipynb +++ b/examples/XGBoost-Examples/taxi/notebooks/scala/taxi-gpu.ipynb @@ -320,23 +320,13 @@ "## Benchmark and train\n", "The object `benchmark` is used to compute the elapsed time of some operations.\n", "\n", - "Training with evaluation sets is also supported in 2 ways, the same as CPU version's behavior:\n", + "Training with evaluation dataset is also supported, the same as CPU version's behavior:\n", "\n", - "* Call API `setEvalSets` after initializing an XGBoostRegressor\n", + "* Call API `setEvalDataset` after initializing an XGBoostClassifier\n", "\n", "```scala\n", - "xgbRegressor.setEvalSets(Map(\"eval\" -> evalSet))\n", - "\n", - "```\n", - "\n", - "* Use parameter `eval_sets` when initializing an XGBoostRegressor\n", - "\n", - "```scala\n", - "val paramMapWithEval = paramMap + (\"eval_sets\" -> Map(\"eval\" -> evalSet))\n", - "val xgbRegressorWithEval = new XGBoostRegressor(paramMapWithEval)\n", - "```\n", - "\n", - "Here chooses the API way to set evaluation sets." + "xgbClassifier.setEvalDataset(evalSet)\n", + "```" ] }, { @@ -356,7 +346,7 @@ } ], "source": [ - "xgbRegressor.setEvalSets(Map(\"eval\" -> evalSet))" + "xgbRegressor.setEvalDataset(evalSet)" ] }, { @@ -609,4 +599,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/examples/XGBoost-Examples/taxi/scala/src/com/nvidia/spark/examples/taxi/Main.scala b/examples/XGBoost-Examples/taxi/scala/src/com/nvidia/spark/examples/taxi/Main.scala index 698d8cf56..e05017f79 100644 --- a/examples/XGBoost-Examples/taxi/scala/src/com/nvidia/spark/examples/taxi/Main.scala +++ b/examples/XGBoost-Examples/taxi/scala/src/com/nvidia/spark/examples/taxi/Main.scala @@ -58,14 +58,13 @@ object Main extends Taxi { val xgbRegressionModel = if (xgboostArgs.isToTrain) { // build XGBoost XGBoostRegressor - val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap + - // Add train-eval dataset if specified - ("eval_sets" -> datasets(1).map(ds => Map("test" -> ds)).getOrElse(Map.empty)) - ) + val xgbParamFinal = xgboostArgs.xgboostParams(commParamMap) val xgbRegressor = new XGBoostRegressor(xgbParamFinal) .setLabelCol(labelColName) .setFeaturesCol(featureNames) + datasets(1).foreach(_ => xgbRegressor.setEvalDataset(_)) + println("\n------ Training ------") // Shall we not log the time if it is abnormal, which is usually caused by training failure val (model, _) = benchmark.time("train") {